aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-28 11:54:49 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-28 11:54:49 -0400
commitd1a76187a5be4f89c6cb19d800cb5fb7aac735c5 (patch)
tree2fac3ffbfffc7560eeef8364b541d0d7a0057920 /arch/x86
parentc7e78cff6b7518212247fb20b1dc6411540dc9af (diff)
parent0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff)
Merge commit 'v2.6.28-rc2' into core/locking
Conflicts: arch/um/include/asm/system.h
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig222
-rw-r--r--arch/x86/Kconfig.cpu149
-rw-r--r--arch/x86/Kconfig.debug13
-rw-r--r--arch/x86/Makefile6
-rw-r--r--arch/x86/Makefile_32.cpu6
-rw-r--r--arch/x86/boot/Makefile5
-rw-r--r--arch/x86/boot/compressed/Makefile8
-rw-r--r--arch/x86/boot/compressed/head_32.S5
-rw-r--r--arch/x86/boot/compressed/misc.c12
-rw-r--r--arch/x86/boot/compressed/relocs.c2
-rw-r--r--arch/x86/boot/cpu.c17
-rw-r--r--arch/x86/boot/edd.c7
-rw-r--r--arch/x86/boot/header.S1
-rw-r--r--arch/x86/boot/mkcpustr.c40
-rw-r--r--arch/x86/boot/video-bios.c4
-rw-r--r--arch/x86/boot/video-vesa.c15
-rw-r--r--arch/x86/configs/i386_defconfig22
-rw-r--r--arch/x86/configs/x86_64_defconfig31
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/crc32c-intel.c197
-rw-r--r--arch/x86/ia32/ia32_aout.c11
-rw-r--r--arch/x86/ia32/ia32_signal.c138
-rw-r--r--arch/x86/ia32/ia32entry.S30
-rw-r--r--arch/x86/ia32/sys_ia32.c108
-rw-r--r--arch/x86/include/asm/Kbuild24
-rw-r--r--arch/x86/include/asm/a.out-core.h73
-rw-r--r--arch/x86/include/asm/a.out.h20
-rw-r--r--arch/x86/include/asm/acpi.h178
-rw-r--r--arch/x86/include/asm/agp.h35
-rw-r--r--arch/x86/include/asm/alternative-asm.h22
-rw-r--r--arch/x86/include/asm/alternative.h183
-rw-r--r--arch/x86/include/asm/amd_iommu.h35
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h404
-rw-r--r--arch/x86/include/asm/apic.h199
-rw-r--r--arch/x86/include/asm/apicdef.h417
-rw-r--r--arch/x86/include/asm/arch_hooks.h26
-rw-r--r--arch/x86/include/asm/asm.h47
-rw-r--r--arch/x86/include/asm/atomic.h5
-rw-r--r--arch/x86/include/asm/atomic_32.h259
-rw-r--r--arch/x86/include/asm/atomic_64.h473
-rw-r--r--arch/x86/include/asm/auxvec.h12
-rw-r--r--arch/x86/include/asm/bigsmp/apic.h139
-rw-r--r--arch/x86/include/asm/bigsmp/apicdef.h13
-rw-r--r--arch/x86/include/asm/bigsmp/ipi.h25
-rw-r--r--arch/x86/include/asm/bios_ebda.h36
-rw-r--r--arch/x86/include/asm/bitops.h451
-rw-r--r--arch/x86/include/asm/boot.h26
-rw-r--r--arch/x86/include/asm/bootparam.h111
-rw-r--r--arch/x86/include/asm/bug.h39
-rw-r--r--arch/x86/include/asm/bugs.h12
-rw-r--r--arch/x86/include/asm/byteorder.h81
-rw-r--r--arch/x86/include/asm/cache.h20
-rw-r--r--arch/x86/include/asm/cacheflush.h118
-rw-r--r--arch/x86/include/asm/calgary.h72
-rw-r--r--arch/x86/include/asm/calling.h170
-rw-r--r--arch/x86/include/asm/checksum.h5
-rw-r--r--arch/x86/include/asm/checksum_32.h189
-rw-r--r--arch/x86/include/asm/checksum_64.h191
-rw-r--r--arch/x86/include/asm/cmpxchg.h5
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h344
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h185
-rw-r--r--arch/x86/include/asm/compat.h218
-rw-r--r--arch/x86/include/asm/cpu.h20
-rw-r--r--arch/x86/include/asm/cpufeature.h271
-rw-r--r--arch/x86/include/asm/cputime.h1
-rw-r--r--arch/x86/include/asm/current.h39
-rw-r--r--arch/x86/include/asm/debugreg.h70
-rw-r--r--arch/x86/include/asm/delay.h31
-rw-r--r--arch/x86/include/asm/desc.h409
-rw-r--r--arch/x86/include/asm/desc_defs.h95
-rw-r--r--arch/x86/include/asm/device.h16
-rw-r--r--arch/x86/include/asm/div64.h60
-rw-r--r--arch/x86/include/asm/dma-mapping.h308
-rw-r--r--arch/x86/include/asm/dma.h318
-rw-r--r--arch/x86/include/asm/dmi.h26
-rw-r--r--arch/x86/include/asm/ds.h238
-rw-r--r--arch/x86/include/asm/dwarf2.h61
-rw-r--r--arch/x86/include/asm/e820.h146
-rw-r--r--arch/x86/include/asm/edac.h18
-rw-r--r--arch/x86/include/asm/efi.h110
-rw-r--r--arch/x86/include/asm/elf.h336
-rw-r--r--arch/x86/include/asm/emergency-restart.h18
-rw-r--r--arch/x86/include/asm/errno.h1
-rw-r--r--arch/x86/include/asm/es7000/apic.h193
-rw-r--r--arch/x86/include/asm/es7000/apicdef.h13
-rw-r--r--arch/x86/include/asm/es7000/ipi.h24
-rw-r--r--arch/x86/include/asm/es7000/mpparse.h30
-rw-r--r--arch/x86/include/asm/es7000/wakecpu.h59
-rw-r--r--arch/x86/include/asm/fb.h21
-rw-r--r--arch/x86/include/asm/fcntl.h1
-rw-r--r--arch/x86/include/asm/fixmap.h68
-rw-r--r--arch/x86/include/asm/fixmap_32.h123
-rw-r--r--arch/x86/include/asm/fixmap_64.h83
-rw-r--r--arch/x86/include/asm/floppy.h281
-rw-r--r--arch/x86/include/asm/frame.h27
-rw-r--r--arch/x86/include/asm/ftrace.h24
-rw-r--r--arch/x86/include/asm/futex.h140
-rw-r--r--arch/x86/include/asm/gart.h73
-rw-r--r--arch/x86/include/asm/genapic.h5
-rw-r--r--arch/x86/include/asm/genapic_32.h126
-rw-r--r--arch/x86/include/asm/genapic_64.h58
-rw-r--r--arch/x86/include/asm/geode.h253
-rw-r--r--arch/x86/include/asm/gpio.h56
-rw-r--r--arch/x86/include/asm/hardirq.h11
-rw-r--r--arch/x86/include/asm/hardirq_32.h28
-rw-r--r--arch/x86/include/asm/hardirq_64.h23
-rw-r--r--arch/x86/include/asm/highmem.h82
-rw-r--r--arch/x86/include/asm/hpet.h114
-rw-r--r--arch/x86/include/asm/hugetlb.h93
-rw-r--r--arch/x86/include/asm/hw_irq.h131
-rw-r--r--arch/x86/include/asm/hypertransport.h45
-rw-r--r--arch/x86/include/asm/i387.h400
-rw-r--r--arch/x86/include/asm/i8253.h18
-rw-r--r--arch/x86/include/asm/i8259.h63
-rw-r--r--arch/x86/include/asm/ia32.h170
-rw-r--r--arch/x86/include/asm/ia32_unistd.h18
-rw-r--r--arch/x86/include/asm/idle.h16
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/io.h91
-rw-r--r--arch/x86/include/asm/io_32.h284
-rw-r--r--arch/x86/include/asm/io_64.h244
-rw-r--r--arch/x86/include/asm/io_apic.h204
-rw-r--r--arch/x86/include/asm/ioctl.h1
-rw-r--r--arch/x86/include/asm/ioctls.h94
-rw-r--r--arch/x86/include/asm/iommu.h50
-rw-r--r--arch/x86/include/asm/ipcbuf.h28
-rw-r--r--arch/x86/include/asm/ipi.h138
-rw-r--r--arch/x86/include/asm/irq.h50
-rw-r--r--arch/x86/include/asm/irq_regs.h5
-rw-r--r--arch/x86/include/asm/irq_regs_32.h29
-rw-r--r--arch/x86/include/asm/irq_regs_64.h1
-rw-r--r--arch/x86/include/asm/irq_remapping.h8
-rw-r--r--arch/x86/include/asm/irq_vectors.h164
-rw-r--r--arch/x86/include/asm/irqflags.h211
-rw-r--r--arch/x86/include/asm/ist.h34
-rw-r--r--arch/x86/include/asm/k8.h15
-rw-r--r--arch/x86/include/asm/kdebug.h37
-rw-r--r--arch/x86/include/asm/kexec.h175
-rw-r--r--arch/x86/include/asm/kgdb.h79
-rw-r--r--arch/x86/include/asm/kmap_types.h29
-rw-r--r--arch/x86/include/asm/kprobes.h88
-rw-r--r--arch/x86/include/asm/kvm.h211
-rw-r--r--arch/x86/include/asm/kvm_host.h752
-rw-r--r--arch/x86/include/asm/kvm_para.h147
-rw-r--r--arch/x86/include/asm/kvm_x86_emulate.h184
-rw-r--r--arch/x86/include/asm/ldt.h40
-rw-r--r--arch/x86/include/asm/lguest.h94
-rw-r--r--arch/x86/include/asm/lguest_hcall.h71
-rw-r--r--arch/x86/include/asm/linkage.h61
-rw-r--r--arch/x86/include/asm/local.h235
-rw-r--r--arch/x86/include/asm/mach-default/apm.h73
-rw-r--r--arch/x86/include/asm/mach-default/do_timer.h16
-rw-r--r--arch/x86/include/asm/mach-default/entry_arch.h36
-rw-r--r--arch/x86/include/asm/mach-default/mach_apic.h156
-rw-r--r--arch/x86/include/asm/mach-default/mach_apicdef.h24
-rw-r--r--arch/x86/include/asm/mach-default/mach_ipi.h64
-rw-r--r--arch/x86/include/asm/mach-default/mach_mpparse.h17
-rw-r--r--arch/x86/include/asm/mach-default/mach_mpspec.h12
-rw-r--r--arch/x86/include/asm/mach-default/mach_timer.h48
-rw-r--r--arch/x86/include/asm/mach-default/mach_traps.h33
-rw-r--r--arch/x86/include/asm/mach-default/mach_wakecpu.h42
-rw-r--r--arch/x86/include/asm/mach-default/pci-functions.h19
-rw-r--r--arch/x86/include/asm/mach-default/setup_arch.h3
-rw-r--r--arch/x86/include/asm/mach-default/smpboot_hooks.h59
-rw-r--r--arch/x86/include/asm/mach-generic/gpio.h15
-rw-r--r--arch/x86/include/asm/mach-generic/mach_apic.h33
-rw-r--r--arch/x86/include/asm/mach-generic/mach_apicdef.h11
-rw-r--r--arch/x86/include/asm/mach-generic/mach_ipi.h10
-rw-r--r--arch/x86/include/asm/mach-generic/mach_mpparse.h10
-rw-r--r--arch/x86/include/asm/mach-generic/mach_mpspec.h12
-rw-r--r--arch/x86/include/asm/mach-rdc321x/gpio.h60
-rw-r--r--arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h12
-rw-r--r--arch/x86/include/asm/mach-voyager/do_timer.h17
-rw-r--r--arch/x86/include/asm/mach-voyager/entry_arch.h26
-rw-r--r--arch/x86/include/asm/mach-voyager/setup_arch.h12
-rw-r--r--arch/x86/include/asm/math_emu.h31
-rw-r--r--arch/x86/include/asm/mc146818rtc.h104
-rw-r--r--arch/x86/include/asm/mca.h43
-rw-r--r--arch/x86/include/asm/mca_dma.h201
-rw-r--r--arch/x86/include/asm/mce.h130
-rw-r--r--arch/x86/include/asm/microcode.h47
-rw-r--r--arch/x86/include/asm/mman.h20
-rw-r--r--arch/x86/include/asm/mmconfig.h12
-rw-r--r--arch/x86/include/asm/mmu.h26
-rw-r--r--arch/x86/include/asm/mmu_context.h37
-rw-r--r--arch/x86/include/asm/mmu_context_32.h56
-rw-r--r--arch/x86/include/asm/mmu_context_64.h54
-rw-r--r--arch/x86/include/asm/mmx.h14
-rw-r--r--arch/x86/include/asm/mmzone.h5
-rw-r--r--arch/x86/include/asm/mmzone_32.h134
-rw-r--r--arch/x86/include/asm/mmzone_64.h51
-rw-r--r--arch/x86/include/asm/module.h80
-rw-r--r--arch/x86/include/asm/mpspec.h145
-rw-r--r--arch/x86/include/asm/mpspec_def.h180
-rw-r--r--arch/x86/include/asm/msgbuf.h39
-rw-r--r--arch/x86/include/asm/msidef.h55
-rw-r--r--arch/x86/include/asm/msr-index.h332
-rw-r--r--arch/x86/include/asm/msr.h247
-rw-r--r--arch/x86/include/asm/mtrr.h173
-rw-r--r--arch/x86/include/asm/mutex.h5
-rw-r--r--arch/x86/include/asm/mutex_32.h125
-rw-r--r--arch/x86/include/asm/mutex_64.h100
-rw-r--r--arch/x86/include/asm/nmi.h81
-rw-r--r--arch/x86/include/asm/nops.h118
-rw-r--r--arch/x86/include/asm/numa.h5
-rw-r--r--arch/x86/include/asm/numa_32.h11
-rw-r--r--arch/x86/include/asm/numa_64.h43
-rw-r--r--arch/x86/include/asm/numaq.h169
-rw-r--r--arch/x86/include/asm/numaq/apic.h136
-rw-r--r--arch/x86/include/asm/numaq/apicdef.h14
-rw-r--r--arch/x86/include/asm/numaq/ipi.h25
-rw-r--r--arch/x86/include/asm/numaq/mpparse.h7
-rw-r--r--arch/x86/include/asm/numaq/wakecpu.h43
-rw-r--r--arch/x86/include/asm/olpc.h132
-rw-r--r--arch/x86/include/asm/page.h209
-rw-r--r--arch/x86/include/asm/page_32.h136
-rw-r--r--arch/x86/include/asm/page_64.h105
-rw-r--r--arch/x86/include/asm/param.h22
-rw-r--r--arch/x86/include/asm/paravirt.h1650
-rw-r--r--arch/x86/include/asm/parport.h10
-rw-r--r--arch/x86/include/asm/pat.h22
-rw-r--r--arch/x86/include/asm/pci-direct.h21
-rw-r--r--arch/x86/include/asm/pci.h114
-rw-r--r--arch/x86/include/asm/pci_32.h34
-rw-r--r--arch/x86/include/asm/pci_64.h66
-rw-r--r--arch/x86/include/asm/pda.h137
-rw-r--r--arch/x86/include/asm/percpu.h218
-rw-r--r--arch/x86/include/asm/pgalloc.h114
-rw-r--r--arch/x86/include/asm/pgtable-2level-defs.h20
-rw-r--r--arch/x86/include/asm/pgtable-2level.h79
-rw-r--r--arch/x86/include/asm/pgtable-3level-defs.h28
-rw-r--r--arch/x86/include/asm/pgtable-3level.h175
-rw-r--r--arch/x86/include/asm/pgtable.h562
-rw-r--r--arch/x86/include/asm/pgtable_32.h191
-rw-r--r--arch/x86/include/asm/pgtable_64.h285
-rw-r--r--arch/x86/include/asm/poll.h1
-rw-r--r--arch/x86/include/asm/posix_types.h13
-rw-r--r--arch/x86/include/asm/posix_types_32.h85
-rw-r--r--arch/x86/include/asm/posix_types_64.h119
-rw-r--r--arch/x86/include/asm/prctl.h10
-rw-r--r--arch/x86/include/asm/processor-cyrix.h38
-rw-r--r--arch/x86/include/asm/processor-flags.h100
-rw-r--r--arch/x86/include/asm/processor.h936
-rw-r--r--arch/x86/include/asm/proto.h32
-rw-r--r--arch/x86/include/asm/ptrace-abi.h145
-rw-r--r--arch/x86/include/asm/ptrace.h280
-rw-r--r--arch/x86/include/asm/pvclock-abi.h42
-rw-r--r--arch/x86/include/asm/pvclock.h14
-rw-r--r--arch/x86/include/asm/reboot.h21
-rw-r--r--arch/x86/include/asm/reboot_fixups.h6
-rw-r--r--arch/x86/include/asm/required-features.h82
-rw-r--r--arch/x86/include/asm/resource.h1
-rw-r--r--arch/x86/include/asm/resume-trace.h21
-rw-r--r--arch/x86/include/asm/rio.h63
-rw-r--r--arch/x86/include/asm/rtc.h1
-rw-r--r--arch/x86/include/asm/rwlock.h8
-rw-r--r--arch/x86/include/asm/rwsem.h265
-rw-r--r--arch/x86/include/asm/scatterlist.h33
-rw-r--r--arch/x86/include/asm/seccomp.h5
-rw-r--r--arch/x86/include/asm/seccomp_32.h17
-rw-r--r--arch/x86/include/asm/seccomp_64.h25
-rw-r--r--arch/x86/include/asm/sections.h1
-rw-r--r--arch/x86/include/asm/segment.h209
-rw-r--r--arch/x86/include/asm/sembuf.h24
-rw-r--r--arch/x86/include/asm/serial.h29
-rw-r--r--arch/x86/include/asm/setup.h105
-rw-r--r--arch/x86/include/asm/shmbuf.h51
-rw-r--r--arch/x86/include/asm/shmparam.h6
-rw-r--r--arch/x86/include/asm/sigcontext.h284
-rw-r--r--arch/x86/include/asm/sigcontext32.h75
-rw-r--r--arch/x86/include/asm/siginfo.h10
-rw-r--r--arch/x86/include/asm/signal.h262
-rw-r--r--arch/x86/include/asm/smp.h229
-rw-r--r--arch/x86/include/asm/socket.h57
-rw-r--r--arch/x86/include/asm/sockios.h13
-rw-r--r--arch/x86/include/asm/sparsemem.h34
-rw-r--r--arch/x86/include/asm/spinlock.h364
-rw-r--r--arch/x86/include/asm/spinlock_types.h20
-rw-r--r--arch/x86/include/asm/srat.h39
-rw-r--r--arch/x86/include/asm/stacktrace.h21
-rw-r--r--arch/x86/include/asm/stat.h114
-rw-r--r--arch/x86/include/asm/statfs.h12
-rw-r--r--arch/x86/include/asm/string.h5
-rw-r--r--arch/x86/include/asm/string_32.h326
-rw-r--r--arch/x86/include/asm/string_64.h60
-rw-r--r--arch/x86/include/asm/summit/apic.h184
-rw-r--r--arch/x86/include/asm/summit/apicdef.h13
-rw-r--r--arch/x86/include/asm/summit/ipi.h25
-rw-r--r--arch/x86/include/asm/summit/mpparse.h109
-rw-r--r--arch/x86/include/asm/suspend.h5
-rw-r--r--arch/x86/include/asm/suspend_32.h51
-rw-r--r--arch/x86/include/asm/suspend_64.h52
-rw-r--r--arch/x86/include/asm/swiotlb.h58
-rw-r--r--arch/x86/include/asm/sync_bitops.h130
-rw-r--r--arch/x86/include/asm/syscall.h213
-rw-r--r--arch/x86/include/asm/syscalls.h93
-rw-r--r--arch/x86/include/asm/system.h425
-rw-r--r--arch/x86/include/asm/system_64.h22
-rw-r--r--arch/x86/include/asm/tce.h48
-rw-r--r--arch/x86/include/asm/termbits.h198
-rw-r--r--arch/x86/include/asm/termios.h113
-rw-r--r--arch/x86/include/asm/therm_throt.h9
-rw-r--r--arch/x86/include/asm/thread_info.h264
-rw-r--r--arch/x86/include/asm/time.h63
-rw-r--r--arch/x86/include/asm/timer.h66
-rw-r--r--arch/x86/include/asm/timex.h19
-rw-r--r--arch/x86/include/asm/tlb.h11
-rw-r--r--arch/x86/include/asm/tlbflush.h178
-rw-r--r--arch/x86/include/asm/topology.h258
-rw-r--r--arch/x86/include/asm/trampoline.h21
-rw-r--r--arch/x86/include/asm/traps.h81
-rw-r--r--arch/x86/include/asm/tsc.h62
-rw-r--r--arch/x86/include/asm/types.h36
-rw-r--r--arch/x86/include/asm/uaccess.h456
-rw-r--r--arch/x86/include/asm/uaccess_32.h218
-rw-r--r--arch/x86/include/asm/uaccess_64.h208
-rw-r--r--arch/x86/include/asm/ucontext.h18
-rw-r--r--arch/x86/include/asm/unaligned.h14
-rw-r--r--arch/x86/include/asm/unistd.h13
-rw-r--r--arch/x86/include/asm/unistd_32.h379
-rw-r--r--arch/x86/include/asm/unistd_64.h693
-rw-r--r--arch/x86/include/asm/unwind.h13
-rw-r--r--arch/x86/include/asm/user.h5
-rw-r--r--arch/x86/include/asm/user32.h70
-rw-r--r--arch/x86/include/asm/user_32.h131
-rw-r--r--arch/x86/include/asm/user_64.h137
-rw-r--r--arch/x86/include/asm/uv/bios.h94
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h332
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h354
-rw-r--r--arch/x86/include/asm/uv/uv_irq.h36
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h1295
-rw-r--r--arch/x86/include/asm/vdso.h47
-rw-r--r--arch/x86/include/asm/vga.h20
-rw-r--r--arch/x86/include/asm/vgtod.h29
-rw-r--r--arch/x86/include/asm/vic.h61
-rw-r--r--arch/x86/include/asm/visws/cobalt.h125
-rw-r--r--arch/x86/include/asm/visws/lithium.h53
-rw-r--r--arch/x86/include/asm/visws/piix4.h107
-rw-r--r--arch/x86/include/asm/visws/sgivw.h5
-rw-r--r--arch/x86/include/asm/vm86.h208
-rw-r--r--arch/x86/include/asm/vmi.h263
-rw-r--r--arch/x86/include/asm/vmi_time.h98
-rw-r--r--arch/x86/include/asm/voyager.h528
-rw-r--r--arch/x86/include/asm/vsyscall.h44
-rw-r--r--arch/x86/include/asm/xcr.h49
-rw-r--r--arch/x86/include/asm/xen/events.h24
-rw-r--r--arch/x86/include/asm/xen/grant_table.h7
-rw-r--r--arch/x86/include/asm/xen/hypercall.h527
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h82
-rw-r--r--arch/x86/include/asm/xen/interface.h175
-rw-r--r--arch/x86/include/asm/xen/interface_32.h97
-rw-r--r--arch/x86/include/asm/xen/interface_64.h159
-rw-r--r--arch/x86/include/asm/xen/page.h165
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/include/asm/xor_32.h888
-rw-r--r--arch/x86/include/asm/xor_64.h361
-rw-r--r--arch/x86/include/asm/xsave.h118
-rw-r--r--arch/x86/kernel/Makefile23
-rw-r--r--arch/x86/kernel/acpi/boot.c56
-rw-r--r--arch/x86/kernel/acpi/sleep.c7
-rw-r--r--arch/x86/kernel/alternative.c10
-rw-r--r--arch/x86/kernel/amd_iommu.c359
-rw-r--r--arch/x86/kernel/amd_iommu_init.c196
-rw-r--r--arch/x86/kernel/aperture_64.c6
-rw-r--r--arch/x86/kernel/apic.c (renamed from arch/x86/kernel/apic_32.c)1038
-rw-r--r--arch/x86/kernel/apic_64.c1390
-rw-r--r--arch/x86/kernel/apm_32.c4
-rw-r--r--arch/x86/kernel/asm-offsets_64.c2
-rw-r--r--arch/x86/kernel/bios_uv.c137
-rw-r--r--arch/x86/kernel/cpu/.gitignore1
-rw-r--r--arch/x86/kernel/cpu/Makefile34
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c88
-rw-r--r--arch/x86/kernel/cpu/amd.c548
-rw-r--r--arch/x86/kernel/cpu/amd_64.c224
-rw-r--r--arch/x86/kernel/cpu/centaur.c4
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c6
-rw-r--r--arch/x86/kernel/cpu/cmpxchg.c72
-rw-r--r--arch/x86/kernel/cpu/common.c1001
-rw-r--r--arch/x86/kernel/cpu/common_64.c712
-rw-r--r--arch/x86/kernel/cpu/cpu.h19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c16
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c42
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c41
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c2
-rw-r--r--arch/x86/kernel/cpu/cyrix.c23
-rw-r--r--arch/x86/kernel/cpu/feature_names.c84
-rw-r--r--arch/x86/kernel/cpu/intel.c365
-rw-r--r--arch/x86/kernel/cpu/intel_64.c95
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c169
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c2
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl32
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c276
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c97
-rw-r--r--arch/x86/kernel/cpu/powerflags.c20
-rw-r--r--arch/x86/kernel/cpu/proc.c6
-rw-r--r--arch/x86/kernel/cpu/transmeta.c32
-rw-r--r--arch/x86/kernel/cpu/umc.c3
-rw-r--r--arch/x86/kernel/cpuid.c5
-rw-r--r--arch/x86/kernel/crash_dump_32.c3
-rw-r--r--arch/x86/kernel/crash_dump_64.c14
-rw-r--r--arch/x86/kernel/doublefault_32.c2
-rw-r--r--arch/x86/kernel/ds.c954
-rw-r--r--arch/x86/kernel/dumpstack_32.c449
-rw-r--r--arch/x86/kernel/dumpstack_64.c575
-rw-r--r--arch/x86/kernel/e820.c32
-rw-r--r--arch/x86/kernel/early-quirks.c115
-rw-r--r--arch/x86/kernel/early_printk.c748
-rw-r--r--arch/x86/kernel/efi.c10
-rw-r--r--arch/x86/kernel/entry_32.S36
-rw-r--r--arch/x86/kernel/entry_64.S45
-rw-r--r--arch/x86/kernel/es7000_32.c (renamed from arch/x86/mach-es7000/es7000plat.c)115
-rw-r--r--arch/x86/kernel/ftrace.c124
-rw-r--r--arch/x86/kernel/genapic_64.c88
-rw-r--r--arch/x86/kernel/genapic_flat_64.c64
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c159
-rw-r--r--arch/x86/kernel/genx2apic_phys.c154
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c138
-rw-r--r--arch/x86/kernel/head.c1
-rw-r--r--arch/x86/kernel/head64.c5
-rw-r--r--arch/x86/kernel/head_32.S34
-rw-r--r--arch/x86/kernel/head_64.S4
-rw-r--r--arch/x86/kernel/hpet.c459
-rw-r--r--arch/x86/kernel/i387.c168
-rw-r--r--arch/x86/kernel/i8259.c24
-rw-r--r--arch/x86/kernel/io_apic.c (renamed from arch/x86/kernel/io_apic_64.c)2071
-rw-r--r--arch/x86/kernel/io_apic_32.c2901
-rw-r--r--arch/x86/kernel/ioport.c1
-rw-r--r--arch/x86/kernel/ipi.c3
-rw-r--r--arch/x86/kernel/irq.c189
-rw-r--r--arch/x86/kernel/irq_32.c194
-rw-r--r--arch/x86/kernel/irq_64.c169
-rw-r--r--arch/x86/kernel/irqinit_32.c84
-rw-r--r--arch/x86/kernel/irqinit_64.c69
-rw-r--r--arch/x86/kernel/k8.c5
-rw-r--r--arch/x86/kernel/kgdb.c50
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/kvmclock.c30
-rw-r--r--arch/x86/kernel/ldt.c10
-rw-r--r--arch/x86/kernel/microcode.c853
-rw-r--r--arch/x86/kernel/microcode_amd.c435
-rw-r--r--arch/x86/kernel/microcode_core.c508
-rw-r--r--arch/x86/kernel/microcode_intel.c480
-rw-r--r--arch/x86/kernel/mpparse.c2
-rw-r--r--arch/x86/kernel/msr.c4
-rw-r--r--arch/x86/kernel/nmi.c11
-rw-r--r--arch/x86/kernel/numaq_32.c7
-rw-r--r--arch/x86/kernel/olpc.c6
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c37
-rw-r--r--arch/x86/kernel/paravirt.c30
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c36
-rw-r--r--arch/x86/kernel/pci-dma.c199
-rw-r--r--arch/x86/kernel/pci-gart_64.c170
-rw-r--r--arch/x86/kernel/pci-nommu.c10
-rw-r--r--arch/x86/kernel/pcspeaker.c13
-rw-r--r--arch/x86/kernel/process.c21
-rw-r--r--arch/x86/kernel/process_32.c105
-rw-r--r--arch/x86/kernel/process_64.c204
-rw-r--r--arch/x86/kernel/ptrace.c522
-rw-r--r--arch/x86/kernel/pvclock.c12
-rw-r--r--arch/x86/kernel/quirks.c44
-rw-r--r--arch/x86/kernel/reboot.c6
-rw-r--r--arch/x86/kernel/rtc.c42
-rw-r--r--arch/x86/kernel/setup.c236
-rw-r--r--arch/x86/kernel/setup_percpu.c28
-rw-r--r--arch/x86/kernel/sigframe.h19
-rw-r--r--arch/x86/kernel/signal_32.c273
-rw-r--r--arch/x86/kernel/signal_64.c362
-rw-r--r--arch/x86/kernel/smp.c6
-rw-r--r--arch/x86/kernel/smpboot.c238
-rw-r--r--arch/x86/kernel/summit_32.c2
-rw-r--r--arch/x86/kernel/sys_i386_32.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c44
-rw-r--r--arch/x86/kernel/syscall_64.c4
-rw-r--r--arch/x86/kernel/time_32.c8
-rw-r--r--arch/x86/kernel/time_64.c23
-rw-r--r--arch/x86/kernel/tlb_32.c8
-rw-r--r--arch/x86/kernel/tlb_uv.c2
-rw-r--r--arch/x86/kernel/tls.c1
-rw-r--r--arch/x86/kernel/traps.c (renamed from arch/x86/kernel/traps_32.c)888
-rw-r--r--arch/x86/kernel/traps_64.c1212
-rw-r--r--arch/x86/kernel/tsc.c290
-rw-r--r--arch/x86/kernel/uv_irq.c79
-rw-r--r--arch/x86/kernel/uv_sysfs.c72
-rw-r--r--arch/x86/kernel/visws_quirks.c48
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kernel/vmi_32.c16
-rw-r--r--arch/x86/kernel/vmiclock_32.c3
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S9
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S9
-rw-r--r--arch/x86/kernel/vsmp_64.c2
-rw-r--r--arch/x86/kernel/xsave.c345
-rw-r--r--arch/x86/kvm/Makefile5
-rw-r--r--arch/x86/kvm/i8254.c87
-rw-r--r--arch/x86/kvm/i8254.h7
-rw-r--r--arch/x86/kvm/i8259.c53
-rw-r--r--arch/x86/kvm/irq.c3
-rw-r--r--arch/x86/kvm/irq.h6
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h32
-rw-r--r--arch/x86/kvm/lapic.c49
-rw-r--r--arch/x86/kvm/mmu.c680
-rw-r--r--arch/x86/kvm/paging_tmpl.h249
-rw-r--r--arch/x86/kvm/svm.c156
-rw-r--r--arch/x86/kvm/vmx.c712
-rw-r--r--arch/x86/kvm/vmx.h18
-rw-r--r--arch/x86/kvm/x86.c554
-rw-r--r--arch/x86/kvm/x86.h22
-rw-r--r--arch/x86/kvm/x86_emulate.c170
-rw-r--r--arch/x86/lguest/boot.c40
-rw-r--r--arch/x86/lib/Makefile3
-rw-r--r--arch/x86/lib/msr-on-cpu.c78
-rw-r--r--arch/x86/lib/string_32.c42
-rw-r--r--arch/x86/lib/strstr_32.c6
-rw-r--r--arch/x86/lib/usercopy_32.c7
-rw-r--r--arch/x86/mach-default/setup.c19
-rw-r--r--arch/x86/mach-es7000/Makefile5
-rw-r--r--arch/x86/mach-es7000/es7000.h114
-rw-r--r--arch/x86/mach-generic/Makefile1
-rw-r--r--arch/x86/mach-generic/bigsmp.c13
-rw-r--r--arch/x86/mach-generic/es7000.c47
-rw-r--r--arch/x86/mach-generic/numaq.c26
-rw-r--r--arch/x86/mach-generic/summit.c25
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c6
-rw-r--r--arch/x86/mm/Makefile6
-rw-r--r--arch/x86/mm/dump_pagetables.c4
-rw-r--r--arch/x86/mm/fault.c67
-rw-r--r--arch/x86/mm/gup.c10
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/init_32.c90
-rw-r--r--arch/x86/mm/init_64.c184
-rw-r--r--arch/x86/mm/ioremap.c197
-rw-r--r--arch/x86/mm/memtest.c7
-rw-r--r--arch/x86/mm/mmio-mod.c87
-rw-r--r--arch/x86/mm/numa_32.c (renamed from arch/x86/mm/discontig_32.c)2
-rw-r--r--arch/x86/mm/numa_64.c10
-rw-r--r--arch/x86/mm/pageattr-test.c9
-rw-r--r--arch/x86/mm/pageattr.c476
-rw-r--r--arch/x86/mm/pat.c132
-rw-r--r--arch/x86/mm/pf_in.c121
-rw-r--r--arch/x86/mm/pgtable.c6
-rw-r--r--arch/x86/mm/pgtable_32.c3
-rw-r--r--arch/x86/mm/srat_64.c2
-rw-r--r--arch/x86/mm/testmmiotrace.c4
-rw-r--r--arch/x86/oprofile/Makefile2
-rw-r--r--arch/x86/oprofile/backtrace.c3
-rw-r--r--arch/x86/oprofile/nmi_int.c204
-rw-r--r--arch/x86/oprofile/op_counter.h18
-rw-r--r--arch/x86/oprofile/op_model_amd.c546
-rw-r--r--arch/x86/oprofile/op_model_athlon.c190
-rw-r--r--arch/x86/oprofile/op_model_p4.c207
-rw-r--r--arch/x86/oprofile/op_model_ppro.c120
-rw-r--r--arch/x86/oprofile/op_x86_model.h17
-rw-r--r--arch/x86/pci/acpi.c5
-rw-r--r--arch/x86/pci/amd_bus.c2
-rw-r--r--arch/x86/pci/fixup.c28
-rw-r--r--arch/x86/pci/i386.c3
-rw-r--r--arch/x86/pci/irq.c86
-rw-r--r--arch/x86/pci/mmconfig-shared.c12
-rw-r--r--arch/x86/power/cpu_32.c7
-rw-r--r--arch/x86/power/cpu_64.c7
-rw-r--r--arch/x86/power/hibernate_asm_32.S14
-rw-r--r--arch/x86/xen/Kconfig12
-rw-r--r--arch/x86/xen/Makefile12
-rw-r--r--arch/x86/xen/debugfs.c123
-rw-r--r--arch/x86/xen/debugfs.h10
-rw-r--r--arch/x86/xen/enlighten.c318
-rw-r--r--arch/x86/xen/irq.c141
-rw-r--r--arch/x86/xen/mmu.c315
-rw-r--r--arch/x86/xen/mmu.h3
-rw-r--r--arch/x86/xen/multicalls.c115
-rw-r--r--arch/x86/xen/smp.c245
-rw-r--r--arch/x86/xen/spinlock.c428
-rw-r--r--arch/x86/xen/time.c23
-rw-r--r--arch/x86/xen/xen-asm_32.S2
-rw-r--r--arch/x86/xen/xen-asm_64.S22
-rw-r--r--arch/x86/xen/xen-ops.h8
586 files changed, 58376 insertions, 14873 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index ed92864d132..350bee1d54d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -18,6 +18,7 @@ config X86_64
18### Arch settings 18### Arch settings
19config X86 19config X86
20 def_bool y 20 def_bool y
21 select HAVE_AOUT if X86_32
21 select HAVE_UNSTABLE_SCHED_CLOCK 22 select HAVE_UNSTABLE_SCHED_CLOCK
22 select HAVE_IDE 23 select HAVE_IDE
23 select HAVE_OPROFILE 24 select HAVE_OPROFILE
@@ -25,10 +26,12 @@ config X86
25 select HAVE_KPROBES 26 select HAVE_KPROBES
26 select ARCH_WANT_OPTIONAL_GPIOLIB 27 select ARCH_WANT_OPTIONAL_GPIOLIB
27 select HAVE_KRETPROBES 28 select HAVE_KRETPROBES
29 select HAVE_FTRACE_MCOUNT_RECORD
28 select HAVE_DYNAMIC_FTRACE 30 select HAVE_DYNAMIC_FTRACE
29 select HAVE_FTRACE 31 select HAVE_FTRACE
30 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 32 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
31 select HAVE_ARCH_KGDB if !X86_VOYAGER 33 select HAVE_ARCH_KGDB if !X86_VOYAGER
34 select HAVE_ARCH_TRACEHOOK
32 select HAVE_GENERIC_DMA_COHERENT if X86_32 35 select HAVE_GENERIC_DMA_COHERENT if X86_32
33 select HAVE_EFFICIENT_UNALIGNED_ACCESS 36 select HAVE_EFFICIENT_UNALIGNED_ACCESS
34 37
@@ -37,10 +40,6 @@ config ARCH_DEFCONFIG
37 default "arch/x86/configs/i386_defconfig" if X86_32 40 default "arch/x86/configs/i386_defconfig" if X86_32
38 default "arch/x86/configs/x86_64_defconfig" if X86_64 41 default "arch/x86/configs/x86_64_defconfig" if X86_64
39 42
40
41config GENERIC_LOCKBREAK
42 def_bool n
43
44config GENERIC_TIME 43config GENERIC_TIME
45 def_bool y 44 def_bool y
46 45
@@ -93,7 +92,7 @@ config GENERIC_HWEIGHT
93 def_bool y 92 def_bool y
94 93
95config GENERIC_GPIO 94config GENERIC_GPIO
96 def_bool n 95 bool
97 96
98config ARCH_MAY_HAVE_PC_FDC 97config ARCH_MAY_HAVE_PC_FDC
99 def_bool y 98 def_bool y
@@ -104,12 +103,6 @@ config RWSEM_GENERIC_SPINLOCK
104config RWSEM_XCHGADD_ALGORITHM 103config RWSEM_XCHGADD_ALGORITHM
105 def_bool X86_XADD 104 def_bool X86_XADD
106 105
107config ARCH_HAS_ILOG2_U32
108 def_bool n
109
110config ARCH_HAS_ILOG2_U64
111 def_bool n
112
113config ARCH_HAS_CPU_IDLE_WAIT 106config ARCH_HAS_CPU_IDLE_WAIT
114 def_bool y 107 def_bool y
115 108
@@ -123,6 +116,9 @@ config GENERIC_TIME_VSYSCALL
123config ARCH_HAS_CPU_RELAX 116config ARCH_HAS_CPU_RELAX
124 def_bool y 117 def_bool y
125 118
119config ARCH_HAS_DEFAULT_IDLE
120 def_bool y
121
126config ARCH_HAS_CACHE_LINE_SIZE 122config ARCH_HAS_CACHE_LINE_SIZE
127 def_bool y 123 def_bool y
128 124
@@ -151,9 +147,6 @@ config AUDIT_ARCH
151 bool 147 bool
152 default X86_64 148 default X86_64
153 149
154config ARCH_SUPPORTS_AOUT
155 def_bool y
156
157config ARCH_SUPPORTS_OPTIMIZED_INLINING 150config ARCH_SUPPORTS_OPTIMIZED_INLINING
158 def_bool y 151 def_bool y
159 152
@@ -204,6 +197,7 @@ config X86_TRAMPOLINE
204config KTIME_SCALAR 197config KTIME_SCALAR
205 def_bool X86_32 198 def_bool X86_32
206source "init/Kconfig" 199source "init/Kconfig"
200source "kernel/Kconfig.freezer"
207 201
208menu "Processor type and features" 202menu "Processor type and features"
209 203
@@ -553,6 +547,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
553config AMD_IOMMU 547config AMD_IOMMU
554 bool "AMD IOMMU support" 548 bool "AMD IOMMU support"
555 select SWIOTLB 549 select SWIOTLB
550 select PCI_MSI
556 depends on X86_64 && PCI && ACPI 551 depends on X86_64 && PCI && ACPI
557 help 552 help
558 With this option you can enable support for AMD IOMMU hardware in 553 With this option you can enable support for AMD IOMMU hardware in
@@ -758,9 +753,8 @@ config I8K
758 Say N otherwise. 753 Say N otherwise.
759 754
760config X86_REBOOTFIXUPS 755config X86_REBOOTFIXUPS
761 def_bool n 756 bool "Enable X86 board specific fixups for reboot"
762 prompt "Enable X86 board specific fixups for reboot" 757 depends on X86_32
763 depends on X86_32 && X86
764 ---help--- 758 ---help---
765 This enables chipset and/or board specific fixups to be done 759 This enables chipset and/or board specific fixups to be done
766 in order to get reboot to work correctly. This is only needed on 760 in order to get reboot to work correctly. This is only needed on
@@ -776,23 +770,45 @@ config X86_REBOOTFIXUPS
776 Say N otherwise. 770 Say N otherwise.
777 771
778config MICROCODE 772config MICROCODE
779 tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" 773 tristate "/dev/cpu/microcode - microcode support"
780 select FW_LOADER 774 select FW_LOADER
781 ---help--- 775 ---help---
782 If you say Y here, you will be able to update the microcode on 776 If you say Y here, you will be able to update the microcode on
783 Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II, 777 certain Intel and AMD processors. The Intel support is for the
784 Pentium III, Pentium 4, Xeon etc. You will obviously need the 778 IA32 family, e.g. Pentium Pro, Pentium II, Pentium III,
785 actual microcode binary data itself which is not shipped with the 779 Pentium 4, Xeon etc. The AMD support is for family 0x10 and
786 Linux kernel. 780 0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra.
781 You will obviously need the actual microcode binary data itself
782 which is not shipped with the Linux kernel.
787 783
788 For latest news and information on obtaining all the required 784 This option selects the general module only, you need to select
789 ingredients for this driver, check: 785 at least one vendor specific module as well.
790 <http://www.urbanmyth.org/microcode/>.
791 786
792 To compile this driver as a module, choose M here: the 787 To compile this driver as a module, choose M here: the
793 module will be called microcode. 788 module will be called microcode.
794 789
795config MICROCODE_OLD_INTERFACE 790config MICROCODE_INTEL
791 bool "Intel microcode patch loading support"
792 depends on MICROCODE
793 default MICROCODE
794 select FW_LOADER
795 --help---
796 This options enables microcode patch loading support for Intel
797 processors.
798
799 For latest news and information on obtaining all the required
800 Intel ingredients for this driver, check:
801 <http://www.urbanmyth.org/microcode/>.
802
803config MICROCODE_AMD
804 bool "AMD microcode patch loading support"
805 depends on MICROCODE
806 select FW_LOADER
807 --help---
808 If you select this option, microcode patch loading support for AMD
809 processors will be enabled.
810
811 config MICROCODE_OLD_INTERFACE
796 def_bool y 812 def_bool y
797 depends on MICROCODE 813 depends on MICROCODE
798 814
@@ -922,16 +938,17 @@ config HIGHMEM
922 depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) 938 depends on X86_32 && (HIGHMEM64G || HIGHMEM4G)
923 939
924config X86_PAE 940config X86_PAE
925 def_bool n 941 bool "PAE (Physical Address Extension) Support"
926 prompt "PAE (Physical Address Extension) Support"
927 depends on X86_32 && !HIGHMEM4G 942 depends on X86_32 && !HIGHMEM4G
928 select RESOURCES_64BIT
929 help 943 help
930 PAE is required for NX support, and furthermore enables 944 PAE is required for NX support, and furthermore enables
931 larger swapspace support for non-overcommit purposes. It 945 larger swapspace support for non-overcommit purposes. It
932 has the cost of more pagetable lookup overhead, and also 946 has the cost of more pagetable lookup overhead, and also
933 consumes more pagetable space per process. 947 consumes more pagetable space per process.
934 948
949config ARCH_PHYS_ADDR_T_64BIT
950 def_bool X86_64 || X86_PAE
951
935# Common NUMA Features 952# Common NUMA Features
936config NUMA 953config NUMA
937 bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" 954 bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
@@ -1020,7 +1037,7 @@ config HAVE_ARCH_ALLOC_REMAP
1020 1037
1021config ARCH_FLATMEM_ENABLE 1038config ARCH_FLATMEM_ENABLE
1022 def_bool y 1039 def_bool y
1023 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA 1040 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
1024 1041
1025config ARCH_DISCONTIGMEM_ENABLE 1042config ARCH_DISCONTIGMEM_ENABLE
1026 def_bool y 1043 def_bool y
@@ -1036,7 +1053,7 @@ config ARCH_SPARSEMEM_DEFAULT
1036 1053
1037config ARCH_SPARSEMEM_ENABLE 1054config ARCH_SPARSEMEM_ENABLE
1038 def_bool y 1055 def_bool y
1039 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) 1056 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH
1040 select SPARSEMEM_STATIC if X86_32 1057 select SPARSEMEM_STATIC if X86_32
1041 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 1058 select SPARSEMEM_VMEMMAP_ENABLE if X86_64
1042 1059
@@ -1059,6 +1076,56 @@ config HIGHPTE
1059 low memory. Setting this option will put user-space page table 1076 low memory. Setting this option will put user-space page table
1060 entries in high memory. 1077 entries in high memory.
1061 1078
1079config X86_CHECK_BIOS_CORRUPTION
1080 bool "Check for low memory corruption"
1081 help
1082 Periodically check for memory corruption in low memory, which
1083 is suspected to be caused by BIOS. Even when enabled in the
1084 configuration, it is disabled at runtime. Enable it by
1085 setting "memory_corruption_check=1" on the kernel command
1086 line. By default it scans the low 64k of memory every 60
1087 seconds; see the memory_corruption_check_size and
1088 memory_corruption_check_period parameters in
1089 Documentation/kernel-parameters.txt to adjust this.
1090
1091 When enabled with the default parameters, this option has
1092 almost no overhead, as it reserves a relatively small amount
1093 of memory and scans it infrequently. It both detects corruption
1094 and prevents it from affecting the running system.
1095
1096 It is, however, intended as a diagnostic tool; if repeatable
1097 BIOS-originated corruption always affects the same memory,
1098 you can use memmap= to prevent the kernel from using that
1099 memory.
1100
1101config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
1102 bool "Set the default setting of memory_corruption_check"
1103 depends on X86_CHECK_BIOS_CORRUPTION
1104 default y
1105 help
1106 Set whether the default state of memory_corruption_check is
1107 on or off.
1108
1109config X86_RESERVE_LOW_64K
1110 bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
1111 default y
1112 help
1113 Reserve the first 64K of physical RAM on BIOSes that are known
1114 to potentially corrupt that memory range. A numbers of BIOSes are
1115 known to utilize this area during suspend/resume, so it must not
1116 be used by the kernel.
1117
1118 Set this to N if you are absolutely sure that you trust the BIOS
1119 to get all its memory reservations and usages right.
1120
1121 If you have doubts about the BIOS (e.g. suspend/resume does not
1122 work or there's kernel crashes after certain hardware hotplug
1123 events) and it's not AMI or Phoenix, then you might want to enable
1124 X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
1125 corruption patterns.
1126
1127 Say Y if unsure.
1128
1062config MATH_EMULATION 1129config MATH_EMULATION
1063 bool 1130 bool
1064 prompt "Math emulation" if X86_32 1131 prompt "Math emulation" if X86_32
@@ -1117,10 +1184,10 @@ config MTRR
1117 You can safely say Y even if your machine doesn't have MTRRs, you'll 1184 You can safely say Y even if your machine doesn't have MTRRs, you'll
1118 just add about 9 KB to your kernel. 1185 just add about 9 KB to your kernel.
1119 1186
1120 See <file:Documentation/mtrr.txt> for more information. 1187 See <file:Documentation/x86/mtrr.txt> for more information.
1121 1188
1122config MTRR_SANITIZER 1189config MTRR_SANITIZER
1123 bool 1190 def_bool y
1124 prompt "MTRR cleanup support" 1191 prompt "MTRR cleanup support"
1125 depends on MTRR 1192 depends on MTRR
1126 help 1193 help
@@ -1131,7 +1198,7 @@ config MTRR_SANITIZER
1131 The largest mtrr entry size for a continous block can be set with 1198 The largest mtrr entry size for a continous block can be set with
1132 mtrr_chunk_size. 1199 mtrr_chunk_size.
1133 1200
1134 If unsure, say N. 1201 If unsure, say Y.
1135 1202
1136config MTRR_SANITIZER_ENABLE_DEFAULT 1203config MTRR_SANITIZER_ENABLE_DEFAULT
1137 int "MTRR cleanup enable value (0-1)" 1204 int "MTRR cleanup enable value (0-1)"
@@ -1166,8 +1233,7 @@ config X86_PAT
1166 If unsure, say Y. 1233 If unsure, say Y.
1167 1234
1168config EFI 1235config EFI
1169 def_bool n 1236 bool "EFI runtime service support"
1170 prompt "EFI runtime service support"
1171 depends on ACPI 1237 depends on ACPI
1172 ---help--- 1238 ---help---
1173 This enables the kernel to use EFI runtime services that are 1239 This enables the kernel to use EFI runtime services that are
@@ -1180,18 +1246,9 @@ config EFI
1180 resultant kernel should continue to boot on existing non-EFI 1246 resultant kernel should continue to boot on existing non-EFI
1181 platforms. 1247 platforms.
1182 1248
1183config IRQBALANCE
1184 def_bool y
1185 prompt "Enable kernel irq balancing"
1186 depends on X86_32 && SMP && X86_IO_APIC
1187 help
1188 The default yes will allow the kernel to do irq load balancing.
1189 Saying no will keep the kernel from doing irq load balancing.
1190
1191config SECCOMP 1249config SECCOMP
1192 def_bool y 1250 def_bool y
1193 prompt "Enable seccomp to safely compute untrusted bytecode" 1251 prompt "Enable seccomp to safely compute untrusted bytecode"
1194 depends on PROC_FS
1195 help 1252 help
1196 This kernel feature is useful for number crunching applications 1253 This kernel feature is useful for number crunching applications
1197 that may need to compute untrusted bytecode during their 1254 that may need to compute untrusted bytecode during their
@@ -1199,7 +1256,7 @@ config SECCOMP
1199 the process as file descriptors supporting the read/write 1256 the process as file descriptors supporting the read/write
1200 syscalls, it's possible to isolate those applications in 1257 syscalls, it's possible to isolate those applications in
1201 their own address space using seccomp. Once seccomp is 1258 their own address space using seccomp. Once seccomp is
1202 enabled via /proc/<pid>/seccomp, it cannot be disabled 1259 enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
1203 and the task is only allowed to execute a few safe syscalls 1260 and the task is only allowed to execute a few safe syscalls
1204 defined by each seccomp mode. 1261 defined by each seccomp mode.
1205 1262
@@ -1356,14 +1413,14 @@ config PHYSICAL_ALIGN
1356 Don't change this unless you know what you are doing. 1413 Don't change this unless you know what you are doing.
1357 1414
1358config HOTPLUG_CPU 1415config HOTPLUG_CPU
1359 bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)" 1416 bool "Support for hot-pluggable CPUs"
1360 depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER 1417 depends on SMP && HOTPLUG && !X86_VOYAGER
1361 ---help--- 1418 ---help---
1362 Say Y here to experiment with turning CPUs off and on, and to 1419 Say Y here to allow turning CPUs off and on. CPUs can be
1363 enable suspend on SMP systems. CPUs can be controlled through 1420 controlled through /sys/devices/system/cpu.
1364 /sys/devices/system/cpu. 1421 ( Note: power management support will enable this option
1365 Say N if you want to disable CPU hotplug and don't need to 1422 automatically on SMP systems. )
1366 suspend. 1423 Say N if you want to disable CPU hotplug.
1367 1424
1368config COMPAT_VDSO 1425config COMPAT_VDSO
1369 def_bool y 1426 def_bool y
@@ -1378,6 +1435,51 @@ config COMPAT_VDSO
1378 1435
1379 If unsure, say Y. 1436 If unsure, say Y.
1380 1437
1438config CMDLINE_BOOL
1439 bool "Built-in kernel command line"
1440 default n
1441 help
1442 Allow for specifying boot arguments to the kernel at
1443 build time. On some systems (e.g. embedded ones), it is
1444 necessary or convenient to provide some or all of the
1445 kernel boot arguments with the kernel itself (that is,
1446 to not rely on the boot loader to provide them.)
1447
1448 To compile command line arguments into the kernel,
1449 set this option to 'Y', then fill in the
1450 the boot arguments in CONFIG_CMDLINE.
1451
1452 Systems with fully functional boot loaders (i.e. non-embedded)
1453 should leave this option set to 'N'.
1454
1455config CMDLINE
1456 string "Built-in kernel command string"
1457 depends on CMDLINE_BOOL
1458 default ""
1459 help
1460 Enter arguments here that should be compiled into the kernel
1461 image and used at boot time. If the boot loader provides a
1462 command line at boot time, it is appended to this string to
1463 form the full kernel command line, when the system boots.
1464
1465 However, you can use the CONFIG_CMDLINE_OVERRIDE option to
1466 change this behavior.
1467
1468 In most cases, the command line (whether built-in or provided
1469 by the boot loader) should specify the device for the root
1470 file system.
1471
1472config CMDLINE_OVERRIDE
1473 bool "Built-in command line overrides boot loader arguments"
1474 default n
1475 depends on CMDLINE_BOOL
1476 help
1477 Set this option to 'Y' to have the kernel ignore the boot loader
1478 command line, and use ONLY the built-in command line.
1479
1480 This is used to work around broken boot loaders. This should
1481 be set to 'N' under normal conditions.
1482
1381endmenu 1483endmenu
1382 1484
1383config ARCH_ENABLE_MEMORY_HOTPLUG 1485config ARCH_ENABLE_MEMORY_HOTPLUG
@@ -1536,6 +1638,8 @@ source "arch/x86/kernel/cpu/cpufreq/Kconfig"
1536 1638
1537source "drivers/cpuidle/Kconfig" 1639source "drivers/cpuidle/Kconfig"
1538 1640
1641source "drivers/idle/Kconfig"
1642
1539endmenu 1643endmenu
1540 1644
1541 1645
@@ -1643,6 +1747,14 @@ config DMAR_FLOPPY_WA
1643 workaround will setup a 1:1 mapping for the first 1747 workaround will setup a 1:1 mapping for the first
1644 16M to make floppy (an ISA device) work. 1748 16M to make floppy (an ISA device) work.
1645 1749
1750config INTR_REMAP
1751 bool "Support for Interrupt Remapping (EXPERIMENTAL)"
1752 depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
1753 help
1754 Supports Interrupt remapping for IO-APIC and MSI devices.
1755 To use x2apic mode in the CPU's which support x2APIC enhancements or
1756 to support platforms with CPU's having > 8 bit APIC ID, say Y.
1757
1646source "drivers/pci/pcie/Kconfig" 1758source "drivers/pci/pcie/Kconfig"
1647 1759
1648source "drivers/pci/Kconfig" 1760source "drivers/pci/Kconfig"
@@ -1759,7 +1871,7 @@ config IA32_EMULATION
1759 1871
1760config IA32_AOUT 1872config IA32_AOUT
1761 tristate "IA32 a.out support" 1873 tristate "IA32 a.out support"
1762 depends on IA32_EMULATION && ARCH_SUPPORTS_AOUT 1874 depends on IA32_EMULATION
1763 help 1875 help
1764 Support old a.out binaries in the 32bit emulation. 1876 Support old a.out binaries in the 32bit emulation.
1765 1877
@@ -1773,7 +1885,7 @@ config COMPAT_FOR_U64_ALIGNMENT
1773 1885
1774config SYSVIPC_COMPAT 1886config SYSVIPC_COMPAT
1775 def_bool y 1887 def_bool y
1776 depends on X86_64 && COMPAT && SYSVIPC 1888 depends on COMPAT && SYSVIPC
1777 1889
1778endmenu 1890endmenu
1779 1891
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index b225219c448..0b7c4a3f065 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -38,8 +38,7 @@ config M386
38 - "Crusoe" for the Transmeta Crusoe series. 38 - "Crusoe" for the Transmeta Crusoe series.
39 - "Efficeon" for the Transmeta Efficeon series. 39 - "Efficeon" for the Transmeta Efficeon series.
40 - "Winchip-C6" for original IDT Winchip. 40 - "Winchip-C6" for original IDT Winchip.
41 - "Winchip-2" for IDT Winchip 2. 41 - "Winchip-2" for IDT Winchips with 3dNow! capabilities.
42 - "Winchip-2A" for IDT Winchips with 3dNow! capabilities.
43 - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). 42 - "GeodeGX1" for Geode GX1 (Cyrix MediaGX).
44 - "Geode GX/LX" For AMD Geode GX and LX processors. 43 - "Geode GX/LX" For AMD Geode GX and LX processors.
45 - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. 44 - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3.
@@ -194,19 +193,11 @@ config MWINCHIPC6
194 treat this chip as a 586TSC with some extended instructions 193 treat this chip as a 586TSC with some extended instructions
195 and alignment requirements. 194 and alignment requirements.
196 195
197config MWINCHIP2
198 bool "Winchip-2"
199 depends on X86_32
200 help
201 Select this for an IDT Winchip-2. Linux and GCC
202 treat this chip as a 586TSC with some extended instructions
203 and alignment requirements.
204
205config MWINCHIP3D 196config MWINCHIP3D
206 bool "Winchip-2A/Winchip-3" 197 bool "Winchip-2/Winchip-2A/Winchip-3"
207 depends on X86_32 198 depends on X86_32
208 help 199 help
209 Select this for an IDT Winchip-2A or 3. Linux and GCC 200 Select this for an IDT Winchip-2, 2A or 3. Linux and GCC
210 treat this chip as a 586TSC with some extended instructions 201 treat this chip as a 586TSC with some extended instructions
211 and alignment requirements. Also enable out of order memory 202 and alignment requirements. Also enable out of order memory
212 stores for this CPU, which can increase performance of some 203 stores for this CPU, which can increase performance of some
@@ -318,7 +309,7 @@ config X86_L1_CACHE_SHIFT
318 int 309 int
319 default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC 310 default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC
320 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 311 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
321 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 312 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
322 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 313 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7
323 314
324config X86_XADD 315config X86_XADD
@@ -360,7 +351,7 @@ config X86_POPAD_OK
360 351
361config X86_ALIGNMENT_16 352config X86_ALIGNMENT_16
362 def_bool y 353 def_bool y
363 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 354 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
364 355
365config X86_INTEL_USERCOPY 356config X86_INTEL_USERCOPY
366 def_bool y 357 def_bool y
@@ -368,7 +359,7 @@ config X86_INTEL_USERCOPY
368 359
369config X86_USE_PPRO_CHECKSUM 360config X86_USE_PPRO_CHECKSUM
370 def_bool y 361 def_bool y
371 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 362 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2
372 363
373config X86_USE_3DNOW 364config X86_USE_3DNOW
374 def_bool y 365 def_bool y
@@ -376,7 +367,7 @@ config X86_USE_3DNOW
376 367
377config X86_OOSTORE 368config X86_OOSTORE
378 def_bool y 369 def_bool y
379 depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR 370 depends on (MWINCHIP3D || MWINCHIPC6) && MTRR
380 371
381# 372#
382# P6_NOPs are a relatively minor optimization that require a family >= 373# P6_NOPs are a relatively minor optimization that require a family >=
@@ -396,7 +387,7 @@ config X86_P6_NOP
396 387
397config X86_TSC 388config X86_TSC
398 def_bool y 389 def_bool y
399 depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 390 depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
400 391
401config X86_CMPXCHG64 392config X86_CMPXCHG64
402 def_bool y 393 def_bool y
@@ -406,7 +397,7 @@ config X86_CMPXCHG64
406# generates cmov. 397# generates cmov.
407config X86_CMOV 398config X86_CMOV
408 def_bool y 399 def_bool y
409 depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64) 400 depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64)
410 401
411config X86_MINIMUM_CPU_FAMILY 402config X86_MINIMUM_CPU_FAMILY
412 int 403 int
@@ -417,4 +408,124 @@ config X86_MINIMUM_CPU_FAMILY
417 408
418config X86_DEBUGCTLMSR 409config X86_DEBUGCTLMSR
419 def_bool y 410 def_bool y
420 depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) 411 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
412
413menuconfig PROCESSOR_SELECT
414 bool "Supported processor vendors" if EMBEDDED
415 help
416 This lets you choose what x86 vendor support code your kernel
417 will include.
418
419config CPU_SUP_INTEL
420 default y
421 bool "Support Intel processors" if PROCESSOR_SELECT
422 help
423 This enables detection, tunings and quirks for Intel processors
424
425 You need this enabled if you want your kernel to run on an
426 Intel CPU. Disabling this option on other types of CPUs
427 makes the kernel a tiny bit smaller. Disabling it on an Intel
428 CPU might render the kernel unbootable.
429
430 If unsure, say N.
431
432config CPU_SUP_CYRIX_32
433 default y
434 bool "Support Cyrix processors" if PROCESSOR_SELECT
435 depends on !64BIT
436 help
437 This enables detection, tunings and quirks for Cyrix processors
438
439 You need this enabled if you want your kernel to run on a
440 Cyrix CPU. Disabling this option on other types of CPUs
441 makes the kernel a tiny bit smaller. Disabling it on a Cyrix
442 CPU might render the kernel unbootable.
443
444 If unsure, say N.
445
446config CPU_SUP_AMD
447 default y
448 bool "Support AMD processors" if PROCESSOR_SELECT
449 help
450 This enables detection, tunings and quirks for AMD processors
451
452 You need this enabled if you want your kernel to run on an
453 AMD CPU. Disabling this option on other types of CPUs
454 makes the kernel a tiny bit smaller. Disabling it on an AMD
455 CPU might render the kernel unbootable.
456
457 If unsure, say N.
458
459config CPU_SUP_CENTAUR_32
460 default y
461 bool "Support Centaur processors" if PROCESSOR_SELECT
462 depends on !64BIT
463 help
464 This enables detection, tunings and quirks for Centaur processors
465
466 You need this enabled if you want your kernel to run on a
467 Centaur CPU. Disabling this option on other types of CPUs
468 makes the kernel a tiny bit smaller. Disabling it on a Centaur
469 CPU might render the kernel unbootable.
470
471 If unsure, say N.
472
473config CPU_SUP_CENTAUR_64
474 default y
475 bool "Support Centaur processors" if PROCESSOR_SELECT
476 depends on 64BIT
477 help
478 This enables detection, tunings and quirks for Centaur processors
479
480 You need this enabled if you want your kernel to run on a
481 Centaur CPU. Disabling this option on other types of CPUs
482 makes the kernel a tiny bit smaller. Disabling it on a Centaur
483 CPU might render the kernel unbootable.
484
485 If unsure, say N.
486
487config CPU_SUP_TRANSMETA_32
488 default y
489 bool "Support Transmeta processors" if PROCESSOR_SELECT
490 depends on !64BIT
491 help
492 This enables detection, tunings and quirks for Transmeta processors
493
494 You need this enabled if you want your kernel to run on a
495 Transmeta CPU. Disabling this option on other types of CPUs
496 makes the kernel a tiny bit smaller. Disabling it on a Transmeta
497 CPU might render the kernel unbootable.
498
499 If unsure, say N.
500
501config CPU_SUP_UMC_32
502 default y
503 bool "Support UMC processors" if PROCESSOR_SELECT
504 depends on !64BIT
505 help
506 This enables detection, tunings and quirks for UMC processors
507
508 You need this enabled if you want your kernel to run on a
509 UMC CPU. Disabling this option on other types of CPUs
510 makes the kernel a tiny bit smaller. Disabling it on a UMC
511 CPU might render the kernel unbootable.
512
513 If unsure, say N.
514
515config X86_DS
516 bool "Debug Store support"
517 default y
518 help
519 Add support for Debug Store.
520 This allows the kernel to provide a memory buffer to the hardware
521 to store various profiling and tracing events.
522
523config X86_PTRACE_BTS
524 bool "ptrace interface to Branch Trace Store"
525 default y
526 depends on (X86_DS && X86_DEBUGCTLMSR)
527 help
528 Add a ptrace interface to allow collecting an execution trace
529 of the traced task.
530 This collects control flow changes in a (cyclic) buffer and allows
531 debuggers to fill in the gaps and show an execution trace of the debuggee.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 092f019e033..2a3dfbd5e67 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -43,6 +43,19 @@ config EARLY_PRINTK
43 with klogd/syslogd or the X server. You should normally N here, 43 with klogd/syslogd or the X server. You should normally N here,
44 unless you want to debug such a crash. 44 unless you want to debug such a crash.
45 45
46config EARLY_PRINTK_DBGP
47 bool "Early printk via EHCI debug port"
48 default n
49 depends on EARLY_PRINTK && PCI
50 help
51 Write kernel log output directly into the EHCI debug port.
52
53 This is useful for kernel debugging when your machine crashes very
54 early before the console code is initialized. For normal operation
55 it is not recommended because it looks ugly and doesn't cooperate
56 with klogd/syslogd or the X server. You should normally N here,
57 unless you want to debug such a crash. You need usb debug device.
58
46config DEBUG_STACKOVERFLOW 59config DEBUG_STACKOVERFLOW
47 bool "Check for stack overflows" 60 bool "Check for stack overflows"
48 depends on DEBUG_KERNEL 61 depends on DEBUG_KERNEL
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index f5631da585b..d1a47adb5ae 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -110,16 +110,16 @@ KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
110mcore-y := arch/x86/mach-default/ 110mcore-y := arch/x86/mach-default/
111 111
112# Voyager subarch support 112# Voyager subarch support
113mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager 113mflags-$(CONFIG_X86_VOYAGER) := -Iarch/x86/include/asm/mach-voyager
114mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/ 114mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
115 115
116# generic subarchitecture 116# generic subarchitecture
117mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic 117mflags-$(CONFIG_X86_GENERICARCH):= -Iarch/x86/include/asm/mach-generic
118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ 118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
119mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ 119mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
120 120
121# default subarch .h files 121# default subarch .h files
122mflags-y += -Iinclude/asm-x86/mach-default 122mflags-y += -Iarch/x86/include/asm/mach-default
123 123
124# 64 bit does not support subarch support - clear sub arch variables 124# 64 bit does not support subarch support - clear sub arch variables
125fcore-$(CONFIG_X86_64) := 125fcore-$(CONFIG_X86_64) :=
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index e372b584e91..80177ec052f 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -28,7 +28,6 @@ cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon)
28cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 28cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
29cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 29cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
30cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) 30cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586)
31cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586)
32cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) 31cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586)
33cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 32cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
34cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) 33cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
@@ -45,3 +44,8 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
45# cpu entries 44# cpu entries
46cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) 45cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
47 46
47# Bug fix for binutils: this option is required in order to keep
48# binutils from generating NOPL instructions against our will.
49ifneq ($(CONFIG_X86_P6_NOP),y)
50cflags-y += $(call cc-option,-Wa$(comma)-mtune=generic32,)
51endif
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 7ee102f9c4f..cd48c721001 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -72,9 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
72KBUILD_CFLAGS += $(call cc-option,-m32) 72KBUILD_CFLAGS += $(call cc-option,-m32)
73KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 73KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
74 74
75$(obj)/zImage: IMAGE_OFFSET := 0x1000
76$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) 75$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK)
77$(obj)/bzImage: IMAGE_OFFSET := 0x100000
78$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__ 76$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
79$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ 77$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
80$(obj)/bzImage: BUILDFLAGS := -b 78$(obj)/bzImage: BUILDFLAGS := -b
@@ -117,7 +115,7 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE
117 $(call if_changed,objcopy) 115 $(call if_changed,objcopy)
118 116
119$(obj)/compressed/vmlinux: FORCE 117$(obj)/compressed/vmlinux: FORCE
120 $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ 118 $(Q)$(MAKE) $(build)=$(obj)/compressed $@
121 119
122# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel 120# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
123FDARGS = 121FDARGS =
@@ -181,6 +179,7 @@ isoimage: $(BOOTIMAGE)
181 mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ 179 mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \
182 -no-emul-boot -boot-load-size 4 -boot-info-table \ 180 -no-emul-boot -boot-load-size 4 -boot-info-table \
183 $(obj)/isoimage 181 $(obj)/isoimage
182 isohybrid $(obj)/image.iso 2>/dev/null || true
184 rm -rf $(obj)/isoimage 183 rm -rf $(obj)/isoimage
185 184
186zlilo: $(BOOTIMAGE) 185zlilo: $(BOOTIMAGE)
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 92fdd35bd93..1771c804e02 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -27,9 +27,8 @@ $(obj)/vmlinux.bin: vmlinux FORCE
27 $(call if_changed,objcopy) 27 $(call if_changed,objcopy)
28 28
29 29
30ifeq ($(CONFIG_X86_32),y) 30targets += vmlinux.bin.all vmlinux.relocs relocs
31targets += vmlinux.bin.all vmlinux.relocs 31hostprogs-$(CONFIG_X86_32) += relocs
32hostprogs-y := relocs
33 32
34quiet_cmd_relocs = RELOCS $@ 33quiet_cmd_relocs = RELOCS $@
35 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< 34 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -43,6 +42,8 @@ quiet_cmd_relocbin = BUILD $@
43$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE 42$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
44 $(call if_changed,relocbin) 43 $(call if_changed,relocbin)
45 44
45ifeq ($(CONFIG_X86_32),y)
46
46ifdef CONFIG_RELOCATABLE 47ifdef CONFIG_RELOCATABLE
47$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE 48$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
48 $(call if_changed,gzip) 49 $(call if_changed,gzip)
@@ -59,6 +60,5 @@ $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
59LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T 60LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
60endif 61endif
61 62
62
63$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE 63$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
64 $(call if_changed,ld) 64 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index ba7736cf2ec..29c5fbf0839 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -137,14 +137,15 @@ relocated:
137 */ 137 */
138 movl output_len(%ebx), %eax 138 movl output_len(%ebx), %eax
139 pushl %eax 139 pushl %eax
140 # push arguments for decompress_kernel:
140 pushl %ebp # output address 141 pushl %ebp # output address
141 movl input_len(%ebx), %eax 142 movl input_len(%ebx), %eax
142 pushl %eax # input_len 143 pushl %eax # input_len
143 leal input_data(%ebx), %eax 144 leal input_data(%ebx), %eax
144 pushl %eax # input_data 145 pushl %eax # input_data
145 leal boot_heap(%ebx), %eax 146 leal boot_heap(%ebx), %eax
146 pushl %eax # heap area as third argument 147 pushl %eax # heap area
147 pushl %esi # real mode pointer as second arg 148 pushl %esi # real mode pointer
148 call decompress_kernel 149 call decompress_kernel
149 addl $20, %esp 150 addl $20, %esp
150 popl %ecx 151 popl %ecx
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 9fea7370647..da062216948 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -16,7 +16,7 @@
16 */ 16 */
17#undef CONFIG_PARAVIRT 17#undef CONFIG_PARAVIRT
18#ifdef CONFIG_X86_32 18#ifdef CONFIG_X86_32
19#define _ASM_DESC_H_ 1 19#define _ASM_X86_DESC_H 1
20#endif 20#endif
21 21
22#ifdef CONFIG_X86_64 22#ifdef CONFIG_X86_64
@@ -27,7 +27,7 @@
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <linux/screen_info.h> 28#include <linux/screen_info.h>
29#include <linux/elf.h> 29#include <linux/elf.h>
30#include <asm/io.h> 30#include <linux/io.h>
31#include <asm/page.h> 31#include <asm/page.h>
32#include <asm/boot.h> 32#include <asm/boot.h>
33#include <asm/bootparam.h> 33#include <asm/bootparam.h>
@@ -251,7 +251,7 @@ static void __putstr(int error, const char *s)
251 y--; 251 y--;
252 } 252 }
253 } else { 253 } else {
254 vidmem [(x + cols * y) * 2] = c; 254 vidmem[(x + cols * y) * 2] = c;
255 if (++x >= cols) { 255 if (++x >= cols) {
256 x = 0; 256 x = 0;
257 if (++y >= lines) { 257 if (++y >= lines) {
@@ -277,7 +277,8 @@ static void *memset(void *s, int c, unsigned n)
277 int i; 277 int i;
278 char *ss = s; 278 char *ss = s;
279 279
280 for (i = 0; i < n; i++) ss[i] = c; 280 for (i = 0; i < n; i++)
281 ss[i] = c;
281 return s; 282 return s;
282} 283}
283 284
@@ -287,7 +288,8 @@ static void *memcpy(void *dest, const void *src, unsigned n)
287 const char *s = src; 288 const char *s = src;
288 char *d = dest; 289 char *d = dest;
289 290
290 for (i = 0; i < n; i++) d[i] = s[i]; 291 for (i = 0; i < n; i++)
292 d[i] = s[i];
291 return dest; 293 return dest;
292} 294}
293 295
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index a1310c52fc0..857e492c571 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -492,7 +492,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
492 continue; 492 continue;
493 } 493 }
494 sh_symtab = sec_symtab->symtab; 494 sh_symtab = sec_symtab->symtab;
495 sym_strtab = sec->link->strtab; 495 sym_strtab = sec_symtab->link->strtab;
496 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { 496 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
497 Elf32_Rel *rel; 497 Elf32_Rel *rel;
498 Elf32_Sym *sym; 498 Elf32_Sym *sym;
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 75298fe2edc..6ec6bb6e995 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -59,17 +59,18 @@ int validate_cpu(void)
59 u32 e = err_flags[i]; 59 u32 e = err_flags[i];
60 60
61 for (j = 0; j < 32; j++) { 61 for (j = 0; j < 32; j++) {
62 int n = (i << 5)+j; 62 if (msg_strs[0] < i ||
63 if (*msg_strs < n) { 63 (msg_strs[0] == i && msg_strs[1] < j)) {
64 /* Skip to the next string */ 64 /* Skip to the next string */
65 do { 65 msg_strs += 2;
66 msg_strs++; 66 while (*msg_strs++)
67 } while (*msg_strs); 67 ;
68 msg_strs++;
69 } 68 }
70 if (e & 1) { 69 if (e & 1) {
71 if (*msg_strs == n && msg_strs[1]) 70 if (msg_strs[0] == i &&
72 printf("%s ", msg_strs+1); 71 msg_strs[1] == j &&
72 msg_strs[2])
73 printf("%s ", msg_strs+2);
73 else 74 else
74 printf("%d:%d ", i, j); 75 printf("%d:%d ", i, j);
75 } 76 }
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index d93cbc6464d..1aae8f3e5ca 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -41,6 +41,7 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
41 char *mbrbuf_ptr, *mbrbuf_end; 41 char *mbrbuf_ptr, *mbrbuf_end;
42 u32 buf_base, mbr_base; 42 u32 buf_base, mbr_base;
43 extern char _end[]; 43 extern char _end[];
44 u16 mbr_magic;
44 45
45 sector_size = ei->params.bytes_per_sector; 46 sector_size = ei->params.bytes_per_sector;
46 if (!sector_size) 47 if (!sector_size)
@@ -58,11 +59,15 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
58 if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr) 59 if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr)
59 return -1; 60 return -1;
60 61
62 memset(mbrbuf_ptr, 0, sector_size);
61 if (read_mbr(devno, mbrbuf_ptr)) 63 if (read_mbr(devno, mbrbuf_ptr))
62 return -1; 64 return -1;
63 65
64 *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET]; 66 *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET];
65 return 0; 67 mbr_magic = *(u16 *)&mbrbuf_ptr[510];
68
69 /* check for valid MBR magic */
70 return mbr_magic == 0xAA55 ? 0 : -1;
66} 71}
67 72
68static int get_edd_info(u8 devno, struct edd_info *ei) 73static int get_edd_info(u8 devno, struct edd_info *ei)
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index af86e431acf..b993062e9a5 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -30,7 +30,6 @@ SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
30SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ 30SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
31 /* to be loaded */ 31 /* to be loaded */
32ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ 32ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
33SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */
34 33
35#ifndef SVGA_MODE 34#ifndef SVGA_MODE
36#define SVGA_MODE ASK_VGA 35#define SVGA_MODE ASK_VGA
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index bbe76953bae..8ef60f20b37 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -15,33 +15,33 @@
15 15
16#include <stdio.h> 16#include <stdio.h>
17 17
18#include "../kernel/cpu/feature_names.c" 18#include "../kernel/cpu/capflags.c"
19
20#if NCAPFLAGS > 8
21# error "Need to adjust the boot code handling of CPUID strings"
22#endif
23 19
24int main(void) 20int main(void)
25{ 21{
26 int i; 22 int i, j;
27 const char *str; 23 const char *str;
28 24
29 printf("static const char x86_cap_strs[] = \n"); 25 printf("static const char x86_cap_strs[] = \n");
30 26
31 for (i = 0; i < NCAPINTS*32; i++) { 27 for (i = 0; i < NCAPINTS; i++) {
32 str = x86_cap_flags[i]; 28 for (j = 0; j < 32; j++) {
33 29 str = x86_cap_flags[i*32+j];
34 if (i == NCAPINTS*32-1) { 30
35 /* The last entry must be unconditional; this 31 if (i == NCAPINTS-1 && j == 31) {
36 also consumes the compiler-added null character */ 32 /* The last entry must be unconditional; this
37 if (!str) 33 also consumes the compiler-added null
38 str = ""; 34 character */
39 printf("\t\"\\x%02x\"\"%s\"\n", i, str); 35 if (!str)
40 } else if (str) { 36 str = "";
41 printf("#if REQUIRED_MASK%d & (1 << %d)\n" 37 printf("\t\"\\x%02x\\x%02x\"\"%s\"\n",
42 "\t\"\\x%02x\"\"%s\\0\"\n" 38 i, j, str);
43 "#endif\n", 39 } else if (str) {
44 i >> 5, i & 31, i, str); 40 printf("#if REQUIRED_MASK%d & (1 << %d)\n"
41 "\t\"\\x%02x\\x%02x\"\"%s\\0\"\n"
42 "#endif\n",
43 i, j, i, j, str);
44 }
45 } 45 }
46 } 46 }
47 printf("\t;\n"); 47 printf("\t;\n");
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index 49f26aaaebc..3fa979c9c36 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -17,7 +17,7 @@
17#include "boot.h" 17#include "boot.h"
18#include "video.h" 18#include "video.h"
19 19
20__videocard video_bios; 20static __videocard video_bios;
21 21
22/* Set a conventional BIOS mode */ 22/* Set a conventional BIOS mode */
23static int set_bios_mode(u8 mode); 23static int set_bios_mode(u8 mode);
@@ -119,7 +119,7 @@ static int bios_probe(void)
119 return nmodes; 119 return nmodes;
120} 120}
121 121
122__videocard video_bios = 122static __videocard video_bios =
123{ 123{
124 .card_name = "BIOS", 124 .card_name = "BIOS",
125 .probe = bios_probe, 125 .probe = bios_probe,
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 401ad998ad0..75115849af3 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -20,7 +20,7 @@
20static struct vesa_general_info vginfo; 20static struct vesa_general_info vginfo;
21static struct vesa_mode_info vminfo; 21static struct vesa_mode_info vminfo;
22 22
23__videocard video_vesa; 23static __videocard video_vesa;
24 24
25#ifndef _WAKEUP 25#ifndef _WAKEUP
26static void vesa_store_mode_params_graphics(void); 26static void vesa_store_mode_params_graphics(void);
@@ -88,14 +88,11 @@ static int vesa_probe(void)
88 (vminfo.memory_layout == 4 || 88 (vminfo.memory_layout == 4 ||
89 vminfo.memory_layout == 6) && 89 vminfo.memory_layout == 6) &&
90 vminfo.memory_planes == 1) { 90 vminfo.memory_planes == 1) {
91#ifdef CONFIG_FB 91#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
92 /* Graphics mode, color, linear frame buffer 92 /* Graphics mode, color, linear frame buffer
93 supported. Only register the mode if 93 supported. Only register the mode if
94 if framebuffer is configured, however, 94 if framebuffer is configured, however,
95 otherwise the user will be left without a screen. 95 otherwise the user will be left without a screen. */
96 We don't require CONFIG_FB_VESA, however, since
97 some of the other framebuffer drivers can use
98 this mode-setting, too. */
99 mi = GET_HEAP(struct mode_info, 1); 96 mi = GET_HEAP(struct mode_info, 1);
100 mi->mode = mode + VIDEO_FIRST_VESA; 97 mi->mode = mode + VIDEO_FIRST_VESA;
101 mi->depth = vminfo.bpp; 98 mi->depth = vminfo.bpp;
@@ -133,10 +130,12 @@ static int vesa_set_mode(struct mode_info *mode)
133 if ((vminfo.mode_attr & 0x15) == 0x05) { 130 if ((vminfo.mode_attr & 0x15) == 0x05) {
134 /* It's a supported text mode */ 131 /* It's a supported text mode */
135 is_graphic = 0; 132 is_graphic = 0;
133#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
136 } else if ((vminfo.mode_attr & 0x99) == 0x99) { 134 } else if ((vminfo.mode_attr & 0x99) == 0x99) {
137 /* It's a graphics mode with linear frame buffer */ 135 /* It's a graphics mode with linear frame buffer */
138 is_graphic = 1; 136 is_graphic = 1;
139 vesa_mode |= 0x4000; /* Request linear frame buffer */ 137 vesa_mode |= 0x4000; /* Request linear frame buffer */
138#endif
140 } else { 139 } else {
141 return -1; /* Invalid mode */ 140 return -1; /* Invalid mode */
142 } 141 }
@@ -224,7 +223,7 @@ static void vesa_store_pm_info(void)
224static void vesa_store_mode_params_graphics(void) 223static void vesa_store_mode_params_graphics(void)
225{ 224{
226 /* Tell the kernel we're in VESA graphics mode */ 225 /* Tell the kernel we're in VESA graphics mode */
227 boot_params.screen_info.orig_video_isVGA = 0x23; 226 boot_params.screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
228 227
229 /* Mode parameters */ 228 /* Mode parameters */
230 boot_params.screen_info.vesa_attributes = vminfo.mode_attr; 229 boot_params.screen_info.vesa_attributes = vminfo.mode_attr;
@@ -294,7 +293,7 @@ void vesa_store_edid(void)
294 293
295#endif /* not _WAKEUP */ 294#endif /* not _WAKEUP */
296 295
297__videocard video_vesa = 296static __videocard video_vesa =
298{ 297{
299 .card_name = "VESA", 298 .card_name = "VESA",
300 .probe = vesa_probe, 299 .probe = vesa_probe,
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 104275e191a..13b8c86ae98 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.27-rc4 3# Linux kernel version: 2.6.27-rc5
4# Mon Aug 25 15:04:00 2008 4# Wed Sep 3 17:23:09 2008
5# 5#
6# CONFIG_64BIT is not set 6# CONFIG_64BIT is not set
7CONFIG_X86_32=y 7CONFIG_X86_32=y
@@ -202,7 +202,7 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
202# CONFIG_M586 is not set 202# CONFIG_M586 is not set
203# CONFIG_M586TSC is not set 203# CONFIG_M586TSC is not set
204# CONFIG_M586MMX is not set 204# CONFIG_M586MMX is not set
205# CONFIG_M686 is not set 205CONFIG_M686=y
206# CONFIG_MPENTIUMII is not set 206# CONFIG_MPENTIUMII is not set
207# CONFIG_MPENTIUMIII is not set 207# CONFIG_MPENTIUMIII is not set
208# CONFIG_MPENTIUMM is not set 208# CONFIG_MPENTIUMM is not set
@@ -213,7 +213,6 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
213# CONFIG_MCRUSOE is not set 213# CONFIG_MCRUSOE is not set
214# CONFIG_MEFFICEON is not set 214# CONFIG_MEFFICEON is not set
215# CONFIG_MWINCHIPC6 is not set 215# CONFIG_MWINCHIPC6 is not set
216# CONFIG_MWINCHIP2 is not set
217# CONFIG_MWINCHIP3D is not set 216# CONFIG_MWINCHIP3D is not set
218# CONFIG_MGEODEGX1 is not set 217# CONFIG_MGEODEGX1 is not set
219# CONFIG_MGEODE_LX is not set 218# CONFIG_MGEODE_LX is not set
@@ -221,13 +220,14 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
221# CONFIG_MVIAC3_2 is not set 220# CONFIG_MVIAC3_2 is not set
222# CONFIG_MVIAC7 is not set 221# CONFIG_MVIAC7 is not set
223# CONFIG_MPSC is not set 222# CONFIG_MPSC is not set
224CONFIG_MCORE2=y 223# CONFIG_MCORE2 is not set
225# CONFIG_GENERIC_CPU is not set 224# CONFIG_GENERIC_CPU is not set
226CONFIG_X86_GENERIC=y 225CONFIG_X86_GENERIC=y
227CONFIG_X86_CPU=y 226CONFIG_X86_CPU=y
228CONFIG_X86_CMPXCHG=y 227CONFIG_X86_CMPXCHG=y
229CONFIG_X86_L1_CACHE_SHIFT=7 228CONFIG_X86_L1_CACHE_SHIFT=7
230CONFIG_X86_XADD=y 229CONFIG_X86_XADD=y
230# CONFIG_X86_PPRO_FENCE is not set
231CONFIG_X86_WP_WORKS_OK=y 231CONFIG_X86_WP_WORKS_OK=y
232CONFIG_X86_INVLPG=y 232CONFIG_X86_INVLPG=y
233CONFIG_X86_BSWAP=y 233CONFIG_X86_BSWAP=y
@@ -235,14 +235,15 @@ CONFIG_X86_POPAD_OK=y
235CONFIG_X86_INTEL_USERCOPY=y 235CONFIG_X86_INTEL_USERCOPY=y
236CONFIG_X86_USE_PPRO_CHECKSUM=y 236CONFIG_X86_USE_PPRO_CHECKSUM=y
237CONFIG_X86_TSC=y 237CONFIG_X86_TSC=y
238CONFIG_X86_CMOV=y
238CONFIG_X86_MINIMUM_CPU_FAMILY=4 239CONFIG_X86_MINIMUM_CPU_FAMILY=4
239CONFIG_X86_DEBUGCTLMSR=y 240CONFIG_X86_DEBUGCTLMSR=y
240CONFIG_HPET_TIMER=y 241CONFIG_HPET_TIMER=y
241CONFIG_HPET_EMULATE_RTC=y 242CONFIG_HPET_EMULATE_RTC=y
242CONFIG_DMI=y 243CONFIG_DMI=y
243# CONFIG_IOMMU_HELPER is not set 244# CONFIG_IOMMU_HELPER is not set
244CONFIG_NR_CPUS=4 245CONFIG_NR_CPUS=64
245# CONFIG_SCHED_SMT is not set 246CONFIG_SCHED_SMT=y
246CONFIG_SCHED_MC=y 247CONFIG_SCHED_MC=y
247# CONFIG_PREEMPT_NONE is not set 248# CONFIG_PREEMPT_NONE is not set
248CONFIG_PREEMPT_VOLUNTARY=y 249CONFIG_PREEMPT_VOLUNTARY=y
@@ -254,7 +255,8 @@ CONFIG_VM86=y
254# CONFIG_TOSHIBA is not set 255# CONFIG_TOSHIBA is not set
255# CONFIG_I8K is not set 256# CONFIG_I8K is not set
256CONFIG_X86_REBOOTFIXUPS=y 257CONFIG_X86_REBOOTFIXUPS=y
257# CONFIG_MICROCODE is not set 258CONFIG_MICROCODE=y
259CONFIG_MICROCODE_OLD_INTERFACE=y
258CONFIG_X86_MSR=y 260CONFIG_X86_MSR=y
259CONFIG_X86_CPUID=y 261CONFIG_X86_CPUID=y
260# CONFIG_NOHIGHMEM is not set 262# CONFIG_NOHIGHMEM is not set
@@ -285,7 +287,6 @@ CONFIG_MTRR=y
285# CONFIG_MTRR_SANITIZER is not set 287# CONFIG_MTRR_SANITIZER is not set
286CONFIG_X86_PAT=y 288CONFIG_X86_PAT=y
287CONFIG_EFI=y 289CONFIG_EFI=y
288# CONFIG_IRQBALANCE is not set
289CONFIG_SECCOMP=y 290CONFIG_SECCOMP=y
290# CONFIG_HZ_100 is not set 291# CONFIG_HZ_100 is not set
291# CONFIG_HZ_250 is not set 292# CONFIG_HZ_250 is not set
@@ -1532,7 +1533,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y
1532CONFIG_VGA_CONSOLE=y 1533CONFIG_VGA_CONSOLE=y
1533CONFIG_VGACON_SOFT_SCROLLBACK=y 1534CONFIG_VGACON_SOFT_SCROLLBACK=y
1534CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 1535CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
1535CONFIG_VIDEO_SELECT=y
1536CONFIG_DUMMY_CONSOLE=y 1536CONFIG_DUMMY_CONSOLE=y
1537# CONFIG_FRAMEBUFFER_CONSOLE is not set 1537# CONFIG_FRAMEBUFFER_CONSOLE is not set
1538CONFIG_LOGO=y 1538CONFIG_LOGO=y
@@ -2115,7 +2115,7 @@ CONFIG_IO_DELAY_0X80=y
2115CONFIG_DEFAULT_IO_DELAY_TYPE=0 2115CONFIG_DEFAULT_IO_DELAY_TYPE=0
2116CONFIG_DEBUG_BOOT_PARAMS=y 2116CONFIG_DEBUG_BOOT_PARAMS=y
2117# CONFIG_CPA_DEBUG is not set 2117# CONFIG_CPA_DEBUG is not set
2118# CONFIG_OPTIMIZE_INLINING is not set 2118CONFIG_OPTIMIZE_INLINING=y
2119 2119
2120# 2120#
2121# Security options 2121# Security options
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 678c8acefe0..f0a03d7a7d6 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,7 +1,7 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.27-rc4 3# Linux kernel version: 2.6.27-rc5
4# Mon Aug 25 14:40:46 2008 4# Wed Sep 3 17:13:39 2008
5# 5#
6CONFIG_64BIT=y 6CONFIG_64BIT=y
7# CONFIG_X86_32 is not set 7# CONFIG_X86_32 is not set
@@ -210,7 +210,6 @@ CONFIG_X86_PC=y
210# CONFIG_MCRUSOE is not set 210# CONFIG_MCRUSOE is not set
211# CONFIG_MEFFICEON is not set 211# CONFIG_MEFFICEON is not set
212# CONFIG_MWINCHIPC6 is not set 212# CONFIG_MWINCHIPC6 is not set
213# CONFIG_MWINCHIP2 is not set
214# CONFIG_MWINCHIP3D is not set 213# CONFIG_MWINCHIP3D is not set
215# CONFIG_MGEODEGX1 is not set 214# CONFIG_MGEODEGX1 is not set
216# CONFIG_MGEODE_LX is not set 215# CONFIG_MGEODE_LX is not set
@@ -218,17 +217,14 @@ CONFIG_X86_PC=y
218# CONFIG_MVIAC3_2 is not set 217# CONFIG_MVIAC3_2 is not set
219# CONFIG_MVIAC7 is not set 218# CONFIG_MVIAC7 is not set
220# CONFIG_MPSC is not set 219# CONFIG_MPSC is not set
221CONFIG_MCORE2=y 220# CONFIG_MCORE2 is not set
222# CONFIG_GENERIC_CPU is not set 221CONFIG_GENERIC_CPU=y
223CONFIG_X86_CPU=y 222CONFIG_X86_CPU=y
224CONFIG_X86_L1_CACHE_BYTES=64 223CONFIG_X86_L1_CACHE_BYTES=128
225CONFIG_X86_INTERNODE_CACHE_BYTES=64 224CONFIG_X86_INTERNODE_CACHE_BYTES=128
226CONFIG_X86_CMPXCHG=y 225CONFIG_X86_CMPXCHG=y
227CONFIG_X86_L1_CACHE_SHIFT=6 226CONFIG_X86_L1_CACHE_SHIFT=7
228CONFIG_X86_WP_WORKS_OK=y 227CONFIG_X86_WP_WORKS_OK=y
229CONFIG_X86_INTEL_USERCOPY=y
230CONFIG_X86_USE_PPRO_CHECKSUM=y
231CONFIG_X86_P6_NOP=y
232CONFIG_X86_TSC=y 228CONFIG_X86_TSC=y
233CONFIG_X86_CMPXCHG64=y 229CONFIG_X86_CMPXCHG64=y
234CONFIG_X86_CMOV=y 230CONFIG_X86_CMOV=y
@@ -243,9 +239,8 @@ CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y
243CONFIG_AMD_IOMMU=y 239CONFIG_AMD_IOMMU=y
244CONFIG_SWIOTLB=y 240CONFIG_SWIOTLB=y
245CONFIG_IOMMU_HELPER=y 241CONFIG_IOMMU_HELPER=y
246# CONFIG_MAXSMP is not set 242CONFIG_NR_CPUS=64
247CONFIG_NR_CPUS=4 243CONFIG_SCHED_SMT=y
248# CONFIG_SCHED_SMT is not set
249CONFIG_SCHED_MC=y 244CONFIG_SCHED_MC=y
250# CONFIG_PREEMPT_NONE is not set 245# CONFIG_PREEMPT_NONE is not set
251CONFIG_PREEMPT_VOLUNTARY=y 246CONFIG_PREEMPT_VOLUNTARY=y
@@ -254,7 +249,8 @@ CONFIG_X86_LOCAL_APIC=y
254CONFIG_X86_IO_APIC=y 249CONFIG_X86_IO_APIC=y
255# CONFIG_X86_MCE is not set 250# CONFIG_X86_MCE is not set
256# CONFIG_I8K is not set 251# CONFIG_I8K is not set
257# CONFIG_MICROCODE is not set 252CONFIG_MICROCODE=y
253CONFIG_MICROCODE_OLD_INTERFACE=y
258CONFIG_X86_MSR=y 254CONFIG_X86_MSR=y
259CONFIG_X86_CPUID=y 255CONFIG_X86_CPUID=y
260CONFIG_NUMA=y 256CONFIG_NUMA=y
@@ -290,7 +286,7 @@ CONFIG_BOUNCE=y
290CONFIG_VIRT_TO_BUS=y 286CONFIG_VIRT_TO_BUS=y
291CONFIG_MTRR=y 287CONFIG_MTRR=y
292# CONFIG_MTRR_SANITIZER is not set 288# CONFIG_MTRR_SANITIZER is not set
293# CONFIG_X86_PAT is not set 289CONFIG_X86_PAT=y
294CONFIG_EFI=y 290CONFIG_EFI=y
295CONFIG_SECCOMP=y 291CONFIG_SECCOMP=y
296# CONFIG_HZ_100 is not set 292# CONFIG_HZ_100 is not set
@@ -1508,7 +1504,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y
1508CONFIG_VGA_CONSOLE=y 1504CONFIG_VGA_CONSOLE=y
1509CONFIG_VGACON_SOFT_SCROLLBACK=y 1505CONFIG_VGACON_SOFT_SCROLLBACK=y
1510CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 1506CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
1511CONFIG_VIDEO_SELECT=y
1512CONFIG_DUMMY_CONSOLE=y 1507CONFIG_DUMMY_CONSOLE=y
1513# CONFIG_FRAMEBUFFER_CONSOLE is not set 1508# CONFIG_FRAMEBUFFER_CONSOLE is not set
1514CONFIG_LOGO=y 1509CONFIG_LOGO=y
@@ -2089,7 +2084,7 @@ CONFIG_IO_DELAY_0X80=y
2089CONFIG_DEFAULT_IO_DELAY_TYPE=0 2084CONFIG_DEFAULT_IO_DELAY_TYPE=0
2090CONFIG_DEBUG_BOOT_PARAMS=y 2085CONFIG_DEBUG_BOOT_PARAMS=y
2091# CONFIG_CPA_DEBUG is not set 2086# CONFIG_CPA_DEBUG is not set
2092# CONFIG_OPTIMIZE_INLINING is not set 2087CONFIG_OPTIMIZE_INLINING=y
2093 2088
2094# 2089#
2095# Security options 2090# Security options
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3874c2de540..903de4aa509 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
11obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 11obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
12 12
13obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
14
13aes-i586-y := aes-i586-asm_32.o aes_glue.o 15aes-i586-y := aes-i586-asm_32.o aes_glue.o
14twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o 16twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
15salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o 17salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c
new file mode 100644
index 00000000000..070afc5b6c9
--- /dev/null
+++ b/arch/x86/crypto/crc32c-intel.c
@@ -0,0 +1,197 @@
1/*
2 * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
3 * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
4 * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
5 * http://www.intel.com/products/processor/manuals/
6 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
7 * Volume 2A: Instruction Set Reference, A-M
8 *
9 * Copyright (c) 2008 Austin Zhang <austin_zhang@linux.intel.com>
10 * Copyright (c) 2008 Kent Liu <kent.liu@intel.com>
11 *
12 * This program is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option)
15 * any later version.
16 *
17 */
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/string.h>
21#include <linux/kernel.h>
22#include <crypto/internal/hash.h>
23
24#include <asm/cpufeature.h>
25
26#define CHKSUM_BLOCK_SIZE 1
27#define CHKSUM_DIGEST_SIZE 4
28
29#define SCALE_F sizeof(unsigned long)
30
31#ifdef CONFIG_X86_64
32#define REX_PRE "0x48, "
33#else
34#define REX_PRE
35#endif
36
37static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
38{
39 while (length--) {
40 __asm__ __volatile__(
41 ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
42 :"=S"(crc)
43 :"0"(crc), "c"(*data)
44 );
45 data++;
46 }
47
48 return crc;
49}
50
51static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
52{
53 unsigned int iquotient = len / SCALE_F;
54 unsigned int iremainder = len % SCALE_F;
55 unsigned long *ptmp = (unsigned long *)p;
56
57 while (iquotient--) {
58 __asm__ __volatile__(
59 ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
60 :"=S"(crc)
61 :"0"(crc), "c"(*ptmp)
62 );
63 ptmp++;
64 }
65
66 if (iremainder)
67 crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
68 iremainder);
69
70 return crc;
71}
72
73/*
74 * Setting the seed allows arbitrary accumulators and flexible XOR policy
75 * If your algorithm starts with ~0, then XOR with ~0 before you set
76 * the seed.
77 */
78static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key,
79 unsigned int keylen)
80{
81 u32 *mctx = crypto_ahash_ctx(hash);
82
83 if (keylen != sizeof(u32)) {
84 crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
85 return -EINVAL;
86 }
87 *mctx = le32_to_cpup((__le32 *)key);
88 return 0;
89}
90
91static int crc32c_intel_init(struct ahash_request *req)
92{
93 u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
94 u32 *crcp = ahash_request_ctx(req);
95
96 *crcp = *mctx;
97
98 return 0;
99}
100
101static int crc32c_intel_update(struct ahash_request *req)
102{
103 struct crypto_hash_walk walk;
104 u32 *crcp = ahash_request_ctx(req);
105 u32 crc = *crcp;
106 int nbytes;
107
108 for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
109 nbytes = crypto_hash_walk_done(&walk, 0))
110 crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
111
112 *crcp = crc;
113 return 0;
114}
115
116static int crc32c_intel_final(struct ahash_request *req)
117{
118 u32 *crcp = ahash_request_ctx(req);
119
120 *(__le32 *)req->result = ~cpu_to_le32p(crcp);
121 return 0;
122}
123
124static int crc32c_intel_digest(struct ahash_request *req)
125{
126 struct crypto_hash_walk walk;
127 u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
128 u32 crc = *mctx;
129 int nbytes;
130
131 for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
132 nbytes = crypto_hash_walk_done(&walk, 0))
133 crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
134
135 *(__le32 *)req->result = ~cpu_to_le32(crc);
136 return 0;
137}
138
139static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
140{
141 u32 *key = crypto_tfm_ctx(tfm);
142
143 *key = ~0;
144
145 tfm->crt_ahash.reqsize = sizeof(u32);
146
147 return 0;
148}
149
150static struct crypto_alg alg = {
151 .cra_name = "crc32c",
152 .cra_driver_name = "crc32c-intel",
153 .cra_priority = 200,
154 .cra_flags = CRYPTO_ALG_TYPE_AHASH,
155 .cra_blocksize = CHKSUM_BLOCK_SIZE,
156 .cra_alignmask = 3,
157 .cra_ctxsize = sizeof(u32),
158 .cra_module = THIS_MODULE,
159 .cra_list = LIST_HEAD_INIT(alg.cra_list),
160 .cra_init = crc32c_intel_cra_init,
161 .cra_type = &crypto_ahash_type,
162 .cra_u = {
163 .ahash = {
164 .digestsize = CHKSUM_DIGEST_SIZE,
165 .setkey = crc32c_intel_setkey,
166 .init = crc32c_intel_init,
167 .update = crc32c_intel_update,
168 .final = crc32c_intel_final,
169 .digest = crc32c_intel_digest,
170 }
171 }
172};
173
174
175static int __init crc32c_intel_mod_init(void)
176{
177 if (cpu_has_xmm4_2)
178 return crypto_register_alg(&alg);
179 else
180 return -ENODEV;
181}
182
183static void __exit crc32c_intel_mod_fini(void)
184{
185 crypto_unregister_alg(&alg);
186}
187
188module_init(crc32c_intel_mod_init);
189module_exit(crc32c_intel_mod_fini);
190
191MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>");
192MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
193MODULE_LICENSE("GPL");
194
195MODULE_ALIAS("crc32c");
196MODULE_ALIAS("crc32c-intel");
197
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index a0e1dbe67dc..127ec3f0721 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -85,8 +85,10 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
85 dump->regs.ax = regs->ax; 85 dump->regs.ax = regs->ax;
86 dump->regs.ds = current->thread.ds; 86 dump->regs.ds = current->thread.ds;
87 dump->regs.es = current->thread.es; 87 dump->regs.es = current->thread.es;
88 asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; 88 savesegment(fs, fs);
89 asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; 89 dump->regs.fs = fs;
90 savesegment(gs, gs);
91 dump->regs.gs = gs;
90 dump->regs.orig_ax = regs->orig_ax; 92 dump->regs.orig_ax = regs->orig_ax;
91 dump->regs.ip = regs->ip; 93 dump->regs.ip = regs->ip;
92 dump->regs.cs = regs->cs; 94 dump->regs.cs = regs->cs;
@@ -430,8 +432,9 @@ beyond_if:
430 current->mm->start_stack = 432 current->mm->start_stack =
431 (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); 433 (unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
432 /* start thread */ 434 /* start thread */
433 asm volatile("movl %0,%%fs" :: "r" (0)); \ 435 loadsegment(fs, 0);
434 asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); 436 loadsegment(ds, __USER32_DS);
437 loadsegment(es, __USER32_DS);
435 load_gs_index(0); 438 load_gs_index(0);
436 (regs)->ip = ex.a_entry; 439 (regs)->ip = ex.a_entry;
437 (regs)->sp = current->mm->start_stack; 440 (regs)->sp = current->mm->start_stack;
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 20af4c79579..4bc02b23674 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -179,9 +179,10 @@ struct sigframe
179 u32 pretcode; 179 u32 pretcode;
180 int sig; 180 int sig;
181 struct sigcontext_ia32 sc; 181 struct sigcontext_ia32 sc;
182 struct _fpstate_ia32 fpstate; 182 struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */
183 unsigned int extramask[_COMPAT_NSIG_WORDS-1]; 183 unsigned int extramask[_COMPAT_NSIG_WORDS-1];
184 char retcode[8]; 184 char retcode[8];
185 /* fp state follows here */
185}; 186};
186 187
187struct rt_sigframe 188struct rt_sigframe
@@ -192,8 +193,8 @@ struct rt_sigframe
192 u32 puc; 193 u32 puc;
193 compat_siginfo_t info; 194 compat_siginfo_t info;
194 struct ucontext_ia32 uc; 195 struct ucontext_ia32 uc;
195 struct _fpstate_ia32 fpstate;
196 char retcode[8]; 196 char retcode[8];
197 /* fp state follows here */
197}; 198};
198 199
199#define COPY(x) { \ 200#define COPY(x) { \
@@ -206,7 +207,7 @@ struct rt_sigframe
206 { unsigned int cur; \ 207 { unsigned int cur; \
207 unsigned short pre; \ 208 unsigned short pre; \
208 err |= __get_user(pre, &sc->seg); \ 209 err |= __get_user(pre, &sc->seg); \
209 asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ 210 savesegment(seg, cur); \
210 pre |= mask; \ 211 pre |= mask; \
211 if (pre != cur) loadsegment(seg, pre); } 212 if (pre != cur) loadsegment(seg, pre); }
212 213
@@ -215,7 +216,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
215 unsigned int *peax) 216 unsigned int *peax)
216{ 217{
217 unsigned int tmpflags, gs, oldgs, err = 0; 218 unsigned int tmpflags, gs, oldgs, err = 0;
218 struct _fpstate_ia32 __user *buf; 219 void __user *buf;
219 u32 tmp; 220 u32 tmp;
220 221
221 /* Always make any pending restarted system calls return -EINTR */ 222 /* Always make any pending restarted system calls return -EINTR */
@@ -235,7 +236,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
235 */ 236 */
236 err |= __get_user(gs, &sc->gs); 237 err |= __get_user(gs, &sc->gs);
237 gs |= 3; 238 gs |= 3;
238 asm("movl %%gs,%0" : "=r" (oldgs)); 239 savesegment(gs, oldgs);
239 if (gs != oldgs) 240 if (gs != oldgs)
240 load_gs_index(gs); 241 load_gs_index(gs);
241 242
@@ -259,26 +260,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
259 260
260 err |= __get_user(tmp, &sc->fpstate); 261 err |= __get_user(tmp, &sc->fpstate);
261 buf = compat_ptr(tmp); 262 buf = compat_ptr(tmp);
262 if (buf) { 263 err |= restore_i387_xstate_ia32(buf);
263 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
264 goto badframe;
265 err |= restore_i387_ia32(buf);
266 } else {
267 struct task_struct *me = current;
268
269 if (used_math()) {
270 clear_fpu(me);
271 clear_used_math();
272 }
273 }
274 264
275 err |= __get_user(tmp, &sc->ax); 265 err |= __get_user(tmp, &sc->ax);
276 *peax = tmp; 266 *peax = tmp;
277 267
278 return err; 268 return err;
279
280badframe:
281 return 1;
282} 269}
283 270
284asmlinkage long sys32_sigreturn(struct pt_regs *regs) 271asmlinkage long sys32_sigreturn(struct pt_regs *regs)
@@ -350,46 +337,42 @@ badframe:
350 */ 337 */
351 338
352static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, 339static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
353 struct _fpstate_ia32 __user *fpstate, 340 void __user *fpstate,
354 struct pt_regs *regs, unsigned int mask) 341 struct pt_regs *regs, unsigned int mask)
355{ 342{
356 int tmp, err = 0; 343 int tmp, err = 0;
357 344
358 tmp = 0; 345 savesegment(gs, tmp);
359 __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
360 err |= __put_user(tmp, (unsigned int __user *)&sc->gs); 346 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
361 __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); 347 savesegment(fs, tmp);
362 err |= __put_user(tmp, (unsigned int __user *)&sc->fs); 348 err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
363 __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp)); 349 savesegment(ds, tmp);
364 err |= __put_user(tmp, (unsigned int __user *)&sc->ds); 350 err |= __put_user(tmp, (unsigned int __user *)&sc->ds);
365 __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); 351 savesegment(es, tmp);
366 err |= __put_user(tmp, (unsigned int __user *)&sc->es); 352 err |= __put_user(tmp, (unsigned int __user *)&sc->es);
367 353
368 err |= __put_user((u32)regs->di, &sc->di); 354 err |= __put_user(regs->di, &sc->di);
369 err |= __put_user((u32)regs->si, &sc->si); 355 err |= __put_user(regs->si, &sc->si);
370 err |= __put_user((u32)regs->bp, &sc->bp); 356 err |= __put_user(regs->bp, &sc->bp);
371 err |= __put_user((u32)regs->sp, &sc->sp); 357 err |= __put_user(regs->sp, &sc->sp);
372 err |= __put_user((u32)regs->bx, &sc->bx); 358 err |= __put_user(regs->bx, &sc->bx);
373 err |= __put_user((u32)regs->dx, &sc->dx); 359 err |= __put_user(regs->dx, &sc->dx);
374 err |= __put_user((u32)regs->cx, &sc->cx); 360 err |= __put_user(regs->cx, &sc->cx);
375 err |= __put_user((u32)regs->ax, &sc->ax); 361 err |= __put_user(regs->ax, &sc->ax);
376 err |= __put_user((u32)regs->cs, &sc->cs); 362 err |= __put_user(regs->cs, &sc->cs);
377 err |= __put_user((u32)regs->ss, &sc->ss); 363 err |= __put_user(regs->ss, &sc->ss);
378 err |= __put_user(current->thread.trap_no, &sc->trapno); 364 err |= __put_user(current->thread.trap_no, &sc->trapno);
379 err |= __put_user(current->thread.error_code, &sc->err); 365 err |= __put_user(current->thread.error_code, &sc->err);
380 err |= __put_user((u32)regs->ip, &sc->ip); 366 err |= __put_user(regs->ip, &sc->ip);
381 err |= __put_user((u32)regs->flags, &sc->flags); 367 err |= __put_user(regs->flags, &sc->flags);
382 err |= __put_user((u32)regs->sp, &sc->sp_at_signal); 368 err |= __put_user(regs->sp, &sc->sp_at_signal);
383 369
384 tmp = save_i387_ia32(fpstate); 370 tmp = save_i387_xstate_ia32(fpstate);
385 if (tmp < 0) 371 if (tmp < 0)
386 err = -EFAULT; 372 err = -EFAULT;
387 else { 373 else
388 clear_used_math();
389 stts();
390 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), 374 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
391 &sc->fpstate); 375 &sc->fpstate);
392 }
393 376
394 /* non-iBCS2 extensions.. */ 377 /* non-iBCS2 extensions.. */
395 err |= __put_user(mask, &sc->oldmask); 378 err |= __put_user(mask, &sc->oldmask);
@@ -402,7 +385,8 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
402 * Determine which stack to use.. 385 * Determine which stack to use..
403 */ 386 */
404static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 387static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
405 size_t frame_size) 388 size_t frame_size,
389 void **fpstate)
406{ 390{
407 unsigned long sp; 391 unsigned long sp;
408 392
@@ -421,6 +405,11 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
421 ka->sa.sa_restorer) 405 ka->sa.sa_restorer)
422 sp = (unsigned long) ka->sa.sa_restorer; 406 sp = (unsigned long) ka->sa.sa_restorer;
423 407
408 if (used_math()) {
409 sp = sp - sig_xstate_ia32_size;
410 *fpstate = (struct _fpstate_ia32 *) sp;
411 }
412
424 sp -= frame_size; 413 sp -= frame_size;
425 /* Align the stack pointer according to the i386 ABI, 414 /* Align the stack pointer according to the i386 ABI,
426 * i.e. so that on function entry ((sp + 4) & 15) == 0. */ 415 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
@@ -434,6 +423,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
434 struct sigframe __user *frame; 423 struct sigframe __user *frame;
435 void __user *restorer; 424 void __user *restorer;
436 int err = 0; 425 int err = 0;
426 void __user *fpstate = NULL;
437 427
438 /* copy_to_user optimizes that into a single 8 byte store */ 428 /* copy_to_user optimizes that into a single 8 byte store */
439 static const struct { 429 static const struct {
@@ -448,25 +438,21 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
448 0, 438 0,
449 }; 439 };
450 440
451 frame = get_sigframe(ka, regs, sizeof(*frame)); 441 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
452 442
453 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 443 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
454 goto give_sigsegv; 444 return -EFAULT;
455 445
456 err |= __put_user(sig, &frame->sig); 446 if (__put_user(sig, &frame->sig))
457 if (err) 447 return -EFAULT;
458 goto give_sigsegv;
459 448
460 err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs, 449 if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
461 set->sig[0]); 450 return -EFAULT;
462 if (err)
463 goto give_sigsegv;
464 451
465 if (_COMPAT_NSIG_WORDS > 1) { 452 if (_COMPAT_NSIG_WORDS > 1) {
466 err |= __copy_to_user(frame->extramask, &set->sig[1], 453 if (__copy_to_user(frame->extramask, &set->sig[1],
467 sizeof(frame->extramask)); 454 sizeof(frame->extramask)))
468 if (err) 455 return -EFAULT;
469 goto give_sigsegv;
470 } 456 }
471 457
472 if (ka->sa.sa_flags & SA_RESTORER) { 458 if (ka->sa.sa_flags & SA_RESTORER) {
@@ -487,7 +473,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
487 */ 473 */
488 err |= __copy_to_user(frame->retcode, &code, 8); 474 err |= __copy_to_user(frame->retcode, &code, 8);
489 if (err) 475 if (err)
490 goto give_sigsegv; 476 return -EFAULT;
491 477
492 /* Set up registers for signal handler */ 478 /* Set up registers for signal handler */
493 regs->sp = (unsigned long) frame; 479 regs->sp = (unsigned long) frame;
@@ -498,8 +484,8 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
498 regs->dx = 0; 484 regs->dx = 0;
499 regs->cx = 0; 485 regs->cx = 0;
500 486
501 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 487 loadsegment(ds, __USER32_DS);
502 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 488 loadsegment(es, __USER32_DS);
503 489
504 regs->cs = __USER32_CS; 490 regs->cs = __USER32_CS;
505 regs->ss = __USER32_DS; 491 regs->ss = __USER32_DS;
@@ -510,10 +496,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
510#endif 496#endif
511 497
512 return 0; 498 return 0;
513
514give_sigsegv:
515 force_sigsegv(sig, current);
516 return -EFAULT;
517} 499}
518 500
519int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 501int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
@@ -522,6 +504,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
522 struct rt_sigframe __user *frame; 504 struct rt_sigframe __user *frame;
523 void __user *restorer; 505 void __user *restorer;
524 int err = 0; 506 int err = 0;
507 void __user *fpstate = NULL;
525 508
526 /* __copy_to_user optimizes that into a single 8 byte store */ 509 /* __copy_to_user optimizes that into a single 8 byte store */
527 static const struct { 510 static const struct {
@@ -537,30 +520,33 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
537 0, 520 0,
538 }; 521 };
539 522
540 frame = get_sigframe(ka, regs, sizeof(*frame)); 523 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
541 524
542 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 525 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
543 goto give_sigsegv; 526 return -EFAULT;
544 527
545 err |= __put_user(sig, &frame->sig); 528 err |= __put_user(sig, &frame->sig);
546 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); 529 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
547 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); 530 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
548 err |= copy_siginfo_to_user32(&frame->info, info); 531 err |= copy_siginfo_to_user32(&frame->info, info);
549 if (err) 532 if (err)
550 goto give_sigsegv; 533 return -EFAULT;
551 534
552 /* Create the ucontext. */ 535 /* Create the ucontext. */
553 err |= __put_user(0, &frame->uc.uc_flags); 536 if (cpu_has_xsave)
537 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
538 else
539 err |= __put_user(0, &frame->uc.uc_flags);
554 err |= __put_user(0, &frame->uc.uc_link); 540 err |= __put_user(0, &frame->uc.uc_link);
555 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 541 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
556 err |= __put_user(sas_ss_flags(regs->sp), 542 err |= __put_user(sas_ss_flags(regs->sp),
557 &frame->uc.uc_stack.ss_flags); 543 &frame->uc.uc_stack.ss_flags);
558 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 544 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
559 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 545 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
560 regs, set->sig[0]); 546 regs, set->sig[0]);
561 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 547 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
562 if (err) 548 if (err)
563 goto give_sigsegv; 549 return -EFAULT;
564 550
565 if (ka->sa.sa_flags & SA_RESTORER) 551 if (ka->sa.sa_flags & SA_RESTORER)
566 restorer = ka->sa.sa_restorer; 552 restorer = ka->sa.sa_restorer;
@@ -575,7 +561,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
575 */ 561 */
576 err |= __copy_to_user(frame->retcode, &code, 8); 562 err |= __copy_to_user(frame->retcode, &code, 8);
577 if (err) 563 if (err)
578 goto give_sigsegv; 564 return -EFAULT;
579 565
580 /* Set up registers for signal handler */ 566 /* Set up registers for signal handler */
581 regs->sp = (unsigned long) frame; 567 regs->sp = (unsigned long) frame;
@@ -591,8 +577,8 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
591 regs->dx = (unsigned long) &frame->info; 577 regs->dx = (unsigned long) &frame->info;
592 regs->cx = (unsigned long) &frame->uc; 578 regs->cx = (unsigned long) &frame->uc;
593 579
594 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 580 loadsegment(ds, __USER32_DS);
595 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 581 loadsegment(es, __USER32_DS);
596 582
597 regs->cs = __USER32_CS; 583 regs->cs = __USER32_CS;
598 regs->ss = __USER32_DS; 584 regs->ss = __USER32_DS;
@@ -603,8 +589,4 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
603#endif 589#endif
604 590
605 return 0; 591 return 0;
606
607give_sigsegv:
608 force_sigsegv(sig, current);
609 return -EFAULT;
610} 592}
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index ffc1bb4fed7..256b00b6189 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -39,11 +39,11 @@
39 .endm 39 .endm
40 40
41 /* clobbers %eax */ 41 /* clobbers %eax */
42 .macro CLEAR_RREGS 42 .macro CLEAR_RREGS _r9=rax
43 xorl %eax,%eax 43 xorl %eax,%eax
44 movq %rax,R11(%rsp) 44 movq %rax,R11(%rsp)
45 movq %rax,R10(%rsp) 45 movq %rax,R10(%rsp)
46 movq %rax,R9(%rsp) 46 movq %\_r9,R9(%rsp)
47 movq %rax,R8(%rsp) 47 movq %rax,R8(%rsp)
48 .endm 48 .endm
49 49
@@ -52,11 +52,10 @@
52 * We don't reload %eax because syscall_trace_enter() returned 52 * We don't reload %eax because syscall_trace_enter() returned
53 * the value it wants us to use in the table lookup. 53 * the value it wants us to use in the table lookup.
54 */ 54 */
55 .macro LOAD_ARGS32 offset 55 .macro LOAD_ARGS32 offset, _r9=0
56 movl \offset(%rsp),%r11d 56 .if \_r9
57 movl \offset+8(%rsp),%r10d
58 movl \offset+16(%rsp),%r9d 57 movl \offset+16(%rsp),%r9d
59 movl \offset+24(%rsp),%r8d 58 .endif
60 movl \offset+40(%rsp),%ecx 59 movl \offset+40(%rsp),%ecx
61 movl \offset+48(%rsp),%edx 60 movl \offset+48(%rsp),%edx
62 movl \offset+56(%rsp),%esi 61 movl \offset+56(%rsp),%esi
@@ -145,7 +144,7 @@ ENTRY(ia32_sysenter_target)
145 SAVE_ARGS 0,0,1 144 SAVE_ARGS 0,0,1
146 /* no need to do an access_ok check here because rbp has been 145 /* no need to do an access_ok check here because rbp has been
147 32bit zero extended */ 146 32bit zero extended */
1481: movl (%rbp),%r9d 1471: movl (%rbp),%ebp
149 .section __ex_table,"a" 148 .section __ex_table,"a"
150 .quad 1b,ia32_badarg 149 .quad 1b,ia32_badarg
151 .previous 150 .previous
@@ -157,7 +156,7 @@ ENTRY(ia32_sysenter_target)
157 cmpl $(IA32_NR_syscalls-1),%eax 156 cmpl $(IA32_NR_syscalls-1),%eax
158 ja ia32_badsys 157 ja ia32_badsys
159sysenter_do_call: 158sysenter_do_call:
160 IA32_ARG_FIXUP 1 159 IA32_ARG_FIXUP
161sysenter_dispatch: 160sysenter_dispatch:
162 call *ia32_sys_call_table(,%rax,8) 161 call *ia32_sys_call_table(,%rax,8)
163 movq %rax,RAX-ARGOFFSET(%rsp) 162 movq %rax,RAX-ARGOFFSET(%rsp)
@@ -234,20 +233,17 @@ sysexit_audit:
234#endif 233#endif
235 234
236sysenter_tracesys: 235sysenter_tracesys:
237 xchgl %r9d,%ebp
238#ifdef CONFIG_AUDITSYSCALL 236#ifdef CONFIG_AUDITSYSCALL
239 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) 237 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
240 jz sysenter_auditsys 238 jz sysenter_auditsys
241#endif 239#endif
242 SAVE_REST 240 SAVE_REST
243 CLEAR_RREGS 241 CLEAR_RREGS
244 movq %r9,R9(%rsp)
245 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ 242 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
246 movq %rsp,%rdi /* &pt_regs -> arg1 */ 243 movq %rsp,%rdi /* &pt_regs -> arg1 */
247 call syscall_trace_enter 244 call syscall_trace_enter
248 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 245 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
249 RESTORE_REST 246 RESTORE_REST
250 xchgl %ebp,%r9d
251 cmpl $(IA32_NR_syscalls-1),%eax 247 cmpl $(IA32_NR_syscalls-1),%eax
252 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ 248 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
253 jmp sysenter_do_call 249 jmp sysenter_do_call
@@ -314,9 +310,9 @@ ENTRY(ia32_cstar_target)
314 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) 310 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
315 CFI_REMEMBER_STATE 311 CFI_REMEMBER_STATE
316 jnz cstar_tracesys 312 jnz cstar_tracesys
317cstar_do_call:
318 cmpl $IA32_NR_syscalls-1,%eax 313 cmpl $IA32_NR_syscalls-1,%eax
319 ja ia32_badsys 314 ja ia32_badsys
315cstar_do_call:
320 IA32_ARG_FIXUP 1 316 IA32_ARG_FIXUP 1
321cstar_dispatch: 317cstar_dispatch:
322 call *ia32_sys_call_table(,%rax,8) 318 call *ia32_sys_call_table(,%rax,8)
@@ -357,15 +353,13 @@ cstar_tracesys:
357#endif 353#endif
358 xchgl %r9d,%ebp 354 xchgl %r9d,%ebp
359 SAVE_REST 355 SAVE_REST
360 CLEAR_RREGS 356 CLEAR_RREGS r9
361 movq %r9,R9(%rsp)
362 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 357 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
363 movq %rsp,%rdi /* &pt_regs -> arg1 */ 358 movq %rsp,%rdi /* &pt_regs -> arg1 */
364 call syscall_trace_enter 359 call syscall_trace_enter
365 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 360 LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
366 RESTORE_REST 361 RESTORE_REST
367 xchgl %ebp,%r9d 362 xchgl %ebp,%r9d
368 movl RSP-ARGOFFSET(%rsp), %r8d
369 cmpl $(IA32_NR_syscalls-1),%eax 363 cmpl $(IA32_NR_syscalls-1),%eax
370 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ 364 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
371 jmp cstar_do_call 365 jmp cstar_do_call
@@ -577,8 +571,8 @@ ia32_sys_call_table:
577 .quad compat_sys_setrlimit /* 75 */ 571 .quad compat_sys_setrlimit /* 75 */
578 .quad compat_sys_old_getrlimit /* old_getrlimit */ 572 .quad compat_sys_old_getrlimit /* old_getrlimit */
579 .quad compat_sys_getrusage 573 .quad compat_sys_getrusage
580 .quad sys32_gettimeofday 574 .quad compat_sys_gettimeofday
581 .quad sys32_settimeofday 575 .quad compat_sys_settimeofday
582 .quad sys_getgroups16 /* 80 */ 576 .quad sys_getgroups16 /* 80 */
583 .quad sys_setgroups16 577 .quad sys_setgroups16
584 .quad sys32_old_select 578 .quad sys32_old_select
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index d3c64088b98..2e09dcd3c0a 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -49,41 +49,6 @@
49 49
50#define AA(__x) ((unsigned long)(__x)) 50#define AA(__x) ((unsigned long)(__x))
51 51
52int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf)
53{
54 compat_ino_t ino;
55
56 typeof(ubuf->st_uid) uid = 0;
57 typeof(ubuf->st_gid) gid = 0;
58 SET_UID(uid, kbuf->uid);
59 SET_GID(gid, kbuf->gid);
60 if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev))
61 return -EOVERFLOW;
62 if (kbuf->size >= 0x7fffffff)
63 return -EOVERFLOW;
64 ino = kbuf->ino;
65 if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino)
66 return -EOVERFLOW;
67 if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) ||
68 __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) ||
69 __put_user(ino, &ubuf->st_ino) ||
70 __put_user(kbuf->mode, &ubuf->st_mode) ||
71 __put_user(kbuf->nlink, &ubuf->st_nlink) ||
72 __put_user(uid, &ubuf->st_uid) ||
73 __put_user(gid, &ubuf->st_gid) ||
74 __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) ||
75 __put_user(kbuf->size, &ubuf->st_size) ||
76 __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) ||
77 __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) ||
78 __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) ||
79 __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
80 __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) ||
81 __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
82 __put_user(kbuf->blksize, &ubuf->st_blksize) ||
83 __put_user(kbuf->blocks, &ubuf->st_blocks))
84 return -EFAULT;
85 return 0;
86}
87 52
88asmlinkage long sys32_truncate64(char __user *filename, 53asmlinkage long sys32_truncate64(char __user *filename,
89 unsigned long offset_low, 54 unsigned long offset_low,
@@ -402,75 +367,11 @@ asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
402 return 0; 367 return 0;
403} 368}
404 369
405static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i)
406{
407 int err = -EFAULT;
408
409 if (access_ok(VERIFY_READ, i, sizeof(*i))) {
410 err = __get_user(o->tv_sec, &i->tv_sec);
411 err |= __get_user(o->tv_usec, &i->tv_usec);
412 }
413 return err;
414}
415
416static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i)
417{
418 int err = -EFAULT;
419
420 if (access_ok(VERIFY_WRITE, o, sizeof(*o))) {
421 err = __put_user(i->tv_sec, &o->tv_sec);
422 err |= __put_user(i->tv_usec, &o->tv_usec);
423 }
424 return err;
425}
426
427asmlinkage long sys32_alarm(unsigned int seconds) 370asmlinkage long sys32_alarm(unsigned int seconds)
428{ 371{
429 return alarm_setitimer(seconds); 372 return alarm_setitimer(seconds);
430} 373}
431 374
432/*
433 * Translations due to time_t size differences. Which affects all
434 * sorts of things, like timeval and itimerval.
435 */
436asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv,
437 struct timezone __user *tz)
438{
439 if (tv) {
440 struct timeval ktv;
441
442 do_gettimeofday(&ktv);
443 if (put_tv32(tv, &ktv))
444 return -EFAULT;
445 }
446 if (tz) {
447 if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
448 return -EFAULT;
449 }
450 return 0;
451}
452
453asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv,
454 struct timezone __user *tz)
455{
456 struct timeval ktv;
457 struct timespec kts;
458 struct timezone ktz;
459
460 if (tv) {
461 if (get_tv32(&ktv, tv))
462 return -EFAULT;
463 kts.tv_sec = ktv.tv_sec;
464 kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC;
465 }
466 if (tz) {
467 if (copy_from_user(&ktz, tz, sizeof(ktz)))
468 return -EFAULT;
469 }
470
471 return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
472}
473
474struct sel_arg_struct { 375struct sel_arg_struct {
475 unsigned int n; 376 unsigned int n;
476 unsigned int inp; 377 unsigned int inp;
@@ -556,15 +457,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
556 return ret; 457 return ret;
557} 458}
558 459
559/* These are here just in case some old ia32 binary calls it. */
560asmlinkage long sys32_pause(void)
561{
562 current->state = TASK_INTERRUPTIBLE;
563 schedule();
564 return -ERESTARTNOHAND;
565}
566
567
568#ifdef CONFIG_SYSCTL_SYSCALL 460#ifdef CONFIG_SYSCTL_SYSCALL
569struct sysctl_ia32 { 461struct sysctl_ia32 {
570 unsigned int name; 462 unsigned int name;
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
new file mode 100644
index 00000000000..4a8e80cdcfa
--- /dev/null
+++ b/arch/x86/include/asm/Kbuild
@@ -0,0 +1,24 @@
1include include/asm-generic/Kbuild.asm
2
3header-y += boot.h
4header-y += bootparam.h
5header-y += debugreg.h
6header-y += ldt.h
7header-y += msr-index.h
8header-y += prctl.h
9header-y += ptrace-abi.h
10header-y += sigcontext32.h
11header-y += ucontext.h
12header-y += processor-flags.h
13
14unifdef-y += e820.h
15unifdef-y += ist.h
16unifdef-y += mce.h
17unifdef-y += msr.h
18unifdef-y += mtrr.h
19unifdef-y += posix_types_32.h
20unifdef-y += posix_types_64.h
21unifdef-y += unistd_32.h
22unifdef-y += unistd_64.h
23unifdef-y += vm86.h
24unifdef-y += vsyscall.h
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
new file mode 100644
index 00000000000..37822206083
--- /dev/null
+++ b/arch/x86/include/asm/a.out-core.h
@@ -0,0 +1,73 @@
1/* a.out coredump register dumper
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#ifndef _ASM_X86_A_OUT_CORE_H
13#define _ASM_X86_A_OUT_CORE_H
14
15#ifdef __KERNEL__
16#ifdef CONFIG_X86_32
17
18#include <linux/user.h>
19#include <linux/elfcore.h>
20
21/*
22 * fill in the user structure for an a.out core dump
23 */
24static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
25{
26 u16 gs;
27
28/* changed the size calculations - should hopefully work better. lbt */
29 dump->magic = CMAGIC;
30 dump->start_code = 0;
31 dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
32 dump->u_tsize = ((unsigned long)current->mm->end_code) >> PAGE_SHIFT;
33 dump->u_dsize = ((unsigned long)(current->mm->brk + (PAGE_SIZE - 1)))
34 >> PAGE_SHIFT;
35 dump->u_dsize -= dump->u_tsize;
36 dump->u_ssize = 0;
37 dump->u_debugreg[0] = current->thread.debugreg0;
38 dump->u_debugreg[1] = current->thread.debugreg1;
39 dump->u_debugreg[2] = current->thread.debugreg2;
40 dump->u_debugreg[3] = current->thread.debugreg3;
41 dump->u_debugreg[4] = 0;
42 dump->u_debugreg[5] = 0;
43 dump->u_debugreg[6] = current->thread.debugreg6;
44 dump->u_debugreg[7] = current->thread.debugreg7;
45
46 if (dump->start_stack < TASK_SIZE)
47 dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
48 >> PAGE_SHIFT;
49
50 dump->regs.bx = regs->bx;
51 dump->regs.cx = regs->cx;
52 dump->regs.dx = regs->dx;
53 dump->regs.si = regs->si;
54 dump->regs.di = regs->di;
55 dump->regs.bp = regs->bp;
56 dump->regs.ax = regs->ax;
57 dump->regs.ds = (u16)regs->ds;
58 dump->regs.es = (u16)regs->es;
59 dump->regs.fs = (u16)regs->fs;
60 savesegment(gs, gs);
61 dump->regs.orig_ax = regs->orig_ax;
62 dump->regs.ip = regs->ip;
63 dump->regs.cs = (u16)regs->cs;
64 dump->regs.flags = regs->flags;
65 dump->regs.sp = regs->sp;
66 dump->regs.ss = (u16)regs->ss;
67
68 dump->u_fpvalid = dump_fpu(regs, &dump->i387);
69}
70
71#endif /* CONFIG_X86_32 */
72#endif /* __KERNEL__ */
73#endif /* _ASM_X86_A_OUT_CORE_H */
diff --git a/arch/x86/include/asm/a.out.h b/arch/x86/include/asm/a.out.h
new file mode 100644
index 00000000000..4684f97a5bb
--- /dev/null
+++ b/arch/x86/include/asm/a.out.h
@@ -0,0 +1,20 @@
1#ifndef _ASM_X86_A_OUT_H
2#define _ASM_X86_A_OUT_H
3
4struct exec
5{
6 unsigned int a_info; /* Use macros N_MAGIC, etc for access */
7 unsigned a_text; /* length of text, in bytes */
8 unsigned a_data; /* length of data, in bytes */
9 unsigned a_bss; /* length of uninitialized data area for file, in bytes */
10 unsigned a_syms; /* length of symbol table data in file, in bytes */
11 unsigned a_entry; /* start address */
12 unsigned a_trsize; /* length of relocation info for text, in bytes */
13 unsigned a_drsize; /* length of relocation info for data, in bytes */
14};
15
16#define N_TRSIZE(a) ((a).a_trsize)
17#define N_DRSIZE(a) ((a).a_drsize)
18#define N_SYMSIZE(a) ((a).a_syms)
19
20#endif /* _ASM_X86_A_OUT_H */
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
new file mode 100644
index 00000000000..8d676d8ecde
--- /dev/null
+++ b/arch/x86/include/asm/acpi.h
@@ -0,0 +1,178 @@
1#ifndef _ASM_X86_ACPI_H
2#define _ASM_X86_ACPI_H
3
4/*
5 * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6 * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 *
24 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25 */
26#include <acpi/pdc_intel.h>
27
28#include <asm/numa.h>
29#include <asm/processor.h>
30#include <asm/mmu.h>
31#include <asm/mpspec.h>
32
33#define COMPILER_DEPENDENT_INT64 long long
34#define COMPILER_DEPENDENT_UINT64 unsigned long long
35
36/*
37 * Calling conventions:
38 *
39 * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads)
40 * ACPI_EXTERNAL_XFACE - External ACPI interfaces
41 * ACPI_INTERNAL_XFACE - Internal ACPI interfaces
42 * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces
43 */
44#define ACPI_SYSTEM_XFACE
45#define ACPI_EXTERNAL_XFACE
46#define ACPI_INTERNAL_XFACE
47#define ACPI_INTERNAL_VAR_XFACE
48
49/* Asm macros */
50
51#define ACPI_ASM_MACROS
52#define BREAKPOINT3
53#define ACPI_DISABLE_IRQS() local_irq_disable()
54#define ACPI_ENABLE_IRQS() local_irq_enable()
55#define ACPI_FLUSH_CPU_CACHE() wbinvd()
56
57int __acpi_acquire_global_lock(unsigned int *lock);
58int __acpi_release_global_lock(unsigned int *lock);
59
60#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \
61 ((Acq) = __acpi_acquire_global_lock(&facs->global_lock))
62
63#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \
64 ((Acq) = __acpi_release_global_lock(&facs->global_lock))
65
66/*
67 * Math helper asm macros
68 */
69#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \
70 asm("divl %2;" \
71 : "=a"(q32), "=d"(r32) \
72 : "r"(d32), \
73 "0"(n_lo), "1"(n_hi))
74
75
76#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \
77 asm("shrl $1,%2 ;" \
78 "rcrl $1,%3;" \
79 : "=r"(n_hi), "=r"(n_lo) \
80 : "0"(n_hi), "1"(n_lo))
81
82#ifdef CONFIG_ACPI
83extern int acpi_lapic;
84extern int acpi_ioapic;
85extern int acpi_noirq;
86extern int acpi_strict;
87extern int acpi_disabled;
88extern int acpi_ht;
89extern int acpi_pci_disabled;
90extern int acpi_skip_timer_override;
91extern int acpi_use_timer_override;
92
93extern u8 acpi_sci_flags;
94extern int acpi_sci_override_gsi;
95void acpi_pic_sci_set_trigger(unsigned int, u16);
96
97static inline void disable_acpi(void)
98{
99 acpi_disabled = 1;
100 acpi_ht = 0;
101 acpi_pci_disabled = 1;
102 acpi_noirq = 1;
103}
104
105/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */
106#define FIX_ACPI_PAGES 4
107
108extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
109
110static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
111static inline void acpi_disable_pci(void)
112{
113 acpi_pci_disabled = 1;
114 acpi_noirq_set();
115}
116extern int acpi_irq_balance_set(char *str);
117
118/* routines for saving/restoring kernel state */
119extern int acpi_save_state_mem(void);
120extern void acpi_restore_state_mem(void);
121
122extern unsigned long acpi_wakeup_address;
123
124/* early initialization routine */
125extern void acpi_reserve_bootmem(void);
126
127/*
128 * Check if the CPU can handle C2 and deeper
129 */
130static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
131{
132 /*
133 * Early models (<=5) of AMD Opterons are not supposed to go into
134 * C2 state.
135 *
136 * Steppings 0x0A and later are good
137 */
138 if (boot_cpu_data.x86 == 0x0F &&
139 boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
140 boot_cpu_data.x86_model <= 0x05 &&
141 boot_cpu_data.x86_mask < 0x0A)
142 return 1;
143 else if (boot_cpu_has(X86_FEATURE_AMDC1E))
144 return 1;
145 else
146 return max_cstate;
147}
148
149#else /* !CONFIG_ACPI */
150
151#define acpi_lapic 0
152#define acpi_ioapic 0
153static inline void acpi_noirq_set(void) { }
154static inline void acpi_disable_pci(void) { }
155static inline void disable_acpi(void) { }
156
157#endif /* !CONFIG_ACPI */
158
159#define ARCH_HAS_POWER_INIT 1
160
161struct bootnode;
162
163#ifdef CONFIG_ACPI_NUMA
164extern int acpi_numa;
165extern int acpi_scan_nodes(unsigned long start, unsigned long end);
166#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
167extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
168 int num_nodes);
169#else
170static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
171 int num_nodes)
172{
173}
174#endif
175
176#define acpi_unlazy_tlb(x) leave_mm(x)
177
178#endif /* _ASM_X86_ACPI_H */
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
new file mode 100644
index 00000000000..9825cd64c9b
--- /dev/null
+++ b/arch/x86/include/asm/agp.h
@@ -0,0 +1,35 @@
1#ifndef _ASM_X86_AGP_H
2#define _ASM_X86_AGP_H
3
4#include <asm/pgtable.h>
5#include <asm/cacheflush.h>
6
7/*
8 * Functions to keep the agpgart mappings coherent with the MMU. The
9 * GART gives the CPU a physical alias of pages in memory. The alias
10 * region is mapped uncacheable. Make sure there are no conflicting
11 * mappings with different cachability attributes for the same
12 * page. This avoids data corruption on some CPUs.
13 */
14
15#define map_page_into_agp(page) set_pages_uc(page, 1)
16#define unmap_page_from_agp(page) set_pages_wb(page, 1)
17
18/*
19 * Could use CLFLUSH here if the cpu supports it. But then it would
20 * need to be called for each cacheline of the whole page so it may
21 * not be worth it. Would need a page for it.
22 */
23#define flush_agp_cache() wbinvd()
24
25/* Convert a physical address to an address suitable for the GART. */
26#define phys_to_gart(x) (x)
27#define gart_to_phys(x) (x)
28
29/* GATT allocation. Returns/accepts GATT kernel virtual address. */
30#define alloc_gatt_pages(order) \
31 ((char *)__get_free_pages(GFP_KERNEL, (order)))
32#define free_gatt_pages(table, order) \
33 free_pages((unsigned long)(table), (order))
34
35#endif /* _ASM_X86_AGP_H */
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
new file mode 100644
index 00000000000..e2077d343c3
--- /dev/null
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -0,0 +1,22 @@
1#ifdef __ASSEMBLY__
2
3#ifdef CONFIG_X86_32
4# define X86_ALIGN .long
5#else
6# define X86_ALIGN .quad
7#endif
8
9#ifdef CONFIG_SMP
10 .macro LOCK_PREFIX
111: lock
12 .section .smp_locks,"a"
13 .align 4
14 X86_ALIGN 1b
15 .previous
16 .endm
17#else
18 .macro LOCK_PREFIX
19 .endm
20#endif
21
22#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
new file mode 100644
index 00000000000..f6aa18eadf7
--- /dev/null
+++ b/arch/x86/include/asm/alternative.h
@@ -0,0 +1,183 @@
1#ifndef _ASM_X86_ALTERNATIVE_H
2#define _ASM_X86_ALTERNATIVE_H
3
4#include <linux/types.h>
5#include <linux/stddef.h>
6#include <asm/asm.h>
7
8/*
9 * Alternative inline assembly for SMP.
10 *
11 * The LOCK_PREFIX macro defined here replaces the LOCK and
12 * LOCK_PREFIX macros used everywhere in the source tree.
13 *
14 * SMP alternatives use the same data structures as the other
15 * alternatives and the X86_FEATURE_UP flag to indicate the case of a
16 * UP system running a SMP kernel. The existing apply_alternatives()
17 * works fine for patching a SMP kernel for UP.
18 *
19 * The SMP alternative tables can be kept after boot and contain both
20 * UP and SMP versions of the instructions to allow switching back to
21 * SMP at runtime, when hotplugging in a new CPU, which is especially
22 * useful in virtualized environments.
23 *
24 * The very common lock prefix is handled as special case in a
25 * separate table which is a pure address list without replacement ptr
26 * and size information. That keeps the table sizes small.
27 */
28
29#ifdef CONFIG_SMP
30#define LOCK_PREFIX \
31 ".section .smp_locks,\"a\"\n" \
32 _ASM_ALIGN "\n" \
33 _ASM_PTR "661f\n" /* address */ \
34 ".previous\n" \
35 "661:\n\tlock; "
36
37#else /* ! CONFIG_SMP */
38#define LOCK_PREFIX ""
39#endif
40
41/* This must be included *after* the definition of LOCK_PREFIX */
42#include <asm/cpufeature.h>
43
44struct alt_instr {
45 u8 *instr; /* original instruction */
46 u8 *replacement;
47 u8 cpuid; /* cpuid bit set for replacement */
48 u8 instrlen; /* length of original instruction */
49 u8 replacementlen; /* length of new instruction, <= instrlen */
50 u8 pad1;
51#ifdef CONFIG_X86_64
52 u32 pad2;
53#endif
54};
55
56extern void alternative_instructions(void);
57extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
58
59struct module;
60
61#ifdef CONFIG_SMP
62extern void alternatives_smp_module_add(struct module *mod, char *name,
63 void *locks, void *locks_end,
64 void *text, void *text_end);
65extern void alternatives_smp_module_del(struct module *mod);
66extern void alternatives_smp_switch(int smp);
67#else
68static inline void alternatives_smp_module_add(struct module *mod, char *name,
69 void *locks, void *locks_end,
70 void *text, void *text_end) {}
71static inline void alternatives_smp_module_del(struct module *mod) {}
72static inline void alternatives_smp_switch(int smp) {}
73#endif /* CONFIG_SMP */
74
75const unsigned char *const *find_nop_table(void);
76
77/*
78 * Alternative instructions for different CPU types or capabilities.
79 *
80 * This allows to use optimized instructions even on generic binary
81 * kernels.
82 *
83 * length of oldinstr must be longer or equal the length of newinstr
84 * It can be padded with nops as needed.
85 *
86 * For non barrier like inlines please define new variants
87 * without volatile and memory clobber.
88 */
89#define alternative(oldinstr, newinstr, feature) \
90 asm volatile ("661:\n\t" oldinstr "\n662:\n" \
91 ".section .altinstructions,\"a\"\n" \
92 _ASM_ALIGN "\n" \
93 _ASM_PTR "661b\n" /* label */ \
94 _ASM_PTR "663f\n" /* new instruction */ \
95 " .byte %c0\n" /* feature bit */ \
96 " .byte 662b-661b\n" /* sourcelen */ \
97 " .byte 664f-663f\n" /* replacementlen */ \
98 ".previous\n" \
99 ".section .altinstr_replacement,\"ax\"\n" \
100 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
101 ".previous" :: "i" (feature) : "memory")
102
103/*
104 * Alternative inline assembly with input.
105 *
106 * Pecularities:
107 * No memory clobber here.
108 * Argument numbers start with 1.
109 * Best is to use constraints that are fixed size (like (%1) ... "r")
110 * If you use variable sized constraints like "m" or "g" in the
111 * replacement make sure to pad to the worst case length.
112 */
113#define alternative_input(oldinstr, newinstr, feature, input...) \
114 asm volatile ("661:\n\t" oldinstr "\n662:\n" \
115 ".section .altinstructions,\"a\"\n" \
116 _ASM_ALIGN "\n" \
117 _ASM_PTR "661b\n" /* label */ \
118 _ASM_PTR "663f\n" /* new instruction */ \
119 " .byte %c0\n" /* feature bit */ \
120 " .byte 662b-661b\n" /* sourcelen */ \
121 " .byte 664f-663f\n" /* replacementlen */ \
122 ".previous\n" \
123 ".section .altinstr_replacement,\"ax\"\n" \
124 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
125 ".previous" :: "i" (feature), ##input)
126
127/* Like alternative_input, but with a single output argument */
128#define alternative_io(oldinstr, newinstr, feature, output, input...) \
129 asm volatile ("661:\n\t" oldinstr "\n662:\n" \
130 ".section .altinstructions,\"a\"\n" \
131 _ASM_ALIGN "\n" \
132 _ASM_PTR "661b\n" /* label */ \
133 _ASM_PTR "663f\n" /* new instruction */ \
134 " .byte %c[feat]\n" /* feature bit */ \
135 " .byte 662b-661b\n" /* sourcelen */ \
136 " .byte 664f-663f\n" /* replacementlen */ \
137 ".previous\n" \
138 ".section .altinstr_replacement,\"ax\"\n" \
139 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
140 ".previous" : output : [feat] "i" (feature), ##input)
141
142/*
143 * use this macro(s) if you need more than one output parameter
144 * in alternative_io
145 */
146#define ASM_OUTPUT2(a, b) a, b
147
148struct paravirt_patch_site;
149#ifdef CONFIG_PARAVIRT
150void apply_paravirt(struct paravirt_patch_site *start,
151 struct paravirt_patch_site *end);
152#else
153static inline void apply_paravirt(struct paravirt_patch_site *start,
154 struct paravirt_patch_site *end)
155{}
156#define __parainstructions NULL
157#define __parainstructions_end NULL
158#endif
159
160extern void add_nops(void *insns, unsigned int len);
161
162/*
163 * Clear and restore the kernel write-protection flag on the local CPU.
164 * Allows the kernel to edit read-only pages.
165 * Side-effect: any interrupt handler running between save and restore will have
166 * the ability to write to read-only pages.
167 *
168 * Warning:
169 * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and
170 * no thread can be preempted in the instructions being modified (no iret to an
171 * invalid instruction possible) or if the instructions are changed from a
172 * consistent state to another consistent state atomically.
173 * More care must be taken when modifying code in the SMP case because of
174 * Intel's errata.
175 * On the local CPU you need to be protected again NMI or MCE handlers seeing an
176 * inconsistent instruction while you patch.
177 * The _early version expects the memory to already be RW.
178 */
179
180extern void *text_poke(void *addr, const void *opcode, size_t len);
181extern void *text_poke_early(void *addr, const void *opcode, size_t len);
182
183#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
new file mode 100644
index 00000000000..f712344329b
--- /dev/null
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -0,0 +1,35 @@
1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#ifndef _ASM_X86_AMD_IOMMU_H
21#define _ASM_X86_AMD_IOMMU_H
22
23#include <linux/irqreturn.h>
24
25#ifdef CONFIG_AMD_IOMMU
26extern int amd_iommu_init(void);
27extern int amd_iommu_init_dma_ops(void);
28extern void amd_iommu_detect(void);
29extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
30#else
31static inline int amd_iommu_init(void) { return -ENODEV; }
32static inline void amd_iommu_detect(void) { }
33#endif
34
35#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
new file mode 100644
index 00000000000..1a30c0440c6
--- /dev/null
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -0,0 +1,404 @@
1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
21#define _ASM_X86_AMD_IOMMU_TYPES_H
22
23#include <linux/types.h>
24#include <linux/list.h>
25#include <linux/spinlock.h>
26
27/*
28 * some size calculation constants
29 */
30#define DEV_TABLE_ENTRY_SIZE 32
31#define ALIAS_TABLE_ENTRY_SIZE 2
32#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *))
33
34/* Length of the MMIO region for the AMD IOMMU */
35#define MMIO_REGION_LENGTH 0x4000
36
37/* Capability offsets used by the driver */
38#define MMIO_CAP_HDR_OFFSET 0x00
39#define MMIO_RANGE_OFFSET 0x0c
40#define MMIO_MISC_OFFSET 0x10
41
42/* Masks, shifts and macros to parse the device range capability */
43#define MMIO_RANGE_LD_MASK 0xff000000
44#define MMIO_RANGE_FD_MASK 0x00ff0000
45#define MMIO_RANGE_BUS_MASK 0x0000ff00
46#define MMIO_RANGE_LD_SHIFT 24
47#define MMIO_RANGE_FD_SHIFT 16
48#define MMIO_RANGE_BUS_SHIFT 8
49#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT)
50#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT)
51#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT)
52#define MMIO_MSI_NUM(x) ((x) & 0x1f)
53
54/* Flag masks for the AMD IOMMU exclusion range */
55#define MMIO_EXCL_ENABLE_MASK 0x01ULL
56#define MMIO_EXCL_ALLOW_MASK 0x02ULL
57
58/* Used offsets into the MMIO space */
59#define MMIO_DEV_TABLE_OFFSET 0x0000
60#define MMIO_CMD_BUF_OFFSET 0x0008
61#define MMIO_EVT_BUF_OFFSET 0x0010
62#define MMIO_CONTROL_OFFSET 0x0018
63#define MMIO_EXCL_BASE_OFFSET 0x0020
64#define MMIO_EXCL_LIMIT_OFFSET 0x0028
65#define MMIO_CMD_HEAD_OFFSET 0x2000
66#define MMIO_CMD_TAIL_OFFSET 0x2008
67#define MMIO_EVT_HEAD_OFFSET 0x2010
68#define MMIO_EVT_TAIL_OFFSET 0x2018
69#define MMIO_STATUS_OFFSET 0x2020
70
71/* MMIO status bits */
72#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04
73
74/* event logging constants */
75#define EVENT_ENTRY_SIZE 0x10
76#define EVENT_TYPE_SHIFT 28
77#define EVENT_TYPE_MASK 0xf
78#define EVENT_TYPE_ILL_DEV 0x1
79#define EVENT_TYPE_IO_FAULT 0x2
80#define EVENT_TYPE_DEV_TAB_ERR 0x3
81#define EVENT_TYPE_PAGE_TAB_ERR 0x4
82#define EVENT_TYPE_ILL_CMD 0x5
83#define EVENT_TYPE_CMD_HARD_ERR 0x6
84#define EVENT_TYPE_IOTLB_INV_TO 0x7
85#define EVENT_TYPE_INV_DEV_REQ 0x8
86#define EVENT_DEVID_MASK 0xffff
87#define EVENT_DEVID_SHIFT 0
88#define EVENT_DOMID_MASK 0xffff
89#define EVENT_DOMID_SHIFT 0
90#define EVENT_FLAGS_MASK 0xfff
91#define EVENT_FLAGS_SHIFT 0x10
92
93/* feature control bits */
94#define CONTROL_IOMMU_EN 0x00ULL
95#define CONTROL_HT_TUN_EN 0x01ULL
96#define CONTROL_EVT_LOG_EN 0x02ULL
97#define CONTROL_EVT_INT_EN 0x03ULL
98#define CONTROL_COMWAIT_EN 0x04ULL
99#define CONTROL_PASSPW_EN 0x08ULL
100#define CONTROL_RESPASSPW_EN 0x09ULL
101#define CONTROL_COHERENT_EN 0x0aULL
102#define CONTROL_ISOC_EN 0x0bULL
103#define CONTROL_CMDBUF_EN 0x0cULL
104#define CONTROL_PPFLOG_EN 0x0dULL
105#define CONTROL_PPFINT_EN 0x0eULL
106
107/* command specific defines */
108#define CMD_COMPL_WAIT 0x01
109#define CMD_INV_DEV_ENTRY 0x02
110#define CMD_INV_IOMMU_PAGES 0x03
111
112#define CMD_COMPL_WAIT_STORE_MASK 0x01
113#define CMD_COMPL_WAIT_INT_MASK 0x02
114#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01
115#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02
116
117#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL
118
119/* macros and definitions for device table entries */
120#define DEV_ENTRY_VALID 0x00
121#define DEV_ENTRY_TRANSLATION 0x01
122#define DEV_ENTRY_IR 0x3d
123#define DEV_ENTRY_IW 0x3e
124#define DEV_ENTRY_NO_PAGE_FAULT 0x62
125#define DEV_ENTRY_EX 0x67
126#define DEV_ENTRY_SYSMGT1 0x68
127#define DEV_ENTRY_SYSMGT2 0x69
128#define DEV_ENTRY_INIT_PASS 0xb8
129#define DEV_ENTRY_EINT_PASS 0xb9
130#define DEV_ENTRY_NMI_PASS 0xba
131#define DEV_ENTRY_LINT0_PASS 0xbe
132#define DEV_ENTRY_LINT1_PASS 0xbf
133#define DEV_ENTRY_MODE_MASK 0x07
134#define DEV_ENTRY_MODE_SHIFT 0x09
135
136/* constants to configure the command buffer */
137#define CMD_BUFFER_SIZE 8192
138#define CMD_BUFFER_ENTRIES 512
139#define MMIO_CMD_SIZE_SHIFT 56
140#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
141
142/* constants for event buffer handling */
143#define EVT_BUFFER_SIZE 8192 /* 512 entries */
144#define EVT_LEN_MASK (0x9ULL << 56)
145
146#define PAGE_MODE_1_LEVEL 0x01
147#define PAGE_MODE_2_LEVEL 0x02
148#define PAGE_MODE_3_LEVEL 0x03
149
150#define IOMMU_PDE_NL_0 0x000ULL
151#define IOMMU_PDE_NL_1 0x200ULL
152#define IOMMU_PDE_NL_2 0x400ULL
153#define IOMMU_PDE_NL_3 0x600ULL
154
155#define IOMMU_PTE_L2_INDEX(address) (((address) >> 30) & 0x1ffULL)
156#define IOMMU_PTE_L1_INDEX(address) (((address) >> 21) & 0x1ffULL)
157#define IOMMU_PTE_L0_INDEX(address) (((address) >> 12) & 0x1ffULL)
158
159#define IOMMU_MAP_SIZE_L1 (1ULL << 21)
160#define IOMMU_MAP_SIZE_L2 (1ULL << 30)
161#define IOMMU_MAP_SIZE_L3 (1ULL << 39)
162
163#define IOMMU_PTE_P (1ULL << 0)
164#define IOMMU_PTE_TV (1ULL << 1)
165#define IOMMU_PTE_U (1ULL << 59)
166#define IOMMU_PTE_FC (1ULL << 60)
167#define IOMMU_PTE_IR (1ULL << 61)
168#define IOMMU_PTE_IW (1ULL << 62)
169
170#define IOMMU_L1_PDE(address) \
171 ((address) | IOMMU_PDE_NL_1 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
172#define IOMMU_L2_PDE(address) \
173 ((address) | IOMMU_PDE_NL_2 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
174
175#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
176#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
177#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
178#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
179
180#define IOMMU_PROT_MASK 0x03
181#define IOMMU_PROT_IR 0x01
182#define IOMMU_PROT_IW 0x02
183
184/* IOMMU capabilities */
185#define IOMMU_CAP_IOTLB 24
186#define IOMMU_CAP_NPCACHE 26
187
188#define MAX_DOMAIN_ID 65536
189
190/* FIXME: move this macro to <linux/pci.h> */
191#define PCI_BUS(x) (((x) >> 8) & 0xff)
192
193/*
194 * This structure contains generic data for IOMMU protection domains
195 * independent of their use.
196 */
197struct protection_domain {
198 spinlock_t lock; /* mostly used to lock the page table*/
199 u16 id; /* the domain id written to the device table */
200 int mode; /* paging mode (0-6 levels) */
201 u64 *pt_root; /* page table root pointer */
202 void *priv; /* private data */
203};
204
205/*
206 * Data container for a dma_ops specific protection domain
207 */
208struct dma_ops_domain {
209 struct list_head list;
210
211 /* generic protection domain information */
212 struct protection_domain domain;
213
214 /* size of the aperture for the mappings */
215 unsigned long aperture_size;
216
217 /* address we start to search for free addresses */
218 unsigned long next_bit;
219
220 /* address allocation bitmap */
221 unsigned long *bitmap;
222
223 /*
224 * Array of PTE pages for the aperture. In this array we save all the
225 * leaf pages of the domain page table used for the aperture. This way
226 * we don't need to walk the page table to find a specific PTE. We can
227 * just calculate its address in constant time.
228 */
229 u64 **pte_pages;
230
231 /* This will be set to true when TLB needs to be flushed */
232 bool need_flush;
233
234 /*
235 * if this is a preallocated domain, keep the device for which it was
236 * preallocated in this variable
237 */
238 u16 target_dev;
239};
240
241/*
242 * Structure where we save information about one hardware AMD IOMMU in the
243 * system.
244 */
245struct amd_iommu {
246 struct list_head list;
247
248 /* locks the accesses to the hardware */
249 spinlock_t lock;
250
251 /* Pointer to PCI device of this IOMMU */
252 struct pci_dev *dev;
253
254 /*
255 * Capability pointer. There could be more than one IOMMU per PCI
256 * device function if there are more than one AMD IOMMU capability
257 * pointers.
258 */
259 u16 cap_ptr;
260
261 /* physical address of MMIO space */
262 u64 mmio_phys;
263 /* virtual address of MMIO space */
264 u8 *mmio_base;
265
266 /* capabilities of that IOMMU read from ACPI */
267 u32 cap;
268
269 /* pci domain of this IOMMU */
270 u16 pci_seg;
271
272 /* first device this IOMMU handles. read from PCI */
273 u16 first_device;
274 /* last device this IOMMU handles. read from PCI */
275 u16 last_device;
276
277 /* start of exclusion range of that IOMMU */
278 u64 exclusion_start;
279 /* length of exclusion range of that IOMMU */
280 u64 exclusion_length;
281
282 /* command buffer virtual address */
283 u8 *cmd_buf;
284 /* size of command buffer */
285 u32 cmd_buf_size;
286
287 /* event buffer virtual address */
288 u8 *evt_buf;
289 /* size of event buffer */
290 u32 evt_buf_size;
291 /* MSI number for event interrupt */
292 u16 evt_msi_num;
293
294 /* if one, we need to send a completion wait command */
295 int need_sync;
296
297 /* true if interrupts for this IOMMU are already enabled */
298 bool int_enabled;
299
300 /* default dma_ops domain for that IOMMU */
301 struct dma_ops_domain *default_dom;
302};
303
304/*
305 * List with all IOMMUs in the system. This list is not locked because it is
306 * only written and read at driver initialization or suspend time
307 */
308extern struct list_head amd_iommu_list;
309
310/*
311 * Structure defining one entry in the device table
312 */
313struct dev_table_entry {
314 u32 data[8];
315};
316
317/*
318 * One entry for unity mappings parsed out of the ACPI table.
319 */
320struct unity_map_entry {
321 struct list_head list;
322
323 /* starting device id this entry is used for (including) */
324 u16 devid_start;
325 /* end device id this entry is used for (including) */
326 u16 devid_end;
327
328 /* start address to unity map (including) */
329 u64 address_start;
330 /* end address to unity map (including) */
331 u64 address_end;
332
333 /* required protection */
334 int prot;
335};
336
337/*
338 * List of all unity mappings. It is not locked because as runtime it is only
339 * read. It is created at ACPI table parsing time.
340 */
341extern struct list_head amd_iommu_unity_map;
342
343/*
344 * Data structures for device handling
345 */
346
347/*
348 * Device table used by hardware. Read and write accesses by software are
349 * locked with the amd_iommu_pd_table lock.
350 */
351extern struct dev_table_entry *amd_iommu_dev_table;
352
353/*
354 * Alias table to find requestor ids to device ids. Not locked because only
355 * read on runtime.
356 */
357extern u16 *amd_iommu_alias_table;
358
359/*
360 * Reverse lookup table to find the IOMMU which translates a specific device.
361 */
362extern struct amd_iommu **amd_iommu_rlookup_table;
363
364/* size of the dma_ops aperture as power of 2 */
365extern unsigned amd_iommu_aperture_order;
366
367/* largest PCI device id we expect translation requests for */
368extern u16 amd_iommu_last_bdf;
369
370/* data structures for protection domain handling */
371extern struct protection_domain **amd_iommu_pd_table;
372
373/* allocation bitmap for domain ids */
374extern unsigned long *amd_iommu_pd_alloc_bitmap;
375
376/* will be 1 if device isolation is enabled */
377extern int amd_iommu_isolate;
378
379/*
380 * If true, the addresses will be flushed on unmap time, not when
381 * they are reused
382 */
383extern bool amd_iommu_unmap_flush;
384
385/* takes a PCI device id and prints it out in a readable form */
386static inline void print_devid(u16 devid, int nl)
387{
388 int bus = devid >> 8;
389 int dev = devid >> 3 & 0x1f;
390 int fn = devid & 0x07;
391
392 printk("%02x:%02x.%x", bus, dev, fn);
393 if (nl)
394 printk("\n");
395}
396
397/* takes bus and device/function and returns the device id
398 * FIXME: should that be in generic PCI code? */
399static inline u16 calc_devid(u8 bus, u8 devfn)
400{
401 return (((u16)bus) << 8) | devfn;
402}
403
404#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
new file mode 100644
index 00000000000..3b1510b4fc5
--- /dev/null
+++ b/arch/x86/include/asm/apic.h
@@ -0,0 +1,199 @@
1#ifndef _ASM_X86_APIC_H
2#define _ASM_X86_APIC_H
3
4#include <linux/pm.h>
5#include <linux/delay.h>
6
7#include <asm/alternative.h>
8#include <asm/fixmap.h>
9#include <asm/apicdef.h>
10#include <asm/processor.h>
11#include <asm/system.h>
12#include <asm/cpufeature.h>
13#include <asm/msr.h>
14
15#define ARCH_APICTIMER_STOPS_ON_C3 1
16
17/*
18 * Debugging macros
19 */
20#define APIC_QUIET 0
21#define APIC_VERBOSE 1
22#define APIC_DEBUG 2
23
24/*
25 * Define the default level of output to be very little
26 * This can be turned up by using apic=verbose for more
27 * information and apic=debug for _lots_ of information.
28 * apic_verbosity is defined in apic.c
29 */
30#define apic_printk(v, s, a...) do { \
31 if ((v) <= apic_verbosity) \
32 printk(s, ##a); \
33 } while (0)
34
35
36extern void generic_apic_probe(void);
37
38#ifdef CONFIG_X86_LOCAL_APIC
39
40extern unsigned int apic_verbosity;
41extern int local_apic_timer_c2_ok;
42
43extern int disable_apic;
44/*
45 * Basic functions accessing APICs.
46 */
47#ifdef CONFIG_PARAVIRT
48#include <asm/paravirt.h>
49#else
50#define setup_boot_clock setup_boot_APIC_clock
51#define setup_secondary_clock setup_secondary_APIC_clock
52#endif
53
54extern int is_vsmp_box(void);
55extern void xapic_wait_icr_idle(void);
56extern u32 safe_xapic_wait_icr_idle(void);
57extern u64 xapic_icr_read(void);
58extern void xapic_icr_write(u32, u32);
59extern int setup_profiling_timer(unsigned int);
60
61static inline void native_apic_mem_write(u32 reg, u32 v)
62{
63 volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
64
65 alternative_io("movl %0, %1", "xchgl %0, %1", X86_FEATURE_11AP,
66 ASM_OUTPUT2("=r" (v), "=m" (*addr)),
67 ASM_OUTPUT2("0" (v), "m" (*addr)));
68}
69
70static inline u32 native_apic_mem_read(u32 reg)
71{
72 return *((volatile u32 *)(APIC_BASE + reg));
73}
74
75static inline void native_apic_msr_write(u32 reg, u32 v)
76{
77 if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
78 reg == APIC_LVR)
79 return;
80
81 wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0);
82}
83
84static inline u32 native_apic_msr_read(u32 reg)
85{
86 u32 low, high;
87
88 if (reg == APIC_DFR)
89 return -1;
90
91 rdmsr(APIC_BASE_MSR + (reg >> 4), low, high);
92 return low;
93}
94
95#ifndef CONFIG_X86_32
96extern int x2apic, x2apic_preenabled;
97extern void check_x2apic(void);
98extern void enable_x2apic(void);
99extern void enable_IR_x2apic(void);
100extern void x2apic_icr_write(u32 low, u32 id);
101static inline int x2apic_enabled(void)
102{
103 int msr, msr2;
104
105 if (!cpu_has_x2apic)
106 return 0;
107
108 rdmsr(MSR_IA32_APICBASE, msr, msr2);
109 if (msr & X2APIC_ENABLE)
110 return 1;
111 return 0;
112}
113#else
114#define x2apic_enabled() 0
115#endif
116
117struct apic_ops {
118 u32 (*read)(u32 reg);
119 void (*write)(u32 reg, u32 v);
120 u64 (*icr_read)(void);
121 void (*icr_write)(u32 low, u32 high);
122 void (*wait_icr_idle)(void);
123 u32 (*safe_wait_icr_idle)(void);
124};
125
126extern struct apic_ops *apic_ops;
127
128#define apic_read (apic_ops->read)
129#define apic_write (apic_ops->write)
130#define apic_icr_read (apic_ops->icr_read)
131#define apic_icr_write (apic_ops->icr_write)
132#define apic_wait_icr_idle (apic_ops->wait_icr_idle)
133#define safe_apic_wait_icr_idle (apic_ops->safe_wait_icr_idle)
134
135extern int get_physical_broadcast(void);
136
137#ifdef CONFIG_X86_64
138static inline void ack_x2APIC_irq(void)
139{
140 /* Docs say use 0 for future compatibility */
141 native_apic_msr_write(APIC_EOI, 0);
142}
143#endif
144
145
146static inline void ack_APIC_irq(void)
147{
148 /*
149 * ack_APIC_irq() actually gets compiled as a single instruction
150 * ... yummie.
151 */
152
153 /* Docs say use 0 for future compatibility */
154 apic_write(APIC_EOI, 0);
155}
156
157extern int lapic_get_maxlvt(void);
158extern void clear_local_APIC(void);
159extern void connect_bsp_APIC(void);
160extern void disconnect_bsp_APIC(int virt_wire_setup);
161extern void disable_local_APIC(void);
162extern void lapic_shutdown(void);
163extern int verify_local_APIC(void);
164extern void cache_APIC_registers(void);
165extern void sync_Arb_IDs(void);
166extern void init_bsp_APIC(void);
167extern void setup_local_APIC(void);
168extern void end_local_APIC_setup(void);
169extern void init_apic_mappings(void);
170extern void setup_boot_APIC_clock(void);
171extern void setup_secondary_APIC_clock(void);
172extern int APIC_init_uniprocessor(void);
173extern void enable_NMI_through_LVT0(void);
174
175/*
176 * On 32bit this is mach-xxx local
177 */
178#ifdef CONFIG_X86_64
179extern void early_init_lapic_mapping(void);
180extern int apic_is_clustered_box(void);
181#else
182static inline int apic_is_clustered_box(void)
183{
184 return 0;
185}
186#endif
187
188extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask);
189extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
190
191
192#else /* !CONFIG_X86_LOCAL_APIC */
193static inline void lapic_shutdown(void) { }
194#define local_apic_timer_c2_ok 1
195static inline void init_apic_mappings(void) { }
196
197#endif /* !CONFIG_X86_LOCAL_APIC */
198
199#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
new file mode 100644
index 00000000000..63134e31e8b
--- /dev/null
+++ b/arch/x86/include/asm/apicdef.h
@@ -0,0 +1,417 @@
1#ifndef _ASM_X86_APICDEF_H
2#define _ASM_X86_APICDEF_H
3
4/*
5 * Constants for various Intel APICs. (local APIC, IOAPIC, etc.)
6 *
7 * Alan Cox <Alan.Cox@linux.org>, 1995.
8 * Ingo Molnar <mingo@redhat.com>, 1999, 2000
9 */
10
11#define APIC_DEFAULT_PHYS_BASE 0xfee00000
12
13#define APIC_ID 0x20
14
15#define APIC_LVR 0x30
16#define APIC_LVR_MASK 0xFF00FF
17#define GET_APIC_VERSION(x) ((x) & 0xFFu)
18#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu)
19#ifdef CONFIG_X86_32
20# define APIC_INTEGRATED(x) ((x) & 0xF0u)
21#else
22# define APIC_INTEGRATED(x) (1)
23#endif
24#define APIC_XAPIC(x) ((x) >= 0x14)
25#define APIC_TASKPRI 0x80
26#define APIC_TPRI_MASK 0xFFu
27#define APIC_ARBPRI 0x90
28#define APIC_ARBPRI_MASK 0xFFu
29#define APIC_PROCPRI 0xA0
30#define APIC_EOI 0xB0
31#define APIC_EIO_ACK 0x0
32#define APIC_RRR 0xC0
33#define APIC_LDR 0xD0
34#define APIC_LDR_MASK (0xFFu << 24)
35#define GET_APIC_LOGICAL_ID(x) (((x) >> 24) & 0xFFu)
36#define SET_APIC_LOGICAL_ID(x) (((x) << 24))
37#define APIC_ALL_CPUS 0xFFu
38#define APIC_DFR 0xE0
39#define APIC_DFR_CLUSTER 0x0FFFFFFFul
40#define APIC_DFR_FLAT 0xFFFFFFFFul
41#define APIC_SPIV 0xF0
42#define APIC_SPIV_FOCUS_DISABLED (1 << 9)
43#define APIC_SPIV_APIC_ENABLED (1 << 8)
44#define APIC_ISR 0x100
45#define APIC_ISR_NR 0x8 /* Number of 32 bit ISR registers. */
46#define APIC_TMR 0x180
47#define APIC_IRR 0x200
48#define APIC_ESR 0x280
49#define APIC_ESR_SEND_CS 0x00001
50#define APIC_ESR_RECV_CS 0x00002
51#define APIC_ESR_SEND_ACC 0x00004
52#define APIC_ESR_RECV_ACC 0x00008
53#define APIC_ESR_SENDILL 0x00020
54#define APIC_ESR_RECVILL 0x00040
55#define APIC_ESR_ILLREGA 0x00080
56#define APIC_ICR 0x300
57#define APIC_DEST_SELF 0x40000
58#define APIC_DEST_ALLINC 0x80000
59#define APIC_DEST_ALLBUT 0xC0000
60#define APIC_ICR_RR_MASK 0x30000
61#define APIC_ICR_RR_INVALID 0x00000
62#define APIC_ICR_RR_INPROG 0x10000
63#define APIC_ICR_RR_VALID 0x20000
64#define APIC_INT_LEVELTRIG 0x08000
65#define APIC_INT_ASSERT 0x04000
66#define APIC_ICR_BUSY 0x01000
67#define APIC_DEST_LOGICAL 0x00800
68#define APIC_DEST_PHYSICAL 0x00000
69#define APIC_DM_FIXED 0x00000
70#define APIC_DM_LOWEST 0x00100
71#define APIC_DM_SMI 0x00200
72#define APIC_DM_REMRD 0x00300
73#define APIC_DM_NMI 0x00400
74#define APIC_DM_INIT 0x00500
75#define APIC_DM_STARTUP 0x00600
76#define APIC_DM_EXTINT 0x00700
77#define APIC_VECTOR_MASK 0x000FF
78#define APIC_ICR2 0x310
79#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF)
80#define SET_APIC_DEST_FIELD(x) ((x) << 24)
81#define APIC_LVTT 0x320
82#define APIC_LVTTHMR 0x330
83#define APIC_LVTPC 0x340
84#define APIC_LVT0 0x350
85#define APIC_LVT_TIMER_BASE_MASK (0x3 << 18)
86#define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3)
87#define SET_APIC_TIMER_BASE(x) (((x) << 18))
88#define APIC_TIMER_BASE_CLKIN 0x0
89#define APIC_TIMER_BASE_TMBASE 0x1
90#define APIC_TIMER_BASE_DIV 0x2
91#define APIC_LVT_TIMER_PERIODIC (1 << 17)
92#define APIC_LVT_MASKED (1 << 16)
93#define APIC_LVT_LEVEL_TRIGGER (1 << 15)
94#define APIC_LVT_REMOTE_IRR (1 << 14)
95#define APIC_INPUT_POLARITY (1 << 13)
96#define APIC_SEND_PENDING (1 << 12)
97#define APIC_MODE_MASK 0x700
98#define GET_APIC_DELIVERY_MODE(x) (((x) >> 8) & 0x7)
99#define SET_APIC_DELIVERY_MODE(x, y) (((x) & ~0x700) | ((y) << 8))
100#define APIC_MODE_FIXED 0x0
101#define APIC_MODE_NMI 0x4
102#define APIC_MODE_EXTINT 0x7
103#define APIC_LVT1 0x360
104#define APIC_LVTERR 0x370
105#define APIC_TMICT 0x380
106#define APIC_TMCCT 0x390
107#define APIC_TDCR 0x3E0
108#define APIC_SELF_IPI 0x3F0
109#define APIC_TDR_DIV_TMBASE (1 << 2)
110#define APIC_TDR_DIV_1 0xB
111#define APIC_TDR_DIV_2 0x0
112#define APIC_TDR_DIV_4 0x1
113#define APIC_TDR_DIV_8 0x2
114#define APIC_TDR_DIV_16 0x3
115#define APIC_TDR_DIV_32 0x8
116#define APIC_TDR_DIV_64 0x9
117#define APIC_TDR_DIV_128 0xA
118#define APIC_EILVT0 0x500
119#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
120#define APIC_EILVT_NR_AMD_10H 4
121#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
122#define APIC_EILVT_MSG_FIX 0x0
123#define APIC_EILVT_MSG_SMI 0x2
124#define APIC_EILVT_MSG_NMI 0x4
125#define APIC_EILVT_MSG_EXT 0x7
126#define APIC_EILVT_MASKED (1 << 16)
127#define APIC_EILVT1 0x510
128#define APIC_EILVT2 0x520
129#define APIC_EILVT3 0x530
130
131#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
132#define APIC_BASE_MSR 0x800
133#define X2APIC_ENABLE (1UL << 10)
134
135#ifdef CONFIG_X86_32
136# define MAX_IO_APICS 64
137#else
138# define MAX_IO_APICS 128
139# define MAX_LOCAL_APIC 32768
140#endif
141
142/*
143 * All x86-64 systems are xAPIC compatible.
144 * In the following, "apicid" is a physical APIC ID.
145 */
146#define XAPIC_DEST_CPUS_SHIFT 4
147#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
148#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
149#define APIC_CLUSTER(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK)
150#define APIC_CLUSTERID(apicid) (APIC_CLUSTER(apicid) >> XAPIC_DEST_CPUS_SHIFT)
151#define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK)
152#define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT)
153
154/*
155 * the local APIC register structure, memory mapped. Not terribly well
156 * tested, but we might eventually use this one in the future - the
157 * problem why we cannot use it right now is the P5 APIC, it has an
158 * errata which cannot take 8-bit reads and writes, only 32-bit ones ...
159 */
160#define u32 unsigned int
161
162struct local_apic {
163
164/*000*/ struct { u32 __reserved[4]; } __reserved_01;
165
166/*010*/ struct { u32 __reserved[4]; } __reserved_02;
167
168/*020*/ struct { /* APIC ID Register */
169 u32 __reserved_1 : 24,
170 phys_apic_id : 4,
171 __reserved_2 : 4;
172 u32 __reserved[3];
173 } id;
174
175/*030*/ const
176 struct { /* APIC Version Register */
177 u32 version : 8,
178 __reserved_1 : 8,
179 max_lvt : 8,
180 __reserved_2 : 8;
181 u32 __reserved[3];
182 } version;
183
184/*040*/ struct { u32 __reserved[4]; } __reserved_03;
185
186/*050*/ struct { u32 __reserved[4]; } __reserved_04;
187
188/*060*/ struct { u32 __reserved[4]; } __reserved_05;
189
190/*070*/ struct { u32 __reserved[4]; } __reserved_06;
191
192/*080*/ struct { /* Task Priority Register */
193 u32 priority : 8,
194 __reserved_1 : 24;
195 u32 __reserved_2[3];
196 } tpr;
197
198/*090*/ const
199 struct { /* Arbitration Priority Register */
200 u32 priority : 8,
201 __reserved_1 : 24;
202 u32 __reserved_2[3];
203 } apr;
204
205/*0A0*/ const
206 struct { /* Processor Priority Register */
207 u32 priority : 8,
208 __reserved_1 : 24;
209 u32 __reserved_2[3];
210 } ppr;
211
212/*0B0*/ struct { /* End Of Interrupt Register */
213 u32 eoi;
214 u32 __reserved[3];
215 } eoi;
216
217/*0C0*/ struct { u32 __reserved[4]; } __reserved_07;
218
219/*0D0*/ struct { /* Logical Destination Register */
220 u32 __reserved_1 : 24,
221 logical_dest : 8;
222 u32 __reserved_2[3];
223 } ldr;
224
225/*0E0*/ struct { /* Destination Format Register */
226 u32 __reserved_1 : 28,
227 model : 4;
228 u32 __reserved_2[3];
229 } dfr;
230
231/*0F0*/ struct { /* Spurious Interrupt Vector Register */
232 u32 spurious_vector : 8,
233 apic_enabled : 1,
234 focus_cpu : 1,
235 __reserved_2 : 22;
236 u32 __reserved_3[3];
237 } svr;
238
239/*100*/ struct { /* In Service Register */
240/*170*/ u32 bitfield;
241 u32 __reserved[3];
242 } isr [8];
243
244/*180*/ struct { /* Trigger Mode Register */
245/*1F0*/ u32 bitfield;
246 u32 __reserved[3];
247 } tmr [8];
248
249/*200*/ struct { /* Interrupt Request Register */
250/*270*/ u32 bitfield;
251 u32 __reserved[3];
252 } irr [8];
253
254/*280*/ union { /* Error Status Register */
255 struct {
256 u32 send_cs_error : 1,
257 receive_cs_error : 1,
258 send_accept_error : 1,
259 receive_accept_error : 1,
260 __reserved_1 : 1,
261 send_illegal_vector : 1,
262 receive_illegal_vector : 1,
263 illegal_register_address : 1,
264 __reserved_2 : 24;
265 u32 __reserved_3[3];
266 } error_bits;
267 struct {
268 u32 errors;
269 u32 __reserved_3[3];
270 } all_errors;
271 } esr;
272
273/*290*/ struct { u32 __reserved[4]; } __reserved_08;
274
275/*2A0*/ struct { u32 __reserved[4]; } __reserved_09;
276
277/*2B0*/ struct { u32 __reserved[4]; } __reserved_10;
278
279/*2C0*/ struct { u32 __reserved[4]; } __reserved_11;
280
281/*2D0*/ struct { u32 __reserved[4]; } __reserved_12;
282
283/*2E0*/ struct { u32 __reserved[4]; } __reserved_13;
284
285/*2F0*/ struct { u32 __reserved[4]; } __reserved_14;
286
287/*300*/ struct { /* Interrupt Command Register 1 */
288 u32 vector : 8,
289 delivery_mode : 3,
290 destination_mode : 1,
291 delivery_status : 1,
292 __reserved_1 : 1,
293 level : 1,
294 trigger : 1,
295 __reserved_2 : 2,
296 shorthand : 2,
297 __reserved_3 : 12;
298 u32 __reserved_4[3];
299 } icr1;
300
301/*310*/ struct { /* Interrupt Command Register 2 */
302 union {
303 u32 __reserved_1 : 24,
304 phys_dest : 4,
305 __reserved_2 : 4;
306 u32 __reserved_3 : 24,
307 logical_dest : 8;
308 } dest;
309 u32 __reserved_4[3];
310 } icr2;
311
312/*320*/ struct { /* LVT - Timer */
313 u32 vector : 8,
314 __reserved_1 : 4,
315 delivery_status : 1,
316 __reserved_2 : 3,
317 mask : 1,
318 timer_mode : 1,
319 __reserved_3 : 14;
320 u32 __reserved_4[3];
321 } lvt_timer;
322
323/*330*/ struct { /* LVT - Thermal Sensor */
324 u32 vector : 8,
325 delivery_mode : 3,
326 __reserved_1 : 1,
327 delivery_status : 1,
328 __reserved_2 : 3,
329 mask : 1,
330 __reserved_3 : 15;
331 u32 __reserved_4[3];
332 } lvt_thermal;
333
334/*340*/ struct { /* LVT - Performance Counter */
335 u32 vector : 8,
336 delivery_mode : 3,
337 __reserved_1 : 1,
338 delivery_status : 1,
339 __reserved_2 : 3,
340 mask : 1,
341 __reserved_3 : 15;
342 u32 __reserved_4[3];
343 } lvt_pc;
344
345/*350*/ struct { /* LVT - LINT0 */
346 u32 vector : 8,
347 delivery_mode : 3,
348 __reserved_1 : 1,
349 delivery_status : 1,
350 polarity : 1,
351 remote_irr : 1,
352 trigger : 1,
353 mask : 1,
354 __reserved_2 : 15;
355 u32 __reserved_3[3];
356 } lvt_lint0;
357
358/*360*/ struct { /* LVT - LINT1 */
359 u32 vector : 8,
360 delivery_mode : 3,
361 __reserved_1 : 1,
362 delivery_status : 1,
363 polarity : 1,
364 remote_irr : 1,
365 trigger : 1,
366 mask : 1,
367 __reserved_2 : 15;
368 u32 __reserved_3[3];
369 } lvt_lint1;
370
371/*370*/ struct { /* LVT - Error */
372 u32 vector : 8,
373 __reserved_1 : 4,
374 delivery_status : 1,
375 __reserved_2 : 3,
376 mask : 1,
377 __reserved_3 : 15;
378 u32 __reserved_4[3];
379 } lvt_error;
380
381/*380*/ struct { /* Timer Initial Count Register */
382 u32 initial_count;
383 u32 __reserved_2[3];
384 } timer_icr;
385
386/*390*/ const
387 struct { /* Timer Current Count Register */
388 u32 curr_count;
389 u32 __reserved_2[3];
390 } timer_ccr;
391
392/*3A0*/ struct { u32 __reserved[4]; } __reserved_16;
393
394/*3B0*/ struct { u32 __reserved[4]; } __reserved_17;
395
396/*3C0*/ struct { u32 __reserved[4]; } __reserved_18;
397
398/*3D0*/ struct { u32 __reserved[4]; } __reserved_19;
399
400/*3E0*/ struct { /* Timer Divide Configuration Register */
401 u32 divisor : 4,
402 __reserved_1 : 28;
403 u32 __reserved_2[3];
404 } timer_dcr;
405
406/*3F0*/ struct { u32 __reserved[4]; } __reserved_20;
407
408} __attribute__ ((packed));
409
410#undef u32
411
412#ifdef CONFIG_X86_32
413 #define BAD_APICID 0xFFu
414#else
415 #define BAD_APICID 0xFFFFu
416#endif
417#endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/arch_hooks.h b/arch/x86/include/asm/arch_hooks.h
new file mode 100644
index 00000000000..cbd4957838a
--- /dev/null
+++ b/arch/x86/include/asm/arch_hooks.h
@@ -0,0 +1,26 @@
1#ifndef _ASM_X86_ARCH_HOOKS_H
2#define _ASM_X86_ARCH_HOOKS_H
3
4#include <linux/interrupt.h>
5
6/*
7 * linux/include/asm/arch_hooks.h
8 *
9 * define the architecture specific hooks
10 */
11
12/* these aren't arch hooks, they are generic routines
13 * that can be used by the hooks */
14extern void init_ISA_irqs(void);
15extern irqreturn_t timer_interrupt(int irq, void *dev_id);
16
17/* these are the defined hooks */
18extern void intr_init_hook(void);
19extern void pre_intr_init_hook(void);
20extern void pre_setup_arch_hook(void);
21extern void trap_init_hook(void);
22extern void pre_time_init_hook(void);
23extern void time_init_hook(void);
24extern void mca_nmi_hook(void);
25
26#endif /* _ASM_X86_ARCH_HOOKS_H */
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
new file mode 100644
index 00000000000..56be78f582f
--- /dev/null
+++ b/arch/x86/include/asm/asm.h
@@ -0,0 +1,47 @@
1#ifndef _ASM_X86_ASM_H
2#define _ASM_X86_ASM_H
3
4#ifdef __ASSEMBLY__
5# define __ASM_FORM(x) x
6# define __ASM_EX_SEC .section __ex_table
7#else
8# define __ASM_FORM(x) " " #x " "
9# define __ASM_EX_SEC " .section __ex_table,\"a\"\n"
10#endif
11
12#ifdef CONFIG_X86_32
13# define __ASM_SEL(a,b) __ASM_FORM(a)
14#else
15# define __ASM_SEL(a,b) __ASM_FORM(b)
16#endif
17
18#define __ASM_SIZE(inst) __ASM_SEL(inst##l, inst##q)
19#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg)
20
21#define _ASM_PTR __ASM_SEL(.long, .quad)
22#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8)
23
24#define _ASM_MOV __ASM_SIZE(mov)
25#define _ASM_INC __ASM_SIZE(inc)
26#define _ASM_DEC __ASM_SIZE(dec)
27#define _ASM_ADD __ASM_SIZE(add)
28#define _ASM_SUB __ASM_SIZE(sub)
29#define _ASM_XADD __ASM_SIZE(xadd)
30
31#define _ASM_AX __ASM_REG(ax)
32#define _ASM_BX __ASM_REG(bx)
33#define _ASM_CX __ASM_REG(cx)
34#define _ASM_DX __ASM_REG(dx)
35#define _ASM_SP __ASM_REG(sp)
36#define _ASM_BP __ASM_REG(bp)
37#define _ASM_SI __ASM_REG(si)
38#define _ASM_DI __ASM_REG(di)
39
40/* Exception table entry */
41# define _ASM_EXTABLE(from,to) \
42 __ASM_EX_SEC \
43 _ASM_ALIGN "\n" \
44 _ASM_PTR #from "," #to "\n" \
45 " .previous\n"
46
47#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
new file mode 100644
index 00000000000..4e1b8873c47
--- /dev/null
+++ b/arch/x86/include/asm/atomic.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "atomic_32.h"
3#else
4# include "atomic_64.h"
5#endif
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
new file mode 100644
index 00000000000..ad5b9f6ecdd
--- /dev/null
+++ b/arch/x86/include/asm/atomic_32.h
@@ -0,0 +1,259 @@
1#ifndef _ASM_X86_ATOMIC_32_H
2#define _ASM_X86_ATOMIC_32_H
3
4#include <linux/compiler.h>
5#include <asm/processor.h>
6#include <asm/cmpxchg.h>
7
8/*
9 * Atomic operations that C can't guarantee us. Useful for
10 * resource counting etc..
11 */
12
13/*
14 * Make sure gcc doesn't try to be clever and move things around
15 * on us. We need to use _exactly_ the address the user gave us,
16 * not some alias that contains the same information.
17 */
18typedef struct {
19 int counter;
20} atomic_t;
21
22#define ATOMIC_INIT(i) { (i) }
23
24/**
25 * atomic_read - read atomic variable
26 * @v: pointer of type atomic_t
27 *
28 * Atomically reads the value of @v.
29 */
30#define atomic_read(v) ((v)->counter)
31
32/**
33 * atomic_set - set atomic variable
34 * @v: pointer of type atomic_t
35 * @i: required value
36 *
37 * Atomically sets the value of @v to @i.
38 */
39#define atomic_set(v, i) (((v)->counter) = (i))
40
41/**
42 * atomic_add - add integer to atomic variable
43 * @i: integer value to add
44 * @v: pointer of type atomic_t
45 *
46 * Atomically adds @i to @v.
47 */
48static inline void atomic_add(int i, atomic_t *v)
49{
50 asm volatile(LOCK_PREFIX "addl %1,%0"
51 : "+m" (v->counter)
52 : "ir" (i));
53}
54
55/**
56 * atomic_sub - subtract integer from atomic variable
57 * @i: integer value to subtract
58 * @v: pointer of type atomic_t
59 *
60 * Atomically subtracts @i from @v.
61 */
62static inline void atomic_sub(int i, atomic_t *v)
63{
64 asm volatile(LOCK_PREFIX "subl %1,%0"
65 : "+m" (v->counter)
66 : "ir" (i));
67}
68
69/**
70 * atomic_sub_and_test - subtract value from variable and test result
71 * @i: integer value to subtract
72 * @v: pointer of type atomic_t
73 *
74 * Atomically subtracts @i from @v and returns
75 * true if the result is zero, or false for all
76 * other cases.
77 */
78static inline int atomic_sub_and_test(int i, atomic_t *v)
79{
80 unsigned char c;
81
82 asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
83 : "+m" (v->counter), "=qm" (c)
84 : "ir" (i) : "memory");
85 return c;
86}
87
88/**
89 * atomic_inc - increment atomic variable
90 * @v: pointer of type atomic_t
91 *
92 * Atomically increments @v by 1.
93 */
94static inline void atomic_inc(atomic_t *v)
95{
96 asm volatile(LOCK_PREFIX "incl %0"
97 : "+m" (v->counter));
98}
99
100/**
101 * atomic_dec - decrement atomic variable
102 * @v: pointer of type atomic_t
103 *
104 * Atomically decrements @v by 1.
105 */
106static inline void atomic_dec(atomic_t *v)
107{
108 asm volatile(LOCK_PREFIX "decl %0"
109 : "+m" (v->counter));
110}
111
112/**
113 * atomic_dec_and_test - decrement and test
114 * @v: pointer of type atomic_t
115 *
116 * Atomically decrements @v by 1 and
117 * returns true if the result is 0, or false for all other
118 * cases.
119 */
120static inline int atomic_dec_and_test(atomic_t *v)
121{
122 unsigned char c;
123
124 asm volatile(LOCK_PREFIX "decl %0; sete %1"
125 : "+m" (v->counter), "=qm" (c)
126 : : "memory");
127 return c != 0;
128}
129
130/**
131 * atomic_inc_and_test - increment and test
132 * @v: pointer of type atomic_t
133 *
134 * Atomically increments @v by 1
135 * and returns true if the result is zero, or false for all
136 * other cases.
137 */
138static inline int atomic_inc_and_test(atomic_t *v)
139{
140 unsigned char c;
141
142 asm volatile(LOCK_PREFIX "incl %0; sete %1"
143 : "+m" (v->counter), "=qm" (c)
144 : : "memory");
145 return c != 0;
146}
147
148/**
149 * atomic_add_negative - add and test if negative
150 * @v: pointer of type atomic_t
151 * @i: integer value to add
152 *
153 * Atomically adds @i to @v and returns true
154 * if the result is negative, or false when
155 * result is greater than or equal to zero.
156 */
157static inline int atomic_add_negative(int i, atomic_t *v)
158{
159 unsigned char c;
160
161 asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
162 : "+m" (v->counter), "=qm" (c)
163 : "ir" (i) : "memory");
164 return c;
165}
166
167/**
168 * atomic_add_return - add integer and return
169 * @v: pointer of type atomic_t
170 * @i: integer value to add
171 *
172 * Atomically adds @i to @v and returns @i + @v
173 */
174static inline int atomic_add_return(int i, atomic_t *v)
175{
176 int __i;
177#ifdef CONFIG_M386
178 unsigned long flags;
179 if (unlikely(boot_cpu_data.x86 <= 3))
180 goto no_xadd;
181#endif
182 /* Modern 486+ processor */
183 __i = i;
184 asm volatile(LOCK_PREFIX "xaddl %0, %1"
185 : "+r" (i), "+m" (v->counter)
186 : : "memory");
187 return i + __i;
188
189#ifdef CONFIG_M386
190no_xadd: /* Legacy 386 processor */
191 local_irq_save(flags);
192 __i = atomic_read(v);
193 atomic_set(v, i + __i);
194 local_irq_restore(flags);
195 return i + __i;
196#endif
197}
198
199/**
200 * atomic_sub_return - subtract integer and return
201 * @v: pointer of type atomic_t
202 * @i: integer value to subtract
203 *
204 * Atomically subtracts @i from @v and returns @v - @i
205 */
206static inline int atomic_sub_return(int i, atomic_t *v)
207{
208 return atomic_add_return(-i, v);
209}
210
211#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
212#define atomic_xchg(v, new) (xchg(&((v)->counter), (new)))
213
214/**
215 * atomic_add_unless - add unless the number is already a given value
216 * @v: pointer of type atomic_t
217 * @a: the amount to add to v...
218 * @u: ...unless v is equal to u.
219 *
220 * Atomically adds @a to @v, so long as @v was not already @u.
221 * Returns non-zero if @v was not @u, and zero otherwise.
222 */
223static inline int atomic_add_unless(atomic_t *v, int a, int u)
224{
225 int c, old;
226 c = atomic_read(v);
227 for (;;) {
228 if (unlikely(c == (u)))
229 break;
230 old = atomic_cmpxchg((v), c, c + (a));
231 if (likely(old == c))
232 break;
233 c = old;
234 }
235 return c != (u);
236}
237
238#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
239
240#define atomic_inc_return(v) (atomic_add_return(1, v))
241#define atomic_dec_return(v) (atomic_sub_return(1, v))
242
243/* These are x86-specific, used by some header files */
244#define atomic_clear_mask(mask, addr) \
245 asm volatile(LOCK_PREFIX "andl %0,%1" \
246 : : "r" (~(mask)), "m" (*(addr)) : "memory")
247
248#define atomic_set_mask(mask, addr) \
249 asm volatile(LOCK_PREFIX "orl %0,%1" \
250 : : "r" (mask), "m" (*(addr)) : "memory")
251
252/* Atomic operations are already serializing on x86 */
253#define smp_mb__before_atomic_dec() barrier()
254#define smp_mb__after_atomic_dec() barrier()
255#define smp_mb__before_atomic_inc() barrier()
256#define smp_mb__after_atomic_inc() barrier()
257
258#include <asm-generic/atomic.h>
259#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
new file mode 100644
index 00000000000..279d2a731f3
--- /dev/null
+++ b/arch/x86/include/asm/atomic_64.h
@@ -0,0 +1,473 @@
1#ifndef _ASM_X86_ATOMIC_64_H
2#define _ASM_X86_ATOMIC_64_H
3
4#include <asm/alternative.h>
5#include <asm/cmpxchg.h>
6
7/* atomic_t should be 32 bit signed type */
8
9/*
10 * Atomic operations that C can't guarantee us. Useful for
11 * resource counting etc..
12 */
13
14/*
15 * Make sure gcc doesn't try to be clever and move things around
16 * on us. We need to use _exactly_ the address the user gave us,
17 * not some alias that contains the same information.
18 */
19typedef struct {
20 int counter;
21} atomic_t;
22
23#define ATOMIC_INIT(i) { (i) }
24
25/**
26 * atomic_read - read atomic variable
27 * @v: pointer of type atomic_t
28 *
29 * Atomically reads the value of @v.
30 */
31#define atomic_read(v) ((v)->counter)
32
33/**
34 * atomic_set - set atomic variable
35 * @v: pointer of type atomic_t
36 * @i: required value
37 *
38 * Atomically sets the value of @v to @i.
39 */
40#define atomic_set(v, i) (((v)->counter) = (i))
41
42/**
43 * atomic_add - add integer to atomic variable
44 * @i: integer value to add
45 * @v: pointer of type atomic_t
46 *
47 * Atomically adds @i to @v.
48 */
49static inline void atomic_add(int i, atomic_t *v)
50{
51 asm volatile(LOCK_PREFIX "addl %1,%0"
52 : "=m" (v->counter)
53 : "ir" (i), "m" (v->counter));
54}
55
56/**
57 * atomic_sub - subtract the atomic variable
58 * @i: integer value to subtract
59 * @v: pointer of type atomic_t
60 *
61 * Atomically subtracts @i from @v.
62 */
63static inline void atomic_sub(int i, atomic_t *v)
64{
65 asm volatile(LOCK_PREFIX "subl %1,%0"
66 : "=m" (v->counter)
67 : "ir" (i), "m" (v->counter));
68}
69
70/**
71 * atomic_sub_and_test - subtract value from variable and test result
72 * @i: integer value to subtract
73 * @v: pointer of type atomic_t
74 *
75 * Atomically subtracts @i from @v and returns
76 * true if the result is zero, or false for all
77 * other cases.
78 */
79static inline int atomic_sub_and_test(int i, atomic_t *v)
80{
81 unsigned char c;
82
83 asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
84 : "=m" (v->counter), "=qm" (c)
85 : "ir" (i), "m" (v->counter) : "memory");
86 return c;
87}
88
89/**
90 * atomic_inc - increment atomic variable
91 * @v: pointer of type atomic_t
92 *
93 * Atomically increments @v by 1.
94 */
95static inline void atomic_inc(atomic_t *v)
96{
97 asm volatile(LOCK_PREFIX "incl %0"
98 : "=m" (v->counter)
99 : "m" (v->counter));
100}
101
102/**
103 * atomic_dec - decrement atomic variable
104 * @v: pointer of type atomic_t
105 *
106 * Atomically decrements @v by 1.
107 */
108static inline void atomic_dec(atomic_t *v)
109{
110 asm volatile(LOCK_PREFIX "decl %0"
111 : "=m" (v->counter)
112 : "m" (v->counter));
113}
114
115/**
116 * atomic_dec_and_test - decrement and test
117 * @v: pointer of type atomic_t
118 *
119 * Atomically decrements @v by 1 and
120 * returns true if the result is 0, or false for all other
121 * cases.
122 */
123static inline int atomic_dec_and_test(atomic_t *v)
124{
125 unsigned char c;
126
127 asm volatile(LOCK_PREFIX "decl %0; sete %1"
128 : "=m" (v->counter), "=qm" (c)
129 : "m" (v->counter) : "memory");
130 return c != 0;
131}
132
133/**
134 * atomic_inc_and_test - increment and test
135 * @v: pointer of type atomic_t
136 *
137 * Atomically increments @v by 1
138 * and returns true if the result is zero, or false for all
139 * other cases.
140 */
141static inline int atomic_inc_and_test(atomic_t *v)
142{
143 unsigned char c;
144
145 asm volatile(LOCK_PREFIX "incl %0; sete %1"
146 : "=m" (v->counter), "=qm" (c)
147 : "m" (v->counter) : "memory");
148 return c != 0;
149}
150
151/**
152 * atomic_add_negative - add and test if negative
153 * @i: integer value to add
154 * @v: pointer of type atomic_t
155 *
156 * Atomically adds @i to @v and returns true
157 * if the result is negative, or false when
158 * result is greater than or equal to zero.
159 */
160static inline int atomic_add_negative(int i, atomic_t *v)
161{
162 unsigned char c;
163
164 asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
165 : "=m" (v->counter), "=qm" (c)
166 : "ir" (i), "m" (v->counter) : "memory");
167 return c;
168}
169
170/**
171 * atomic_add_return - add and return
172 * @i: integer value to add
173 * @v: pointer of type atomic_t
174 *
175 * Atomically adds @i to @v and returns @i + @v
176 */
177static inline int atomic_add_return(int i, atomic_t *v)
178{
179 int __i = i;
180 asm volatile(LOCK_PREFIX "xaddl %0, %1"
181 : "+r" (i), "+m" (v->counter)
182 : : "memory");
183 return i + __i;
184}
185
186static inline int atomic_sub_return(int i, atomic_t *v)
187{
188 return atomic_add_return(-i, v);
189}
190
191#define atomic_inc_return(v) (atomic_add_return(1, v))
192#define atomic_dec_return(v) (atomic_sub_return(1, v))
193
194/* An 64bit atomic type */
195
196typedef struct {
197 long counter;
198} atomic64_t;
199
200#define ATOMIC64_INIT(i) { (i) }
201
202/**
203 * atomic64_read - read atomic64 variable
204 * @v: pointer of type atomic64_t
205 *
206 * Atomically reads the value of @v.
207 * Doesn't imply a read memory barrier.
208 */
209#define atomic64_read(v) ((v)->counter)
210
211/**
212 * atomic64_set - set atomic64 variable
213 * @v: pointer to type atomic64_t
214 * @i: required value
215 *
216 * Atomically sets the value of @v to @i.
217 */
218#define atomic64_set(v, i) (((v)->counter) = (i))
219
220/**
221 * atomic64_add - add integer to atomic64 variable
222 * @i: integer value to add
223 * @v: pointer to type atomic64_t
224 *
225 * Atomically adds @i to @v.
226 */
227static inline void atomic64_add(long i, atomic64_t *v)
228{
229 asm volatile(LOCK_PREFIX "addq %1,%0"
230 : "=m" (v->counter)
231 : "er" (i), "m" (v->counter));
232}
233
234/**
235 * atomic64_sub - subtract the atomic64 variable
236 * @i: integer value to subtract
237 * @v: pointer to type atomic64_t
238 *
239 * Atomically subtracts @i from @v.
240 */
241static inline void atomic64_sub(long i, atomic64_t *v)
242{
243 asm volatile(LOCK_PREFIX "subq %1,%0"
244 : "=m" (v->counter)
245 : "er" (i), "m" (v->counter));
246}
247
248/**
249 * atomic64_sub_and_test - subtract value from variable and test result
250 * @i: integer value to subtract
251 * @v: pointer to type atomic64_t
252 *
253 * Atomically subtracts @i from @v and returns
254 * true if the result is zero, or false for all
255 * other cases.
256 */
257static inline int atomic64_sub_and_test(long i, atomic64_t *v)
258{
259 unsigned char c;
260
261 asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
262 : "=m" (v->counter), "=qm" (c)
263 : "er" (i), "m" (v->counter) : "memory");
264 return c;
265}
266
267/**
268 * atomic64_inc - increment atomic64 variable
269 * @v: pointer to type atomic64_t
270 *
271 * Atomically increments @v by 1.
272 */
273static inline void atomic64_inc(atomic64_t *v)
274{
275 asm volatile(LOCK_PREFIX "incq %0"
276 : "=m" (v->counter)
277 : "m" (v->counter));
278}
279
280/**
281 * atomic64_dec - decrement atomic64 variable
282 * @v: pointer to type atomic64_t
283 *
284 * Atomically decrements @v by 1.
285 */
286static inline void atomic64_dec(atomic64_t *v)
287{
288 asm volatile(LOCK_PREFIX "decq %0"
289 : "=m" (v->counter)
290 : "m" (v->counter));
291}
292
293/**
294 * atomic64_dec_and_test - decrement and test
295 * @v: pointer to type atomic64_t
296 *
297 * Atomically decrements @v by 1 and
298 * returns true if the result is 0, or false for all other
299 * cases.
300 */
301static inline int atomic64_dec_and_test(atomic64_t *v)
302{
303 unsigned char c;
304
305 asm volatile(LOCK_PREFIX "decq %0; sete %1"
306 : "=m" (v->counter), "=qm" (c)
307 : "m" (v->counter) : "memory");
308 return c != 0;
309}
310
311/**
312 * atomic64_inc_and_test - increment and test
313 * @v: pointer to type atomic64_t
314 *
315 * Atomically increments @v by 1
316 * and returns true if the result is zero, or false for all
317 * other cases.
318 */
319static inline int atomic64_inc_and_test(atomic64_t *v)
320{
321 unsigned char c;
322
323 asm volatile(LOCK_PREFIX "incq %0; sete %1"
324 : "=m" (v->counter), "=qm" (c)
325 : "m" (v->counter) : "memory");
326 return c != 0;
327}
328
329/**
330 * atomic64_add_negative - add and test if negative
331 * @i: integer value to add
332 * @v: pointer to type atomic64_t
333 *
334 * Atomically adds @i to @v and returns true
335 * if the result is negative, or false when
336 * result is greater than or equal to zero.
337 */
338static inline int atomic64_add_negative(long i, atomic64_t *v)
339{
340 unsigned char c;
341
342 asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
343 : "=m" (v->counter), "=qm" (c)
344 : "er" (i), "m" (v->counter) : "memory");
345 return c;
346}
347
348/**
349 * atomic64_add_return - add and return
350 * @i: integer value to add
351 * @v: pointer to type atomic64_t
352 *
353 * Atomically adds @i to @v and returns @i + @v
354 */
355static inline long atomic64_add_return(long i, atomic64_t *v)
356{
357 long __i = i;
358 asm volatile(LOCK_PREFIX "xaddq %0, %1;"
359 : "+r" (i), "+m" (v->counter)
360 : : "memory");
361 return i + __i;
362}
363
364static inline long atomic64_sub_return(long i, atomic64_t *v)
365{
366 return atomic64_add_return(-i, v);
367}
368
369#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
370#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
371
372#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
373#define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
374
375#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
376#define atomic_xchg(v, new) (xchg(&((v)->counter), (new)))
377
378/**
379 * atomic_add_unless - add unless the number is a given value
380 * @v: pointer of type atomic_t
381 * @a: the amount to add to v...
382 * @u: ...unless v is equal to u.
383 *
384 * Atomically adds @a to @v, so long as it was not @u.
385 * Returns non-zero if @v was not @u, and zero otherwise.
386 */
387static inline int atomic_add_unless(atomic_t *v, int a, int u)
388{
389 int c, old;
390 c = atomic_read(v);
391 for (;;) {
392 if (unlikely(c == (u)))
393 break;
394 old = atomic_cmpxchg((v), c, c + (a));
395 if (likely(old == c))
396 break;
397 c = old;
398 }
399 return c != (u);
400}
401
402#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
403
404/**
405 * atomic64_add_unless - add unless the number is a given value
406 * @v: pointer of type atomic64_t
407 * @a: the amount to add to v...
408 * @u: ...unless v is equal to u.
409 *
410 * Atomically adds @a to @v, so long as it was not @u.
411 * Returns non-zero if @v was not @u, and zero otherwise.
412 */
413static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
414{
415 long c, old;
416 c = atomic64_read(v);
417 for (;;) {
418 if (unlikely(c == (u)))
419 break;
420 old = atomic64_cmpxchg((v), c, c + (a));
421 if (likely(old == c))
422 break;
423 c = old;
424 }
425 return c != (u);
426}
427
428/**
429 * atomic_inc_short - increment of a short integer
430 * @v: pointer to type int
431 *
432 * Atomically adds 1 to @v
433 * Returns the new value of @u
434 */
435static inline short int atomic_inc_short(short int *v)
436{
437 asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
438 return *v;
439}
440
441/**
442 * atomic_or_long - OR of two long integers
443 * @v1: pointer to type unsigned long
444 * @v2: pointer to type unsigned long
445 *
446 * Atomically ORs @v1 and @v2
447 * Returns the result of the OR
448 */
449static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
450{
451 asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
452}
453
454#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
455
456/* These are x86-specific, used by some header files */
457#define atomic_clear_mask(mask, addr) \
458 asm volatile(LOCK_PREFIX "andl %0,%1" \
459 : : "r" (~(mask)), "m" (*(addr)) : "memory")
460
461#define atomic_set_mask(mask, addr) \
462 asm volatile(LOCK_PREFIX "orl %0,%1" \
463 : : "r" ((unsigned)(mask)), "m" (*(addr)) \
464 : "memory")
465
466/* Atomic operations are already serializing on x86 */
467#define smp_mb__before_atomic_dec() barrier()
468#define smp_mb__after_atomic_dec() barrier()
469#define smp_mb__before_atomic_inc() barrier()
470#define smp_mb__after_atomic_inc() barrier()
471
472#include <asm-generic/atomic.h>
473#endif /* _ASM_X86_ATOMIC_64_H */
diff --git a/arch/x86/include/asm/auxvec.h b/arch/x86/include/asm/auxvec.h
new file mode 100644
index 00000000000..1316b4c3542
--- /dev/null
+++ b/arch/x86/include/asm/auxvec.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_AUXVEC_H
2#define _ASM_X86_AUXVEC_H
3/*
4 * Architecture-neutral AT_ values in 0-17, leave some room
5 * for more of them, start the x86-specific ones at 32.
6 */
7#ifdef __i386__
8#define AT_SYSINFO 32
9#endif
10#define AT_SYSINFO_EHDR 33
11
12#endif /* _ASM_X86_AUXVEC_H */
diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h
new file mode 100644
index 00000000000..1d9543b9d35
--- /dev/null
+++ b/arch/x86/include/asm/bigsmp/apic.h
@@ -0,0 +1,139 @@
1#ifndef __ASM_MACH_APIC_H
2#define __ASM_MACH_APIC_H
3
4#define xapic_phys_to_log_apicid(cpu) (per_cpu(x86_bios_cpu_apicid, cpu))
5#define esr_disable (1)
6
7static inline int apic_id_registered(void)
8{
9 return (1);
10}
11
12static inline cpumask_t target_cpus(void)
13{
14#ifdef CONFIG_SMP
15 return cpu_online_map;
16#else
17 return cpumask_of_cpu(0);
18#endif
19}
20
21#undef APIC_DEST_LOGICAL
22#define APIC_DEST_LOGICAL 0
23#define APIC_DFR_VALUE (APIC_DFR_FLAT)
24#define INT_DELIVERY_MODE (dest_Fixed)
25#define INT_DEST_MODE (0) /* phys delivery to target proc */
26#define NO_BALANCE_IRQ (0)
27#define WAKE_SECONDARY_VIA_INIT
28
29
30static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
31{
32 return (0);
33}
34
35static inline unsigned long check_apicid_present(int bit)
36{
37 return (1);
38}
39
40static inline unsigned long calculate_ldr(int cpu)
41{
42 unsigned long val, id;
43 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
44 id = xapic_phys_to_log_apicid(cpu);
45 val |= SET_APIC_LOGICAL_ID(id);
46 return val;
47}
48
49/*
50 * Set up the logical destination ID.
51 *
52 * Intel recommends to set DFR, LDR and TPR before enabling
53 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
54 * document number 292116). So here it goes...
55 */
56static inline void init_apic_ldr(void)
57{
58 unsigned long val;
59 int cpu = smp_processor_id();
60
61 apic_write(APIC_DFR, APIC_DFR_VALUE);
62 val = calculate_ldr(cpu);
63 apic_write(APIC_LDR, val);
64}
65
66static inline void setup_apic_routing(void)
67{
68 printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
69 "Physflat", nr_ioapics);
70}
71
72static inline int multi_timer_check(int apic, int irq)
73{
74 return (0);
75}
76
77static inline int apicid_to_node(int logical_apicid)
78{
79 return apicid_2_node[hard_smp_processor_id()];
80}
81
82static inline int cpu_present_to_apicid(int mps_cpu)
83{
84 if (mps_cpu < NR_CPUS)
85 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
86
87 return BAD_APICID;
88}
89
90static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
91{
92 return physid_mask_of_physid(phys_apicid);
93}
94
95extern u8 cpu_2_logical_apicid[];
96/* Mapping from cpu number to logical apicid */
97static inline int cpu_to_logical_apicid(int cpu)
98{
99 if (cpu >= NR_CPUS)
100 return BAD_APICID;
101 return cpu_physical_id(cpu);
102}
103
104static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
105{
106 /* For clustered we don't have a good way to do this yet - hack */
107 return physids_promote(0xFFL);
108}
109
110static inline void setup_portio_remap(void)
111{
112}
113
114static inline void enable_apic_mode(void)
115{
116}
117
118static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
119{
120 return (1);
121}
122
123/* As we are using single CPU as destination, pick only one CPU here */
124static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
125{
126 int cpu;
127 int apicid;
128
129 cpu = first_cpu(cpumask);
130 apicid = cpu_to_logical_apicid(cpu);
131 return apicid;
132}
133
134static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
135{
136 return cpuid_apic >> index_msb;
137}
138
139#endif /* __ASM_MACH_APIC_H */
diff --git a/arch/x86/include/asm/bigsmp/apicdef.h b/arch/x86/include/asm/bigsmp/apicdef.h
new file mode 100644
index 00000000000..392c3f5ef2f
--- /dev/null
+++ b/arch/x86/include/asm/bigsmp/apicdef.h
@@ -0,0 +1,13 @@
1#ifndef __ASM_MACH_APICDEF_H
2#define __ASM_MACH_APICDEF_H
3
4#define APIC_ID_MASK (0xFF<<24)
5
6static inline unsigned get_apic_id(unsigned long x)
7{
8 return (((x)>>24)&0xFF);
9}
10
11#define GET_APIC_ID(x) get_apic_id(x)
12
13#endif
diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h
new file mode 100644
index 00000000000..9404c535b7e
--- /dev/null
+++ b/arch/x86/include/asm/bigsmp/ipi.h
@@ -0,0 +1,25 @@
1#ifndef __ASM_MACH_IPI_H
2#define __ASM_MACH_IPI_H
3
4void send_IPI_mask_sequence(cpumask_t mask, int vector);
5
6static inline void send_IPI_mask(cpumask_t mask, int vector)
7{
8 send_IPI_mask_sequence(mask, vector);
9}
10
11static inline void send_IPI_allbutself(int vector)
12{
13 cpumask_t mask = cpu_online_map;
14 cpu_clear(smp_processor_id(), mask);
15
16 if (!cpus_empty(mask))
17 send_IPI_mask(mask, vector);
18}
19
20static inline void send_IPI_all(int vector)
21{
22 send_IPI_mask(cpu_online_map, vector);
23}
24
25#endif /* __ASM_MACH_IPI_H */
diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h
new file mode 100644
index 00000000000..3c7521063d3
--- /dev/null
+++ b/arch/x86/include/asm/bios_ebda.h
@@ -0,0 +1,36 @@
1#ifndef _ASM_X86_BIOS_EBDA_H
2#define _ASM_X86_BIOS_EBDA_H
3
4#include <asm/io.h>
5
6/*
7 * there is a real-mode segmented pointer pointing to the
8 * 4K EBDA area at 0x40E.
9 */
10static inline unsigned int get_bios_ebda(void)
11{
12 unsigned int address = *(unsigned short *)phys_to_virt(0x40E);
13 address <<= 4;
14 return address; /* 0 means none */
15}
16
17void reserve_ebda_region(void);
18
19#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
20/*
21 * This is obviously not a great place for this, but we want to be
22 * able to scatter it around anywhere in the kernel.
23 */
24void check_for_bios_corruption(void);
25void start_periodic_check_for_corruption(void);
26#else
27static inline void check_for_bios_corruption(void)
28{
29}
30
31static inline void start_periodic_check_for_corruption(void)
32{
33}
34#endif
35
36#endif /* _ASM_X86_BIOS_EBDA_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
new file mode 100644
index 00000000000..36001032271
--- /dev/null
+++ b/arch/x86/include/asm/bitops.h
@@ -0,0 +1,451 @@
1#ifndef _ASM_X86_BITOPS_H
2#define _ASM_X86_BITOPS_H
3
4/*
5 * Copyright 1992, Linus Torvalds.
6 */
7
8#ifndef _LINUX_BITOPS_H
9#error only <linux/bitops.h> can be included directly
10#endif
11
12#include <linux/compiler.h>
13#include <asm/alternative.h>
14
15/*
16 * These have to be done with inline assembly: that way the bit-setting
17 * is guaranteed to be atomic. All bit operations return 0 if the bit
18 * was cleared before the operation and != 0 if it was not.
19 *
20 * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
21 */
22
23#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
24/* Technically wrong, but this avoids compilation errors on some gcc
25 versions. */
26#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
27#else
28#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
29#endif
30
31#define ADDR BITOP_ADDR(addr)
32
33/*
34 * We do the locked ops that don't return the old value as
35 * a mask operation on a byte.
36 */
37#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr))
38#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3))
39#define CONST_MASK(nr) (1 << ((nr) & 7))
40
41/**
42 * set_bit - Atomically set a bit in memory
43 * @nr: the bit to set
44 * @addr: the address to start counting from
45 *
46 * This function is atomic and may not be reordered. See __set_bit()
47 * if you do not require the atomic guarantees.
48 *
49 * Note: there are no guarantees that this function will not be reordered
50 * on non x86 architectures, so if you are writing portable code,
51 * make sure not to rely on its reordering guarantees.
52 *
53 * Note that @nr may be almost arbitrarily large; this function is not
54 * restricted to acting on a single-word quantity.
55 */
56static inline void set_bit(unsigned int nr, volatile unsigned long *addr)
57{
58 if (IS_IMMEDIATE(nr)) {
59 asm volatile(LOCK_PREFIX "orb %1,%0"
60 : CONST_MASK_ADDR(nr, addr)
61 : "iq" ((u8)CONST_MASK(nr))
62 : "memory");
63 } else {
64 asm volatile(LOCK_PREFIX "bts %1,%0"
65 : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
66 }
67}
68
69/**
70 * __set_bit - Set a bit in memory
71 * @nr: the bit to set
72 * @addr: the address to start counting from
73 *
74 * Unlike set_bit(), this function is non-atomic and may be reordered.
75 * If it's called on the same region of memory simultaneously, the effect
76 * may be that only one operation succeeds.
77 */
78static inline void __set_bit(int nr, volatile unsigned long *addr)
79{
80 asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
81}
82
83/**
84 * clear_bit - Clears a bit in memory
85 * @nr: Bit to clear
86 * @addr: Address to start counting from
87 *
88 * clear_bit() is atomic and may not be reordered. However, it does
89 * not contain a memory barrier, so if it is used for locking purposes,
90 * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
91 * in order to ensure changes are visible on other processors.
92 */
93static inline void clear_bit(int nr, volatile unsigned long *addr)
94{
95 if (IS_IMMEDIATE(nr)) {
96 asm volatile(LOCK_PREFIX "andb %1,%0"
97 : CONST_MASK_ADDR(nr, addr)
98 : "iq" ((u8)~CONST_MASK(nr)));
99 } else {
100 asm volatile(LOCK_PREFIX "btr %1,%0"
101 : BITOP_ADDR(addr)
102 : "Ir" (nr));
103 }
104}
105
106/*
107 * clear_bit_unlock - Clears a bit in memory
108 * @nr: Bit to clear
109 * @addr: Address to start counting from
110 *
111 * clear_bit() is atomic and implies release semantics before the memory
112 * operation. It can be used for an unlock.
113 */
114static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
115{
116 barrier();
117 clear_bit(nr, addr);
118}
119
120static inline void __clear_bit(int nr, volatile unsigned long *addr)
121{
122 asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
123}
124
125/*
126 * __clear_bit_unlock - Clears a bit in memory
127 * @nr: Bit to clear
128 * @addr: Address to start counting from
129 *
130 * __clear_bit() is non-atomic and implies release semantics before the memory
131 * operation. It can be used for an unlock if no other CPUs can concurrently
132 * modify other bits in the word.
133 *
134 * No memory barrier is required here, because x86 cannot reorder stores past
135 * older loads. Same principle as spin_unlock.
136 */
137static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
138{
139 barrier();
140 __clear_bit(nr, addr);
141}
142
143#define smp_mb__before_clear_bit() barrier()
144#define smp_mb__after_clear_bit() barrier()
145
146/**
147 * __change_bit - Toggle a bit in memory
148 * @nr: the bit to change
149 * @addr: the address to start counting from
150 *
151 * Unlike change_bit(), this function is non-atomic and may be reordered.
152 * If it's called on the same region of memory simultaneously, the effect
153 * may be that only one operation succeeds.
154 */
155static inline void __change_bit(int nr, volatile unsigned long *addr)
156{
157 asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
158}
159
160/**
161 * change_bit - Toggle a bit in memory
162 * @nr: Bit to change
163 * @addr: Address to start counting from
164 *
165 * change_bit() is atomic and may not be reordered.
166 * Note that @nr may be almost arbitrarily large; this function is not
167 * restricted to acting on a single-word quantity.
168 */
169static inline void change_bit(int nr, volatile unsigned long *addr)
170{
171 asm volatile(LOCK_PREFIX "btc %1,%0" : ADDR : "Ir" (nr));
172}
173
174/**
175 * test_and_set_bit - Set a bit and return its old value
176 * @nr: Bit to set
177 * @addr: Address to count from
178 *
179 * This operation is atomic and cannot be reordered.
180 * It also implies a memory barrier.
181 */
182static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
183{
184 int oldbit;
185
186 asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
187 "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
188
189 return oldbit;
190}
191
192/**
193 * test_and_set_bit_lock - Set a bit and return its old value for lock
194 * @nr: Bit to set
195 * @addr: Address to count from
196 *
197 * This is the same as test_and_set_bit on x86.
198 */
199static inline int test_and_set_bit_lock(int nr, volatile unsigned long *addr)
200{
201 return test_and_set_bit(nr, addr);
202}
203
204/**
205 * __test_and_set_bit - Set a bit and return its old value
206 * @nr: Bit to set
207 * @addr: Address to count from
208 *
209 * This operation is non-atomic and can be reordered.
210 * If two examples of this operation race, one can appear to succeed
211 * but actually fail. You must protect multiple accesses with a lock.
212 */
213static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
214{
215 int oldbit;
216
217 asm("bts %2,%1\n\t"
218 "sbb %0,%0"
219 : "=r" (oldbit), ADDR
220 : "Ir" (nr));
221 return oldbit;
222}
223
224/**
225 * test_and_clear_bit - Clear a bit and return its old value
226 * @nr: Bit to clear
227 * @addr: Address to count from
228 *
229 * This operation is atomic and cannot be reordered.
230 * It also implies a memory barrier.
231 */
232static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
233{
234 int oldbit;
235
236 asm volatile(LOCK_PREFIX "btr %2,%1\n\t"
237 "sbb %0,%0"
238 : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
239
240 return oldbit;
241}
242
243/**
244 * __test_and_clear_bit - Clear a bit and return its old value
245 * @nr: Bit to clear
246 * @addr: Address to count from
247 *
248 * This operation is non-atomic and can be reordered.
249 * If two examples of this operation race, one can appear to succeed
250 * but actually fail. You must protect multiple accesses with a lock.
251 */
252static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
253{
254 int oldbit;
255
256 asm volatile("btr %2,%1\n\t"
257 "sbb %0,%0"
258 : "=r" (oldbit), ADDR
259 : "Ir" (nr));
260 return oldbit;
261}
262
263/* WARNING: non atomic and it can be reordered! */
264static inline int __test_and_change_bit(int nr, volatile unsigned long *addr)
265{
266 int oldbit;
267
268 asm volatile("btc %2,%1\n\t"
269 "sbb %0,%0"
270 : "=r" (oldbit), ADDR
271 : "Ir" (nr) : "memory");
272
273 return oldbit;
274}
275
276/**
277 * test_and_change_bit - Change a bit and return its old value
278 * @nr: Bit to change
279 * @addr: Address to count from
280 *
281 * This operation is atomic and cannot be reordered.
282 * It also implies a memory barrier.
283 */
284static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
285{
286 int oldbit;
287
288 asm volatile(LOCK_PREFIX "btc %2,%1\n\t"
289 "sbb %0,%0"
290 : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
291
292 return oldbit;
293}
294
295static inline int constant_test_bit(int nr, const volatile unsigned long *addr)
296{
297 return ((1UL << (nr % BITS_PER_LONG)) &
298 (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
299}
300
301static inline int variable_test_bit(int nr, volatile const unsigned long *addr)
302{
303 int oldbit;
304
305 asm volatile("bt %2,%1\n\t"
306 "sbb %0,%0"
307 : "=r" (oldbit)
308 : "m" (*(unsigned long *)addr), "Ir" (nr));
309
310 return oldbit;
311}
312
313#if 0 /* Fool kernel-doc since it doesn't do macros yet */
314/**
315 * test_bit - Determine whether a bit is set
316 * @nr: bit number to test
317 * @addr: Address to start counting from
318 */
319static int test_bit(int nr, const volatile unsigned long *addr);
320#endif
321
322#define test_bit(nr, addr) \
323 (__builtin_constant_p((nr)) \
324 ? constant_test_bit((nr), (addr)) \
325 : variable_test_bit((nr), (addr)))
326
327/**
328 * __ffs - find first set bit in word
329 * @word: The word to search
330 *
331 * Undefined if no bit exists, so code should check against 0 first.
332 */
333static inline unsigned long __ffs(unsigned long word)
334{
335 asm("bsf %1,%0"
336 : "=r" (word)
337 : "rm" (word));
338 return word;
339}
340
341/**
342 * ffz - find first zero bit in word
343 * @word: The word to search
344 *
345 * Undefined if no zero exists, so code should check against ~0UL first.
346 */
347static inline unsigned long ffz(unsigned long word)
348{
349 asm("bsf %1,%0"
350 : "=r" (word)
351 : "r" (~word));
352 return word;
353}
354
355/*
356 * __fls: find last set bit in word
357 * @word: The word to search
358 *
359 * Undefined if no set bit exists, so code should check against 0 first.
360 */
361static inline unsigned long __fls(unsigned long word)
362{
363 asm("bsr %1,%0"
364 : "=r" (word)
365 : "rm" (word));
366 return word;
367}
368
369#ifdef __KERNEL__
370/**
371 * ffs - find first set bit in word
372 * @x: the word to search
373 *
374 * This is defined the same way as the libc and compiler builtin ffs
375 * routines, therefore differs in spirit from the other bitops.
376 *
377 * ffs(value) returns 0 if value is 0 or the position of the first
378 * set bit if value is nonzero. The first (least significant) bit
379 * is at position 1.
380 */
381static inline int ffs(int x)
382{
383 int r;
384#ifdef CONFIG_X86_CMOV
385 asm("bsfl %1,%0\n\t"
386 "cmovzl %2,%0"
387 : "=r" (r) : "rm" (x), "r" (-1));
388#else
389 asm("bsfl %1,%0\n\t"
390 "jnz 1f\n\t"
391 "movl $-1,%0\n"
392 "1:" : "=r" (r) : "rm" (x));
393#endif
394 return r + 1;
395}
396
397/**
398 * fls - find last set bit in word
399 * @x: the word to search
400 *
401 * This is defined in a similar way as the libc and compiler builtin
402 * ffs, but returns the position of the most significant set bit.
403 *
404 * fls(value) returns 0 if value is 0 or the position of the last
405 * set bit if value is nonzero. The last (most significant) bit is
406 * at position 32.
407 */
408static inline int fls(int x)
409{
410 int r;
411#ifdef CONFIG_X86_CMOV
412 asm("bsrl %1,%0\n\t"
413 "cmovzl %2,%0"
414 : "=&r" (r) : "rm" (x), "rm" (-1));
415#else
416 asm("bsrl %1,%0\n\t"
417 "jnz 1f\n\t"
418 "movl $-1,%0\n"
419 "1:" : "=r" (r) : "rm" (x));
420#endif
421 return r + 1;
422}
423#endif /* __KERNEL__ */
424
425#undef ADDR
426
427#ifdef __KERNEL__
428
429#include <asm-generic/bitops/sched.h>
430
431#define ARCH_HAS_FAST_MULTIPLIER 1
432
433#include <asm-generic/bitops/hweight.h>
434
435#endif /* __KERNEL__ */
436
437#include <asm-generic/bitops/fls64.h>
438
439#ifdef __KERNEL__
440
441#include <asm-generic/bitops/ext2-non-atomic.h>
442
443#define ext2_set_bit_atomic(lock, nr, addr) \
444 test_and_set_bit((nr), (unsigned long *)(addr))
445#define ext2_clear_bit_atomic(lock, nr, addr) \
446 test_and_clear_bit((nr), (unsigned long *)(addr))
447
448#include <asm-generic/bitops/minix.h>
449
450#endif /* __KERNEL__ */
451#endif /* _ASM_X86_BITOPS_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
new file mode 100644
index 00000000000..dd61616cb73
--- /dev/null
+++ b/arch/x86/include/asm/boot.h
@@ -0,0 +1,26 @@
1#ifndef _ASM_X86_BOOT_H
2#define _ASM_X86_BOOT_H
3
4/* Don't touch these, unless you really know what you're doing. */
5#define DEF_SYSSEG 0x1000
6#define DEF_SYSSIZE 0x7F00
7
8/* Internal svga startup constants */
9#define NORMAL_VGA 0xffff /* 80x25 mode */
10#define EXTENDED_VGA 0xfffe /* 80x50 mode */
11#define ASK_VGA 0xfffd /* ask for it at bootup */
12
13/* Physical address where kernel should be loaded. */
14#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
15 + (CONFIG_PHYSICAL_ALIGN - 1)) \
16 & ~(CONFIG_PHYSICAL_ALIGN - 1))
17
18#ifdef CONFIG_X86_64
19#define BOOT_HEAP_SIZE 0x7000
20#define BOOT_STACK_SIZE 0x4000
21#else
22#define BOOT_HEAP_SIZE 0x4000
23#define BOOT_STACK_SIZE 0x1000
24#endif
25
26#endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
new file mode 100644
index 00000000000..433adaebf9b
--- /dev/null
+++ b/arch/x86/include/asm/bootparam.h
@@ -0,0 +1,111 @@
1#ifndef _ASM_X86_BOOTPARAM_H
2#define _ASM_X86_BOOTPARAM_H
3
4#include <linux/types.h>
5#include <linux/screen_info.h>
6#include <linux/apm_bios.h>
7#include <linux/edd.h>
8#include <asm/e820.h>
9#include <asm/ist.h>
10#include <video/edid.h>
11
12/* setup data types */
13#define SETUP_NONE 0
14#define SETUP_E820_EXT 1
15
16/* extensible setup data list node */
17struct setup_data {
18 __u64 next;
19 __u32 type;
20 __u32 len;
21 __u8 data[0];
22};
23
24struct setup_header {
25 __u8 setup_sects;
26 __u16 root_flags;
27 __u32 syssize;
28 __u16 ram_size;
29#define RAMDISK_IMAGE_START_MASK 0x07FF
30#define RAMDISK_PROMPT_FLAG 0x8000
31#define RAMDISK_LOAD_FLAG 0x4000
32 __u16 vid_mode;
33 __u16 root_dev;
34 __u16 boot_flag;
35 __u16 jump;
36 __u32 header;
37 __u16 version;
38 __u32 realmode_swtch;
39 __u16 start_sys;
40 __u16 kernel_version;
41 __u8 type_of_loader;
42 __u8 loadflags;
43#define LOADED_HIGH (1<<0)
44#define QUIET_FLAG (1<<5)
45#define KEEP_SEGMENTS (1<<6)
46#define CAN_USE_HEAP (1<<7)
47 __u16 setup_move_size;
48 __u32 code32_start;
49 __u32 ramdisk_image;
50 __u32 ramdisk_size;
51 __u32 bootsect_kludge;
52 __u16 heap_end_ptr;
53 __u16 _pad1;
54 __u32 cmd_line_ptr;
55 __u32 initrd_addr_max;
56 __u32 kernel_alignment;
57 __u8 relocatable_kernel;
58 __u8 _pad2[3];
59 __u32 cmdline_size;
60 __u32 hardware_subarch;
61 __u64 hardware_subarch_data;
62 __u32 payload_offset;
63 __u32 payload_length;
64 __u64 setup_data;
65} __attribute__((packed));
66
67struct sys_desc_table {
68 __u16 length;
69 __u8 table[14];
70};
71
72struct efi_info {
73 __u32 efi_loader_signature;
74 __u32 efi_systab;
75 __u32 efi_memdesc_size;
76 __u32 efi_memdesc_version;
77 __u32 efi_memmap;
78 __u32 efi_memmap_size;
79 __u32 efi_systab_hi;
80 __u32 efi_memmap_hi;
81};
82
83/* The so-called "zeropage" */
84struct boot_params {
85 struct screen_info screen_info; /* 0x000 */
86 struct apm_bios_info apm_bios_info; /* 0x040 */
87 __u8 _pad2[12]; /* 0x054 */
88 struct ist_info ist_info; /* 0x060 */
89 __u8 _pad3[16]; /* 0x070 */
90 __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
91 __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
92 struct sys_desc_table sys_desc_table; /* 0x0a0 */
93 __u8 _pad4[144]; /* 0x0b0 */
94 struct edid_info edid_info; /* 0x140 */
95 struct efi_info efi_info; /* 0x1c0 */
96 __u32 alt_mem_k; /* 0x1e0 */
97 __u32 scratch; /* Scratch field! */ /* 0x1e4 */
98 __u8 e820_entries; /* 0x1e8 */
99 __u8 eddbuf_entries; /* 0x1e9 */
100 __u8 edd_mbr_sig_buf_entries; /* 0x1ea */
101 __u8 _pad6[6]; /* 0x1eb */
102 struct setup_header hdr; /* setup header */ /* 0x1f1 */
103 __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)];
104 __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */
105 struct e820entry e820_map[E820MAX]; /* 0x2d0 */
106 __u8 _pad8[48]; /* 0xcd0 */
107 struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */
108 __u8 _pad9[276]; /* 0xeec */
109} __attribute__((packed));
110
111#endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
new file mode 100644
index 00000000000..3def2065fce
--- /dev/null
+++ b/arch/x86/include/asm/bug.h
@@ -0,0 +1,39 @@
1#ifndef _ASM_X86_BUG_H
2#define _ASM_X86_BUG_H
3
4#ifdef CONFIG_BUG
5#define HAVE_ARCH_BUG
6
7#ifdef CONFIG_DEBUG_BUGVERBOSE
8
9#ifdef CONFIG_X86_32
10# define __BUG_C0 "2:\t.long 1b, %c0\n"
11#else
12# define __BUG_C0 "2:\t.quad 1b, %c0\n"
13#endif
14
15#define BUG() \
16do { \
17 asm volatile("1:\tud2\n" \
18 ".pushsection __bug_table,\"a\"\n" \
19 __BUG_C0 \
20 "\t.word %c1, 0\n" \
21 "\t.org 2b+%c2\n" \
22 ".popsection" \
23 : : "i" (__FILE__), "i" (__LINE__), \
24 "i" (sizeof(struct bug_entry))); \
25 for (;;) ; \
26} while (0)
27
28#else
29#define BUG() \
30do { \
31 asm volatile("ud2"); \
32 for (;;) ; \
33} while (0)
34#endif
35
36#endif /* !CONFIG_BUG */
37
38#include <asm-generic/bug.h>
39#endif /* _ASM_X86_BUG_H */
diff --git a/arch/x86/include/asm/bugs.h b/arch/x86/include/asm/bugs.h
new file mode 100644
index 00000000000..08abf639075
--- /dev/null
+++ b/arch/x86/include/asm/bugs.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_BUGS_H
2#define _ASM_X86_BUGS_H
3
4extern void check_bugs(void);
5
6#if defined(CONFIG_CPU_SUP_INTEL) && defined(CONFIG_X86_32)
7int ppro_with_ram_bug(void);
8#else
9static inline int ppro_with_ram_bug(void) { return 0; }
10#endif
11
12#endif /* _ASM_X86_BUGS_H */
diff --git a/arch/x86/include/asm/byteorder.h b/arch/x86/include/asm/byteorder.h
new file mode 100644
index 00000000000..e02ae2d89ac
--- /dev/null
+++ b/arch/x86/include/asm/byteorder.h
@@ -0,0 +1,81 @@
1#ifndef _ASM_X86_BYTEORDER_H
2#define _ASM_X86_BYTEORDER_H
3
4#include <asm/types.h>
5#include <linux/compiler.h>
6
7#ifdef __GNUC__
8
9#ifdef __i386__
10
11static inline __attribute_const__ __u32 ___arch__swab32(__u32 x)
12{
13#ifdef CONFIG_X86_BSWAP
14 asm("bswap %0" : "=r" (x) : "0" (x));
15#else
16 asm("xchgb %b0,%h0\n\t" /* swap lower bytes */
17 "rorl $16,%0\n\t" /* swap words */
18 "xchgb %b0,%h0" /* swap higher bytes */
19 : "=q" (x)
20 : "0" (x));
21#endif
22 return x;
23}
24
25static inline __attribute_const__ __u64 ___arch__swab64(__u64 val)
26{
27 union {
28 struct {
29 __u32 a;
30 __u32 b;
31 } s;
32 __u64 u;
33 } v;
34 v.u = val;
35#ifdef CONFIG_X86_BSWAP
36 asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1"
37 : "=r" (v.s.a), "=r" (v.s.b)
38 : "0" (v.s.a), "1" (v.s.b));
39#else
40 v.s.a = ___arch__swab32(v.s.a);
41 v.s.b = ___arch__swab32(v.s.b);
42 asm("xchgl %0,%1"
43 : "=r" (v.s.a), "=r" (v.s.b)
44 : "0" (v.s.a), "1" (v.s.b));
45#endif
46 return v.u;
47}
48
49#else /* __i386__ */
50
51static inline __attribute_const__ __u64 ___arch__swab64(__u64 x)
52{
53 asm("bswapq %0"
54 : "=r" (x)
55 : "0" (x));
56 return x;
57}
58
59static inline __attribute_const__ __u32 ___arch__swab32(__u32 x)
60{
61 asm("bswapl %0"
62 : "=r" (x)
63 : "0" (x));
64 return x;
65}
66
67#endif
68
69/* Do not define swab16. Gcc is smart enough to recognize "C" version and
70 convert it into rotation or exhange. */
71
72#define __arch__swab64(x) ___arch__swab64(x)
73#define __arch__swab32(x) ___arch__swab32(x)
74
75#define __BYTEORDER_HAS_U64__
76
77#endif /* __GNUC__ */
78
79#include <linux/byteorder/little_endian.h>
80
81#endif /* _ASM_X86_BYTEORDER_H */
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
new file mode 100644
index 00000000000..5d367caa0e3
--- /dev/null
+++ b/arch/x86/include/asm/cache.h
@@ -0,0 +1,20 @@
1#ifndef _ASM_X86_CACHE_H
2#define _ASM_X86_CACHE_H
3
4/* L1 cache line size */
5#define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT)
6#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
7
8#define __read_mostly __attribute__((__section__(".data.read_mostly")))
9
10#ifdef CONFIG_X86_VSMP
11/* vSMP Internode cacheline shift */
12#define INTERNODE_CACHE_SHIFT (12)
13#ifdef CONFIG_SMP
14#define __cacheline_aligned_in_smp \
15 __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \
16 __attribute__((__section__(".data.page_aligned")))
17#endif
18#endif
19
20#endif /* _ASM_X86_CACHE_H */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
new file mode 100644
index 00000000000..2f8466540fb
--- /dev/null
+++ b/arch/x86/include/asm/cacheflush.h
@@ -0,0 +1,118 @@
1#ifndef _ASM_X86_CACHEFLUSH_H
2#define _ASM_X86_CACHEFLUSH_H
3
4/* Keep includes the same across arches. */
5#include <linux/mm.h>
6
7/* Caches aren't brain-dead on the intel. */
8#define flush_cache_all() do { } while (0)
9#define flush_cache_mm(mm) do { } while (0)
10#define flush_cache_dup_mm(mm) do { } while (0)
11#define flush_cache_range(vma, start, end) do { } while (0)
12#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
13#define flush_dcache_page(page) do { } while (0)
14#define flush_dcache_mmap_lock(mapping) do { } while (0)
15#define flush_dcache_mmap_unlock(mapping) do { } while (0)
16#define flush_icache_range(start, end) do { } while (0)
17#define flush_icache_page(vma, pg) do { } while (0)
18#define flush_icache_user_range(vma, pg, adr, len) do { } while (0)
19#define flush_cache_vmap(start, end) do { } while (0)
20#define flush_cache_vunmap(start, end) do { } while (0)
21
22#define copy_to_user_page(vma, page, vaddr, dst, src, len) \
23 memcpy((dst), (src), (len))
24#define copy_from_user_page(vma, page, vaddr, dst, src, len) \
25 memcpy((dst), (src), (len))
26
27#define PG_non_WB PG_arch_1
28PAGEFLAG(NonWB, non_WB)
29
30/*
31 * The set_memory_* API can be used to change various attributes of a virtual
32 * address range. The attributes include:
33 * Cachability : UnCached, WriteCombining, WriteBack
34 * Executability : eXeutable, NoteXecutable
35 * Read/Write : ReadOnly, ReadWrite
36 * Presence : NotPresent
37 *
38 * Within a catagory, the attributes are mutually exclusive.
39 *
40 * The implementation of this API will take care of various aspects that
41 * are associated with changing such attributes, such as:
42 * - Flushing TLBs
43 * - Flushing CPU caches
44 * - Making sure aliases of the memory behind the mapping don't violate
45 * coherency rules as defined by the CPU in the system.
46 *
47 * What this API does not do:
48 * - Provide exclusion between various callers - including callers that
49 * operation on other mappings of the same physical page
50 * - Restore default attributes when a page is freed
51 * - Guarantee that mappings other than the requested one are
52 * in any state, other than that these do not violate rules for
53 * the CPU you have. Do not depend on any effects on other mappings,
54 * CPUs other than the one you have may have more relaxed rules.
55 * The caller is required to take care of these.
56 */
57
58int _set_memory_uc(unsigned long addr, int numpages);
59int _set_memory_wc(unsigned long addr, int numpages);
60int _set_memory_wb(unsigned long addr, int numpages);
61int set_memory_uc(unsigned long addr, int numpages);
62int set_memory_wc(unsigned long addr, int numpages);
63int set_memory_wb(unsigned long addr, int numpages);
64int set_memory_x(unsigned long addr, int numpages);
65int set_memory_nx(unsigned long addr, int numpages);
66int set_memory_ro(unsigned long addr, int numpages);
67int set_memory_rw(unsigned long addr, int numpages);
68int set_memory_np(unsigned long addr, int numpages);
69int set_memory_4k(unsigned long addr, int numpages);
70
71int set_memory_array_uc(unsigned long *addr, int addrinarray);
72int set_memory_array_wb(unsigned long *addr, int addrinarray);
73
74/*
75 * For legacy compatibility with the old APIs, a few functions
76 * are provided that work on a "struct page".
77 * These functions operate ONLY on the 1:1 kernel mapping of the
78 * memory that the struct page represents, and internally just
79 * call the set_memory_* function. See the description of the
80 * set_memory_* function for more details on conventions.
81 *
82 * These APIs should be considered *deprecated* and are likely going to
83 * be removed in the future.
84 * The reason for this is the implicit operation on the 1:1 mapping only,
85 * making this not a generally useful API.
86 *
87 * Specifically, many users of the old APIs had a virtual address,
88 * called virt_to_page() or vmalloc_to_page() on that address to
89 * get a struct page* that the old API required.
90 * To convert these cases, use set_memory_*() on the original
91 * virtual address, do not use these functions.
92 */
93
94int set_pages_uc(struct page *page, int numpages);
95int set_pages_wb(struct page *page, int numpages);
96int set_pages_x(struct page *page, int numpages);
97int set_pages_nx(struct page *page, int numpages);
98int set_pages_ro(struct page *page, int numpages);
99int set_pages_rw(struct page *page, int numpages);
100
101
102void clflush_cache_range(void *addr, unsigned int size);
103
104#ifdef CONFIG_DEBUG_RODATA
105void mark_rodata_ro(void);
106extern const int rodata_test_data;
107#endif
108
109#ifdef CONFIG_DEBUG_RODATA_TEST
110int rodata_test(void);
111#else
112static inline int rodata_test(void)
113{
114 return 0;
115}
116#endif
117
118#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
new file mode 100644
index 00000000000..b03bedb62aa
--- /dev/null
+++ b/arch/x86/include/asm/calgary.h
@@ -0,0 +1,72 @@
1/*
2 * Derived from include/asm-powerpc/iommu.h
3 *
4 * Copyright IBM Corporation, 2006-2007
5 *
6 * Author: Jon Mason <jdmason@us.ibm.com>
7 * Author: Muli Ben-Yehuda <muli@il.ibm.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#ifndef _ASM_X86_CALGARY_H
25#define _ASM_X86_CALGARY_H
26
27#include <linux/spinlock.h>
28#include <linux/device.h>
29#include <linux/dma-mapping.h>
30#include <linux/timer.h>
31#include <asm/types.h>
32
33struct iommu_table {
34 struct cal_chipset_ops *chip_ops; /* chipset specific funcs */
35 unsigned long it_base; /* mapped address of tce table */
36 unsigned long it_hint; /* Hint for next alloc */
37 unsigned long *it_map; /* A simple allocation bitmap for now */
38 void __iomem *bbar; /* Bridge BAR */
39 u64 tar_val; /* Table Address Register */
40 struct timer_list watchdog_timer;
41 spinlock_t it_lock; /* Protects it_map */
42 unsigned int it_size; /* Size of iommu table in entries */
43 unsigned char it_busno; /* Bus number this table belongs to */
44};
45
46struct cal_chipset_ops {
47 void (*handle_quirks)(struct iommu_table *tbl, struct pci_dev *dev);
48 void (*tce_cache_blast)(struct iommu_table *tbl);
49 void (*dump_error_regs)(struct iommu_table *tbl);
50};
51
52#define TCE_TABLE_SIZE_UNSPECIFIED ~0
53#define TCE_TABLE_SIZE_64K 0
54#define TCE_TABLE_SIZE_128K 1
55#define TCE_TABLE_SIZE_256K 2
56#define TCE_TABLE_SIZE_512K 3
57#define TCE_TABLE_SIZE_1M 4
58#define TCE_TABLE_SIZE_2M 5
59#define TCE_TABLE_SIZE_4M 6
60#define TCE_TABLE_SIZE_8M 7
61
62extern int use_calgary;
63
64#ifdef CONFIG_CALGARY_IOMMU
65extern int calgary_iommu_init(void);
66extern void detect_calgary(void);
67#else
68static inline int calgary_iommu_init(void) { return 1; }
69static inline void detect_calgary(void) { return; }
70#endif
71
72#endif /* _ASM_X86_CALGARY_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
new file mode 100644
index 00000000000..2bc162e0ec6
--- /dev/null
+++ b/arch/x86/include/asm/calling.h
@@ -0,0 +1,170 @@
1/*
2 * Some macros to handle stack frames in assembly.
3 */
4
5#define R15 0
6#define R14 8
7#define R13 16
8#define R12 24
9#define RBP 32
10#define RBX 40
11
12/* arguments: interrupts/non tracing syscalls only save upto here*/
13#define R11 48
14#define R10 56
15#define R9 64
16#define R8 72
17#define RAX 80
18#define RCX 88
19#define RDX 96
20#define RSI 104
21#define RDI 112
22#define ORIG_RAX 120 /* + error_code */
23/* end of arguments */
24
25/* cpu exception frame or undefined in case of fast syscall. */
26#define RIP 128
27#define CS 136
28#define EFLAGS 144
29#define RSP 152
30#define SS 160
31
32#define ARGOFFSET R11
33#define SWFRAME ORIG_RAX
34
35 .macro SAVE_ARGS addskip=0, norcx=0, nor891011=0
36 subq $9*8+\addskip, %rsp
37 CFI_ADJUST_CFA_OFFSET 9*8+\addskip
38 movq %rdi, 8*8(%rsp)
39 CFI_REL_OFFSET rdi, 8*8
40 movq %rsi, 7*8(%rsp)
41 CFI_REL_OFFSET rsi, 7*8
42 movq %rdx, 6*8(%rsp)
43 CFI_REL_OFFSET rdx, 6*8
44 .if \norcx
45 .else
46 movq %rcx, 5*8(%rsp)
47 CFI_REL_OFFSET rcx, 5*8
48 .endif
49 movq %rax, 4*8(%rsp)
50 CFI_REL_OFFSET rax, 4*8
51 .if \nor891011
52 .else
53 movq %r8, 3*8(%rsp)
54 CFI_REL_OFFSET r8, 3*8
55 movq %r9, 2*8(%rsp)
56 CFI_REL_OFFSET r9, 2*8
57 movq %r10, 1*8(%rsp)
58 CFI_REL_OFFSET r10, 1*8
59 movq %r11, (%rsp)
60 CFI_REL_OFFSET r11, 0*8
61 .endif
62 .endm
63
64#define ARG_SKIP 9*8
65
66 .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
67 skipr8910=0, skiprdx=0
68 .if \skipr11
69 .else
70 movq (%rsp), %r11
71 CFI_RESTORE r11
72 .endif
73 .if \skipr8910
74 .else
75 movq 1*8(%rsp), %r10
76 CFI_RESTORE r10
77 movq 2*8(%rsp), %r9
78 CFI_RESTORE r9
79 movq 3*8(%rsp), %r8
80 CFI_RESTORE r8
81 .endif
82 .if \skiprax
83 .else
84 movq 4*8(%rsp), %rax
85 CFI_RESTORE rax
86 .endif
87 .if \skiprcx
88 .else
89 movq 5*8(%rsp), %rcx
90 CFI_RESTORE rcx
91 .endif
92 .if \skiprdx
93 .else
94 movq 6*8(%rsp), %rdx
95 CFI_RESTORE rdx
96 .endif
97 movq 7*8(%rsp), %rsi
98 CFI_RESTORE rsi
99 movq 8*8(%rsp), %rdi
100 CFI_RESTORE rdi
101 .if ARG_SKIP+\addskip > 0
102 addq $ARG_SKIP+\addskip, %rsp
103 CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
104 .endif
105 .endm
106
107 .macro LOAD_ARGS offset, skiprax=0
108 movq \offset(%rsp), %r11
109 movq \offset+8(%rsp), %r10
110 movq \offset+16(%rsp), %r9
111 movq \offset+24(%rsp), %r8
112 movq \offset+40(%rsp), %rcx
113 movq \offset+48(%rsp), %rdx
114 movq \offset+56(%rsp), %rsi
115 movq \offset+64(%rsp), %rdi
116 .if \skiprax
117 .else
118 movq \offset+72(%rsp), %rax
119 .endif
120 .endm
121
122#define REST_SKIP 6*8
123
124 .macro SAVE_REST
125 subq $REST_SKIP, %rsp
126 CFI_ADJUST_CFA_OFFSET REST_SKIP
127 movq %rbx, 5*8(%rsp)
128 CFI_REL_OFFSET rbx, 5*8
129 movq %rbp, 4*8(%rsp)
130 CFI_REL_OFFSET rbp, 4*8
131 movq %r12, 3*8(%rsp)
132 CFI_REL_OFFSET r12, 3*8
133 movq %r13, 2*8(%rsp)
134 CFI_REL_OFFSET r13, 2*8
135 movq %r14, 1*8(%rsp)
136 CFI_REL_OFFSET r14, 1*8
137 movq %r15, (%rsp)
138 CFI_REL_OFFSET r15, 0*8
139 .endm
140
141 .macro RESTORE_REST
142 movq (%rsp), %r15
143 CFI_RESTORE r15
144 movq 1*8(%rsp), %r14
145 CFI_RESTORE r14
146 movq 2*8(%rsp), %r13
147 CFI_RESTORE r13
148 movq 3*8(%rsp), %r12
149 CFI_RESTORE r12
150 movq 4*8(%rsp), %rbp
151 CFI_RESTORE rbp
152 movq 5*8(%rsp), %rbx
153 CFI_RESTORE rbx
154 addq $REST_SKIP, %rsp
155 CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
156 .endm
157
158 .macro SAVE_ALL
159 SAVE_ARGS
160 SAVE_REST
161 .endm
162
163 .macro RESTORE_ALL addskip=0
164 RESTORE_REST
165 RESTORE_ARGS 0, \addskip
166 .endm
167
168 .macro icebp
169 .byte 0xf1
170 .endm
diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h
new file mode 100644
index 00000000000..848850fd7d6
--- /dev/null
+++ b/arch/x86/include/asm/checksum.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "checksum_32.h"
3#else
4# include "checksum_64.h"
5#endif
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
new file mode 100644
index 00000000000..7c5ef8b14d9
--- /dev/null
+++ b/arch/x86/include/asm/checksum_32.h
@@ -0,0 +1,189 @@
1#ifndef _ASM_X86_CHECKSUM_32_H
2#define _ASM_X86_CHECKSUM_32_H
3
4#include <linux/in6.h>
5
6#include <asm/uaccess.h>
7
8/*
9 * computes the checksum of a memory block at buff, length len,
10 * and adds in "sum" (32-bit)
11 *
12 * returns a 32-bit number suitable for feeding into itself
13 * or csum_tcpudp_magic
14 *
15 * this function must be called with even lengths, except
16 * for the last fragment, which may be odd
17 *
18 * it's best to have buff aligned on a 32-bit boundary
19 */
20asmlinkage __wsum csum_partial(const void *buff, int len, __wsum sum);
21
22/*
23 * the same as csum_partial, but copies from src while it
24 * checksums, and handles user-space pointer exceptions correctly, when needed.
25 *
26 * here even more important to align src and dst on a 32-bit (or even
27 * better 64-bit) boundary
28 */
29
30asmlinkage __wsum csum_partial_copy_generic(const void *src, void *dst,
31 int len, __wsum sum,
32 int *src_err_ptr, int *dst_err_ptr);
33
34/*
35 * Note: when you get a NULL pointer exception here this means someone
36 * passed in an incorrect kernel address to one of these functions.
37 *
38 * If you use these functions directly please don't forget the
39 * access_ok().
40 */
41static inline __wsum csum_partial_copy_nocheck(const void *src, void *dst,
42 int len, __wsum sum)
43{
44 return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL);
45}
46
47static inline __wsum csum_partial_copy_from_user(const void __user *src,
48 void *dst,
49 int len, __wsum sum,
50 int *err_ptr)
51{
52 might_sleep();
53 return csum_partial_copy_generic((__force void *)src, dst,
54 len, sum, err_ptr, NULL);
55}
56
57/*
58 * This is a version of ip_compute_csum() optimized for IP headers,
59 * which always checksum on 4 octet boundaries.
60 *
61 * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
62 * Arnt Gulbrandsen.
63 */
64static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
65{
66 unsigned int sum;
67
68 asm volatile("movl (%1), %0 ;\n"
69 "subl $4, %2 ;\n"
70 "jbe 2f ;\n"
71 "addl 4(%1), %0 ;\n"
72 "adcl 8(%1), %0 ;\n"
73 "adcl 12(%1), %0;\n"
74 "1: adcl 16(%1), %0 ;\n"
75 "lea 4(%1), %1 ;\n"
76 "decl %2 ;\n"
77 "jne 1b ;\n"
78 "adcl $0, %0 ;\n"
79 "movl %0, %2 ;\n"
80 "shrl $16, %0 ;\n"
81 "addw %w2, %w0 ;\n"
82 "adcl $0, %0 ;\n"
83 "notl %0 ;\n"
84 "2: ;\n"
85 /* Since the input registers which are loaded with iph and ihl
86 are modified, we must also specify them as outputs, or gcc
87 will assume they contain their original values. */
88 : "=r" (sum), "=r" (iph), "=r" (ihl)
89 : "1" (iph), "2" (ihl)
90 : "memory");
91 return (__force __sum16)sum;
92}
93
94/*
95 * Fold a partial checksum
96 */
97
98static inline __sum16 csum_fold(__wsum sum)
99{
100 asm("addl %1, %0 ;\n"
101 "adcl $0xffff, %0 ;\n"
102 : "=r" (sum)
103 : "r" ((__force u32)sum << 16),
104 "0" ((__force u32)sum & 0xffff0000));
105 return (__force __sum16)(~(__force u32)sum >> 16);
106}
107
108static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
109 unsigned short len,
110 unsigned short proto,
111 __wsum sum)
112{
113 asm("addl %1, %0 ;\n"
114 "adcl %2, %0 ;\n"
115 "adcl %3, %0 ;\n"
116 "adcl $0, %0 ;\n"
117 : "=r" (sum)
118 : "g" (daddr), "g"(saddr),
119 "g" ((len + proto) << 8), "0" (sum));
120 return sum;
121}
122
123/*
124 * computes the checksum of the TCP/UDP pseudo-header
125 * returns a 16-bit checksum, already complemented
126 */
127static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
128 unsigned short len,
129 unsigned short proto,
130 __wsum sum)
131{
132 return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
133}
134
135/*
136 * this routine is used for miscellaneous IP-like checksums, mainly
137 * in icmp.c
138 */
139
140static inline __sum16 ip_compute_csum(const void *buff, int len)
141{
142 return csum_fold(csum_partial(buff, len, 0));
143}
144
145#define _HAVE_ARCH_IPV6_CSUM
146static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
147 const struct in6_addr *daddr,
148 __u32 len, unsigned short proto,
149 __wsum sum)
150{
151 asm("addl 0(%1), %0 ;\n"
152 "adcl 4(%1), %0 ;\n"
153 "adcl 8(%1), %0 ;\n"
154 "adcl 12(%1), %0 ;\n"
155 "adcl 0(%2), %0 ;\n"
156 "adcl 4(%2), %0 ;\n"
157 "adcl 8(%2), %0 ;\n"
158 "adcl 12(%2), %0 ;\n"
159 "adcl %3, %0 ;\n"
160 "adcl %4, %0 ;\n"
161 "adcl $0, %0 ;\n"
162 : "=&r" (sum)
163 : "r" (saddr), "r" (daddr),
164 "r" (htonl(len)), "r" (htonl(proto)), "0" (sum));
165
166 return csum_fold(sum);
167}
168
169/*
170 * Copy and checksum to user
171 */
172#define HAVE_CSUM_COPY_USER
173static inline __wsum csum_and_copy_to_user(const void *src,
174 void __user *dst,
175 int len, __wsum sum,
176 int *err_ptr)
177{
178 might_sleep();
179 if (access_ok(VERIFY_WRITE, dst, len))
180 return csum_partial_copy_generic(src, (__force void *)dst,
181 len, sum, NULL, err_ptr);
182
183 if (len)
184 *err_ptr = -EFAULT;
185
186 return (__force __wsum)-1; /* invalid checksum */
187}
188
189#endif /* _ASM_X86_CHECKSUM_32_H */
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
new file mode 100644
index 00000000000..9bfdc41629e
--- /dev/null
+++ b/arch/x86/include/asm/checksum_64.h
@@ -0,0 +1,191 @@
1#ifndef _ASM_X86_CHECKSUM_64_H
2#define _ASM_X86_CHECKSUM_64_H
3
4/*
5 * Checksums for x86-64
6 * Copyright 2002 by Andi Kleen, SuSE Labs
7 * with some code from asm-x86/checksum.h
8 */
9
10#include <linux/compiler.h>
11#include <asm/uaccess.h>
12#include <asm/byteorder.h>
13
14/**
15 * csum_fold - Fold and invert a 32bit checksum.
16 * sum: 32bit unfolded sum
17 *
18 * Fold a 32bit running checksum to 16bit and invert it. This is usually
19 * the last step before putting a checksum into a packet.
20 * Make sure not to mix with 64bit checksums.
21 */
22static inline __sum16 csum_fold(__wsum sum)
23{
24 asm(" addl %1,%0\n"
25 " adcl $0xffff,%0"
26 : "=r" (sum)
27 : "r" ((__force u32)sum << 16),
28 "0" ((__force u32)sum & 0xffff0000));
29 return (__force __sum16)(~(__force u32)sum >> 16);
30}
31
32/*
33 * This is a version of ip_compute_csum() optimized for IP headers,
34 * which always checksum on 4 octet boundaries.
35 *
36 * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
37 * Arnt Gulbrandsen.
38 */
39
40/**
41 * ip_fast_csum - Compute the IPv4 header checksum efficiently.
42 * iph: ipv4 header
43 * ihl: length of header / 4
44 */
45static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
46{
47 unsigned int sum;
48
49 asm(" movl (%1), %0\n"
50 " subl $4, %2\n"
51 " jbe 2f\n"
52 " addl 4(%1), %0\n"
53 " adcl 8(%1), %0\n"
54 " adcl 12(%1), %0\n"
55 "1: adcl 16(%1), %0\n"
56 " lea 4(%1), %1\n"
57 " decl %2\n"
58 " jne 1b\n"
59 " adcl $0, %0\n"
60 " movl %0, %2\n"
61 " shrl $16, %0\n"
62 " addw %w2, %w0\n"
63 " adcl $0, %0\n"
64 " notl %0\n"
65 "2:"
66 /* Since the input registers which are loaded with iph and ihl
67 are modified, we must also specify them as outputs, or gcc
68 will assume they contain their original values. */
69 : "=r" (sum), "=r" (iph), "=r" (ihl)
70 : "1" (iph), "2" (ihl)
71 : "memory");
72 return (__force __sum16)sum;
73}
74
75/**
76 * csum_tcpup_nofold - Compute an IPv4 pseudo header checksum.
77 * @saddr: source address
78 * @daddr: destination address
79 * @len: length of packet
80 * @proto: ip protocol of packet
81 * @sum: initial sum to be added in (32bit unfolded)
82 *
83 * Returns the pseudo header checksum the input data. Result is
84 * 32bit unfolded.
85 */
86static inline __wsum
87csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
88 unsigned short proto, __wsum sum)
89{
90 asm(" addl %1, %0\n"
91 " adcl %2, %0\n"
92 " adcl %3, %0\n"
93 " adcl $0, %0\n"
94 : "=r" (sum)
95 : "g" (daddr), "g" (saddr),
96 "g" ((len + proto)<<8), "0" (sum));
97 return sum;
98}
99
100
101/**
102 * csum_tcpup_magic - Compute an IPv4 pseudo header checksum.
103 * @saddr: source address
104 * @daddr: destination address
105 * @len: length of packet
106 * @proto: ip protocol of packet
107 * @sum: initial sum to be added in (32bit unfolded)
108 *
109 * Returns the 16bit pseudo header checksum the input data already
110 * complemented and ready to be filled in.
111 */
112static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
113 unsigned short len,
114 unsigned short proto, __wsum sum)
115{
116 return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
117}
118
119/**
120 * csum_partial - Compute an internet checksum.
121 * @buff: buffer to be checksummed
122 * @len: length of buffer.
123 * @sum: initial sum to be added in (32bit unfolded)
124 *
125 * Returns the 32bit unfolded internet checksum of the buffer.
126 * Before filling it in it needs to be csum_fold()'ed.
127 * buff should be aligned to a 64bit boundary if possible.
128 */
129extern __wsum csum_partial(const void *buff, int len, __wsum sum);
130
131#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1
132#define HAVE_CSUM_COPY_USER 1
133
134
135/* Do not call this directly. Use the wrappers below */
136extern __wsum csum_partial_copy_generic(const void *src, const void *dst,
137 int len, __wsum sum,
138 int *src_err_ptr, int *dst_err_ptr);
139
140
141extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
142 int len, __wsum isum, int *errp);
143extern __wsum csum_partial_copy_to_user(const void *src, void __user *dst,
144 int len, __wsum isum, int *errp);
145extern __wsum csum_partial_copy_nocheck(const void *src, void *dst,
146 int len, __wsum sum);
147
148/* Old names. To be removed. */
149#define csum_and_copy_to_user csum_partial_copy_to_user
150#define csum_and_copy_from_user csum_partial_copy_from_user
151
152/**
153 * ip_compute_csum - Compute an 16bit IP checksum.
154 * @buff: buffer address.
155 * @len: length of buffer.
156 *
157 * Returns the 16bit folded/inverted checksum of the passed buffer.
158 * Ready to fill in.
159 */
160extern __sum16 ip_compute_csum(const void *buff, int len);
161
162/**
163 * csum_ipv6_magic - Compute checksum of an IPv6 pseudo header.
164 * @saddr: source address
165 * @daddr: destination address
166 * @len: length of packet
167 * @proto: protocol of packet
168 * @sum: initial sum (32bit unfolded) to be added in
169 *
170 * Computes an IPv6 pseudo header checksum. This sum is added the checksum
171 * into UDP/TCP packets and contains some link layer information.
172 * Returns the unfolded 32bit checksum.
173 */
174
175struct in6_addr;
176
177#define _HAVE_ARCH_IPV6_CSUM 1
178extern __sum16
179csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
180 __u32 len, unsigned short proto, __wsum sum);
181
182static inline unsigned add32_with_carry(unsigned a, unsigned b)
183{
184 asm("addl %2,%0\n\t"
185 "adcl $0,%0"
186 : "=r" (a)
187 : "0" (a), "r" (b));
188 return a;
189}
190
191#endif /* _ASM_X86_CHECKSUM_64_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
new file mode 100644
index 00000000000..a460fa088d4
--- /dev/null
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "cmpxchg_32.h"
3#else
4# include "cmpxchg_64.h"
5#endif
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
new file mode 100644
index 00000000000..82ceb788a98
--- /dev/null
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -0,0 +1,344 @@
1#ifndef _ASM_X86_CMPXCHG_32_H
2#define _ASM_X86_CMPXCHG_32_H
3
4#include <linux/bitops.h> /* for LOCK_PREFIX */
5
6/*
7 * Note: if you use set64_bit(), __cmpxchg64(), or their variants, you
8 * you need to test for the feature in boot_cpu_data.
9 */
10
11#define xchg(ptr, v) \
12 ((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr))))
13
14struct __xchg_dummy {
15 unsigned long a[100];
16};
17#define __xg(x) ((struct __xchg_dummy *)(x))
18
19/*
20 * The semantics of XCHGCMP8B are a bit strange, this is why
21 * there is a loop and the loading of %%eax and %%edx has to
22 * be inside. This inlines well in most cases, the cached
23 * cost is around ~38 cycles. (in the future we might want
24 * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
25 * might have an implicit FPU-save as a cost, so it's not
26 * clear which path to go.)
27 *
28 * cmpxchg8b must be used with the lock prefix here to allow
29 * the instruction to be executed atomically, see page 3-102
30 * of the instruction set reference 24319102.pdf. We need
31 * the reader side to see the coherent 64bit value.
32 */
33static inline void __set_64bit(unsigned long long *ptr,
34 unsigned int low, unsigned int high)
35{
36 asm volatile("\n1:\t"
37 "movl (%0), %%eax\n\t"
38 "movl 4(%0), %%edx\n\t"
39 LOCK_PREFIX "cmpxchg8b (%0)\n\t"
40 "jnz 1b"
41 : /* no outputs */
42 : "D"(ptr),
43 "b"(low),
44 "c"(high)
45 : "ax", "dx", "memory");
46}
47
48static inline void __set_64bit_constant(unsigned long long *ptr,
49 unsigned long long value)
50{
51 __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32));
52}
53
54#define ll_low(x) *(((unsigned int *)&(x)) + 0)
55#define ll_high(x) *(((unsigned int *)&(x)) + 1)
56
57static inline void __set_64bit_var(unsigned long long *ptr,
58 unsigned long long value)
59{
60 __set_64bit(ptr, ll_low(value), ll_high(value));
61}
62
63#define set_64bit(ptr, value) \
64 (__builtin_constant_p((value)) \
65 ? __set_64bit_constant((ptr), (value)) \
66 : __set_64bit_var((ptr), (value)))
67
68#define _set_64bit(ptr, value) \
69 (__builtin_constant_p(value) \
70 ? __set_64bit(ptr, (unsigned int)(value), \
71 (unsigned int)((value) >> 32)) \
72 : __set_64bit(ptr, ll_low((value)), ll_high((value))))
73
74/*
75 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
76 * Note 2: xchg has side effect, so that attribute volatile is necessary,
77 * but generally the primitive is invalid, *ptr is output argument. --ANK
78 */
79static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
80 int size)
81{
82 switch (size) {
83 case 1:
84 asm volatile("xchgb %b0,%1"
85 : "=q" (x)
86 : "m" (*__xg(ptr)), "0" (x)
87 : "memory");
88 break;
89 case 2:
90 asm volatile("xchgw %w0,%1"
91 : "=r" (x)
92 : "m" (*__xg(ptr)), "0" (x)
93 : "memory");
94 break;
95 case 4:
96 asm volatile("xchgl %0,%1"
97 : "=r" (x)
98 : "m" (*__xg(ptr)), "0" (x)
99 : "memory");
100 break;
101 }
102 return x;
103}
104
105/*
106 * Atomic compare and exchange. Compare OLD with MEM, if identical,
107 * store NEW in MEM. Return the initial value in MEM. Success is
108 * indicated by comparing RETURN with OLD.
109 */
110
111#ifdef CONFIG_X86_CMPXCHG
112#define __HAVE_ARCH_CMPXCHG 1
113#define cmpxchg(ptr, o, n) \
114 ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
115 (unsigned long)(n), \
116 sizeof(*(ptr))))
117#define sync_cmpxchg(ptr, o, n) \
118 ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
119 (unsigned long)(n), \
120 sizeof(*(ptr))))
121#define cmpxchg_local(ptr, o, n) \
122 ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
123 (unsigned long)(n), \
124 sizeof(*(ptr))))
125#endif
126
127#ifdef CONFIG_X86_CMPXCHG64
128#define cmpxchg64(ptr, o, n) \
129 ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
130 (unsigned long long)(n)))
131#define cmpxchg64_local(ptr, o, n) \
132 ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
133 (unsigned long long)(n)))
134#endif
135
136static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
137 unsigned long new, int size)
138{
139 unsigned long prev;
140 switch (size) {
141 case 1:
142 asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
143 : "=a"(prev)
144 : "q"(new), "m"(*__xg(ptr)), "0"(old)
145 : "memory");
146 return prev;
147 case 2:
148 asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
149 : "=a"(prev)
150 : "r"(new), "m"(*__xg(ptr)), "0"(old)
151 : "memory");
152 return prev;
153 case 4:
154 asm volatile(LOCK_PREFIX "cmpxchgl %1,%2"
155 : "=a"(prev)
156 : "r"(new), "m"(*__xg(ptr)), "0"(old)
157 : "memory");
158 return prev;
159 }
160 return old;
161}
162
163/*
164 * Always use locked operations when touching memory shared with a
165 * hypervisor, since the system may be SMP even if the guest kernel
166 * isn't.
167 */
168static inline unsigned long __sync_cmpxchg(volatile void *ptr,
169 unsigned long old,
170 unsigned long new, int size)
171{
172 unsigned long prev;
173 switch (size) {
174 case 1:
175 asm volatile("lock; cmpxchgb %b1,%2"
176 : "=a"(prev)
177 : "q"(new), "m"(*__xg(ptr)), "0"(old)
178 : "memory");
179 return prev;
180 case 2:
181 asm volatile("lock; cmpxchgw %w1,%2"
182 : "=a"(prev)
183 : "r"(new), "m"(*__xg(ptr)), "0"(old)
184 : "memory");
185 return prev;
186 case 4:
187 asm volatile("lock; cmpxchgl %1,%2"
188 : "=a"(prev)
189 : "r"(new), "m"(*__xg(ptr)), "0"(old)
190 : "memory");
191 return prev;
192 }
193 return old;
194}
195
196static inline unsigned long __cmpxchg_local(volatile void *ptr,
197 unsigned long old,
198 unsigned long new, int size)
199{
200 unsigned long prev;
201 switch (size) {
202 case 1:
203 asm volatile("cmpxchgb %b1,%2"
204 : "=a"(prev)
205 : "q"(new), "m"(*__xg(ptr)), "0"(old)
206 : "memory");
207 return prev;
208 case 2:
209 asm volatile("cmpxchgw %w1,%2"
210 : "=a"(prev)
211 : "r"(new), "m"(*__xg(ptr)), "0"(old)
212 : "memory");
213 return prev;
214 case 4:
215 asm volatile("cmpxchgl %1,%2"
216 : "=a"(prev)
217 : "r"(new), "m"(*__xg(ptr)), "0"(old)
218 : "memory");
219 return prev;
220 }
221 return old;
222}
223
224static inline unsigned long long __cmpxchg64(volatile void *ptr,
225 unsigned long long old,
226 unsigned long long new)
227{
228 unsigned long long prev;
229 asm volatile(LOCK_PREFIX "cmpxchg8b %3"
230 : "=A"(prev)
231 : "b"((unsigned long)new),
232 "c"((unsigned long)(new >> 32)),
233 "m"(*__xg(ptr)),
234 "0"(old)
235 : "memory");
236 return prev;
237}
238
239static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
240 unsigned long long old,
241 unsigned long long new)
242{
243 unsigned long long prev;
244 asm volatile("cmpxchg8b %3"
245 : "=A"(prev)
246 : "b"((unsigned long)new),
247 "c"((unsigned long)(new >> 32)),
248 "m"(*__xg(ptr)),
249 "0"(old)
250 : "memory");
251 return prev;
252}
253
254#ifndef CONFIG_X86_CMPXCHG
255/*
256 * Building a kernel capable running on 80386. It may be necessary to
257 * simulate the cmpxchg on the 80386 CPU. For that purpose we define
258 * a function for each of the sizes we support.
259 */
260
261extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
262extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
263extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
264
265static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
266 unsigned long new, int size)
267{
268 switch (size) {
269 case 1:
270 return cmpxchg_386_u8(ptr, old, new);
271 case 2:
272 return cmpxchg_386_u16(ptr, old, new);
273 case 4:
274 return cmpxchg_386_u32(ptr, old, new);
275 }
276 return old;
277}
278
279#define cmpxchg(ptr, o, n) \
280({ \
281 __typeof__(*(ptr)) __ret; \
282 if (likely(boot_cpu_data.x86 > 3)) \
283 __ret = (__typeof__(*(ptr)))__cmpxchg((ptr), \
284 (unsigned long)(o), (unsigned long)(n), \
285 sizeof(*(ptr))); \
286 else \
287 __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \
288 (unsigned long)(o), (unsigned long)(n), \
289 sizeof(*(ptr))); \
290 __ret; \
291})
292#define cmpxchg_local(ptr, o, n) \
293({ \
294 __typeof__(*(ptr)) __ret; \
295 if (likely(boot_cpu_data.x86 > 3)) \
296 __ret = (__typeof__(*(ptr)))__cmpxchg_local((ptr), \
297 (unsigned long)(o), (unsigned long)(n), \
298 sizeof(*(ptr))); \
299 else \
300 __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \
301 (unsigned long)(o), (unsigned long)(n), \
302 sizeof(*(ptr))); \
303 __ret; \
304})
305#endif
306
307#ifndef CONFIG_X86_CMPXCHG64
308/*
309 * Building a kernel capable running on 80386 and 80486. It may be necessary
310 * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
311 */
312
313extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
314
315#define cmpxchg64(ptr, o, n) \
316({ \
317 __typeof__(*(ptr)) __ret; \
318 if (likely(boot_cpu_data.x86 > 4)) \
319 __ret = (__typeof__(*(ptr)))__cmpxchg64((ptr), \
320 (unsigned long long)(o), \
321 (unsigned long long)(n)); \
322 else \
323 __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \
324 (unsigned long long)(o), \
325 (unsigned long long)(n)); \
326 __ret; \
327})
328#define cmpxchg64_local(ptr, o, n) \
329({ \
330 __typeof__(*(ptr)) __ret; \
331 if (likely(boot_cpu_data.x86 > 4)) \
332 __ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr), \
333 (unsigned long long)(o), \
334 (unsigned long long)(n)); \
335 else \
336 __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \
337 (unsigned long long)(o), \
338 (unsigned long long)(n)); \
339 __ret; \
340})
341
342#endif
343
344#endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
new file mode 100644
index 00000000000..52de72e0de8
--- /dev/null
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -0,0 +1,185 @@
1#ifndef _ASM_X86_CMPXCHG_64_H
2#define _ASM_X86_CMPXCHG_64_H
3
4#include <asm/alternative.h> /* Provides LOCK_PREFIX */
5
6#define xchg(ptr, v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v), \
7 (ptr), sizeof(*(ptr))))
8
9#define __xg(x) ((volatile long *)(x))
10
11static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
12{
13 *ptr = val;
14}
15
16#define _set_64bit set_64bit
17
18/*
19 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
20 * Note 2: xchg has side effect, so that attribute volatile is necessary,
21 * but generally the primitive is invalid, *ptr is output argument. --ANK
22 */
23static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
24 int size)
25{
26 switch (size) {
27 case 1:
28 asm volatile("xchgb %b0,%1"
29 : "=q" (x)
30 : "m" (*__xg(ptr)), "0" (x)
31 : "memory");
32 break;
33 case 2:
34 asm volatile("xchgw %w0,%1"
35 : "=r" (x)
36 : "m" (*__xg(ptr)), "0" (x)
37 : "memory");
38 break;
39 case 4:
40 asm volatile("xchgl %k0,%1"
41 : "=r" (x)
42 : "m" (*__xg(ptr)), "0" (x)
43 : "memory");
44 break;
45 case 8:
46 asm volatile("xchgq %0,%1"
47 : "=r" (x)
48 : "m" (*__xg(ptr)), "0" (x)
49 : "memory");
50 break;
51 }
52 return x;
53}
54
55/*
56 * Atomic compare and exchange. Compare OLD with MEM, if identical,
57 * store NEW in MEM. Return the initial value in MEM. Success is
58 * indicated by comparing RETURN with OLD.
59 */
60
61#define __HAVE_ARCH_CMPXCHG 1
62
63static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
64 unsigned long new, int size)
65{
66 unsigned long prev;
67 switch (size) {
68 case 1:
69 asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
70 : "=a"(prev)
71 : "q"(new), "m"(*__xg(ptr)), "0"(old)
72 : "memory");
73 return prev;
74 case 2:
75 asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
76 : "=a"(prev)
77 : "r"(new), "m"(*__xg(ptr)), "0"(old)
78 : "memory");
79 return prev;
80 case 4:
81 asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2"
82 : "=a"(prev)
83 : "r"(new), "m"(*__xg(ptr)), "0"(old)
84 : "memory");
85 return prev;
86 case 8:
87 asm volatile(LOCK_PREFIX "cmpxchgq %1,%2"
88 : "=a"(prev)
89 : "r"(new), "m"(*__xg(ptr)), "0"(old)
90 : "memory");
91 return prev;
92 }
93 return old;
94}
95
96/*
97 * Always use locked operations when touching memory shared with a
98 * hypervisor, since the system may be SMP even if the guest kernel
99 * isn't.
100 */
101static inline unsigned long __sync_cmpxchg(volatile void *ptr,
102 unsigned long old,
103 unsigned long new, int size)
104{
105 unsigned long prev;
106 switch (size) {
107 case 1:
108 asm volatile("lock; cmpxchgb %b1,%2"
109 : "=a"(prev)
110 : "q"(new), "m"(*__xg(ptr)), "0"(old)
111 : "memory");
112 return prev;
113 case 2:
114 asm volatile("lock; cmpxchgw %w1,%2"
115 : "=a"(prev)
116 : "r"(new), "m"(*__xg(ptr)), "0"(old)
117 : "memory");
118 return prev;
119 case 4:
120 asm volatile("lock; cmpxchgl %1,%2"
121 : "=a"(prev)
122 : "r"(new), "m"(*__xg(ptr)), "0"(old)
123 : "memory");
124 return prev;
125 }
126 return old;
127}
128
129static inline unsigned long __cmpxchg_local(volatile void *ptr,
130 unsigned long old,
131 unsigned long new, int size)
132{
133 unsigned long prev;
134 switch (size) {
135 case 1:
136 asm volatile("cmpxchgb %b1,%2"
137 : "=a"(prev)
138 : "q"(new), "m"(*__xg(ptr)), "0"(old)
139 : "memory");
140 return prev;
141 case 2:
142 asm volatile("cmpxchgw %w1,%2"
143 : "=a"(prev)
144 : "r"(new), "m"(*__xg(ptr)), "0"(old)
145 : "memory");
146 return prev;
147 case 4:
148 asm volatile("cmpxchgl %k1,%2"
149 : "=a"(prev)
150 : "r"(new), "m"(*__xg(ptr)), "0"(old)
151 : "memory");
152 return prev;
153 case 8:
154 asm volatile("cmpxchgq %1,%2"
155 : "=a"(prev)
156 : "r"(new), "m"(*__xg(ptr)), "0"(old)
157 : "memory");
158 return prev;
159 }
160 return old;
161}
162
163#define cmpxchg(ptr, o, n) \
164 ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
165 (unsigned long)(n), sizeof(*(ptr))))
166#define cmpxchg64(ptr, o, n) \
167({ \
168 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
169 cmpxchg((ptr), (o), (n)); \
170})
171#define cmpxchg_local(ptr, o, n) \
172 ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
173 (unsigned long)(n), \
174 sizeof(*(ptr))))
175#define sync_cmpxchg(ptr, o, n) \
176 ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
177 (unsigned long)(n), \
178 sizeof(*(ptr))))
179#define cmpxchg64_local(ptr, o, n) \
180({ \
181 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
182 cmpxchg_local((ptr), (o), (n)); \
183})
184
185#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
new file mode 100644
index 00000000000..9a9c7bdc923
--- /dev/null
+++ b/arch/x86/include/asm/compat.h
@@ -0,0 +1,218 @@
1#ifndef _ASM_X86_COMPAT_H
2#define _ASM_X86_COMPAT_H
3
4/*
5 * Architecture specific compatibility types
6 */
7#include <linux/types.h>
8#include <linux/sched.h>
9#include <asm/user32.h>
10
11#define COMPAT_USER_HZ 100
12
13typedef u32 compat_size_t;
14typedef s32 compat_ssize_t;
15typedef s32 compat_time_t;
16typedef s32 compat_clock_t;
17typedef s32 compat_pid_t;
18typedef u16 __compat_uid_t;
19typedef u16 __compat_gid_t;
20typedef u32 __compat_uid32_t;
21typedef u32 __compat_gid32_t;
22typedef u16 compat_mode_t;
23typedef u32 compat_ino_t;
24typedef u16 compat_dev_t;
25typedef s32 compat_off_t;
26typedef s64 compat_loff_t;
27typedef u16 compat_nlink_t;
28typedef u16 compat_ipc_pid_t;
29typedef s32 compat_daddr_t;
30typedef u32 compat_caddr_t;
31typedef __kernel_fsid_t compat_fsid_t;
32typedef s32 compat_timer_t;
33typedef s32 compat_key_t;
34
35typedef s32 compat_int_t;
36typedef s32 compat_long_t;
37typedef s64 __attribute__((aligned(4))) compat_s64;
38typedef u32 compat_uint_t;
39typedef u32 compat_ulong_t;
40typedef u64 __attribute__((aligned(4))) compat_u64;
41
42struct compat_timespec {
43 compat_time_t tv_sec;
44 s32 tv_nsec;
45};
46
47struct compat_timeval {
48 compat_time_t tv_sec;
49 s32 tv_usec;
50};
51
52struct compat_stat {
53 compat_dev_t st_dev;
54 u16 __pad1;
55 compat_ino_t st_ino;
56 compat_mode_t st_mode;
57 compat_nlink_t st_nlink;
58 __compat_uid_t st_uid;
59 __compat_gid_t st_gid;
60 compat_dev_t st_rdev;
61 u16 __pad2;
62 u32 st_size;
63 u32 st_blksize;
64 u32 st_blocks;
65 u32 st_atime;
66 u32 st_atime_nsec;
67 u32 st_mtime;
68 u32 st_mtime_nsec;
69 u32 st_ctime;
70 u32 st_ctime_nsec;
71 u32 __unused4;
72 u32 __unused5;
73};
74
75struct compat_flock {
76 short l_type;
77 short l_whence;
78 compat_off_t l_start;
79 compat_off_t l_len;
80 compat_pid_t l_pid;
81};
82
83#define F_GETLK64 12 /* using 'struct flock64' */
84#define F_SETLK64 13
85#define F_SETLKW64 14
86
87/*
88 * IA32 uses 4 byte alignment for 64 bit quantities,
89 * so we need to pack this structure.
90 */
91struct compat_flock64 {
92 short l_type;
93 short l_whence;
94 compat_loff_t l_start;
95 compat_loff_t l_len;
96 compat_pid_t l_pid;
97} __attribute__((packed));
98
99struct compat_statfs {
100 int f_type;
101 int f_bsize;
102 int f_blocks;
103 int f_bfree;
104 int f_bavail;
105 int f_files;
106 int f_ffree;
107 compat_fsid_t f_fsid;
108 int f_namelen; /* SunOS ignores this field. */
109 int f_frsize;
110 int f_spare[5];
111};
112
113#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff
114#define COMPAT_RLIM_INFINITY 0xffffffff
115
116typedef u32 compat_old_sigset_t; /* at least 32 bits */
117
118#define _COMPAT_NSIG 64
119#define _COMPAT_NSIG_BPW 32
120
121typedef u32 compat_sigset_word;
122
123#define COMPAT_OFF_T_MAX 0x7fffffff
124#define COMPAT_LOFF_T_MAX 0x7fffffffffffffffL
125
126struct compat_ipc64_perm {
127 compat_key_t key;
128 __compat_uid32_t uid;
129 __compat_gid32_t gid;
130 __compat_uid32_t cuid;
131 __compat_gid32_t cgid;
132 unsigned short mode;
133 unsigned short __pad1;
134 unsigned short seq;
135 unsigned short __pad2;
136 compat_ulong_t unused1;
137 compat_ulong_t unused2;
138};
139
140struct compat_semid64_ds {
141 struct compat_ipc64_perm sem_perm;
142 compat_time_t sem_otime;
143 compat_ulong_t __unused1;
144 compat_time_t sem_ctime;
145 compat_ulong_t __unused2;
146 compat_ulong_t sem_nsems;
147 compat_ulong_t __unused3;
148 compat_ulong_t __unused4;
149};
150
151struct compat_msqid64_ds {
152 struct compat_ipc64_perm msg_perm;
153 compat_time_t msg_stime;
154 compat_ulong_t __unused1;
155 compat_time_t msg_rtime;
156 compat_ulong_t __unused2;
157 compat_time_t msg_ctime;
158 compat_ulong_t __unused3;
159 compat_ulong_t msg_cbytes;
160 compat_ulong_t msg_qnum;
161 compat_ulong_t msg_qbytes;
162 compat_pid_t msg_lspid;
163 compat_pid_t msg_lrpid;
164 compat_ulong_t __unused4;
165 compat_ulong_t __unused5;
166};
167
168struct compat_shmid64_ds {
169 struct compat_ipc64_perm shm_perm;
170 compat_size_t shm_segsz;
171 compat_time_t shm_atime;
172 compat_ulong_t __unused1;
173 compat_time_t shm_dtime;
174 compat_ulong_t __unused2;
175 compat_time_t shm_ctime;
176 compat_ulong_t __unused3;
177 compat_pid_t shm_cpid;
178 compat_pid_t shm_lpid;
179 compat_ulong_t shm_nattch;
180 compat_ulong_t __unused4;
181 compat_ulong_t __unused5;
182};
183
184/*
185 * The type of struct elf_prstatus.pr_reg in compatible core dumps.
186 */
187typedef struct user_regs_struct32 compat_elf_gregset_t;
188
189/*
190 * A pointer passed in from user mode. This should not
191 * be used for syscall parameters, just declare them
192 * as pointers because the syscall entry code will have
193 * appropriately converted them already.
194 */
195typedef u32 compat_uptr_t;
196
197static inline void __user *compat_ptr(compat_uptr_t uptr)
198{
199 return (void __user *)(unsigned long)uptr;
200}
201
202static inline compat_uptr_t ptr_to_compat(void __user *uptr)
203{
204 return (u32)(unsigned long)uptr;
205}
206
207static inline void __user *compat_alloc_user_space(long len)
208{
209 struct pt_regs *regs = task_pt_regs(current);
210 return (void __user *)regs->sp - len;
211}
212
213static inline int is_compat_task(void)
214{
215 return current_thread_info()->status & TS_COMPAT;
216}
217
218#endif /* _ASM_X86_COMPAT_H */
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
new file mode 100644
index 00000000000..bae482df603
--- /dev/null
+++ b/arch/x86/include/asm/cpu.h
@@ -0,0 +1,20 @@
1#ifndef _ASM_X86_CPU_H
2#define _ASM_X86_CPU_H
3
4#include <linux/device.h>
5#include <linux/cpu.h>
6#include <linux/topology.h>
7#include <linux/nodemask.h>
8#include <linux/percpu.h>
9
10struct x86_cpu {
11 struct cpu cpu;
12};
13
14#ifdef CONFIG_HOTPLUG_CPU
15extern int arch_register_cpu(int num);
16extern void arch_unregister_cpu(int);
17#endif
18
19DECLARE_PER_CPU(int, cpu_state);
20#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
new file mode 100644
index 00000000000..f73e95d75b4
--- /dev/null
+++ b/arch/x86/include/asm/cpufeature.h
@@ -0,0 +1,271 @@
1/*
2 * Defines x86 CPU feature bits
3 */
4#ifndef _ASM_X86_CPUFEATURE_H
5#define _ASM_X86_CPUFEATURE_H
6
7#include <asm/required-features.h>
8
9#define NCAPINTS 9 /* N 32-bit words worth of info */
10
11/*
12 * Note: If the comment begins with a quoted string, that string is used
13 * in /proc/cpuinfo instead of the macro name. If the string is "",
14 * this feature bit is not displayed in /proc/cpuinfo at all.
15 */
16
17/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
18#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */
19#define X86_FEATURE_VME (0*32+ 1) /* Virtual Mode Extensions */
20#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */
21#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extensions */
22#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
23#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */
24#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */
25#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */
26#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
27#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */
28#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */
29#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */
30#define X86_FEATURE_PGE (0*32+13) /* Page Global Enable */
31#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */
32#define X86_FEATURE_CMOV (0*32+15) /* CMOV instructions */
33 /* (plus FCMOVcc, FCOMI with FPU) */
34#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */
35#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */
36#define X86_FEATURE_PN (0*32+18) /* Processor serial number */
37#define X86_FEATURE_CLFLSH (0*32+19) /* "clflush" CLFLUSH instruction */
38#define X86_FEATURE_DS (0*32+21) /* "dts" Debug Store */
39#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */
40#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
41#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
42#define X86_FEATURE_XMM (0*32+25) /* "sse" */
43#define X86_FEATURE_XMM2 (0*32+26) /* "sse2" */
44#define X86_FEATURE_SELFSNOOP (0*32+27) /* "ss" CPU self snoop */
45#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */
46#define X86_FEATURE_ACC (0*32+29) /* "tm" Automatic clock control */
47#define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */
48#define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */
49
50/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
51/* Don't duplicate feature flags which are redundant with Intel! */
52#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */
53#define X86_FEATURE_MP (1*32+19) /* MP Capable. */
54#define X86_FEATURE_NX (1*32+20) /* Execute Disable */
55#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
56#define X86_FEATURE_FXSR_OPT (1*32+25) /* FXSAVE/FXRSTOR optimizations */
57#define X86_FEATURE_GBPAGES (1*32+26) /* "pdpe1gb" GB pages */
58#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */
59#define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */
60#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */
61#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */
62
63/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
64#define X86_FEATURE_RECOVERY (2*32+ 0) /* CPU in recovery mode */
65#define X86_FEATURE_LONGRUN (2*32+ 1) /* Longrun power control */
66#define X86_FEATURE_LRTI (2*32+ 3) /* LongRun table interface */
67
68/* Other features, Linux-defined mapping, word 3 */
69/* This range is used for feature bits which conflict or are synthesized */
70#define X86_FEATURE_CXMMX (3*32+ 0) /* Cyrix MMX extensions */
71#define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */
72#define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */
73#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */
74/* cpu types for specific tunings: */
75#define X86_FEATURE_K8 (3*32+ 4) /* "" Opteron, Athlon64 */
76#define X86_FEATURE_K7 (3*32+ 5) /* "" Athlon */
77#define X86_FEATURE_P3 (3*32+ 6) /* "" P3 */
78#define X86_FEATURE_P4 (3*32+ 7) /* "" P4 */
79#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
80#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */
81#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */
82#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
83#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
84#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */
85#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */
86#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */
87#define X86_FEATURE_SYSENTER32 (3*32+15) /* "" sysenter in ia32 userspace */
88#define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well */
89#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" Mfence synchronizes RDTSC */
90#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
91#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */
92#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
93#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */
94#define X86_FEATURE_XTOPOLOGY (3*32+21) /* cpu topology enum extensions */
95
96/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
97#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
98#define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* PCLMULQDQ instruction */
99#define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit Debug Store */
100#define X86_FEATURE_MWAIT (4*32+ 3) /* "monitor" Monitor/Mwait support */
101#define X86_FEATURE_DSCPL (4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
102#define X86_FEATURE_VMX (4*32+ 5) /* Hardware virtualization */
103#define X86_FEATURE_SMX (4*32+ 6) /* Safer mode */
104#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */
105#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */
106#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */
107#define X86_FEATURE_CID (4*32+10) /* Context ID */
108#define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */
109#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */
110#define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */
111#define X86_FEATURE_PDCM (4*32+15) /* Performance Capabilities */
112#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */
113#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */
114#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */
115#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
116#define X86_FEATURE_AES (4*32+25) /* AES instructions */
117#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
118#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
119#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
120
121/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
122#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */
123#define X86_FEATURE_XSTORE_EN (5*32+ 3) /* "rng_en" RNG enabled */
124#define X86_FEATURE_XCRYPT (5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
125#define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* "ace_en" on-CPU crypto enabled */
126#define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */
127#define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */
128#define X86_FEATURE_PHE (5*32+10) /* PadLock Hash Engine */
129#define X86_FEATURE_PHE_EN (5*32+11) /* PHE enabled */
130#define X86_FEATURE_PMM (5*32+12) /* PadLock Montgomery Multiplier */
131#define X86_FEATURE_PMM_EN (5*32+13) /* PMM enabled */
132
133/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
134#define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */
135#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */
136#define X86_FEATURE_SVM (6*32+ 2) /* Secure virtual machine */
137#define X86_FEATURE_EXTAPIC (6*32+ 3) /* Extended APIC space */
138#define X86_FEATURE_CR8_LEGACY (6*32+ 4) /* CR8 in 32-bit mode */
139#define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */
140#define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */
141#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */
142#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
143#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */
144#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */
145#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */
146#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
147#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
148
149/*
150 * Auxiliary flags: Linux defined - For features scattered in various
151 * CPUID levels like 0x6, 0xA etc
152 */
153#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
154
155/* Virtualization flags: Linux defined */
156#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
157#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */
158#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
159#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
160#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
161
162#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
163
164#include <linux/bitops.h>
165
166extern const char * const x86_cap_flags[NCAPINTS*32];
167extern const char * const x86_power_flags[32];
168
169#define test_cpu_cap(c, bit) \
170 test_bit(bit, (unsigned long *)((c)->x86_capability))
171
172#define cpu_has(c, bit) \
173 (__builtin_constant_p(bit) && \
174 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
175 (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
176 (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
177 (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) || \
178 (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \
179 (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \
180 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
181 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \
182 ? 1 : \
183 test_cpu_cap(c, bit))
184
185#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
186
187#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
188#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
189#define setup_clear_cpu_cap(bit) do { \
190 clear_cpu_cap(&boot_cpu_data, bit); \
191 set_bit(bit, (unsigned long *)cleared_cpu_caps); \
192} while (0)
193#define setup_force_cpu_cap(bit) do { \
194 set_cpu_cap(&boot_cpu_data, bit); \
195 clear_bit(bit, (unsigned long *)cleared_cpu_caps); \
196} while (0)
197
198#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
199#define cpu_has_vme boot_cpu_has(X86_FEATURE_VME)
200#define cpu_has_de boot_cpu_has(X86_FEATURE_DE)
201#define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE)
202#define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC)
203#define cpu_has_pae boot_cpu_has(X86_FEATURE_PAE)
204#define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE)
205#define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC)
206#define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP)
207#define cpu_has_mtrr boot_cpu_has(X86_FEATURE_MTRR)
208#define cpu_has_mmx boot_cpu_has(X86_FEATURE_MMX)
209#define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR)
210#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM)
211#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2)
212#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3)
213#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
214#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP)
215#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
216#define cpu_has_k6_mtrr boot_cpu_has(X86_FEATURE_K6_MTRR)
217#define cpu_has_cyrix_arr boot_cpu_has(X86_FEATURE_CYRIX_ARR)
218#define cpu_has_centaur_mcr boot_cpu_has(X86_FEATURE_CENTAUR_MCR)
219#define cpu_has_xstore boot_cpu_has(X86_FEATURE_XSTORE)
220#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN)
221#define cpu_has_xcrypt boot_cpu_has(X86_FEATURE_XCRYPT)
222#define cpu_has_xcrypt_enabled boot_cpu_has(X86_FEATURE_XCRYPT_EN)
223#define cpu_has_ace2 boot_cpu_has(X86_FEATURE_ACE2)
224#define cpu_has_ace2_enabled boot_cpu_has(X86_FEATURE_ACE2_EN)
225#define cpu_has_phe boot_cpu_has(X86_FEATURE_PHE)
226#define cpu_has_phe_enabled boot_cpu_has(X86_FEATURE_PHE_EN)
227#define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM)
228#define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN)
229#define cpu_has_ds boot_cpu_has(X86_FEATURE_DS)
230#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS)
231#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH)
232#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS)
233#define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES)
234#define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
235#define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT)
236#define cpu_has_xmm4_1 boot_cpu_has(X86_FEATURE_XMM4_1)
237#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
238#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
239#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
240
241#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
242# define cpu_has_invlpg 1
243#else
244# define cpu_has_invlpg (boot_cpu_data.x86 > 3)
245#endif
246
247#ifdef CONFIG_X86_64
248
249#undef cpu_has_vme
250#define cpu_has_vme 0
251
252#undef cpu_has_pae
253#define cpu_has_pae ___BUG___
254
255#undef cpu_has_mp
256#define cpu_has_mp 1
257
258#undef cpu_has_k6_mtrr
259#define cpu_has_k6_mtrr 0
260
261#undef cpu_has_cyrix_arr
262#define cpu_has_cyrix_arr 0
263
264#undef cpu_has_centaur_mcr
265#define cpu_has_centaur_mcr 0
266
267#endif /* CONFIG_X86_64 */
268
269#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
270
271#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/arch/x86/include/asm/cputime.h b/arch/x86/include/asm/cputime.h
new file mode 100644
index 00000000000..6d68ad7e0ea
--- /dev/null
+++ b/arch/x86/include/asm/cputime.h
@@ -0,0 +1 @@
#include <asm-generic/cputime.h>
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
new file mode 100644
index 00000000000..0930b4f8d67
--- /dev/null
+++ b/arch/x86/include/asm/current.h
@@ -0,0 +1,39 @@
1#ifndef _ASM_X86_CURRENT_H
2#define _ASM_X86_CURRENT_H
3
4#ifdef CONFIG_X86_32
5#include <linux/compiler.h>
6#include <asm/percpu.h>
7
8struct task_struct;
9
10DECLARE_PER_CPU(struct task_struct *, current_task);
11static __always_inline struct task_struct *get_current(void)
12{
13 return x86_read_percpu(current_task);
14}
15
16#else /* X86_32 */
17
18#ifndef __ASSEMBLY__
19#include <asm/pda.h>
20
21struct task_struct;
22
23static __always_inline struct task_struct *get_current(void)
24{
25 return read_pda(pcurrent);
26}
27
28#else /* __ASSEMBLY__ */
29
30#include <asm/asm-offsets.h>
31#define GET_CURRENT(reg) movq %gs:(pda_pcurrent),reg
32
33#endif /* __ASSEMBLY__ */
34
35#endif /* X86_32 */
36
37#define current get_current()
38
39#endif /* _ASM_X86_CURRENT_H */
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
new file mode 100644
index 00000000000..3ea6f37be9e
--- /dev/null
+++ b/arch/x86/include/asm/debugreg.h
@@ -0,0 +1,70 @@
1#ifndef _ASM_X86_DEBUGREG_H
2#define _ASM_X86_DEBUGREG_H
3
4
5/* Indicate the register numbers for a number of the specific
6 debug registers. Registers 0-3 contain the addresses we wish to trap on */
7#define DR_FIRSTADDR 0 /* u_debugreg[DR_FIRSTADDR] */
8#define DR_LASTADDR 3 /* u_debugreg[DR_LASTADDR] */
9
10#define DR_STATUS 6 /* u_debugreg[DR_STATUS] */
11#define DR_CONTROL 7 /* u_debugreg[DR_CONTROL] */
12
13/* Define a few things for the status register. We can use this to determine
14 which debugging register was responsible for the trap. The other bits
15 are either reserved or not of interest to us. */
16
17#define DR_TRAP0 (0x1) /* db0 */
18#define DR_TRAP1 (0x2) /* db1 */
19#define DR_TRAP2 (0x4) /* db2 */
20#define DR_TRAP3 (0x8) /* db3 */
21
22#define DR_STEP (0x4000) /* single-step */
23#define DR_SWITCH (0x8000) /* task switch */
24
25/* Now define a bunch of things for manipulating the control register.
26 The top two bytes of the control register consist of 4 fields of 4
27 bits - each field corresponds to one of the four debug registers,
28 and indicates what types of access we trap on, and how large the data
29 field is that we are looking at */
30
31#define DR_CONTROL_SHIFT 16 /* Skip this many bits in ctl register */
32#define DR_CONTROL_SIZE 4 /* 4 control bits per register */
33
34#define DR_RW_EXECUTE (0x0) /* Settings for the access types to trap on */
35#define DR_RW_WRITE (0x1)
36#define DR_RW_READ (0x3)
37
38#define DR_LEN_1 (0x0) /* Settings for data length to trap on */
39#define DR_LEN_2 (0x4)
40#define DR_LEN_4 (0xC)
41#define DR_LEN_8 (0x8)
42
43/* The low byte to the control register determine which registers are
44 enabled. There are 4 fields of two bits. One bit is "local", meaning
45 that the processor will reset the bit after a task switch and the other
46 is global meaning that we have to explicitly reset the bit. With linux,
47 you can use either one, since we explicitly zero the register when we enter
48 kernel mode. */
49
50#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */
51#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */
52#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */
53
54#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */
55#define DR_GLOBAL_ENABLE_MASK (0xAA) /* Set global bits for all 4 regs */
56
57/* The second byte to the control register has a few special things.
58 We can slow the instruction pipeline for instructions coming via the
59 gdt or the ldt if we want to. I am not sure why this is an advantage */
60
61#ifdef __i386__
62#define DR_CONTROL_RESERVED (0xFC00) /* Reserved by Intel */
63#else
64#define DR_CONTROL_RESERVED (0xFFFFFFFF0000FC00UL) /* Reserved */
65#endif
66
67#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */
68#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */
69
70#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h
new file mode 100644
index 00000000000..409a649204a
--- /dev/null
+++ b/arch/x86/include/asm/delay.h
@@ -0,0 +1,31 @@
1#ifndef _ASM_X86_DELAY_H
2#define _ASM_X86_DELAY_H
3
4/*
5 * Copyright (C) 1993 Linus Torvalds
6 *
7 * Delay routines calling functions in arch/x86/lib/delay.c
8 */
9
10/* Undefined functions to get compile-time errors */
11extern void __bad_udelay(void);
12extern void __bad_ndelay(void);
13
14extern void __udelay(unsigned long usecs);
15extern void __ndelay(unsigned long nsecs);
16extern void __const_udelay(unsigned long xloops);
17extern void __delay(unsigned long loops);
18
19/* 0x10c7 is 2**32 / 1000000 (rounded up) */
20#define udelay(n) (__builtin_constant_p(n) ? \
21 ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
22 __udelay(n))
23
24/* 0x5 is 2**32 / 1000000000 (rounded up) */
25#define ndelay(n) (__builtin_constant_p(n) ? \
26 ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
27 __ndelay(n))
28
29void use_tsc_delay(void);
30
31#endif /* _ASM_X86_DELAY_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
new file mode 100644
index 00000000000..e6b82b17b07
--- /dev/null
+++ b/arch/x86/include/asm/desc.h
@@ -0,0 +1,409 @@
1#ifndef _ASM_X86_DESC_H
2#define _ASM_X86_DESC_H
3
4#ifndef __ASSEMBLY__
5#include <asm/desc_defs.h>
6#include <asm/ldt.h>
7#include <asm/mmu.h>
8#include <linux/smp.h>
9
10static inline void fill_ldt(struct desc_struct *desc,
11 const struct user_desc *info)
12{
13 desc->limit0 = info->limit & 0x0ffff;
14 desc->base0 = info->base_addr & 0x0000ffff;
15
16 desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
17 desc->type = (info->read_exec_only ^ 1) << 1;
18 desc->type |= info->contents << 2;
19 desc->s = 1;
20 desc->dpl = 0x3;
21 desc->p = info->seg_not_present ^ 1;
22 desc->limit = (info->limit & 0xf0000) >> 16;
23 desc->avl = info->useable;
24 desc->d = info->seg_32bit;
25 desc->g = info->limit_in_pages;
26 desc->base2 = (info->base_addr & 0xff000000) >> 24;
27 /*
28 * Don't allow setting of the lm bit. It is useless anyway
29 * because 64bit system calls require __USER_CS:
30 */
31 desc->l = 0;
32}
33
34extern struct desc_ptr idt_descr;
35extern gate_desc idt_table[];
36
37struct gdt_page {
38 struct desc_struct gdt[GDT_ENTRIES];
39} __attribute__((aligned(PAGE_SIZE)));
40DECLARE_PER_CPU(struct gdt_page, gdt_page);
41
42static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
43{
44 return per_cpu(gdt_page, cpu).gdt;
45}
46
47#ifdef CONFIG_X86_64
48
49static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
50 unsigned dpl, unsigned ist, unsigned seg)
51{
52 gate->offset_low = PTR_LOW(func);
53 gate->segment = __KERNEL_CS;
54 gate->ist = ist;
55 gate->p = 1;
56 gate->dpl = dpl;
57 gate->zero0 = 0;
58 gate->zero1 = 0;
59 gate->type = type;
60 gate->offset_middle = PTR_MIDDLE(func);
61 gate->offset_high = PTR_HIGH(func);
62}
63
64#else
65static inline void pack_gate(gate_desc *gate, unsigned char type,
66 unsigned long base, unsigned dpl, unsigned flags,
67 unsigned short seg)
68{
69 gate->a = (seg << 16) | (base & 0xffff);
70 gate->b = (base & 0xffff0000) |
71 (((0x80 | type | (dpl << 5)) & 0xff) << 8);
72}
73
74#endif
75
76static inline int desc_empty(const void *ptr)
77{
78 const u32 *desc = ptr;
79 return !(desc[0] | desc[1]);
80}
81
82#ifdef CONFIG_PARAVIRT
83#include <asm/paravirt.h>
84#else
85#define load_TR_desc() native_load_tr_desc()
86#define load_gdt(dtr) native_load_gdt(dtr)
87#define load_idt(dtr) native_load_idt(dtr)
88#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
89#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
90
91#define store_gdt(dtr) native_store_gdt(dtr)
92#define store_idt(dtr) native_store_idt(dtr)
93#define store_tr(tr) (tr = native_store_tr())
94#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
95
96#define load_TLS(t, cpu) native_load_tls(t, cpu)
97#define set_ldt native_set_ldt
98
99#define write_ldt_entry(dt, entry, desc) \
100 native_write_ldt_entry(dt, entry, desc)
101#define write_gdt_entry(dt, entry, desc, type) \
102 native_write_gdt_entry(dt, entry, desc, type)
103#define write_idt_entry(dt, entry, g) \
104 native_write_idt_entry(dt, entry, g)
105
106static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
107{
108}
109
110static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
111{
112}
113#endif /* CONFIG_PARAVIRT */
114
115static inline void native_write_idt_entry(gate_desc *idt, int entry,
116 const gate_desc *gate)
117{
118 memcpy(&idt[entry], gate, sizeof(*gate));
119}
120
121static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
122 const void *desc)
123{
124 memcpy(&ldt[entry], desc, 8);
125}
126
127static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
128 const void *desc, int type)
129{
130 unsigned int size;
131 switch (type) {
132 case DESC_TSS:
133 size = sizeof(tss_desc);
134 break;
135 case DESC_LDT:
136 size = sizeof(ldt_desc);
137 break;
138 default:
139 size = sizeof(struct desc_struct);
140 break;
141 }
142 memcpy(&gdt[entry], desc, size);
143}
144
145static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
146 unsigned long limit, unsigned char type,
147 unsigned char flags)
148{
149 desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
150 desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
151 (limit & 0x000f0000) | ((type & 0xff) << 8) |
152 ((flags & 0xf) << 20);
153 desc->p = 1;
154}
155
156
157static inline void set_tssldt_descriptor(void *d, unsigned long addr,
158 unsigned type, unsigned size)
159{
160#ifdef CONFIG_X86_64
161 struct ldttss_desc64 *desc = d;
162 memset(desc, 0, sizeof(*desc));
163 desc->limit0 = size & 0xFFFF;
164 desc->base0 = PTR_LOW(addr);
165 desc->base1 = PTR_MIDDLE(addr) & 0xFF;
166 desc->type = type;
167 desc->p = 1;
168 desc->limit1 = (size >> 16) & 0xF;
169 desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
170 desc->base3 = PTR_HIGH(addr);
171#else
172 pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
173#endif
174}
175
176static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
177{
178 struct desc_struct *d = get_cpu_gdt_table(cpu);
179 tss_desc tss;
180
181 /*
182 * sizeof(unsigned long) coming from an extra "long" at the end
183 * of the iobitmap. See tss_struct definition in processor.h
184 *
185 * -1? seg base+limit should be pointing to the address of the
186 * last valid byte
187 */
188 set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
189 IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
190 sizeof(unsigned long) - 1);
191 write_gdt_entry(d, entry, &tss, DESC_TSS);
192}
193
194#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
195
196static inline void native_set_ldt(const void *addr, unsigned int entries)
197{
198 if (likely(entries == 0))
199 asm volatile("lldt %w0"::"q" (0));
200 else {
201 unsigned cpu = smp_processor_id();
202 ldt_desc ldt;
203
204 set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
205 entries * LDT_ENTRY_SIZE - 1);
206 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
207 &ldt, DESC_LDT);
208 asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
209 }
210}
211
212static inline void native_load_tr_desc(void)
213{
214 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
215}
216
217static inline void native_load_gdt(const struct desc_ptr *dtr)
218{
219 asm volatile("lgdt %0"::"m" (*dtr));
220}
221
222static inline void native_load_idt(const struct desc_ptr *dtr)
223{
224 asm volatile("lidt %0"::"m" (*dtr));
225}
226
227static inline void native_store_gdt(struct desc_ptr *dtr)
228{
229 asm volatile("sgdt %0":"=m" (*dtr));
230}
231
232static inline void native_store_idt(struct desc_ptr *dtr)
233{
234 asm volatile("sidt %0":"=m" (*dtr));
235}
236
237static inline unsigned long native_store_tr(void)
238{
239 unsigned long tr;
240 asm volatile("str %0":"=r" (tr));
241 return tr;
242}
243
244static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
245{
246 unsigned int i;
247 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
248
249 for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
250 gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
251}
252
253#define _LDT_empty(info) \
254 ((info)->base_addr == 0 && \
255 (info)->limit == 0 && \
256 (info)->contents == 0 && \
257 (info)->read_exec_only == 1 && \
258 (info)->seg_32bit == 0 && \
259 (info)->limit_in_pages == 0 && \
260 (info)->seg_not_present == 1 && \
261 (info)->useable == 0)
262
263#ifdef CONFIG_X86_64
264#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
265#else
266#define LDT_empty(info) (_LDT_empty(info))
267#endif
268
269static inline void clear_LDT(void)
270{
271 set_ldt(NULL, 0);
272}
273
274/*
275 * load one particular LDT into the current CPU
276 */
277static inline void load_LDT_nolock(mm_context_t *pc)
278{
279 set_ldt(pc->ldt, pc->size);
280}
281
282static inline void load_LDT(mm_context_t *pc)
283{
284 preempt_disable();
285 load_LDT_nolock(pc);
286 preempt_enable();
287}
288
289static inline unsigned long get_desc_base(const struct desc_struct *desc)
290{
291 return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
292}
293
294static inline unsigned long get_desc_limit(const struct desc_struct *desc)
295{
296 return desc->limit0 | (desc->limit << 16);
297}
298
299static inline void _set_gate(int gate, unsigned type, void *addr,
300 unsigned dpl, unsigned ist, unsigned seg)
301{
302 gate_desc s;
303 pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
304 /*
305 * does not need to be atomic because it is only done once at
306 * setup time
307 */
308 write_idt_entry(idt_table, gate, &s);
309}
310
311/*
312 * This needs to use 'idt_table' rather than 'idt', and
313 * thus use the _nonmapped_ version of the IDT, as the
314 * Pentium F0 0F bugfix can have resulted in the mapped
315 * IDT being write-protected.
316 */
317static inline void set_intr_gate(unsigned int n, void *addr)
318{
319 BUG_ON((unsigned)n > 0xFF);
320 _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
321}
322
323#define SYS_VECTOR_FREE 0
324#define SYS_VECTOR_ALLOCED 1
325
326extern int first_system_vector;
327extern char system_vectors[];
328
329static inline void alloc_system_vector(int vector)
330{
331 if (system_vectors[vector] == SYS_VECTOR_FREE) {
332 system_vectors[vector] = SYS_VECTOR_ALLOCED;
333 if (first_system_vector > vector)
334 first_system_vector = vector;
335 } else
336 BUG();
337}
338
339static inline void alloc_intr_gate(unsigned int n, void *addr)
340{
341 alloc_system_vector(n);
342 set_intr_gate(n, addr);
343}
344
345/*
346 * This routine sets up an interrupt gate at directory privilege level 3.
347 */
348static inline void set_system_intr_gate(unsigned int n, void *addr)
349{
350 BUG_ON((unsigned)n > 0xFF);
351 _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
352}
353
354static inline void set_system_trap_gate(unsigned int n, void *addr)
355{
356 BUG_ON((unsigned)n > 0xFF);
357 _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
358}
359
360static inline void set_trap_gate(unsigned int n, void *addr)
361{
362 BUG_ON((unsigned)n > 0xFF);
363 _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
364}
365
366static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
367{
368 BUG_ON((unsigned)n > 0xFF);
369 _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
370}
371
372static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
373{
374 BUG_ON((unsigned)n > 0xFF);
375 _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
376}
377
378static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
379{
380 BUG_ON((unsigned)n > 0xFF);
381 _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
382}
383
384#else
385/*
386 * GET_DESC_BASE reads the descriptor base of the specified segment.
387 *
388 * Args:
389 * idx - descriptor index
390 * gdt - GDT pointer
391 * base - 32bit register to which the base will be written
392 * lo_w - lo word of the "base" register
393 * lo_b - lo byte of the "base" register
394 * hi_b - hi byte of the low word of the "base" register
395 *
396 * Example:
397 * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
398 * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
399 */
400#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
401 movb idx * 8 + 4(gdt), lo_b; \
402 movb idx * 8 + 7(gdt), hi_b; \
403 shll $16, base; \
404 movw idx * 8 + 2(gdt), lo_w;
405
406
407#endif /* __ASSEMBLY__ */
408
409#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
new file mode 100644
index 00000000000..a6adefa28b9
--- /dev/null
+++ b/arch/x86/include/asm/desc_defs.h
@@ -0,0 +1,95 @@
1/* Written 2000 by Andi Kleen */
2#ifndef _ASM_X86_DESC_DEFS_H
3#define _ASM_X86_DESC_DEFS_H
4
5/*
6 * Segment descriptor structure definitions, usable from both x86_64 and i386
7 * archs.
8 */
9
10#ifndef __ASSEMBLY__
11
12#include <linux/types.h>
13
14/*
15 * FIXME: Acessing the desc_struct through its fields is more elegant,
16 * and should be the one valid thing to do. However, a lot of open code
17 * still touches the a and b acessors, and doing this allow us to do it
18 * incrementally. We keep the signature as a struct, rather than an union,
19 * so we can get rid of it transparently in the future -- glommer
20 */
21/* 8 byte segment descriptor */
22struct desc_struct {
23 union {
24 struct {
25 unsigned int a;
26 unsigned int b;
27 };
28 struct {
29 u16 limit0;
30 u16 base0;
31 unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
32 unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
33 };
34 };
35} __attribute__((packed));
36
37enum {
38 GATE_INTERRUPT = 0xE,
39 GATE_TRAP = 0xF,
40 GATE_CALL = 0xC,
41 GATE_TASK = 0x5,
42};
43
44/* 16byte gate */
45struct gate_struct64 {
46 u16 offset_low;
47 u16 segment;
48 unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
49 u16 offset_middle;
50 u32 offset_high;
51 u32 zero1;
52} __attribute__((packed));
53
54#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF)
55#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF)
56#define PTR_HIGH(x) ((unsigned long long)(x) >> 32)
57
58enum {
59 DESC_TSS = 0x9,
60 DESC_LDT = 0x2,
61 DESCTYPE_S = 0x10, /* !system */
62};
63
64/* LDT or TSS descriptor in the GDT. 16 bytes. */
65struct ldttss_desc64 {
66 u16 limit0;
67 u16 base0;
68 unsigned base1 : 8, type : 5, dpl : 2, p : 1;
69 unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
70 u32 base3;
71 u32 zero1;
72} __attribute__((packed));
73
74#ifdef CONFIG_X86_64
75typedef struct gate_struct64 gate_desc;
76typedef struct ldttss_desc64 ldt_desc;
77typedef struct ldttss_desc64 tss_desc;
78#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32))
79#define gate_segment(g) ((g).segment)
80#else
81typedef struct desc_struct gate_desc;
82typedef struct desc_struct ldt_desc;
83typedef struct desc_struct tss_desc;
84#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff))
85#define gate_segment(g) ((g).a >> 16)
86#endif
87
88struct desc_ptr {
89 unsigned short size;
90 unsigned long address;
91} __attribute__((packed)) ;
92
93#endif /* !__ASSEMBLY__ */
94
95#endif /* _ASM_X86_DESC_DEFS_H */
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
new file mode 100644
index 00000000000..3c034f48fdb
--- /dev/null
+++ b/arch/x86/include/asm/device.h
@@ -0,0 +1,16 @@
1#ifndef _ASM_X86_DEVICE_H
2#define _ASM_X86_DEVICE_H
3
4struct dev_archdata {
5#ifdef CONFIG_ACPI
6 void *acpi_handle;
7#endif
8#ifdef CONFIG_X86_64
9struct dma_mapping_ops *dma_ops;
10#endif
11#ifdef CONFIG_DMAR
12 void *iommu; /* hook for IOMMU specific extension */
13#endif
14};
15
16#endif /* _ASM_X86_DEVICE_H */
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
new file mode 100644
index 00000000000..9a2d644c08e
--- /dev/null
+++ b/arch/x86/include/asm/div64.h
@@ -0,0 +1,60 @@
1#ifndef _ASM_X86_DIV64_H
2#define _ASM_X86_DIV64_H
3
4#ifdef CONFIG_X86_32
5
6#include <linux/types.h>
7
8/*
9 * do_div() is NOT a C function. It wants to return
10 * two values (the quotient and the remainder), but
11 * since that doesn't work very well in C, what it
12 * does is:
13 *
14 * - modifies the 64-bit dividend _in_place_
15 * - returns the 32-bit remainder
16 *
17 * This ends up being the most efficient "calling
18 * convention" on x86.
19 */
20#define do_div(n, base) \
21({ \
22 unsigned long __upper, __low, __high, __mod, __base; \
23 __base = (base); \
24 asm("":"=a" (__low), "=d" (__high) : "A" (n)); \
25 __upper = __high; \
26 if (__high) { \
27 __upper = __high % (__base); \
28 __high = __high / (__base); \
29 } \
30 asm("divl %2":"=a" (__low), "=d" (__mod) \
31 : "rm" (__base), "0" (__low), "1" (__upper)); \
32 asm("":"=A" (n) : "a" (__low), "d" (__high)); \
33 __mod; \
34})
35
36static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
37{
38 union {
39 u64 v64;
40 u32 v32[2];
41 } d = { dividend };
42 u32 upper;
43
44 upper = d.v32[1];
45 d.v32[1] = 0;
46 if (upper >= divisor) {
47 d.v32[1] = upper / divisor;
48 upper %= divisor;
49 }
50 asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
51 "rm" (divisor), "0" (d.v32[0]), "1" (upper));
52 return d.v64;
53}
54#define div_u64_rem div_u64_rem
55
56#else
57# include <asm-generic/div64.h>
58#endif /* CONFIG_X86_32 */
59
60#endif /* _ASM_X86_DIV64_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
new file mode 100644
index 00000000000..4a5397bfce2
--- /dev/null
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -0,0 +1,308 @@
1#ifndef _ASM_X86_DMA_MAPPING_H
2#define _ASM_X86_DMA_MAPPING_H
3
4/*
5 * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
6 * documentation.
7 */
8
9#include <linux/scatterlist.h>
10#include <asm/io.h>
11#include <asm/swiotlb.h>
12#include <asm-generic/dma-coherent.h>
13
14extern dma_addr_t bad_dma_address;
15extern int iommu_merge;
16extern struct device x86_dma_fallback_dev;
17extern int panic_on_overflow;
18
19struct dma_mapping_ops {
20 int (*mapping_error)(struct device *dev,
21 dma_addr_t dma_addr);
22 void* (*alloc_coherent)(struct device *dev, size_t size,
23 dma_addr_t *dma_handle, gfp_t gfp);
24 void (*free_coherent)(struct device *dev, size_t size,
25 void *vaddr, dma_addr_t dma_handle);
26 dma_addr_t (*map_single)(struct device *hwdev, phys_addr_t ptr,
27 size_t size, int direction);
28 void (*unmap_single)(struct device *dev, dma_addr_t addr,
29 size_t size, int direction);
30 void (*sync_single_for_cpu)(struct device *hwdev,
31 dma_addr_t dma_handle, size_t size,
32 int direction);
33 void (*sync_single_for_device)(struct device *hwdev,
34 dma_addr_t dma_handle, size_t size,
35 int direction);
36 void (*sync_single_range_for_cpu)(struct device *hwdev,
37 dma_addr_t dma_handle, unsigned long offset,
38 size_t size, int direction);
39 void (*sync_single_range_for_device)(struct device *hwdev,
40 dma_addr_t dma_handle, unsigned long offset,
41 size_t size, int direction);
42 void (*sync_sg_for_cpu)(struct device *hwdev,
43 struct scatterlist *sg, int nelems,
44 int direction);
45 void (*sync_sg_for_device)(struct device *hwdev,
46 struct scatterlist *sg, int nelems,
47 int direction);
48 int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
49 int nents, int direction);
50 void (*unmap_sg)(struct device *hwdev,
51 struct scatterlist *sg, int nents,
52 int direction);
53 int (*dma_supported)(struct device *hwdev, u64 mask);
54 int is_phys;
55};
56
57extern struct dma_mapping_ops *dma_ops;
58
59static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
60{
61#ifdef CONFIG_X86_32
62 return dma_ops;
63#else
64 if (unlikely(!dev) || !dev->archdata.dma_ops)
65 return dma_ops;
66 else
67 return dev->archdata.dma_ops;
68#endif /* _ASM_X86_DMA_MAPPING_H */
69}
70
71/* Make sure we keep the same behaviour */
72static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
73{
74#ifdef CONFIG_X86_32
75 return 0;
76#else
77 struct dma_mapping_ops *ops = get_dma_ops(dev);
78 if (ops->mapping_error)
79 return ops->mapping_error(dev, dma_addr);
80
81 return (dma_addr == bad_dma_address);
82#endif
83}
84
85#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
86#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
87#define dma_is_consistent(d, h) (1)
88
89extern int dma_supported(struct device *hwdev, u64 mask);
90extern int dma_set_mask(struct device *dev, u64 mask);
91
92extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
93 dma_addr_t *dma_addr, gfp_t flag);
94
95static inline dma_addr_t
96dma_map_single(struct device *hwdev, void *ptr, size_t size,
97 int direction)
98{
99 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
100
101 BUG_ON(!valid_dma_direction(direction));
102 return ops->map_single(hwdev, virt_to_phys(ptr), size, direction);
103}
104
105static inline void
106dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
107 int direction)
108{
109 struct dma_mapping_ops *ops = get_dma_ops(dev);
110
111 BUG_ON(!valid_dma_direction(direction));
112 if (ops->unmap_single)
113 ops->unmap_single(dev, addr, size, direction);
114}
115
116static inline int
117dma_map_sg(struct device *hwdev, struct scatterlist *sg,
118 int nents, int direction)
119{
120 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
121
122 BUG_ON(!valid_dma_direction(direction));
123 return ops->map_sg(hwdev, sg, nents, direction);
124}
125
126static inline void
127dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
128 int direction)
129{
130 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
131
132 BUG_ON(!valid_dma_direction(direction));
133 if (ops->unmap_sg)
134 ops->unmap_sg(hwdev, sg, nents, direction);
135}
136
137static inline void
138dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
139 size_t size, int direction)
140{
141 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
142
143 BUG_ON(!valid_dma_direction(direction));
144 if (ops->sync_single_for_cpu)
145 ops->sync_single_for_cpu(hwdev, dma_handle, size, direction);
146 flush_write_buffers();
147}
148
149static inline void
150dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
151 size_t size, int direction)
152{
153 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
154
155 BUG_ON(!valid_dma_direction(direction));
156 if (ops->sync_single_for_device)
157 ops->sync_single_for_device(hwdev, dma_handle, size, direction);
158 flush_write_buffers();
159}
160
161static inline void
162dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
163 unsigned long offset, size_t size, int direction)
164{
165 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
166
167 BUG_ON(!valid_dma_direction(direction));
168 if (ops->sync_single_range_for_cpu)
169 ops->sync_single_range_for_cpu(hwdev, dma_handle, offset,
170 size, direction);
171 flush_write_buffers();
172}
173
174static inline void
175dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
176 unsigned long offset, size_t size,
177 int direction)
178{
179 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
180
181 BUG_ON(!valid_dma_direction(direction));
182 if (ops->sync_single_range_for_device)
183 ops->sync_single_range_for_device(hwdev, dma_handle,
184 offset, size, direction);
185 flush_write_buffers();
186}
187
188static inline void
189dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
190 int nelems, int direction)
191{
192 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
193
194 BUG_ON(!valid_dma_direction(direction));
195 if (ops->sync_sg_for_cpu)
196 ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
197 flush_write_buffers();
198}
199
200static inline void
201dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
202 int nelems, int direction)
203{
204 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
205
206 BUG_ON(!valid_dma_direction(direction));
207 if (ops->sync_sg_for_device)
208 ops->sync_sg_for_device(hwdev, sg, nelems, direction);
209
210 flush_write_buffers();
211}
212
213static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
214 size_t offset, size_t size,
215 int direction)
216{
217 struct dma_mapping_ops *ops = get_dma_ops(dev);
218
219 BUG_ON(!valid_dma_direction(direction));
220 return ops->map_single(dev, page_to_phys(page) + offset,
221 size, direction);
222}
223
224static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
225 size_t size, int direction)
226{
227 dma_unmap_single(dev, addr, size, direction);
228}
229
230static inline void
231dma_cache_sync(struct device *dev, void *vaddr, size_t size,
232 enum dma_data_direction dir)
233{
234 flush_write_buffers();
235}
236
237static inline int dma_get_cache_alignment(void)
238{
239 /* no easy way to get cache size on all x86, so return the
240 * maximum possible, to be safe */
241 return boot_cpu_data.x86_clflush_size;
242}
243
244static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
245 gfp_t gfp)
246{
247 unsigned long dma_mask = 0;
248
249 dma_mask = dev->coherent_dma_mask;
250 if (!dma_mask)
251 dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
252
253 return dma_mask;
254}
255
256static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
257{
258#ifdef CONFIG_X86_64
259 unsigned long dma_mask = dma_alloc_coherent_mask(dev, gfp);
260
261 if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA))
262 gfp |= GFP_DMA32;
263#endif
264 return gfp;
265}
266
267static inline void *
268dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
269 gfp_t gfp)
270{
271 struct dma_mapping_ops *ops = get_dma_ops(dev);
272 void *memory;
273
274 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
275
276 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
277 return memory;
278
279 if (!dev) {
280 dev = &x86_dma_fallback_dev;
281 gfp |= GFP_DMA;
282 }
283
284 if (!is_device_dma_capable(dev))
285 return NULL;
286
287 if (!ops->alloc_coherent)
288 return NULL;
289
290 return ops->alloc_coherent(dev, size, dma_handle,
291 dma_alloc_coherent_gfp_flags(dev, gfp));
292}
293
294static inline void dma_free_coherent(struct device *dev, size_t size,
295 void *vaddr, dma_addr_t bus)
296{
297 struct dma_mapping_ops *ops = get_dma_ops(dev);
298
299 WARN_ON(irqs_disabled()); /* for portability */
300
301 if (dma_release_from_coherent(dev, get_order(size), vaddr))
302 return;
303
304 if (ops->free_coherent)
305 ops->free_coherent(dev, size, vaddr, bus);
306}
307
308#endif
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
new file mode 100644
index 00000000000..ca1098a7e58
--- /dev/null
+++ b/arch/x86/include/asm/dma.h
@@ -0,0 +1,318 @@
1/*
2 * linux/include/asm/dma.h: Defines for using and allocating dma channels.
3 * Written by Hennus Bergman, 1992.
4 * High DMA channel support & info by Hannu Savolainen
5 * and John Boyd, Nov. 1992.
6 */
7
8#ifndef _ASM_X86_DMA_H
9#define _ASM_X86_DMA_H
10
11#include <linux/spinlock.h> /* And spinlocks */
12#include <asm/io.h> /* need byte IO */
13#include <linux/delay.h>
14
15#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
16#define dma_outb outb_p
17#else
18#define dma_outb outb
19#endif
20
21#define dma_inb inb
22
23/*
24 * NOTES about DMA transfers:
25 *
26 * controller 1: channels 0-3, byte operations, ports 00-1F
27 * controller 2: channels 4-7, word operations, ports C0-DF
28 *
29 * - ALL registers are 8 bits only, regardless of transfer size
30 * - channel 4 is not used - cascades 1 into 2.
31 * - channels 0-3 are byte - addresses/counts are for physical bytes
32 * - channels 5-7 are word - addresses/counts are for physical words
33 * - transfers must not cross physical 64K (0-3) or 128K (5-7) boundaries
34 * - transfer count loaded to registers is 1 less than actual count
35 * - controller 2 offsets are all even (2x offsets for controller 1)
36 * - page registers for 5-7 don't use data bit 0, represent 128K pages
37 * - page registers for 0-3 use bit 0, represent 64K pages
38 *
39 * DMA transfers are limited to the lower 16MB of _physical_ memory.
40 * Note that addresses loaded into registers must be _physical_ addresses,
41 * not logical addresses (which may differ if paging is active).
42 *
43 * Address mapping for channels 0-3:
44 *
45 * A23 ... A16 A15 ... A8 A7 ... A0 (Physical addresses)
46 * | ... | | ... | | ... |
47 * | ... | | ... | | ... |
48 * | ... | | ... | | ... |
49 * P7 ... P0 A7 ... A0 A7 ... A0
50 * | Page | Addr MSB | Addr LSB | (DMA registers)
51 *
52 * Address mapping for channels 5-7:
53 *
54 * A23 ... A17 A16 A15 ... A9 A8 A7 ... A1 A0 (Physical addresses)
55 * | ... | \ \ ... \ \ \ ... \ \
56 * | ... | \ \ ... \ \ \ ... \ (not used)
57 * | ... | \ \ ... \ \ \ ... \
58 * P7 ... P1 (0) A7 A6 ... A0 A7 A6 ... A0
59 * | Page | Addr MSB | Addr LSB | (DMA registers)
60 *
61 * Again, channels 5-7 transfer _physical_ words (16 bits), so addresses
62 * and counts _must_ be word-aligned (the lowest address bit is _ignored_ at
63 * the hardware level, so odd-byte transfers aren't possible).
64 *
65 * Transfer count (_not # bytes_) is limited to 64K, represented as actual
66 * count - 1 : 64K => 0xFFFF, 1 => 0x0000. Thus, count is always 1 or more,
67 * and up to 128K bytes may be transferred on channels 5-7 in one operation.
68 *
69 */
70
71#define MAX_DMA_CHANNELS 8
72
73#ifdef CONFIG_X86_32
74
75/* The maximum address that we can perform a DMA transfer to on this platform */
76#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
77
78#else
79
80/* 16MB ISA DMA zone */
81#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT)
82
83/* 4GB broken PCI/AGP hardware bus master zone */
84#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
85
86/* Compat define for old dma zone */
87#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
88
89#endif
90
91/* 8237 DMA controllers */
92#define IO_DMA1_BASE 0x00 /* 8 bit slave DMA, channels 0..3 */
93#define IO_DMA2_BASE 0xC0 /* 16 bit master DMA, ch 4(=slave input)..7 */
94
95/* DMA controller registers */
96#define DMA1_CMD_REG 0x08 /* command register (w) */
97#define DMA1_STAT_REG 0x08 /* status register (r) */
98#define DMA1_REQ_REG 0x09 /* request register (w) */
99#define DMA1_MASK_REG 0x0A /* single-channel mask (w) */
100#define DMA1_MODE_REG 0x0B /* mode register (w) */
101#define DMA1_CLEAR_FF_REG 0x0C /* clear pointer flip-flop (w) */
102#define DMA1_TEMP_REG 0x0D /* Temporary Register (r) */
103#define DMA1_RESET_REG 0x0D /* Master Clear (w) */
104#define DMA1_CLR_MASK_REG 0x0E /* Clear Mask */
105#define DMA1_MASK_ALL_REG 0x0F /* all-channels mask (w) */
106
107#define DMA2_CMD_REG 0xD0 /* command register (w) */
108#define DMA2_STAT_REG 0xD0 /* status register (r) */
109#define DMA2_REQ_REG 0xD2 /* request register (w) */
110#define DMA2_MASK_REG 0xD4 /* single-channel mask (w) */
111#define DMA2_MODE_REG 0xD6 /* mode register (w) */
112#define DMA2_CLEAR_FF_REG 0xD8 /* clear pointer flip-flop (w) */
113#define DMA2_TEMP_REG 0xDA /* Temporary Register (r) */
114#define DMA2_RESET_REG 0xDA /* Master Clear (w) */
115#define DMA2_CLR_MASK_REG 0xDC /* Clear Mask */
116#define DMA2_MASK_ALL_REG 0xDE /* all-channels mask (w) */
117
118#define DMA_ADDR_0 0x00 /* DMA address registers */
119#define DMA_ADDR_1 0x02
120#define DMA_ADDR_2 0x04
121#define DMA_ADDR_3 0x06
122#define DMA_ADDR_4 0xC0
123#define DMA_ADDR_5 0xC4
124#define DMA_ADDR_6 0xC8
125#define DMA_ADDR_7 0xCC
126
127#define DMA_CNT_0 0x01 /* DMA count registers */
128#define DMA_CNT_1 0x03
129#define DMA_CNT_2 0x05
130#define DMA_CNT_3 0x07
131#define DMA_CNT_4 0xC2
132#define DMA_CNT_5 0xC6
133#define DMA_CNT_6 0xCA
134#define DMA_CNT_7 0xCE
135
136#define DMA_PAGE_0 0x87 /* DMA page registers */
137#define DMA_PAGE_1 0x83
138#define DMA_PAGE_2 0x81
139#define DMA_PAGE_3 0x82
140#define DMA_PAGE_5 0x8B
141#define DMA_PAGE_6 0x89
142#define DMA_PAGE_7 0x8A
143
144/* I/O to memory, no autoinit, increment, single mode */
145#define DMA_MODE_READ 0x44
146/* memory to I/O, no autoinit, increment, single mode */
147#define DMA_MODE_WRITE 0x48
148/* pass thru DREQ->HRQ, DACK<-HLDA only */
149#define DMA_MODE_CASCADE 0xC0
150
151#define DMA_AUTOINIT 0x10
152
153
154extern spinlock_t dma_spin_lock;
155
156static inline unsigned long claim_dma_lock(void)
157{
158 unsigned long flags;
159 spin_lock_irqsave(&dma_spin_lock, flags);
160 return flags;
161}
162
163static inline void release_dma_lock(unsigned long flags)
164{
165 spin_unlock_irqrestore(&dma_spin_lock, flags);
166}
167
168/* enable/disable a specific DMA channel */
169static inline void enable_dma(unsigned int dmanr)
170{
171 if (dmanr <= 3)
172 dma_outb(dmanr, DMA1_MASK_REG);
173 else
174 dma_outb(dmanr & 3, DMA2_MASK_REG);
175}
176
177static inline void disable_dma(unsigned int dmanr)
178{
179 if (dmanr <= 3)
180 dma_outb(dmanr | 4, DMA1_MASK_REG);
181 else
182 dma_outb((dmanr & 3) | 4, DMA2_MASK_REG);
183}
184
185/* Clear the 'DMA Pointer Flip Flop'.
186 * Write 0 for LSB/MSB, 1 for MSB/LSB access.
187 * Use this once to initialize the FF to a known state.
188 * After that, keep track of it. :-)
189 * --- In order to do that, the DMA routines below should ---
190 * --- only be used while holding the DMA lock ! ---
191 */
192static inline void clear_dma_ff(unsigned int dmanr)
193{
194 if (dmanr <= 3)
195 dma_outb(0, DMA1_CLEAR_FF_REG);
196 else
197 dma_outb(0, DMA2_CLEAR_FF_REG);
198}
199
200/* set mode (above) for a specific DMA channel */
201static inline void set_dma_mode(unsigned int dmanr, char mode)
202{
203 if (dmanr <= 3)
204 dma_outb(mode | dmanr, DMA1_MODE_REG);
205 else
206 dma_outb(mode | (dmanr & 3), DMA2_MODE_REG);
207}
208
209/* Set only the page register bits of the transfer address.
210 * This is used for successive transfers when we know the contents of
211 * the lower 16 bits of the DMA current address register, but a 64k boundary
212 * may have been crossed.
213 */
214static inline void set_dma_page(unsigned int dmanr, char pagenr)
215{
216 switch (dmanr) {
217 case 0:
218 dma_outb(pagenr, DMA_PAGE_0);
219 break;
220 case 1:
221 dma_outb(pagenr, DMA_PAGE_1);
222 break;
223 case 2:
224 dma_outb(pagenr, DMA_PAGE_2);
225 break;
226 case 3:
227 dma_outb(pagenr, DMA_PAGE_3);
228 break;
229 case 5:
230 dma_outb(pagenr & 0xfe, DMA_PAGE_5);
231 break;
232 case 6:
233 dma_outb(pagenr & 0xfe, DMA_PAGE_6);
234 break;
235 case 7:
236 dma_outb(pagenr & 0xfe, DMA_PAGE_7);
237 break;
238 }
239}
240
241
242/* Set transfer address & page bits for specific DMA channel.
243 * Assumes dma flipflop is clear.
244 */
245static inline void set_dma_addr(unsigned int dmanr, unsigned int a)
246{
247 set_dma_page(dmanr, a>>16);
248 if (dmanr <= 3) {
249 dma_outb(a & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE);
250 dma_outb((a >> 8) & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE);
251 } else {
252 dma_outb((a >> 1) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE);
253 dma_outb((a >> 9) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE);
254 }
255}
256
257
258/* Set transfer size (max 64k for DMA0..3, 128k for DMA5..7) for
259 * a specific DMA channel.
260 * You must ensure the parameters are valid.
261 * NOTE: from a manual: "the number of transfers is one more
262 * than the initial word count"! This is taken into account.
263 * Assumes dma flip-flop is clear.
264 * NOTE 2: "count" represents _bytes_ and must be even for channels 5-7.
265 */
266static inline void set_dma_count(unsigned int dmanr, unsigned int count)
267{
268 count--;
269 if (dmanr <= 3) {
270 dma_outb(count & 0xff, ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE);
271 dma_outb((count >> 8) & 0xff,
272 ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE);
273 } else {
274 dma_outb((count >> 1) & 0xff,
275 ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE);
276 dma_outb((count >> 9) & 0xff,
277 ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE);
278 }
279}
280
281
282/* Get DMA residue count. After a DMA transfer, this
283 * should return zero. Reading this while a DMA transfer is
284 * still in progress will return unpredictable results.
285 * If called before the channel has been used, it may return 1.
286 * Otherwise, it returns the number of _bytes_ left to transfer.
287 *
288 * Assumes DMA flip-flop is clear.
289 */
290static inline int get_dma_residue(unsigned int dmanr)
291{
292 unsigned int io_port;
293 /* using short to get 16-bit wrap around */
294 unsigned short count;
295
296 io_port = (dmanr <= 3) ? ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE
297 : ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE;
298
299 count = 1 + dma_inb(io_port);
300 count += dma_inb(io_port) << 8;
301
302 return (dmanr <= 3) ? count : (count << 1);
303}
304
305
306/* These are in kernel/dma.c: */
307extern int request_dma(unsigned int dmanr, const char *device_id);
308extern void free_dma(unsigned int dmanr);
309
310/* From PCI */
311
312#ifdef CONFIG_PCI
313extern int isa_dma_bridge_buggy;
314#else
315#define isa_dma_bridge_buggy (0)
316#endif
317
318#endif /* _ASM_X86_DMA_H */
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
new file mode 100644
index 00000000000..bc68212c6bc
--- /dev/null
+++ b/arch/x86/include/asm/dmi.h
@@ -0,0 +1,26 @@
1#ifndef _ASM_X86_DMI_H
2#define _ASM_X86_DMI_H
3
4#include <asm/io.h>
5
6#define DMI_MAX_DATA 2048
7
8extern int dmi_alloc_index;
9extern char dmi_alloc_data[DMI_MAX_DATA];
10
11/* This is so early that there is no good way to allocate dynamic memory.
12 Allocate data in an BSS array. */
13static inline void *dmi_alloc(unsigned len)
14{
15 int idx = dmi_alloc_index;
16 if ((dmi_alloc_index + len) > DMI_MAX_DATA)
17 return NULL;
18 dmi_alloc_index += len;
19 return dmi_alloc_data + idx;
20}
21
22/* Use early IO mappings for DMI because it's initialized early */
23#define dmi_ioremap early_ioremap
24#define dmi_iounmap early_iounmap
25
26#endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
new file mode 100644
index 00000000000..72c5a190bf4
--- /dev/null
+++ b/arch/x86/include/asm/ds.h
@@ -0,0 +1,238 @@
1/*
2 * Debug Store (DS) support
3 *
4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS).
7 *
8 * It manages:
9 * - per-thread and per-cpu allocation of BTS and PEBS
10 * - buffer memory allocation (optional)
11 * - buffer overflow handling
12 * - buffer access
13 *
14 * It assumes:
15 * - get_task_struct on all parameter tasks
16 * - current is allowed to trace parameter tasks
17 *
18 *
19 * Copyright (C) 2007-2008 Intel Corporation.
20 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
21 */
22
23#ifndef _ASM_X86_DS_H
24#define _ASM_X86_DS_H
25
26#ifdef CONFIG_X86_DS
27
28#include <linux/types.h>
29#include <linux/init.h>
30
31
32struct task_struct;
33
34/*
35 * Request BTS or PEBS
36 *
37 * Due to alignement constraints, the actual buffer may be slightly
38 * smaller than the requested or provided buffer.
39 *
40 * Returns 0 on success; -Eerrno otherwise
41 *
42 * task: the task to request recording for;
43 * NULL for per-cpu recording on the current cpu
44 * base: the base pointer for the (non-pageable) buffer;
45 * NULL if buffer allocation requested
46 * size: the size of the requested or provided buffer
47 * ovfl: pointer to a function to be called on buffer overflow;
48 * NULL if cyclic buffer requested
49 */
50typedef void (*ds_ovfl_callback_t)(struct task_struct *);
51extern int ds_request_bts(struct task_struct *task, void *base, size_t size,
52 ds_ovfl_callback_t ovfl);
53extern int ds_request_pebs(struct task_struct *task, void *base, size_t size,
54 ds_ovfl_callback_t ovfl);
55
56/*
57 * Release BTS or PEBS resources
58 *
59 * Frees buffers allocated on ds_request.
60 *
61 * Returns 0 on success; -Eerrno otherwise
62 *
63 * task: the task to release resources for;
64 * NULL to release resources for the current cpu
65 */
66extern int ds_release_bts(struct task_struct *task);
67extern int ds_release_pebs(struct task_struct *task);
68
69/*
70 * Return the (array) index of the write pointer.
71 * (assuming an array of BTS/PEBS records)
72 *
73 * Returns -Eerrno on error
74 *
75 * task: the task to access;
76 * NULL to access the current cpu
77 * pos (out): if not NULL, will hold the result
78 */
79extern int ds_get_bts_index(struct task_struct *task, size_t *pos);
80extern int ds_get_pebs_index(struct task_struct *task, size_t *pos);
81
82/*
83 * Return the (array) index one record beyond the end of the array.
84 * (assuming an array of BTS/PEBS records)
85 *
86 * Returns -Eerrno on error
87 *
88 * task: the task to access;
89 * NULL to access the current cpu
90 * pos (out): if not NULL, will hold the result
91 */
92extern int ds_get_bts_end(struct task_struct *task, size_t *pos);
93extern int ds_get_pebs_end(struct task_struct *task, size_t *pos);
94
95/*
96 * Provide a pointer to the BTS/PEBS record at parameter index.
97 * (assuming an array of BTS/PEBS records)
98 *
99 * The pointer points directly into the buffer. The user is
100 * responsible for copying the record.
101 *
102 * Returns the size of a single record on success; -Eerrno on error
103 *
104 * task: the task to access;
105 * NULL to access the current cpu
106 * index: the index of the requested record
107 * record (out): pointer to the requested record
108 */
109extern int ds_access_bts(struct task_struct *task,
110 size_t index, const void **record);
111extern int ds_access_pebs(struct task_struct *task,
112 size_t index, const void **record);
113
114/*
115 * Write one or more BTS/PEBS records at the write pointer index and
116 * advance the write pointer.
117 *
118 * If size is not a multiple of the record size, trailing bytes are
119 * zeroed out.
120 *
121 * May result in one or more overflow notifications.
122 *
123 * If called during overflow handling, that is, with index >=
124 * interrupt threshold, the write will wrap around.
125 *
126 * An overflow notification is given if and when the interrupt
127 * threshold is reached during or after the write.
128 *
129 * Returns the number of bytes written or -Eerrno.
130 *
131 * task: the task to access;
132 * NULL to access the current cpu
133 * buffer: the buffer to write
134 * size: the size of the buffer
135 */
136extern int ds_write_bts(struct task_struct *task,
137 const void *buffer, size_t size);
138extern int ds_write_pebs(struct task_struct *task,
139 const void *buffer, size_t size);
140
141/*
142 * Same as ds_write_bts/pebs, but omit ownership checks.
143 *
144 * This is needed to have some other task than the owner of the
145 * BTS/PEBS buffer or the parameter task itself write into the
146 * respective buffer.
147 */
148extern int ds_unchecked_write_bts(struct task_struct *task,
149 const void *buffer, size_t size);
150extern int ds_unchecked_write_pebs(struct task_struct *task,
151 const void *buffer, size_t size);
152
153/*
154 * Reset the write pointer of the BTS/PEBS buffer.
155 *
156 * Returns 0 on success; -Eerrno on error
157 *
158 * task: the task to access;
159 * NULL to access the current cpu
160 */
161extern int ds_reset_bts(struct task_struct *task);
162extern int ds_reset_pebs(struct task_struct *task);
163
164/*
165 * Clear the BTS/PEBS buffer and reset the write pointer.
166 * The entire buffer will be zeroed out.
167 *
168 * Returns 0 on success; -Eerrno on error
169 *
170 * task: the task to access;
171 * NULL to access the current cpu
172 */
173extern int ds_clear_bts(struct task_struct *task);
174extern int ds_clear_pebs(struct task_struct *task);
175
176/*
177 * Provide the PEBS counter reset value.
178 *
179 * Returns 0 on success; -Eerrno on error
180 *
181 * task: the task to access;
182 * NULL to access the current cpu
183 * value (out): the counter reset value
184 */
185extern int ds_get_pebs_reset(struct task_struct *task, u64 *value);
186
187/*
188 * Set the PEBS counter reset value.
189 *
190 * Returns 0 on success; -Eerrno on error
191 *
192 * task: the task to access;
193 * NULL to access the current cpu
194 * value: the new counter reset value
195 */
196extern int ds_set_pebs_reset(struct task_struct *task, u64 value);
197
198/*
199 * Initialization
200 */
201struct cpuinfo_x86;
202extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
203
204
205
206/*
207 * The DS context - part of struct thread_struct.
208 */
209struct ds_context {
210 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
211 unsigned char *ds;
212 /* the owner of the BTS and PEBS configuration, respectively */
213 struct task_struct *owner[2];
214 /* buffer overflow notification function for BTS and PEBS */
215 ds_ovfl_callback_t callback[2];
216 /* the original buffer address */
217 void *buffer[2];
218 /* the number of allocated pages for on-request allocated buffers */
219 unsigned int pages[2];
220 /* use count */
221 unsigned long count;
222 /* a pointer to the context location inside the thread_struct
223 * or the per_cpu context array */
224 struct ds_context **this;
225 /* a pointer to the task owning this context, or NULL, if the
226 * context is owned by a cpu */
227 struct task_struct *task;
228};
229
230/* called by exit_thread() to free leftover contexts */
231extern void ds_free(struct ds_context *context);
232
233#else /* CONFIG_X86_DS */
234
235#define ds_init_intel(config) do {} while (0)
236
237#endif /* CONFIG_X86_DS */
238#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
new file mode 100644
index 00000000000..804b6e6be92
--- /dev/null
+++ b/arch/x86/include/asm/dwarf2.h
@@ -0,0 +1,61 @@
1#ifndef _ASM_X86_DWARF2_H
2#define _ASM_X86_DWARF2_H
3
4#ifndef __ASSEMBLY__
5#warning "asm/dwarf2.h should be only included in pure assembly files"
6#endif
7
8/*
9 Macros for dwarf2 CFI unwind table entries.
10 See "as.info" for details on these pseudo ops. Unfortunately
11 they are only supported in very new binutils, so define them
12 away for older version.
13 */
14
15#ifdef CONFIG_AS_CFI
16
17#define CFI_STARTPROC .cfi_startproc
18#define CFI_ENDPROC .cfi_endproc
19#define CFI_DEF_CFA .cfi_def_cfa
20#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register
21#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset
22#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset
23#define CFI_OFFSET .cfi_offset
24#define CFI_REL_OFFSET .cfi_rel_offset
25#define CFI_REGISTER .cfi_register
26#define CFI_RESTORE .cfi_restore
27#define CFI_REMEMBER_STATE .cfi_remember_state
28#define CFI_RESTORE_STATE .cfi_restore_state
29#define CFI_UNDEFINED .cfi_undefined
30
31#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
32#define CFI_SIGNAL_FRAME .cfi_signal_frame
33#else
34#define CFI_SIGNAL_FRAME
35#endif
36
37#else
38
39/* Due to the structure of pre-exisiting code, don't use assembler line
40 comment character # to ignore the arguments. Instead, use a dummy macro. */
41.macro cfi_ignore a=0, b=0, c=0, d=0
42.endm
43
44#define CFI_STARTPROC cfi_ignore
45#define CFI_ENDPROC cfi_ignore
46#define CFI_DEF_CFA cfi_ignore
47#define CFI_DEF_CFA_REGISTER cfi_ignore
48#define CFI_DEF_CFA_OFFSET cfi_ignore
49#define CFI_ADJUST_CFA_OFFSET cfi_ignore
50#define CFI_OFFSET cfi_ignore
51#define CFI_REL_OFFSET cfi_ignore
52#define CFI_REGISTER cfi_ignore
53#define CFI_RESTORE cfi_ignore
54#define CFI_REMEMBER_STATE cfi_ignore
55#define CFI_RESTORE_STATE cfi_ignore
56#define CFI_UNDEFINED cfi_ignore
57#define CFI_SIGNAL_FRAME cfi_ignore
58
59#endif
60
61#endif /* _ASM_X86_DWARF2_H */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
new file mode 100644
index 00000000000..3d8ceddbd40
--- /dev/null
+++ b/arch/x86/include/asm/e820.h
@@ -0,0 +1,146 @@
1#ifndef _ASM_X86_E820_H
2#define _ASM_X86_E820_H
3#define E820MAP 0x2d0 /* our map */
4#define E820MAX 128 /* number of entries in E820MAP */
5
6/*
7 * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the
8 * constrained space in the zeropage. If we have more nodes than
9 * that, and if we've booted off EFI firmware, then the EFI tables
10 * passed us from the EFI firmware can list more nodes. Size our
11 * internal memory map tables to have room for these additional
12 * nodes, based on up to three entries per node for which the
13 * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT),
14 * plus E820MAX, allowing space for the possible duplicate E820
15 * entries that might need room in the same arrays, prior to the
16 * call to sanitize_e820_map() to remove duplicates. The allowance
17 * of three memory map entries per node is "enough" entries for
18 * the initial hardware platform motivating this mechanism to make
19 * use of additional EFI map entries. Future platforms may want
20 * to allow more than three entries per node or otherwise refine
21 * this size.
22 */
23
24/*
25 * Odd: 'make headers_check' complains about numa.h if I try
26 * to collapse the next two #ifdef lines to a single line:
27 * #if defined(__KERNEL__) && defined(CONFIG_EFI)
28 */
29#ifdef __KERNEL__
30#ifdef CONFIG_EFI
31#include <linux/numa.h>
32#define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES)
33#else /* ! CONFIG_EFI */
34#define E820_X_MAX E820MAX
35#endif
36#else /* ! __KERNEL__ */
37#define E820_X_MAX E820MAX
38#endif
39
40#define E820NR 0x1e8 /* # entries in E820MAP */
41
42#define E820_RAM 1
43#define E820_RESERVED 2
44#define E820_ACPI 3
45#define E820_NVS 4
46#define E820_UNUSABLE 5
47
48/* reserved RAM used by kernel itself */
49#define E820_RESERVED_KERN 128
50
51#ifndef __ASSEMBLY__
52struct e820entry {
53 __u64 addr; /* start of memory segment */
54 __u64 size; /* size of memory segment */
55 __u32 type; /* type of memory segment */
56} __attribute__((packed));
57
58struct e820map {
59 __u32 nr_map;
60 struct e820entry map[E820_X_MAX];
61};
62
63#ifdef __KERNEL__
64/* see comment in arch/x86/kernel/e820.c */
65extern struct e820map e820;
66extern struct e820map e820_saved;
67
68extern unsigned long pci_mem_start;
69extern int e820_any_mapped(u64 start, u64 end, unsigned type);
70extern int e820_all_mapped(u64 start, u64 end, unsigned type);
71extern void e820_add_region(u64 start, u64 size, int type);
72extern void e820_print_map(char *who);
73extern int
74sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map);
75extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
76 unsigned new_type);
77extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
78 int checktype);
79extern void update_e820(void);
80extern void e820_setup_gap(void);
81extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
82 unsigned long start_addr, unsigned long long end_addr);
83struct setup_data;
84extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data);
85
86#if defined(CONFIG_X86_64) || \
87 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
88extern void e820_mark_nosave_regions(unsigned long limit_pfn);
89#else
90static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
91{
92}
93#endif
94
95#ifdef CONFIG_MEMTEST
96extern void early_memtest(unsigned long start, unsigned long end);
97#else
98static inline void early_memtest(unsigned long start, unsigned long end)
99{
100}
101#endif
102
103extern unsigned long end_user_pfn;
104
105extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
106extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
107extern void reserve_early(u64 start, u64 end, char *name);
108extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
109extern void free_early(u64 start, u64 end);
110extern void early_res_to_bootmem(u64 start, u64 end);
111extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
112
113extern unsigned long e820_end_of_ram_pfn(void);
114extern unsigned long e820_end_of_low_ram_pfn(void);
115extern int e820_find_active_region(const struct e820entry *ei,
116 unsigned long start_pfn,
117 unsigned long last_pfn,
118 unsigned long *ei_startpfn,
119 unsigned long *ei_endpfn);
120extern void e820_register_active_regions(int nid, unsigned long start_pfn,
121 unsigned long end_pfn);
122extern u64 e820_hole_size(u64 start, u64 end);
123extern void finish_e820_parsing(void);
124extern void e820_reserve_resources(void);
125extern void e820_reserve_resources_late(void);
126extern void setup_memory_map(void);
127extern char *default_machine_specific_memory_setup(void);
128extern char *machine_specific_memory_setup(void);
129extern char *memory_setup(void);
130#endif /* __KERNEL__ */
131#endif /* __ASSEMBLY__ */
132
133#define ISA_START_ADDRESS 0xa0000
134#define ISA_END_ADDRESS 0x100000
135#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS)
136
137#define BIOS_BEGIN 0x000a0000
138#define BIOS_END 0x00100000
139
140#ifdef __KERNEL__
141#include <linux/ioport.h>
142
143#define HIGH_MEMORY (1024*1024)
144#endif /* __KERNEL__ */
145
146#endif /* _ASM_X86_E820_H */
diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h
new file mode 100644
index 00000000000..e9b57ecc70c
--- /dev/null
+++ b/arch/x86/include/asm/edac.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_X86_EDAC_H
2#define _ASM_X86_EDAC_H
3
4/* ECC atomic, DMA, SMP and interrupt safe scrub function */
5
6static inline void atomic_scrub(void *va, u32 size)
7{
8 u32 i, *virt_addr = va;
9
10 /*
11 * Very carefully read and write to memory atomically so we
12 * are interrupt, DMA and SMP safe.
13 */
14 for (i = 0; i < size / 4; i++, virt_addr++)
15 asm volatile("lock; addl $0, %0"::"m" (*virt_addr));
16}
17
18#endif /* _ASM_X86_EDAC_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
new file mode 100644
index 00000000000..a2e545c91c3
--- /dev/null
+++ b/arch/x86/include/asm/efi.h
@@ -0,0 +1,110 @@
1#ifndef _ASM_X86_EFI_H
2#define _ASM_X86_EFI_H
3
4#ifdef CONFIG_X86_32
5
6extern unsigned long asmlinkage efi_call_phys(void *, ...);
7
8#define efi_call_phys0(f) efi_call_phys(f)
9#define efi_call_phys1(f, a1) efi_call_phys(f, a1)
10#define efi_call_phys2(f, a1, a2) efi_call_phys(f, a1, a2)
11#define efi_call_phys3(f, a1, a2, a3) efi_call_phys(f, a1, a2, a3)
12#define efi_call_phys4(f, a1, a2, a3, a4) \
13 efi_call_phys(f, a1, a2, a3, a4)
14#define efi_call_phys5(f, a1, a2, a3, a4, a5) \
15 efi_call_phys(f, a1, a2, a3, a4, a5)
16#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \
17 efi_call_phys(f, a1, a2, a3, a4, a5, a6)
18/*
19 * Wrap all the virtual calls in a way that forces the parameters on the stack.
20 */
21
22#define efi_call_virt(f, args...) \
23 ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
24
25#define efi_call_virt0(f) efi_call_virt(f)
26#define efi_call_virt1(f, a1) efi_call_virt(f, a1)
27#define efi_call_virt2(f, a1, a2) efi_call_virt(f, a1, a2)
28#define efi_call_virt3(f, a1, a2, a3) efi_call_virt(f, a1, a2, a3)
29#define efi_call_virt4(f, a1, a2, a3, a4) \
30 efi_call_virt(f, a1, a2, a3, a4)
31#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
32 efi_call_virt(f, a1, a2, a3, a4, a5)
33#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
34 efi_call_virt(f, a1, a2, a3, a4, a5, a6)
35
36#define efi_ioremap(addr, size) ioremap_cache(addr, size)
37
38#else /* !CONFIG_X86_32 */
39
40#define MAX_EFI_IO_PAGES 100
41
42extern u64 efi_call0(void *fp);
43extern u64 efi_call1(void *fp, u64 arg1);
44extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
45extern u64 efi_call3(void *fp, u64 arg1, u64 arg2, u64 arg3);
46extern u64 efi_call4(void *fp, u64 arg1, u64 arg2, u64 arg3, u64 arg4);
47extern u64 efi_call5(void *fp, u64 arg1, u64 arg2, u64 arg3,
48 u64 arg4, u64 arg5);
49extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
50 u64 arg4, u64 arg5, u64 arg6);
51
52#define efi_call_phys0(f) \
53 efi_call0((void *)(f))
54#define efi_call_phys1(f, a1) \
55 efi_call1((void *)(f), (u64)(a1))
56#define efi_call_phys2(f, a1, a2) \
57 efi_call2((void *)(f), (u64)(a1), (u64)(a2))
58#define efi_call_phys3(f, a1, a2, a3) \
59 efi_call3((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3))
60#define efi_call_phys4(f, a1, a2, a3, a4) \
61 efi_call4((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
62 (u64)(a4))
63#define efi_call_phys5(f, a1, a2, a3, a4, a5) \
64 efi_call5((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
65 (u64)(a4), (u64)(a5))
66#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \
67 efi_call6((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
68 (u64)(a4), (u64)(a5), (u64)(a6))
69
70#define efi_call_virt0(f) \
71 efi_call0((void *)(efi.systab->runtime->f))
72#define efi_call_virt1(f, a1) \
73 efi_call1((void *)(efi.systab->runtime->f), (u64)(a1))
74#define efi_call_virt2(f, a1, a2) \
75 efi_call2((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2))
76#define efi_call_virt3(f, a1, a2, a3) \
77 efi_call3((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
78 (u64)(a3))
79#define efi_call_virt4(f, a1, a2, a3, a4) \
80 efi_call4((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
81 (u64)(a3), (u64)(a4))
82#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
83 efi_call5((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
84 (u64)(a3), (u64)(a4), (u64)(a5))
85#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
86 efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
87 (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
88
89extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size);
90
91#endif /* CONFIG_X86_32 */
92
93extern void efi_reserve_early(void);
94extern void efi_call_phys_prelog(void);
95extern void efi_call_phys_epilog(void);
96
97#ifndef CONFIG_EFI
98/*
99 * IF EFI is not configured, have the EFI calls return -ENOSYS.
100 */
101#define efi_call0(_f) (-ENOSYS)
102#define efi_call1(_f, _a1) (-ENOSYS)
103#define efi_call2(_f, _a1, _a2) (-ENOSYS)
104#define efi_call3(_f, _a1, _a2, _a3) (-ENOSYS)
105#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS)
106#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS)
107#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS)
108#endif /* CONFIG_EFI */
109
110#endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
new file mode 100644
index 00000000000..40ca1bea791
--- /dev/null
+++ b/arch/x86/include/asm/elf.h
@@ -0,0 +1,336 @@
1#ifndef _ASM_X86_ELF_H
2#define _ASM_X86_ELF_H
3
4/*
5 * ELF register definitions..
6 */
7
8#include <asm/ptrace.h>
9#include <asm/user.h>
10#include <asm/auxvec.h>
11
12typedef unsigned long elf_greg_t;
13
14#define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t))
15typedef elf_greg_t elf_gregset_t[ELF_NGREG];
16
17typedef struct user_i387_struct elf_fpregset_t;
18
19#ifdef __i386__
20
21typedef struct user_fxsr_struct elf_fpxregset_t;
22
23#define R_386_NONE 0
24#define R_386_32 1
25#define R_386_PC32 2
26#define R_386_GOT32 3
27#define R_386_PLT32 4
28#define R_386_COPY 5
29#define R_386_GLOB_DAT 6
30#define R_386_JMP_SLOT 7
31#define R_386_RELATIVE 8
32#define R_386_GOTOFF 9
33#define R_386_GOTPC 10
34#define R_386_NUM 11
35
36/*
37 * These are used to set parameters in the core dumps.
38 */
39#define ELF_CLASS ELFCLASS32
40#define ELF_DATA ELFDATA2LSB
41#define ELF_ARCH EM_386
42
43#else
44
45/* x86-64 relocation types */
46#define R_X86_64_NONE 0 /* No reloc */
47#define R_X86_64_64 1 /* Direct 64 bit */
48#define R_X86_64_PC32 2 /* PC relative 32 bit signed */
49#define R_X86_64_GOT32 3 /* 32 bit GOT entry */
50#define R_X86_64_PLT32 4 /* 32 bit PLT address */
51#define R_X86_64_COPY 5 /* Copy symbol at runtime */
52#define R_X86_64_GLOB_DAT 6 /* Create GOT entry */
53#define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */
54#define R_X86_64_RELATIVE 8 /* Adjust by program base */
55#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative
56 offset to GOT */
57#define R_X86_64_32 10 /* Direct 32 bit zero extended */
58#define R_X86_64_32S 11 /* Direct 32 bit sign extended */
59#define R_X86_64_16 12 /* Direct 16 bit zero extended */
60#define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */
61#define R_X86_64_8 14 /* Direct 8 bit sign extended */
62#define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */
63
64#define R_X86_64_NUM 16
65
66/*
67 * These are used to set parameters in the core dumps.
68 */
69#define ELF_CLASS ELFCLASS64
70#define ELF_DATA ELFDATA2LSB
71#define ELF_ARCH EM_X86_64
72
73#endif
74
75#include <asm/vdso.h>
76
77extern unsigned int vdso_enabled;
78
79/*
80 * This is used to ensure we don't load something for the wrong architecture.
81 */
82#define elf_check_arch_ia32(x) \
83 (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486))
84
85#include <asm/processor.h>
86#include <asm/system.h>
87
88#ifdef CONFIG_X86_32
89#include <asm/desc.h>
90
91#define elf_check_arch(x) elf_check_arch_ia32(x)
92
93/* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program starts %edx
94 contains a pointer to a function which might be registered using `atexit'.
95 This provides a mean for the dynamic linker to call DT_FINI functions for
96 shared libraries that have been loaded before the code runs.
97
98 A value of 0 tells we have no such handler.
99
100 We might as well make sure everything else is cleared too (except for %esp),
101 just to make things more deterministic.
102 */
103#define ELF_PLAT_INIT(_r, load_addr) \
104 do { \
105 _r->bx = 0; _r->cx = 0; _r->dx = 0; \
106 _r->si = 0; _r->di = 0; _r->bp = 0; \
107 _r->ax = 0; \
108} while (0)
109
110/*
111 * regs is struct pt_regs, pr_reg is elf_gregset_t (which is
112 * now struct_user_regs, they are different)
113 */
114
115#define ELF_CORE_COPY_REGS(pr_reg, regs) \
116do { \
117 pr_reg[0] = regs->bx; \
118 pr_reg[1] = regs->cx; \
119 pr_reg[2] = regs->dx; \
120 pr_reg[3] = regs->si; \
121 pr_reg[4] = regs->di; \
122 pr_reg[5] = regs->bp; \
123 pr_reg[6] = regs->ax; \
124 pr_reg[7] = regs->ds & 0xffff; \
125 pr_reg[8] = regs->es & 0xffff; \
126 pr_reg[9] = regs->fs & 0xffff; \
127 savesegment(gs, pr_reg[10]); \
128 pr_reg[11] = regs->orig_ax; \
129 pr_reg[12] = regs->ip; \
130 pr_reg[13] = regs->cs & 0xffff; \
131 pr_reg[14] = regs->flags; \
132 pr_reg[15] = regs->sp; \
133 pr_reg[16] = regs->ss & 0xffff; \
134} while (0);
135
136#define ELF_PLATFORM (utsname()->machine)
137#define set_personality_64bit() do { } while (0)
138
139#else /* CONFIG_X86_32 */
140
141/*
142 * This is used to ensure we don't load something for the wrong architecture.
143 */
144#define elf_check_arch(x) \
145 ((x)->e_machine == EM_X86_64)
146
147#define compat_elf_check_arch(x) elf_check_arch_ia32(x)
148
149static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp)
150{
151 loadsegment(fs, 0);
152 loadsegment(ds, __USER32_DS);
153 loadsegment(es, __USER32_DS);
154 load_gs_index(0);
155 regs->ip = ip;
156 regs->sp = sp;
157 regs->flags = X86_EFLAGS_IF;
158 regs->cs = __USER32_CS;
159 regs->ss = __USER32_DS;
160}
161
162static inline void elf_common_init(struct thread_struct *t,
163 struct pt_regs *regs, const u16 ds)
164{
165 regs->ax = regs->bx = regs->cx = regs->dx = 0;
166 regs->si = regs->di = regs->bp = 0;
167 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
168 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
169 t->fs = t->gs = 0;
170 t->fsindex = t->gsindex = 0;
171 t->ds = t->es = ds;
172}
173
174#define ELF_PLAT_INIT(_r, load_addr) \
175do { \
176 elf_common_init(&current->thread, _r, 0); \
177 clear_thread_flag(TIF_IA32); \
178} while (0)
179
180#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
181 elf_common_init(&current->thread, regs, __USER_DS)
182
183#define compat_start_thread(regs, ip, sp) \
184do { \
185 start_ia32_thread(regs, ip, sp); \
186 set_fs(USER_DS); \
187} while (0)
188
189#define COMPAT_SET_PERSONALITY(ex) \
190do { \
191 if (test_thread_flag(TIF_IA32)) \
192 clear_thread_flag(TIF_ABI_PENDING); \
193 else \
194 set_thread_flag(TIF_ABI_PENDING); \
195 current->personality |= force_personality32; \
196} while (0)
197
198#define COMPAT_ELF_PLATFORM ("i686")
199
200/*
201 * regs is struct pt_regs, pr_reg is elf_gregset_t (which is
202 * now struct_user_regs, they are different). Assumes current is the process
203 * getting dumped.
204 */
205
206#define ELF_CORE_COPY_REGS(pr_reg, regs) \
207do { \
208 unsigned v; \
209 (pr_reg)[0] = (regs)->r15; \
210 (pr_reg)[1] = (regs)->r14; \
211 (pr_reg)[2] = (regs)->r13; \
212 (pr_reg)[3] = (regs)->r12; \
213 (pr_reg)[4] = (regs)->bp; \
214 (pr_reg)[5] = (regs)->bx; \
215 (pr_reg)[6] = (regs)->r11; \
216 (pr_reg)[7] = (regs)->r10; \
217 (pr_reg)[8] = (regs)->r9; \
218 (pr_reg)[9] = (regs)->r8; \
219 (pr_reg)[10] = (regs)->ax; \
220 (pr_reg)[11] = (regs)->cx; \
221 (pr_reg)[12] = (regs)->dx; \
222 (pr_reg)[13] = (regs)->si; \
223 (pr_reg)[14] = (regs)->di; \
224 (pr_reg)[15] = (regs)->orig_ax; \
225 (pr_reg)[16] = (regs)->ip; \
226 (pr_reg)[17] = (regs)->cs; \
227 (pr_reg)[18] = (regs)->flags; \
228 (pr_reg)[19] = (regs)->sp; \
229 (pr_reg)[20] = (regs)->ss; \
230 (pr_reg)[21] = current->thread.fs; \
231 (pr_reg)[22] = current->thread.gs; \
232 asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
233 asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
234 asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
235 asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v; \
236} while (0);
237
238/* I'm not sure if we can use '-' here */
239#define ELF_PLATFORM ("x86_64")
240extern void set_personality_64bit(void);
241extern unsigned int sysctl_vsyscall32;
242extern int force_personality32;
243
244#endif /* !CONFIG_X86_32 */
245
246#define CORE_DUMP_USE_REGSET
247#define USE_ELF_CORE_DUMP
248#define ELF_EXEC_PAGESIZE 4096
249
250/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
251 use of this is to invoke "./ld.so someprog" to test out a new version of
252 the loader. We need to make sure that it is out of the way of the program
253 that it will "exec", and that there is sufficient room for the brk. */
254
255#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
256
257/* This yields a mask that user programs can use to figure out what
258 instruction set this CPU supports. This could be done in user space,
259 but it's not easy, and we've already done it here. */
260
261#define ELF_HWCAP (boot_cpu_data.x86_capability[0])
262
263/* This yields a string that ld.so will use to load implementation
264 specific libraries for optimization. This is more specific in
265 intent than poking at uname or /proc/cpuinfo.
266
267 For the moment, we have only optimizations for the Intel generations,
268 but that could change... */
269
270#define SET_PERSONALITY(ex) set_personality_64bit()
271
272/*
273 * An executable for which elf_read_implies_exec() returns TRUE will
274 * have the READ_IMPLIES_EXEC personality flag set automatically.
275 */
276#define elf_read_implies_exec(ex, executable_stack) \
277 (executable_stack != EXSTACK_DISABLE_X)
278
279struct task_struct;
280
281#define ARCH_DLINFO_IA32(vdso_enabled) \
282do { \
283 if (vdso_enabled) { \
284 NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \
285 NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \
286 } \
287} while (0)
288
289#ifdef CONFIG_X86_32
290
291#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO))
292
293#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled)
294
295/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
296
297#else /* CONFIG_X86_32 */
298
299#define VDSO_HIGH_BASE 0xffffe000U /* CONFIG_COMPAT_VDSO address */
300
301/* 1GB for 64bit, 8MB for 32bit */
302#define STACK_RND_MASK (test_thread_flag(TIF_IA32) ? 0x7ff : 0x3fffff)
303
304#define ARCH_DLINFO \
305do { \
306 if (vdso_enabled) \
307 NEW_AUX_ENT(AT_SYSINFO_EHDR, \
308 (unsigned long)current->mm->context.vdso); \
309} while (0)
310
311#define AT_SYSINFO 32
312
313#define COMPAT_ARCH_DLINFO ARCH_DLINFO_IA32(sysctl_vsyscall32)
314
315#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
316
317#endif /* !CONFIG_X86_32 */
318
319#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso)
320
321#define VDSO_ENTRY \
322 ((unsigned long)VDSO32_SYMBOL(VDSO_CURRENT_BASE, vsyscall))
323
324struct linux_binprm;
325
326#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
327extern int arch_setup_additional_pages(struct linux_binprm *bprm,
328 int executable_stack);
329
330extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
331#define compat_arch_setup_additional_pages syscall32_setup_pages
332
333extern unsigned long arch_randomize_brk(struct mm_struct *mm);
334#define arch_randomize_brk arch_randomize_brk
335
336#endif /* _ASM_X86_ELF_H */
diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h
new file mode 100644
index 00000000000..94826cf8745
--- /dev/null
+++ b/arch/x86/include/asm/emergency-restart.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_X86_EMERGENCY_RESTART_H
2#define _ASM_X86_EMERGENCY_RESTART_H
3
4enum reboot_type {
5 BOOT_TRIPLE = 't',
6 BOOT_KBD = 'k',
7#ifdef CONFIG_X86_32
8 BOOT_BIOS = 'b',
9#endif
10 BOOT_ACPI = 'a',
11 BOOT_EFI = 'e'
12};
13
14extern enum reboot_type reboot_type;
15
16extern void machine_emergency_restart(void);
17
18#endif /* _ASM_X86_EMERGENCY_RESTART_H */
diff --git a/arch/x86/include/asm/errno.h b/arch/x86/include/asm/errno.h
new file mode 100644
index 00000000000..4c82b503d92
--- /dev/null
+++ b/arch/x86/include/asm/errno.h
@@ -0,0 +1 @@
#include <asm-generic/errno.h>
diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h
new file mode 100644
index 00000000000..380f0b4f17e
--- /dev/null
+++ b/arch/x86/include/asm/es7000/apic.h
@@ -0,0 +1,193 @@
1#ifndef __ASM_ES7000_APIC_H
2#define __ASM_ES7000_APIC_H
3
4#define xapic_phys_to_log_apicid(cpu) per_cpu(x86_bios_cpu_apicid, cpu)
5#define esr_disable (1)
6
7static inline int apic_id_registered(void)
8{
9 return (1);
10}
11
12static inline cpumask_t target_cpus(void)
13{
14#if defined CONFIG_ES7000_CLUSTERED_APIC
15 return CPU_MASK_ALL;
16#else
17 return cpumask_of_cpu(smp_processor_id());
18#endif
19}
20
21#if defined CONFIG_ES7000_CLUSTERED_APIC
22#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
23#define INT_DELIVERY_MODE (dest_LowestPrio)
24#define INT_DEST_MODE (1) /* logical delivery broadcast to all procs */
25#define NO_BALANCE_IRQ (1)
26#undef WAKE_SECONDARY_VIA_INIT
27#define WAKE_SECONDARY_VIA_MIP
28#else
29#define APIC_DFR_VALUE (APIC_DFR_FLAT)
30#define INT_DELIVERY_MODE (dest_Fixed)
31#define INT_DEST_MODE (0) /* phys delivery to target procs */
32#define NO_BALANCE_IRQ (0)
33#undef APIC_DEST_LOGICAL
34#define APIC_DEST_LOGICAL 0x0
35#define WAKE_SECONDARY_VIA_INIT
36#endif
37
38static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
39{
40 return 0;
41}
42static inline unsigned long check_apicid_present(int bit)
43{
44 return physid_isset(bit, phys_cpu_present_map);
45}
46
47#define apicid_cluster(apicid) (apicid & 0xF0)
48
49static inline unsigned long calculate_ldr(int cpu)
50{
51 unsigned long id;
52 id = xapic_phys_to_log_apicid(cpu);
53 return (SET_APIC_LOGICAL_ID(id));
54}
55
56/*
57 * Set up the logical destination ID.
58 *
59 * Intel recommends to set DFR, LdR and TPR before enabling
60 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
61 * document number 292116). So here it goes...
62 */
63static inline void init_apic_ldr(void)
64{
65 unsigned long val;
66 int cpu = smp_processor_id();
67
68 apic_write(APIC_DFR, APIC_DFR_VALUE);
69 val = calculate_ldr(cpu);
70 apic_write(APIC_LDR, val);
71}
72
73#ifndef CONFIG_X86_GENERICARCH
74extern void enable_apic_mode(void);
75#endif
76
77extern int apic_version [MAX_APICS];
78static inline void setup_apic_routing(void)
79{
80 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
81 printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
82 (apic_version[apic] == 0x14) ?
83 "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]);
84}
85
86static inline int multi_timer_check(int apic, int irq)
87{
88 return 0;
89}
90
91static inline int apicid_to_node(int logical_apicid)
92{
93 return 0;
94}
95
96
97static inline int cpu_present_to_apicid(int mps_cpu)
98{
99 if (!mps_cpu)
100 return boot_cpu_physical_apicid;
101 else if (mps_cpu < NR_CPUS)
102 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
103 else
104 return BAD_APICID;
105}
106
107static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
108{
109 static int id = 0;
110 physid_mask_t mask;
111 mask = physid_mask_of_physid(id);
112 ++id;
113 return mask;
114}
115
116extern u8 cpu_2_logical_apicid[];
117/* Mapping from cpu number to logical apicid */
118static inline int cpu_to_logical_apicid(int cpu)
119{
120#ifdef CONFIG_SMP
121 if (cpu >= NR_CPUS)
122 return BAD_APICID;
123 return (int)cpu_2_logical_apicid[cpu];
124#else
125 return logical_smp_processor_id();
126#endif
127}
128
129static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
130{
131 /* For clustered we don't have a good way to do this yet - hack */
132 return physids_promote(0xff);
133}
134
135
136static inline void setup_portio_remap(void)
137{
138}
139
140extern unsigned int boot_cpu_physical_apicid;
141static inline int check_phys_apicid_present(int cpu_physical_apicid)
142{
143 boot_cpu_physical_apicid = read_apic_id();
144 return (1);
145}
146
147static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
148{
149 int num_bits_set;
150 int cpus_found = 0;
151 int cpu;
152 int apicid;
153
154 num_bits_set = cpus_weight(cpumask);
155 /* Return id to all */
156 if (num_bits_set == NR_CPUS)
157#if defined CONFIG_ES7000_CLUSTERED_APIC
158 return 0xFF;
159#else
160 return cpu_to_logical_apicid(0);
161#endif
162 /*
163 * The cpus in the mask must all be on the apic cluster. If are not
164 * on the same apicid cluster return default value of TARGET_CPUS.
165 */
166 cpu = first_cpu(cpumask);
167 apicid = cpu_to_logical_apicid(cpu);
168 while (cpus_found < num_bits_set) {
169 if (cpu_isset(cpu, cpumask)) {
170 int new_apicid = cpu_to_logical_apicid(cpu);
171 if (apicid_cluster(apicid) !=
172 apicid_cluster(new_apicid)){
173 printk ("%s: Not a valid mask!\n", __func__);
174#if defined CONFIG_ES7000_CLUSTERED_APIC
175 return 0xFF;
176#else
177 return cpu_to_logical_apicid(0);
178#endif
179 }
180 apicid = new_apicid;
181 cpus_found++;
182 }
183 cpu++;
184 }
185 return apicid;
186}
187
188static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
189{
190 return cpuid_apic >> index_msb;
191}
192
193#endif /* __ASM_ES7000_APIC_H */
diff --git a/arch/x86/include/asm/es7000/apicdef.h b/arch/x86/include/asm/es7000/apicdef.h
new file mode 100644
index 00000000000..8b234a3cb85
--- /dev/null
+++ b/arch/x86/include/asm/es7000/apicdef.h
@@ -0,0 +1,13 @@
1#ifndef __ASM_ES7000_APICDEF_H
2#define __ASM_ES7000_APICDEF_H
3
4#define APIC_ID_MASK (0xFF<<24)
5
6static inline unsigned get_apic_id(unsigned long x)
7{
8 return (((x)>>24)&0xFF);
9}
10
11#define GET_APIC_ID(x) get_apic_id(x)
12
13#endif
diff --git a/arch/x86/include/asm/es7000/ipi.h b/arch/x86/include/asm/es7000/ipi.h
new file mode 100644
index 00000000000..632a955fcc0
--- /dev/null
+++ b/arch/x86/include/asm/es7000/ipi.h
@@ -0,0 +1,24 @@
1#ifndef __ASM_ES7000_IPI_H
2#define __ASM_ES7000_IPI_H
3
4void send_IPI_mask_sequence(cpumask_t mask, int vector);
5
6static inline void send_IPI_mask(cpumask_t mask, int vector)
7{
8 send_IPI_mask_sequence(mask, vector);
9}
10
11static inline void send_IPI_allbutself(int vector)
12{
13 cpumask_t mask = cpu_online_map;
14 cpu_clear(smp_processor_id(), mask);
15 if (!cpus_empty(mask))
16 send_IPI_mask(mask, vector);
17}
18
19static inline void send_IPI_all(int vector)
20{
21 send_IPI_mask(cpu_online_map, vector);
22}
23
24#endif /* __ASM_ES7000_IPI_H */
diff --git a/arch/x86/include/asm/es7000/mpparse.h b/arch/x86/include/asm/es7000/mpparse.h
new file mode 100644
index 00000000000..ed5a3caae14
--- /dev/null
+++ b/arch/x86/include/asm/es7000/mpparse.h
@@ -0,0 +1,30 @@
1#ifndef __ASM_ES7000_MPPARSE_H
2#define __ASM_ES7000_MPPARSE_H
3
4#include <linux/acpi.h>
5
6extern int parse_unisys_oem (char *oemptr);
7extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
8extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
9extern void setup_unisys(void);
10
11#ifndef CONFIG_X86_GENERICARCH
12extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id);
13extern int mps_oem_check(struct mp_config_table *mpc, char *oem,
14 char *productid);
15#endif
16
17#ifdef CONFIG_ACPI
18
19static inline int es7000_check_dsdt(void)
20{
21 struct acpi_table_header header;
22
23 if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
24 !strncmp(header.oem_id, "UNISYS", 6))
25 return 1;
26 return 0;
27}
28#endif
29
30#endif /* __ASM_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/es7000/wakecpu.h b/arch/x86/include/asm/es7000/wakecpu.h
new file mode 100644
index 00000000000..3ffc5a7bf66
--- /dev/null
+++ b/arch/x86/include/asm/es7000/wakecpu.h
@@ -0,0 +1,59 @@
1#ifndef __ASM_ES7000_WAKECPU_H
2#define __ASM_ES7000_WAKECPU_H
3
4/*
5 * This file copes with machines that wakeup secondary CPUs by the
6 * INIT, INIT, STARTUP sequence.
7 */
8
9#ifdef CONFIG_ES7000_CLUSTERED_APIC
10#define WAKE_SECONDARY_VIA_MIP
11#else
12#define WAKE_SECONDARY_VIA_INIT
13#endif
14
15#ifdef WAKE_SECONDARY_VIA_MIP
16extern int es7000_start_cpu(int cpu, unsigned long eip);
17static inline int
18wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
19{
20 int boot_error = 0;
21 boot_error = es7000_start_cpu(phys_apicid, start_eip);
22 return boot_error;
23}
24#endif
25
26#define TRAMPOLINE_LOW phys_to_virt(0x467)
27#define TRAMPOLINE_HIGH phys_to_virt(0x469)
28
29#define boot_cpu_apicid boot_cpu_physical_apicid
30
31static inline void wait_for_init_deassert(atomic_t *deassert)
32{
33#ifdef WAKE_SECONDARY_VIA_INIT
34 while (!atomic_read(deassert))
35 cpu_relax();
36#endif
37 return;
38}
39
40/* Nothing to do for most platforms, since cleared by the INIT cycle */
41static inline void smp_callin_clear_local_apic(void)
42{
43}
44
45static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
46{
47}
48
49static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
50{
51}
52
53#if APIC_DEBUG
54 #define inquire_remote_apic(apicid) __inquire_remote_apic(apicid)
55#else
56 #define inquire_remote_apic(apicid) {}
57#endif
58
59#endif /* __ASM_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h
new file mode 100644
index 00000000000..53018464aea
--- /dev/null
+++ b/arch/x86/include/asm/fb.h
@@ -0,0 +1,21 @@
1#ifndef _ASM_X86_FB_H
2#define _ASM_X86_FB_H
3
4#include <linux/fb.h>
5#include <linux/fs.h>
6#include <asm/page.h>
7
8static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
9 unsigned long off)
10{
11 if (boot_cpu_data.x86 > 3)
12 pgprot_val(vma->vm_page_prot) |= _PAGE_PCD;
13}
14
15#ifdef CONFIG_X86_32
16extern int fb_is_primary_device(struct fb_info *info);
17#else
18static inline int fb_is_primary_device(struct fb_info *info) { return 0; }
19#endif
20
21#endif /* _ASM_X86_FB_H */
diff --git a/arch/x86/include/asm/fcntl.h b/arch/x86/include/asm/fcntl.h
new file mode 100644
index 00000000000..46ab12db573
--- /dev/null
+++ b/arch/x86/include/asm/fcntl.h
@@ -0,0 +1 @@
#include <asm-generic/fcntl.h>
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
new file mode 100644
index 00000000000..8668a94f850
--- /dev/null
+++ b/arch/x86/include/asm/fixmap.h
@@ -0,0 +1,68 @@
1#ifndef _ASM_X86_FIXMAP_H
2#define _ASM_X86_FIXMAP_H
3
4#ifdef CONFIG_X86_32
5# include "fixmap_32.h"
6#else
7# include "fixmap_64.h"
8#endif
9
10extern int fixmaps_set;
11
12void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
13void native_set_fixmap(enum fixed_addresses idx,
14 unsigned long phys, pgprot_t flags);
15
16#ifndef CONFIG_PARAVIRT
17static inline void __set_fixmap(enum fixed_addresses idx,
18 unsigned long phys, pgprot_t flags)
19{
20 native_set_fixmap(idx, phys, flags);
21}
22#endif
23
24#define set_fixmap(idx, phys) \
25 __set_fixmap(idx, phys, PAGE_KERNEL)
26
27/*
28 * Some hardware wants to get fixmapped without caching.
29 */
30#define set_fixmap_nocache(idx, phys) \
31 __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
32
33#define clear_fixmap(idx) \
34 __set_fixmap(idx, 0, __pgprot(0))
35
36#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
37#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
38
39extern void __this_fixmap_does_not_exist(void);
40
41/*
42 * 'index to address' translation. If anyone tries to use the idx
43 * directly without translation, we catch the bug with a NULL-deference
44 * kernel oops. Illegal ranges of incoming indices are caught too.
45 */
46static __always_inline unsigned long fix_to_virt(const unsigned int idx)
47{
48 /*
49 * this branch gets completely eliminated after inlining,
50 * except when someone tries to use fixaddr indices in an
51 * illegal way. (such as mixing up address types or using
52 * out-of-range indices).
53 *
54 * If it doesn't get removed, the linker will complain
55 * loudly with a reasonably clear error message..
56 */
57 if (idx >= __end_of_fixed_addresses)
58 __this_fixmap_does_not_exist();
59
60 return __fix_to_virt(idx);
61}
62
63static inline unsigned long virt_to_fix(const unsigned long vaddr)
64{
65 BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
66 return __virt_to_fix(vaddr);
67}
68#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/fixmap_32.h b/arch/x86/include/asm/fixmap_32.h
new file mode 100644
index 00000000000..09f29ab5c13
--- /dev/null
+++ b/arch/x86/include/asm/fixmap_32.h
@@ -0,0 +1,123 @@
1/*
2 * fixmap.h: compile-time virtual memory allocation
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1998 Ingo Molnar
9 *
10 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11 */
12
13#ifndef _ASM_X86_FIXMAP_32_H
14#define _ASM_X86_FIXMAP_32_H
15
16
17/* used by vmalloc.c, vsyscall.lds.S.
18 *
19 * Leave one empty page between vmalloc'ed areas and
20 * the start of the fixmap.
21 */
22extern unsigned long __FIXADDR_TOP;
23#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
24#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
25
26#ifndef __ASSEMBLY__
27#include <linux/kernel.h>
28#include <asm/acpi.h>
29#include <asm/apicdef.h>
30#include <asm/page.h>
31#ifdef CONFIG_HIGHMEM
32#include <linux/threads.h>
33#include <asm/kmap_types.h>
34#endif
35
36/*
37 * Here we define all the compile-time 'special' virtual
38 * addresses. The point is to have a constant address at
39 * compile time, but to set the physical address only
40 * in the boot process. We allocate these special addresses
41 * from the end of virtual memory (0xfffff000) backwards.
42 * Also this lets us do fail-safe vmalloc(), we
43 * can guarantee that these special addresses and
44 * vmalloc()-ed addresses never overlap.
45 *
46 * these 'compile-time allocated' memory buffers are
47 * fixed-size 4k pages. (or larger if used with an increment
48 * highger than 1) use fixmap_set(idx,phys) to associate
49 * physical memory with fixmap indices.
50 *
51 * TLB entries of such buffers will not be flushed across
52 * task switches.
53 */
54enum fixed_addresses {
55 FIX_HOLE,
56 FIX_VDSO,
57 FIX_DBGP_BASE,
58 FIX_EARLYCON_MEM_BASE,
59#ifdef CONFIG_X86_LOCAL_APIC
60 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
61#endif
62#ifdef CONFIG_X86_IO_APIC
63 FIX_IO_APIC_BASE_0,
64 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
65#endif
66#ifdef CONFIG_X86_VISWS_APIC
67 FIX_CO_CPU, /* Cobalt timer */
68 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
69 FIX_LI_PCIA, /* Lithium PCI Bridge A */
70 FIX_LI_PCIB, /* Lithium PCI Bridge B */
71#endif
72#ifdef CONFIG_X86_F00F_BUG
73 FIX_F00F_IDT, /* Virtual mapping for IDT */
74#endif
75#ifdef CONFIG_X86_CYCLONE_TIMER
76 FIX_CYCLONE_TIMER, /*cyclone timer register*/
77#endif
78#ifdef CONFIG_HIGHMEM
79 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
80 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
81#endif
82#ifdef CONFIG_PCI_MMCONFIG
83 FIX_PCIE_MCFG,
84#endif
85#ifdef CONFIG_PARAVIRT
86 FIX_PARAVIRT_BOOTMAP,
87#endif
88 __end_of_permanent_fixed_addresses,
89 /*
90 * 256 temporary boot-time mappings, used by early_ioremap(),
91 * before ioremap() is functional.
92 *
93 * We round it up to the next 256 pages boundary so that we
94 * can have a single pgd entry and a single pte table:
95 */
96#define NR_FIX_BTMAPS 64
97#define FIX_BTMAPS_SLOTS 4
98 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
99 (__end_of_permanent_fixed_addresses & 255),
100 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
101 FIX_WP_TEST,
102#ifdef CONFIG_ACPI
103 FIX_ACPI_BEGIN,
104 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
105#endif
106#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
107 FIX_OHCI1394_BASE,
108#endif
109 __end_of_fixed_addresses
110};
111
112extern void reserve_top_address(unsigned long reserve);
113
114
115#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
116
117#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
118#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
119#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
120#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
121
122#endif /* !__ASSEMBLY__ */
123#endif /* _ASM_X86_FIXMAP_32_H */
diff --git a/arch/x86/include/asm/fixmap_64.h b/arch/x86/include/asm/fixmap_64.h
new file mode 100644
index 00000000000..00a30ab9b1a
--- /dev/null
+++ b/arch/x86/include/asm/fixmap_64.h
@@ -0,0 +1,83 @@
1/*
2 * fixmap.h: compile-time virtual memory allocation
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1998 Ingo Molnar
9 */
10
11#ifndef _ASM_X86_FIXMAP_64_H
12#define _ASM_X86_FIXMAP_64_H
13
14#include <linux/kernel.h>
15#include <asm/acpi.h>
16#include <asm/apicdef.h>
17#include <asm/page.h>
18#include <asm/vsyscall.h>
19#include <asm/efi.h>
20
21/*
22 * Here we define all the compile-time 'special' virtual
23 * addresses. The point is to have a constant address at
24 * compile time, but to set the physical address only
25 * in the boot process.
26 *
27 * These 'compile-time allocated' memory buffers are
28 * fixed-size 4k pages (or larger if used with an increment
29 * higher than 1). Use set_fixmap(idx,phys) to associate
30 * physical memory with fixmap indices.
31 *
32 * TLB entries of such buffers will not be flushed across
33 * task switches.
34 */
35
36enum fixed_addresses {
37 VSYSCALL_LAST_PAGE,
38 VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
39 + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
40 VSYSCALL_HPET,
41 FIX_DBGP_BASE,
42 FIX_EARLYCON_MEM_BASE,
43 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
44 FIX_IO_APIC_BASE_0,
45 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
46 FIX_EFI_IO_MAP_LAST_PAGE,
47 FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
48 + MAX_EFI_IO_PAGES - 1,
49#ifdef CONFIG_PARAVIRT
50 FIX_PARAVIRT_BOOTMAP,
51#endif
52 __end_of_permanent_fixed_addresses,
53#ifdef CONFIG_ACPI
54 FIX_ACPI_BEGIN,
55 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
56#endif
57#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
58 FIX_OHCI1394_BASE,
59#endif
60 /*
61 * 256 temporary boot-time mappings, used by early_ioremap(),
62 * before ioremap() is functional.
63 *
64 * We round it up to the next 256 pages boundary so that we
65 * can have a single pgd entry and a single pte table:
66 */
67#define NR_FIX_BTMAPS 64
68#define FIX_BTMAPS_SLOTS 4
69 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
70 (__end_of_permanent_fixed_addresses & 255),
71 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
72 __end_of_fixed_addresses
73};
74
75#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
76#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
77#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
78
79/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
80#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
81#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
82
83#endif /* _ASM_X86_FIXMAP_64_H */
diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h
new file mode 100644
index 00000000000..dbe82a5c5ea
--- /dev/null
+++ b/arch/x86/include/asm/floppy.h
@@ -0,0 +1,281 @@
1/*
2 * Architecture specific parts of the Floppy driver
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1995
9 */
10#ifndef _ASM_X86_FLOPPY_H
11#define _ASM_X86_FLOPPY_H
12
13#include <linux/vmalloc.h>
14
15/*
16 * The DMA channel used by the floppy controller cannot access data at
17 * addresses >= 16MB
18 *
19 * Went back to the 1MB limit, as some people had problems with the floppy
20 * driver otherwise. It doesn't matter much for performance anyway, as most
21 * floppy accesses go through the track buffer.
22 */
23#define _CROSS_64KB(a, s, vdma) \
24 (!(vdma) && \
25 ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
26
27#define CROSS_64KB(a, s) _CROSS_64KB(a, s, use_virtual_dma & 1)
28
29
30#define SW fd_routine[use_virtual_dma & 1]
31#define CSW fd_routine[can_use_virtual_dma & 1]
32
33
34#define fd_inb(port) inb_p(port)
35#define fd_outb(value, port) outb_p(value, port)
36
37#define fd_request_dma() CSW._request_dma(FLOPPY_DMA, "floppy")
38#define fd_free_dma() CSW._free_dma(FLOPPY_DMA)
39#define fd_enable_irq() enable_irq(FLOPPY_IRQ)
40#define fd_disable_irq() disable_irq(FLOPPY_IRQ)
41#define fd_free_irq() free_irq(FLOPPY_IRQ, NULL)
42#define fd_get_dma_residue() SW._get_dma_residue(FLOPPY_DMA)
43#define fd_dma_mem_alloc(size) SW._dma_mem_alloc(size)
44#define fd_dma_setup(addr, size, mode, io) SW._dma_setup(addr, size, mode, io)
45
46#define FLOPPY_CAN_FALLBACK_ON_NODMA
47
48static int virtual_dma_count;
49static int virtual_dma_residue;
50static char *virtual_dma_addr;
51static int virtual_dma_mode;
52static int doing_pdma;
53
54static irqreturn_t floppy_hardint(int irq, void *dev_id)
55{
56 unsigned char st;
57
58#undef TRACE_FLPY_INT
59
60#ifdef TRACE_FLPY_INT
61 static int calls;
62 static int bytes;
63 static int dma_wait;
64#endif
65 if (!doing_pdma)
66 return floppy_interrupt(irq, dev_id);
67
68#ifdef TRACE_FLPY_INT
69 if (!calls)
70 bytes = virtual_dma_count;
71#endif
72
73 {
74 int lcount;
75 char *lptr;
76
77 st = 1;
78 for (lcount = virtual_dma_count, lptr = virtual_dma_addr;
79 lcount; lcount--, lptr++) {
80 st = inb(virtual_dma_port + 4) & 0xa0;
81 if (st != 0xa0)
82 break;
83 if (virtual_dma_mode)
84 outb_p(*lptr, virtual_dma_port + 5);
85 else
86 *lptr = inb_p(virtual_dma_port + 5);
87 }
88 virtual_dma_count = lcount;
89 virtual_dma_addr = lptr;
90 st = inb(virtual_dma_port + 4);
91 }
92
93#ifdef TRACE_FLPY_INT
94 calls++;
95#endif
96 if (st == 0x20)
97 return IRQ_HANDLED;
98 if (!(st & 0x20)) {
99 virtual_dma_residue += virtual_dma_count;
100 virtual_dma_count = 0;
101#ifdef TRACE_FLPY_INT
102 printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n",
103 virtual_dma_count, virtual_dma_residue, calls, bytes,
104 dma_wait);
105 calls = 0;
106 dma_wait = 0;
107#endif
108 doing_pdma = 0;
109 floppy_interrupt(irq, dev_id);
110 return IRQ_HANDLED;
111 }
112#ifdef TRACE_FLPY_INT
113 if (!virtual_dma_count)
114 dma_wait++;
115#endif
116 return IRQ_HANDLED;
117}
118
119static void fd_disable_dma(void)
120{
121 if (!(can_use_virtual_dma & 1))
122 disable_dma(FLOPPY_DMA);
123 doing_pdma = 0;
124 virtual_dma_residue += virtual_dma_count;
125 virtual_dma_count = 0;
126}
127
128static int vdma_request_dma(unsigned int dmanr, const char *device_id)
129{
130 return 0;
131}
132
133static void vdma_nop(unsigned int dummy)
134{
135}
136
137
138static int vdma_get_dma_residue(unsigned int dummy)
139{
140 return virtual_dma_count + virtual_dma_residue;
141}
142
143
144static int fd_request_irq(void)
145{
146 if (can_use_virtual_dma)
147 return request_irq(FLOPPY_IRQ, floppy_hardint,
148 IRQF_DISABLED, "floppy", NULL);
149 else
150 return request_irq(FLOPPY_IRQ, floppy_interrupt,
151 IRQF_DISABLED, "floppy", NULL);
152}
153
154static unsigned long dma_mem_alloc(unsigned long size)
155{
156 return __get_dma_pages(GFP_KERNEL|__GFP_NORETRY, get_order(size));
157}
158
159
160static unsigned long vdma_mem_alloc(unsigned long size)
161{
162 return (unsigned long)vmalloc(size);
163
164}
165
166#define nodma_mem_alloc(size) vdma_mem_alloc(size)
167
168static void _fd_dma_mem_free(unsigned long addr, unsigned long size)
169{
170 if ((unsigned long)addr >= (unsigned long)high_memory)
171 vfree((void *)addr);
172 else
173 free_pages(addr, get_order(size));
174}
175
176#define fd_dma_mem_free(addr, size) _fd_dma_mem_free(addr, size)
177
178static void _fd_chose_dma_mode(char *addr, unsigned long size)
179{
180 if (can_use_virtual_dma == 2) {
181 if ((unsigned long)addr >= (unsigned long)high_memory ||
182 isa_virt_to_bus(addr) >= 0x1000000 ||
183 _CROSS_64KB(addr, size, 0))
184 use_virtual_dma = 1;
185 else
186 use_virtual_dma = 0;
187 } else {
188 use_virtual_dma = can_use_virtual_dma & 1;
189 }
190}
191
192#define fd_chose_dma_mode(addr, size) _fd_chose_dma_mode(addr, size)
193
194
195static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
196{
197 doing_pdma = 1;
198 virtual_dma_port = io;
199 virtual_dma_mode = (mode == DMA_MODE_WRITE);
200 virtual_dma_addr = addr;
201 virtual_dma_count = size;
202 virtual_dma_residue = 0;
203 return 0;
204}
205
206static int hard_dma_setup(char *addr, unsigned long size, int mode, int io)
207{
208#ifdef FLOPPY_SANITY_CHECK
209 if (CROSS_64KB(addr, size)) {
210 printk("DMA crossing 64-K boundary %p-%p\n", addr, addr+size);
211 return -1;
212 }
213#endif
214 /* actual, physical DMA */
215 doing_pdma = 0;
216 clear_dma_ff(FLOPPY_DMA);
217 set_dma_mode(FLOPPY_DMA, mode);
218 set_dma_addr(FLOPPY_DMA, isa_virt_to_bus(addr));
219 set_dma_count(FLOPPY_DMA, size);
220 enable_dma(FLOPPY_DMA);
221 return 0;
222}
223
224static struct fd_routine_l {
225 int (*_request_dma)(unsigned int dmanr, const char *device_id);
226 void (*_free_dma)(unsigned int dmanr);
227 int (*_get_dma_residue)(unsigned int dummy);
228 unsigned long (*_dma_mem_alloc)(unsigned long size);
229 int (*_dma_setup)(char *addr, unsigned long size, int mode, int io);
230} fd_routine[] = {
231 {
232 request_dma,
233 free_dma,
234 get_dma_residue,
235 dma_mem_alloc,
236 hard_dma_setup
237 },
238 {
239 vdma_request_dma,
240 vdma_nop,
241 vdma_get_dma_residue,
242 vdma_mem_alloc,
243 vdma_dma_setup
244 }
245};
246
247
248static int FDC1 = 0x3f0;
249static int FDC2 = -1;
250
251/*
252 * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
253 * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
254 * coincides with another rtc CMOS user. Paul G.
255 */
256#define FLOPPY0_TYPE \
257({ \
258 unsigned long flags; \
259 unsigned char val; \
260 spin_lock_irqsave(&rtc_lock, flags); \
261 val = (CMOS_READ(0x10) >> 4) & 15; \
262 spin_unlock_irqrestore(&rtc_lock, flags); \
263 val; \
264})
265
266#define FLOPPY1_TYPE \
267({ \
268 unsigned long flags; \
269 unsigned char val; \
270 spin_lock_irqsave(&rtc_lock, flags); \
271 val = CMOS_READ(0x10) & 15; \
272 spin_unlock_irqrestore(&rtc_lock, flags); \
273 val; \
274})
275
276#define N_FDC 2
277#define N_DRIVE 8
278
279#define EXTRA_FLOPPY_PARAMS
280
281#endif /* _ASM_X86_FLOPPY_H */
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
new file mode 100644
index 00000000000..06850a7194e
--- /dev/null
+++ b/arch/x86/include/asm/frame.h
@@ -0,0 +1,27 @@
1#ifdef __ASSEMBLY__
2
3#include <asm/dwarf2.h>
4
5/* The annotation hides the frame from the unwinder and makes it look
6 like a ordinary ebp save/restore. This avoids some special cases for
7 frame pointer later */
8#ifdef CONFIG_FRAME_POINTER
9 .macro FRAME
10 pushl %ebp
11 CFI_ADJUST_CFA_OFFSET 4
12 CFI_REL_OFFSET ebp,0
13 movl %esp,%ebp
14 .endm
15 .macro ENDFRAME
16 popl %ebp
17 CFI_ADJUST_CFA_OFFSET -4
18 CFI_RESTORE ebp
19 .endm
20#else
21 .macro FRAME
22 .endm
23 .macro ENDFRAME
24 .endm
25#endif
26
27#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
new file mode 100644
index 00000000000..47f7e65e6c1
--- /dev/null
+++ b/arch/x86/include/asm/ftrace.h
@@ -0,0 +1,24 @@
1#ifndef _ASM_X86_FTRACE_H
2#define _ASM_X86_FTRACE_H
3
4#ifdef CONFIG_FTRACE
5#define MCOUNT_ADDR ((long)(mcount))
6#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
7
8#ifndef __ASSEMBLY__
9extern void mcount(void);
10
11static inline unsigned long ftrace_call_adjust(unsigned long addr)
12{
13 /*
14 * call mcount is "e8 <4 byte offset>"
15 * The addr points to the 4 byte offset and the caller of this
16 * function wants the pointer to e8. Simply subtract one.
17 */
18 return addr - 1;
19}
20#endif
21
22#endif /* CONFIG_FTRACE */
23
24#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
new file mode 100644
index 00000000000..1f11ce44e95
--- /dev/null
+++ b/arch/x86/include/asm/futex.h
@@ -0,0 +1,140 @@
1#ifndef _ASM_X86_FUTEX_H
2#define _ASM_X86_FUTEX_H
3
4#ifdef __KERNEL__
5
6#include <linux/futex.h>
7#include <linux/uaccess.h>
8
9#include <asm/asm.h>
10#include <asm/errno.h>
11#include <asm/processor.h>
12#include <asm/system.h>
13
14#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
15 asm volatile("1:\t" insn "\n" \
16 "2:\t.section .fixup,\"ax\"\n" \
17 "3:\tmov\t%3, %1\n" \
18 "\tjmp\t2b\n" \
19 "\t.previous\n" \
20 _ASM_EXTABLE(1b, 3b) \
21 : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
22 : "i" (-EFAULT), "0" (oparg), "1" (0))
23
24#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
25 asm volatile("1:\tmovl %2, %0\n" \
26 "\tmovl\t%0, %3\n" \
27 "\t" insn "\n" \
28 "2:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \
29 "\tjnz\t1b\n" \
30 "3:\t.section .fixup,\"ax\"\n" \
31 "4:\tmov\t%5, %1\n" \
32 "\tjmp\t3b\n" \
33 "\t.previous\n" \
34 _ASM_EXTABLE(1b, 4b) \
35 _ASM_EXTABLE(2b, 4b) \
36 : "=&a" (oldval), "=&r" (ret), \
37 "+m" (*uaddr), "=&r" (tem) \
38 : "r" (oparg), "i" (-EFAULT), "1" (0))
39
40static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
41{
42 int op = (encoded_op >> 28) & 7;
43 int cmp = (encoded_op >> 24) & 15;
44 int oparg = (encoded_op << 8) >> 20;
45 int cmparg = (encoded_op << 20) >> 20;
46 int oldval = 0, ret, tem;
47
48 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
49 oparg = 1 << oparg;
50
51 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
52 return -EFAULT;
53
54#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
55 /* Real i386 machines can only support FUTEX_OP_SET */
56 if (op != FUTEX_OP_SET && boot_cpu_data.x86 == 3)
57 return -ENOSYS;
58#endif
59
60 pagefault_disable();
61
62 switch (op) {
63 case FUTEX_OP_SET:
64 __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
65 break;
66 case FUTEX_OP_ADD:
67 __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval,
68 uaddr, oparg);
69 break;
70 case FUTEX_OP_OR:
71 __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg);
72 break;
73 case FUTEX_OP_ANDN:
74 __futex_atomic_op2("andl %4, %3", ret, oldval, uaddr, ~oparg);
75 break;
76 case FUTEX_OP_XOR:
77 __futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr, oparg);
78 break;
79 default:
80 ret = -ENOSYS;
81 }
82
83 pagefault_enable();
84
85 if (!ret) {
86 switch (cmp) {
87 case FUTEX_OP_CMP_EQ:
88 ret = (oldval == cmparg);
89 break;
90 case FUTEX_OP_CMP_NE:
91 ret = (oldval != cmparg);
92 break;
93 case FUTEX_OP_CMP_LT:
94 ret = (oldval < cmparg);
95 break;
96 case FUTEX_OP_CMP_GE:
97 ret = (oldval >= cmparg);
98 break;
99 case FUTEX_OP_CMP_LE:
100 ret = (oldval <= cmparg);
101 break;
102 case FUTEX_OP_CMP_GT:
103 ret = (oldval > cmparg);
104 break;
105 default:
106 ret = -ENOSYS;
107 }
108 }
109 return ret;
110}
111
112static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
113 int newval)
114{
115
116#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
117 /* Real i386 machines have no cmpxchg instruction */
118 if (boot_cpu_data.x86 == 3)
119 return -ENOSYS;
120#endif
121
122 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
123 return -EFAULT;
124
125 asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n"
126 "2:\t.section .fixup, \"ax\"\n"
127 "3:\tmov %2, %0\n"
128 "\tjmp 2b\n"
129 "\t.previous\n"
130 _ASM_EXTABLE(1b, 3b)
131 : "=a" (oldval), "+m" (*uaddr)
132 : "i" (-EFAULT), "r" (newval), "0" (oldval)
133 : "memory"
134 );
135
136 return oldval;
137}
138
139#endif
140#endif /* _ASM_X86_FUTEX_H */
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
new file mode 100644
index 00000000000..74252264433
--- /dev/null
+++ b/arch/x86/include/asm/gart.h
@@ -0,0 +1,73 @@
1#ifndef _ASM_X86_GART_H
2#define _ASM_X86_GART_H
3
4#include <asm/e820.h>
5
6extern void set_up_gart_resume(u32, u32);
7
8extern int fallback_aper_order;
9extern int fallback_aper_force;
10extern int fix_aperture;
11
12/* PTE bits. */
13#define GPTE_VALID 1
14#define GPTE_COHERENT 2
15
16/* Aperture control register bits. */
17#define GARTEN (1<<0)
18#define DISGARTCPU (1<<4)
19#define DISGARTIO (1<<5)
20
21/* GART cache control register bits. */
22#define INVGART (1<<0)
23#define GARTPTEERR (1<<1)
24
25/* K8 On-cpu GART registers */
26#define AMD64_GARTAPERTURECTL 0x90
27#define AMD64_GARTAPERTUREBASE 0x94
28#define AMD64_GARTTABLEBASE 0x98
29#define AMD64_GARTCACHECTL 0x9c
30#define AMD64_GARTEN (1<<0)
31
32extern int agp_amd64_init(void);
33
34static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
35{
36 u32 tmp, ctl;
37
38 /* address of the mappings table */
39 addr >>= 12;
40 tmp = (u32) addr<<4;
41 tmp &= ~0xf;
42 pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
43
44 /* Enable GART translation for this hammer. */
45 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
46 ctl |= GARTEN;
47 ctl &= ~(DISGARTCPU | DISGARTIO);
48 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
49}
50
51static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
52{
53 if (!aper_base)
54 return 0;
55
56 if (aper_base + aper_size > 0x100000000ULL) {
57 printk(KERN_INFO "Aperture beyond 4GB. Ignoring.\n");
58 return 0;
59 }
60 if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
61 printk(KERN_INFO "Aperture pointing to e820 RAM. Ignoring.\n");
62 return 0;
63 }
64 if (aper_size < min_size) {
65 printk(KERN_INFO "Aperture too small (%d MB) than (%d MB)\n",
66 aper_size>>20, min_size>>20);
67 return 0;
68 }
69
70 return 1;
71}
72
73#endif /* _ASM_X86_GART_H */
diff --git a/arch/x86/include/asm/genapic.h b/arch/x86/include/asm/genapic.h
new file mode 100644
index 00000000000..d48bee663a6
--- /dev/null
+++ b/arch/x86/include/asm/genapic.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "genapic_32.h"
3#else
4# include "genapic_64.h"
5#endif
diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h
new file mode 100644
index 00000000000..5cbd4fcc06f
--- /dev/null
+++ b/arch/x86/include/asm/genapic_32.h
@@ -0,0 +1,126 @@
1#ifndef _ASM_X86_GENAPIC_32_H
2#define _ASM_X86_GENAPIC_32_H
3
4#include <asm/mpspec.h>
5
6/*
7 * Generic APIC driver interface.
8 *
9 * An straight forward mapping of the APIC related parts of the
10 * x86 subarchitecture interface to a dynamic object.
11 *
12 * This is used by the "generic" x86 subarchitecture.
13 *
14 * Copyright 2003 Andi Kleen, SuSE Labs.
15 */
16
17struct mpc_config_bus;
18struct mp_config_table;
19struct mpc_config_processor;
20
21struct genapic {
22 char *name;
23 int (*probe)(void);
24
25 int (*apic_id_registered)(void);
26 cpumask_t (*target_cpus)(void);
27 int int_delivery_mode;
28 int int_dest_mode;
29 int ESR_DISABLE;
30 int apic_destination_logical;
31 unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid);
32 unsigned long (*check_apicid_present)(int apicid);
33 int no_balance_irq;
34 int no_ioapic_check;
35 void (*init_apic_ldr)(void);
36 physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map);
37
38 void (*setup_apic_routing)(void);
39 int (*multi_timer_check)(int apic, int irq);
40 int (*apicid_to_node)(int logical_apicid);
41 int (*cpu_to_logical_apicid)(int cpu);
42 int (*cpu_present_to_apicid)(int mps_cpu);
43 physid_mask_t (*apicid_to_cpu_present)(int phys_apicid);
44 void (*setup_portio_remap)(void);
45 int (*check_phys_apicid_present)(int boot_cpu_physical_apicid);
46 void (*enable_apic_mode)(void);
47 u32 (*phys_pkg_id)(u32 cpuid_apic, int index_msb);
48
49 /* mpparse */
50 /* When one of the next two hooks returns 1 the genapic
51 is switched to this. Essentially they are additional probe
52 functions. */
53 int (*mps_oem_check)(struct mp_config_table *mpc, char *oem,
54 char *productid);
55 int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
56
57 unsigned (*get_apic_id)(unsigned long x);
58 unsigned long apic_id_mask;
59 unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
60 cpumask_t (*vector_allocation_domain)(int cpu);
61
62#ifdef CONFIG_SMP
63 /* ipi */
64 void (*send_IPI_mask)(cpumask_t mask, int vector);
65 void (*send_IPI_allbutself)(int vector);
66 void (*send_IPI_all)(int vector);
67#endif
68};
69
70#define APICFUNC(x) .x = x,
71
72/* More functions could be probably marked IPIFUNC and save some space
73 in UP GENERICARCH kernels, but I don't have the nerve right now
74 to untangle this mess. -AK */
75#ifdef CONFIG_SMP
76#define IPIFUNC(x) APICFUNC(x)
77#else
78#define IPIFUNC(x)
79#endif
80
81#define APIC_INIT(aname, aprobe) \
82{ \
83 .name = aname, \
84 .probe = aprobe, \
85 .int_delivery_mode = INT_DELIVERY_MODE, \
86 .int_dest_mode = INT_DEST_MODE, \
87 .no_balance_irq = NO_BALANCE_IRQ, \
88 .ESR_DISABLE = esr_disable, \
89 .apic_destination_logical = APIC_DEST_LOGICAL, \
90 APICFUNC(apic_id_registered) \
91 APICFUNC(target_cpus) \
92 APICFUNC(check_apicid_used) \
93 APICFUNC(check_apicid_present) \
94 APICFUNC(init_apic_ldr) \
95 APICFUNC(ioapic_phys_id_map) \
96 APICFUNC(setup_apic_routing) \
97 APICFUNC(multi_timer_check) \
98 APICFUNC(apicid_to_node) \
99 APICFUNC(cpu_to_logical_apicid) \
100 APICFUNC(cpu_present_to_apicid) \
101 APICFUNC(apicid_to_cpu_present) \
102 APICFUNC(setup_portio_remap) \
103 APICFUNC(check_phys_apicid_present) \
104 APICFUNC(mps_oem_check) \
105 APICFUNC(get_apic_id) \
106 .apic_id_mask = APIC_ID_MASK, \
107 APICFUNC(cpu_mask_to_apicid) \
108 APICFUNC(vector_allocation_domain) \
109 APICFUNC(acpi_madt_oem_check) \
110 IPIFUNC(send_IPI_mask) \
111 IPIFUNC(send_IPI_allbutself) \
112 IPIFUNC(send_IPI_all) \
113 APICFUNC(enable_apic_mode) \
114 APICFUNC(phys_pkg_id) \
115}
116
117extern struct genapic *genapic;
118
119enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
120#define get_uv_system_type() UV_NONE
121#define is_uv_system() 0
122#define uv_wakeup_secondary(a, b) 1
123#define uv_system_init() do {} while (0)
124
125
126#endif /* _ASM_X86_GENAPIC_32_H */
diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h
new file mode 100644
index 00000000000..13c4e96199e
--- /dev/null
+++ b/arch/x86/include/asm/genapic_64.h
@@ -0,0 +1,58 @@
1#ifndef _ASM_X86_GENAPIC_64_H
2#define _ASM_X86_GENAPIC_64_H
3
4/*
5 * Copyright 2004 James Cleverdon, IBM.
6 * Subject to the GNU Public License, v.2
7 *
8 * Generic APIC sub-arch data struct.
9 *
10 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
11 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
12 * James Cleverdon.
13 */
14
15struct genapic {
16 char *name;
17 int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
18 u32 int_delivery_mode;
19 u32 int_dest_mode;
20 int (*apic_id_registered)(void);
21 cpumask_t (*target_cpus)(void);
22 cpumask_t (*vector_allocation_domain)(int cpu);
23 void (*init_apic_ldr)(void);
24 /* ipi */
25 void (*send_IPI_mask)(cpumask_t mask, int vector);
26 void (*send_IPI_allbutself)(int vector);
27 void (*send_IPI_all)(int vector);
28 void (*send_IPI_self)(int vector);
29 /* */
30 unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
31 unsigned int (*phys_pkg_id)(int index_msb);
32 unsigned int (*get_apic_id)(unsigned long x);
33 unsigned long (*set_apic_id)(unsigned int id);
34 unsigned long apic_id_mask;
35};
36
37extern struct genapic *genapic;
38
39extern struct genapic apic_flat;
40extern struct genapic apic_physflat;
41extern struct genapic apic_x2apic_cluster;
42extern struct genapic apic_x2apic_phys;
43extern int acpi_madt_oem_check(char *, char *);
44
45extern void apic_send_IPI_self(int vector);
46enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
47extern enum uv_system_type get_uv_system_type(void);
48extern int is_uv_system(void);
49
50extern struct genapic apic_x2apic_uv_x;
51DECLARE_PER_CPU(int, x2apic_extra_bits);
52extern void uv_cpu_init(void);
53extern void uv_system_init(void);
54extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
55
56extern void setup_apic_routing(void);
57
58#endif /* _ASM_X86_GENAPIC_64_H */
diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h
new file mode 100644
index 00000000000..ad3c2ed7548
--- /dev/null
+++ b/arch/x86/include/asm/geode.h
@@ -0,0 +1,253 @@
1/*
2 * AMD Geode definitions
3 * Copyright (C) 2006, Advanced Micro Devices, Inc.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of version 2 of the GNU General Public License
7 * as published by the Free Software Foundation.
8 */
9
10#ifndef _ASM_X86_GEODE_H
11#define _ASM_X86_GEODE_H
12
13#include <asm/processor.h>
14#include <linux/io.h>
15
16/* Generic southbridge functions */
17
18#define GEODE_DEV_PMS 0
19#define GEODE_DEV_ACPI 1
20#define GEODE_DEV_GPIO 2
21#define GEODE_DEV_MFGPT 3
22
23extern int geode_get_dev_base(unsigned int dev);
24
25/* Useful macros */
26#define geode_pms_base() geode_get_dev_base(GEODE_DEV_PMS)
27#define geode_acpi_base() geode_get_dev_base(GEODE_DEV_ACPI)
28#define geode_gpio_base() geode_get_dev_base(GEODE_DEV_GPIO)
29#define geode_mfgpt_base() geode_get_dev_base(GEODE_DEV_MFGPT)
30
31/* MSRS */
32
33#define MSR_GLIU_P2D_RO0 0x10000029
34
35#define MSR_LX_GLD_MSR_CONFIG 0x48002001
36#define MSR_LX_MSR_PADSEL 0x48002011 /* NOT 0x48000011; the data
37 * sheet has the wrong value */
38#define MSR_GLCP_SYS_RSTPLL 0x4C000014
39#define MSR_GLCP_DOTPLL 0x4C000015
40
41#define MSR_LBAR_SMB 0x5140000B
42#define MSR_LBAR_GPIO 0x5140000C
43#define MSR_LBAR_MFGPT 0x5140000D
44#define MSR_LBAR_ACPI 0x5140000E
45#define MSR_LBAR_PMS 0x5140000F
46
47#define MSR_DIVIL_SOFT_RESET 0x51400017
48
49#define MSR_PIC_YSEL_LOW 0x51400020
50#define MSR_PIC_YSEL_HIGH 0x51400021
51#define MSR_PIC_ZSEL_LOW 0x51400022
52#define MSR_PIC_ZSEL_HIGH 0x51400023
53#define MSR_PIC_IRQM_LPC 0x51400025
54
55#define MSR_MFGPT_IRQ 0x51400028
56#define MSR_MFGPT_NR 0x51400029
57#define MSR_MFGPT_SETUP 0x5140002B
58
59#define MSR_LX_SPARE_MSR 0x80000011 /* DC-specific */
60
61#define MSR_GX_GLD_MSR_CONFIG 0xC0002001
62#define MSR_GX_MSR_PADSEL 0xC0002011
63
64/* Resource Sizes */
65
66#define LBAR_GPIO_SIZE 0xFF
67#define LBAR_MFGPT_SIZE 0x40
68#define LBAR_ACPI_SIZE 0x40
69#define LBAR_PMS_SIZE 0x80
70
71/* ACPI registers (PMS block) */
72
73/*
74 * PM1_EN is only valid when VSA is enabled for 16 bit reads.
75 * When VSA is not enabled, *always* read both PM1_STS and PM1_EN
76 * with a 32 bit read at offset 0x0
77 */
78
79#define PM1_STS 0x00
80#define PM1_EN 0x02
81#define PM1_CNT 0x08
82#define PM2_CNT 0x0C
83#define PM_TMR 0x10
84#define PM_GPE0_STS 0x18
85#define PM_GPE0_EN 0x1C
86
87/* PMC registers (PMS block) */
88
89#define PM_SSD 0x00
90#define PM_SCXA 0x04
91#define PM_SCYA 0x08
92#define PM_OUT_SLPCTL 0x0C
93#define PM_SCLK 0x10
94#define PM_SED 0x1
95#define PM_SCXD 0x18
96#define PM_SCYD 0x1C
97#define PM_IN_SLPCTL 0x20
98#define PM_WKD 0x30
99#define PM_WKXD 0x34
100#define PM_RD 0x38
101#define PM_WKXA 0x3C
102#define PM_FSD 0x40
103#define PM_TSD 0x44
104#define PM_PSD 0x48
105#define PM_NWKD 0x4C
106#define PM_AWKD 0x50
107#define PM_SSC 0x54
108
109/* VSA2 magic values */
110
111#define VSA_VRC_INDEX 0xAC1C
112#define VSA_VRC_DATA 0xAC1E
113#define VSA_VR_UNLOCK 0xFC53 /* unlock virtual register */
114#define VSA_VR_SIGNATURE 0x0003
115#define VSA_VR_MEM_SIZE 0x0200
116#define AMD_VSA_SIG 0x4132 /* signature is ascii 'VSA2' */
117#define GSW_VSA_SIG 0x534d /* General Software signature */
118/* GPIO */
119
120#define GPIO_OUTPUT_VAL 0x00
121#define GPIO_OUTPUT_ENABLE 0x04
122#define GPIO_OUTPUT_OPEN_DRAIN 0x08
123#define GPIO_OUTPUT_INVERT 0x0C
124#define GPIO_OUTPUT_AUX1 0x10
125#define GPIO_OUTPUT_AUX2 0x14
126#define GPIO_PULL_UP 0x18
127#define GPIO_PULL_DOWN 0x1C
128#define GPIO_INPUT_ENABLE 0x20
129#define GPIO_INPUT_INVERT 0x24
130#define GPIO_INPUT_FILTER 0x28
131#define GPIO_INPUT_EVENT_COUNT 0x2C
132#define GPIO_READ_BACK 0x30
133#define GPIO_INPUT_AUX1 0x34
134#define GPIO_EVENTS_ENABLE 0x38
135#define GPIO_LOCK_ENABLE 0x3C
136#define GPIO_POSITIVE_EDGE_EN 0x40
137#define GPIO_NEGATIVE_EDGE_EN 0x44
138#define GPIO_POSITIVE_EDGE_STS 0x48
139#define GPIO_NEGATIVE_EDGE_STS 0x4C
140
141#define GPIO_MAP_X 0xE0
142#define GPIO_MAP_Y 0xE4
143#define GPIO_MAP_Z 0xE8
144#define GPIO_MAP_W 0xEC
145
146static inline u32 geode_gpio(unsigned int nr)
147{
148 BUG_ON(nr > 28);
149 return 1 << nr;
150}
151
152extern void geode_gpio_set(u32, unsigned int);
153extern void geode_gpio_clear(u32, unsigned int);
154extern int geode_gpio_isset(u32, unsigned int);
155extern void geode_gpio_setup_event(unsigned int, int, int);
156extern void geode_gpio_set_irq(unsigned int, unsigned int);
157
158static inline void geode_gpio_event_irq(unsigned int gpio, int pair)
159{
160 geode_gpio_setup_event(gpio, pair, 0);
161}
162
163static inline void geode_gpio_event_pme(unsigned int gpio, int pair)
164{
165 geode_gpio_setup_event(gpio, pair, 1);
166}
167
168/* Specific geode tests */
169
170static inline int is_geode_gx(void)
171{
172 return ((boot_cpu_data.x86_vendor == X86_VENDOR_NSC) &&
173 (boot_cpu_data.x86 == 5) &&
174 (boot_cpu_data.x86_model == 5));
175}
176
177static inline int is_geode_lx(void)
178{
179 return ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
180 (boot_cpu_data.x86 == 5) &&
181 (boot_cpu_data.x86_model == 10));
182}
183
184static inline int is_geode(void)
185{
186 return (is_geode_gx() || is_geode_lx());
187}
188
189#ifdef CONFIG_MGEODE_LX
190extern int geode_has_vsa2(void);
191#else
192static inline int geode_has_vsa2(void)
193{
194 return 0;
195}
196#endif
197
198/* MFGPTs */
199
200#define MFGPT_MAX_TIMERS 8
201#define MFGPT_TIMER_ANY (-1)
202
203#define MFGPT_DOMAIN_WORKING 1
204#define MFGPT_DOMAIN_STANDBY 2
205#define MFGPT_DOMAIN_ANY (MFGPT_DOMAIN_WORKING | MFGPT_DOMAIN_STANDBY)
206
207#define MFGPT_CMP1 0
208#define MFGPT_CMP2 1
209
210#define MFGPT_EVENT_IRQ 0
211#define MFGPT_EVENT_NMI 1
212#define MFGPT_EVENT_RESET 3
213
214#define MFGPT_REG_CMP1 0
215#define MFGPT_REG_CMP2 2
216#define MFGPT_REG_COUNTER 4
217#define MFGPT_REG_SETUP 6
218
219#define MFGPT_SETUP_CNTEN (1 << 15)
220#define MFGPT_SETUP_CMP2 (1 << 14)
221#define MFGPT_SETUP_CMP1 (1 << 13)
222#define MFGPT_SETUP_SETUP (1 << 12)
223#define MFGPT_SETUP_STOPEN (1 << 11)
224#define MFGPT_SETUP_EXTEN (1 << 10)
225#define MFGPT_SETUP_REVEN (1 << 5)
226#define MFGPT_SETUP_CLKSEL (1 << 4)
227
228static inline void geode_mfgpt_write(int timer, u16 reg, u16 value)
229{
230 u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
231 outw(value, base + reg + (timer * 8));
232}
233
234static inline u16 geode_mfgpt_read(int timer, u16 reg)
235{
236 u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
237 return inw(base + reg + (timer * 8));
238}
239
240extern int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable);
241extern int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable);
242extern int geode_mfgpt_alloc_timer(int timer, int domain);
243
244#define geode_mfgpt_setup_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 1)
245#define geode_mfgpt_release_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 0)
246
247#ifdef CONFIG_GEODE_MFGPT_TIMER
248extern int __init mfgpt_timer_setup(void);
249#else
250static inline int mfgpt_timer_setup(void) { return 0; }
251#endif
252
253#endif /* _ASM_X86_GEODE_H */
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
new file mode 100644
index 00000000000..49dbfdfa50f
--- /dev/null
+++ b/arch/x86/include/asm/gpio.h
@@ -0,0 +1,56 @@
1/*
2 * Generic GPIO API implementation for x86.
3 *
4 * Derived from the generic GPIO API for powerpc:
5 *
6 * Copyright (c) 2007-2008 MontaVista Software, Inc.
7 *
8 * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 */
15
16#ifndef _ASM_X86_GPIO_H
17#define _ASM_X86_GPIO_H
18
19#include <asm-generic/gpio.h>
20
21#ifdef CONFIG_GPIOLIB
22
23/*
24 * Just call gpiolib.
25 */
26static inline int gpio_get_value(unsigned int gpio)
27{
28 return __gpio_get_value(gpio);
29}
30
31static inline void gpio_set_value(unsigned int gpio, int value)
32{
33 __gpio_set_value(gpio, value);
34}
35
36static inline int gpio_cansleep(unsigned int gpio)
37{
38 return __gpio_cansleep(gpio);
39}
40
41/*
42 * Not implemented, yet.
43 */
44static inline int gpio_to_irq(unsigned int gpio)
45{
46 return -ENOSYS;
47}
48
49static inline int irq_to_gpio(unsigned int irq)
50{
51 return -EINVAL;
52}
53
54#endif /* CONFIG_GPIOLIB */
55
56#endif /* _ASM_X86_GPIO_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
new file mode 100644
index 00000000000..000787df66e
--- /dev/null
+++ b/arch/x86/include/asm/hardirq.h
@@ -0,0 +1,11 @@
1#ifdef CONFIG_X86_32
2# include "hardirq_32.h"
3#else
4# include "hardirq_64.h"
5#endif
6
7extern u64 arch_irq_stat_cpu(unsigned int cpu);
8#define arch_irq_stat_cpu arch_irq_stat_cpu
9
10extern u64 arch_irq_stat(void);
11#define arch_irq_stat arch_irq_stat
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
new file mode 100644
index 00000000000..5ca135e72f2
--- /dev/null
+++ b/arch/x86/include/asm/hardirq_32.h
@@ -0,0 +1,28 @@
1#ifndef _ASM_X86_HARDIRQ_32_H
2#define _ASM_X86_HARDIRQ_32_H
3
4#include <linux/threads.h>
5#include <linux/irq.h>
6
7typedef struct {
8 unsigned int __softirq_pending;
9 unsigned long idle_timestamp;
10 unsigned int __nmi_count; /* arch dependent */
11 unsigned int apic_timer_irqs; /* arch dependent */
12 unsigned int irq0_irqs;
13 unsigned int irq_resched_count;
14 unsigned int irq_call_count;
15 unsigned int irq_tlb_count;
16 unsigned int irq_thermal_count;
17 unsigned int irq_spurious_count;
18} ____cacheline_aligned irq_cpustat_t;
19
20DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
21
22#define __ARCH_IRQ_STAT
23#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member)
24
25void ack_bad_irq(unsigned int irq);
26#include <linux/irq_cpustat.h>
27
28#endif /* _ASM_X86_HARDIRQ_32_H */
diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h
new file mode 100644
index 00000000000..1ba381fc51d
--- /dev/null
+++ b/arch/x86/include/asm/hardirq_64.h
@@ -0,0 +1,23 @@
1#ifndef _ASM_X86_HARDIRQ_64_H
2#define _ASM_X86_HARDIRQ_64_H
3
4#include <linux/threads.h>
5#include <linux/irq.h>
6#include <asm/pda.h>
7#include <asm/apic.h>
8
9/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
10#define MAX_HARDIRQS_PER_CPU NR_VECTORS
11
12#define __ARCH_IRQ_STAT 1
13
14#define local_softirq_pending() read_pda(__softirq_pending)
15
16#define __ARCH_SET_SOFTIRQ_PENDING 1
17
18#define set_softirq_pending(x) write_pda(__softirq_pending, (x))
19#define or_softirq_pending(x) or_pda(__softirq_pending, (x))
20
21extern void ack_bad_irq(unsigned int irq);
22
23#endif /* _ASM_X86_HARDIRQ_64_H */
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
new file mode 100644
index 00000000000..a3b3b7c3027
--- /dev/null
+++ b/arch/x86/include/asm/highmem.h
@@ -0,0 +1,82 @@
1/*
2 * highmem.h: virtual kernel memory mappings for high memory
3 *
4 * Used in CONFIG_HIGHMEM systems for memory pages which
5 * are not addressable by direct kernel virtual addresses.
6 *
7 * Copyright (C) 1999 Gerhard Wichert, Siemens AG
8 * Gerhard.Wichert@pdb.siemens.de
9 *
10 *
11 * Redesigned the x86 32-bit VM architecture to deal with
12 * up to 16 Terabyte physical memory. With current x86 CPUs
13 * we now support up to 64 Gigabytes physical RAM.
14 *
15 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
16 */
17
18#ifndef _ASM_X86_HIGHMEM_H
19#define _ASM_X86_HIGHMEM_H
20
21#ifdef __KERNEL__
22
23#include <linux/interrupt.h>
24#include <linux/threads.h>
25#include <asm/kmap_types.h>
26#include <asm/tlbflush.h>
27#include <asm/paravirt.h>
28
29/* declarations for highmem.c */
30extern unsigned long highstart_pfn, highend_pfn;
31
32extern pte_t *kmap_pte;
33extern pgprot_t kmap_prot;
34extern pte_t *pkmap_page_table;
35
36/*
37 * Right now we initialize only a single pte table. It can be extended
38 * easily, subsequent pte tables have to be allocated in one physical
39 * chunk of RAM.
40 */
41/*
42 * Ordering is:
43 *
44 * FIXADDR_TOP
45 * fixed_addresses
46 * FIXADDR_START
47 * temp fixed addresses
48 * FIXADDR_BOOT_START
49 * Persistent kmap area
50 * PKMAP_BASE
51 * VMALLOC_END
52 * Vmalloc area
53 * VMALLOC_START
54 * high_memory
55 */
56#define LAST_PKMAP_MASK (LAST_PKMAP-1)
57#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
58#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
59
60extern void *kmap_high(struct page *page);
61extern void kunmap_high(struct page *page);
62
63void *kmap(struct page *page);
64void kunmap(struct page *page);
65void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
66void *kmap_atomic(struct page *page, enum km_type type);
67void kunmap_atomic(void *kvaddr, enum km_type type);
68void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
69struct page *kmap_atomic_to_page(void *ptr);
70
71#ifndef CONFIG_PARAVIRT
72#define kmap_atomic_pte(page, type) kmap_atomic(page, type)
73#endif
74
75#define flush_cache_kmaps() do { } while (0)
76
77extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
78 unsigned long end_pfn);
79
80#endif /* __KERNEL__ */
81
82#endif /* _ASM_X86_HIGHMEM_H */
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
new file mode 100644
index 00000000000..1c22cb05ad6
--- /dev/null
+++ b/arch/x86/include/asm/hpet.h
@@ -0,0 +1,114 @@
1#ifndef _ASM_X86_HPET_H
2#define _ASM_X86_HPET_H
3
4#include <linux/msi.h>
5
6#ifdef CONFIG_HPET_TIMER
7
8#define HPET_MMAP_SIZE 1024
9
10#define HPET_ID 0x000
11#define HPET_PERIOD 0x004
12#define HPET_CFG 0x010
13#define HPET_STATUS 0x020
14#define HPET_COUNTER 0x0f0
15
16#define HPET_Tn_CFG(n) (0x100 + 0x20 * n)
17#define HPET_Tn_CMP(n) (0x108 + 0x20 * n)
18#define HPET_Tn_ROUTE(n) (0x110 + 0x20 * n)
19
20#define HPET_T0_CFG 0x100
21#define HPET_T0_CMP 0x108
22#define HPET_T0_ROUTE 0x110
23#define HPET_T1_CFG 0x120
24#define HPET_T1_CMP 0x128
25#define HPET_T1_ROUTE 0x130
26#define HPET_T2_CFG 0x140
27#define HPET_T2_CMP 0x148
28#define HPET_T2_ROUTE 0x150
29
30#define HPET_ID_REV 0x000000ff
31#define HPET_ID_NUMBER 0x00001f00
32#define HPET_ID_64BIT 0x00002000
33#define HPET_ID_LEGSUP 0x00008000
34#define HPET_ID_VENDOR 0xffff0000
35#define HPET_ID_NUMBER_SHIFT 8
36#define HPET_ID_VENDOR_SHIFT 16
37
38#define HPET_ID_VENDOR_8086 0x8086
39
40#define HPET_CFG_ENABLE 0x001
41#define HPET_CFG_LEGACY 0x002
42#define HPET_LEGACY_8254 2
43#define HPET_LEGACY_RTC 8
44
45#define HPET_TN_LEVEL 0x0002
46#define HPET_TN_ENABLE 0x0004
47#define HPET_TN_PERIODIC 0x0008
48#define HPET_TN_PERIODIC_CAP 0x0010
49#define HPET_TN_64BIT_CAP 0x0020
50#define HPET_TN_SETVAL 0x0040
51#define HPET_TN_32BIT 0x0100
52#define HPET_TN_ROUTE 0x3e00
53#define HPET_TN_FSB 0x4000
54#define HPET_TN_FSB_CAP 0x8000
55#define HPET_TN_ROUTE_SHIFT 9
56
57/* Max HPET Period is 10^8 femto sec as in HPET spec */
58#define HPET_MAX_PERIOD 100000000UL
59/*
60 * Min HPET period is 10^5 femto sec just for safety. If it is less than this,
61 * then 32 bit HPET counter wrapsaround in less than 0.5 sec.
62 */
63#define HPET_MIN_PERIOD 100000UL
64
65/* hpet memory map physical address */
66extern unsigned long hpet_address;
67extern unsigned long force_hpet_address;
68extern int hpet_force_user;
69extern int is_hpet_enabled(void);
70extern int hpet_enable(void);
71extern void hpet_disable(void);
72extern unsigned long hpet_readl(unsigned long a);
73extern void force_hpet_resume(void);
74
75extern void hpet_msi_unmask(unsigned int irq);
76extern void hpet_msi_mask(unsigned int irq);
77extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
78extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
79
80#ifdef CONFIG_PCI_MSI
81extern int arch_setup_hpet_msi(unsigned int irq);
82#else
83static inline int arch_setup_hpet_msi(unsigned int irq)
84{
85 return -EINVAL;
86}
87#endif
88
89#ifdef CONFIG_HPET_EMULATE_RTC
90
91#include <linux/interrupt.h>
92
93typedef irqreturn_t (*rtc_irq_handler)(int interrupt, void *cookie);
94extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask);
95extern int hpet_set_rtc_irq_bit(unsigned long bit_mask);
96extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
97 unsigned char sec);
98extern int hpet_set_periodic_freq(unsigned long freq);
99extern int hpet_rtc_dropped_irq(void);
100extern int hpet_rtc_timer_init(void);
101extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id);
102extern int hpet_register_irq_handler(rtc_irq_handler handler);
103extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
104
105#endif /* CONFIG_HPET_EMULATE_RTC */
106
107#else /* CONFIG_HPET_TIMER */
108
109static inline int hpet_enable(void) { return 0; }
110static inline int is_hpet_enabled(void) { return 0; }
111#define hpet_readl(a) 0
112
113#endif
114#endif /* _ASM_X86_HPET_H */
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
new file mode 100644
index 00000000000..439a9acc132
--- /dev/null
+++ b/arch/x86/include/asm/hugetlb.h
@@ -0,0 +1,93 @@
1#ifndef _ASM_X86_HUGETLB_H
2#define _ASM_X86_HUGETLB_H
3
4#include <asm/page.h>
5
6
7static inline int is_hugepage_only_range(struct mm_struct *mm,
8 unsigned long addr,
9 unsigned long len) {
10 return 0;
11}
12
13/*
14 * If the arch doesn't supply something else, assume that hugepage
15 * size aligned regions are ok without further preparation.
16 */
17static inline int prepare_hugepage_range(struct file *file,
18 unsigned long addr, unsigned long len)
19{
20 struct hstate *h = hstate_file(file);
21 if (len & ~huge_page_mask(h))
22 return -EINVAL;
23 if (addr & ~huge_page_mask(h))
24 return -EINVAL;
25 return 0;
26}
27
28static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
29}
30
31static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
32 unsigned long addr, unsigned long end,
33 unsigned long floor,
34 unsigned long ceiling)
35{
36 free_pgd_range(tlb, addr, end, floor, ceiling);
37}
38
39static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
40 pte_t *ptep, pte_t pte)
41{
42 set_pte_at(mm, addr, ptep, pte);
43}
44
45static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
46 unsigned long addr, pte_t *ptep)
47{
48 return ptep_get_and_clear(mm, addr, ptep);
49}
50
51static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
52 unsigned long addr, pte_t *ptep)
53{
54}
55
56static inline int huge_pte_none(pte_t pte)
57{
58 return pte_none(pte);
59}
60
61static inline pte_t huge_pte_wrprotect(pte_t pte)
62{
63 return pte_wrprotect(pte);
64}
65
66static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
67 unsigned long addr, pte_t *ptep)
68{
69 ptep_set_wrprotect(mm, addr, ptep);
70}
71
72static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
73 unsigned long addr, pte_t *ptep,
74 pte_t pte, int dirty)
75{
76 return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
77}
78
79static inline pte_t huge_ptep_get(pte_t *ptep)
80{
81 return *ptep;
82}
83
84static inline int arch_prepare_hugepage(struct page *page)
85{
86 return 0;
87}
88
89static inline void arch_release_hugepage(struct page *page)
90{
91}
92
93#endif /* _ASM_X86_HUGETLB_H */
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
new file mode 100644
index 00000000000..b97aecb0b61
--- /dev/null
+++ b/arch/x86/include/asm/hw_irq.h
@@ -0,0 +1,131 @@
1#ifndef _ASM_X86_HW_IRQ_H
2#define _ASM_X86_HW_IRQ_H
3
4/*
5 * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
6 *
7 * moved some of the old arch/i386/kernel/irq.h to here. VY
8 *
9 * IRQ/IPI changes taken from work by Thomas Radke
10 * <tomsoft@informatik.tu-chemnitz.de>
11 *
12 * hacked by Andi Kleen for x86-64.
13 * unified by tglx
14 */
15
16#include <asm/irq_vectors.h>
17
18#ifndef __ASSEMBLY__
19
20#include <linux/percpu.h>
21#include <linux/profile.h>
22#include <linux/smp.h>
23
24#include <asm/atomic.h>
25#include <asm/irq.h>
26#include <asm/sections.h>
27
28#define platform_legacy_irq(irq) ((irq) < 16)
29
30/* Interrupt handlers registered during init_IRQ */
31extern void apic_timer_interrupt(void);
32extern void error_interrupt(void);
33extern void spurious_interrupt(void);
34extern void thermal_interrupt(void);
35extern void reschedule_interrupt(void);
36
37extern void invalidate_interrupt(void);
38extern void invalidate_interrupt0(void);
39extern void invalidate_interrupt1(void);
40extern void invalidate_interrupt2(void);
41extern void invalidate_interrupt3(void);
42extern void invalidate_interrupt4(void);
43extern void invalidate_interrupt5(void);
44extern void invalidate_interrupt6(void);
45extern void invalidate_interrupt7(void);
46
47extern void irq_move_cleanup_interrupt(void);
48extern void threshold_interrupt(void);
49
50extern void call_function_interrupt(void);
51extern void call_function_single_interrupt(void);
52
53/* PIC specific functions */
54extern void disable_8259A_irq(unsigned int irq);
55extern void enable_8259A_irq(unsigned int irq);
56extern int i8259A_irq_pending(unsigned int irq);
57extern void make_8259A_irq(unsigned int irq);
58extern void init_8259A(int aeoi);
59
60/* IOAPIC */
61#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
62extern unsigned long io_apic_irqs;
63
64extern void init_VISWS_APIC_irqs(void);
65extern void setup_IO_APIC(void);
66extern void disable_IO_APIC(void);
67extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
68extern void setup_ioapic_dest(void);
69
70#ifdef CONFIG_X86_64
71extern void enable_IO_APIC(void);
72#endif
73
74/* IPI functions */
75#ifdef CONFIG_X86_32
76extern void send_IPI_self(int vector);
77#endif
78extern void send_IPI(int dest, int vector);
79
80/* Statistics */
81extern atomic_t irq_err_count;
82extern atomic_t irq_mis_count;
83
84/* EISA */
85extern void eisa_set_level_irq(unsigned int irq);
86
87/* Voyager functions */
88extern asmlinkage void vic_cpi_interrupt(void);
89extern asmlinkage void vic_sys_interrupt(void);
90extern asmlinkage void vic_cmn_interrupt(void);
91extern asmlinkage void qic_timer_interrupt(void);
92extern asmlinkage void qic_invalidate_interrupt(void);
93extern asmlinkage void qic_reschedule_interrupt(void);
94extern asmlinkage void qic_enable_irq_interrupt(void);
95extern asmlinkage void qic_call_function_interrupt(void);
96
97/* SMP */
98extern void smp_apic_timer_interrupt(struct pt_regs *);
99extern void smp_spurious_interrupt(struct pt_regs *);
100extern void smp_error_interrupt(struct pt_regs *);
101#ifdef CONFIG_X86_SMP
102extern void smp_reschedule_interrupt(struct pt_regs *);
103extern void smp_call_function_interrupt(struct pt_regs *);
104extern void smp_call_function_single_interrupt(struct pt_regs *);
105#ifdef CONFIG_X86_32
106extern void smp_invalidate_interrupt(struct pt_regs *);
107#else
108extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
109#endif
110#endif
111
112#ifdef CONFIG_X86_32
113extern void (*const interrupt[NR_VECTORS])(void);
114#endif
115
116typedef int vector_irq_t[NR_VECTORS];
117DECLARE_PER_CPU(vector_irq_t, vector_irq);
118
119#ifdef CONFIG_X86_IO_APIC
120extern void lock_vector_lock(void);
121extern void unlock_vector_lock(void);
122extern void __setup_vector_irq(int cpu);
123#else
124static inline void lock_vector_lock(void) {}
125static inline void unlock_vector_lock(void) {}
126static inline void __setup_vector_irq(int cpu) {}
127#endif
128
129#endif /* !ASSEMBLY_ */
130
131#endif /* _ASM_X86_HW_IRQ_H */
diff --git a/arch/x86/include/asm/hypertransport.h b/arch/x86/include/asm/hypertransport.h
new file mode 100644
index 00000000000..334b1a885d9
--- /dev/null
+++ b/arch/x86/include/asm/hypertransport.h
@@ -0,0 +1,45 @@
1#ifndef _ASM_X86_HYPERTRANSPORT_H
2#define _ASM_X86_HYPERTRANSPORT_H
3
4/*
5 * Constants for x86 Hypertransport Interrupts.
6 */
7
8#define HT_IRQ_LOW_BASE 0xf8000000
9
10#define HT_IRQ_LOW_VECTOR_SHIFT 16
11#define HT_IRQ_LOW_VECTOR_MASK 0x00ff0000
12#define HT_IRQ_LOW_VECTOR(v) \
13 (((v) << HT_IRQ_LOW_VECTOR_SHIFT) & HT_IRQ_LOW_VECTOR_MASK)
14
15#define HT_IRQ_LOW_DEST_ID_SHIFT 8
16#define HT_IRQ_LOW_DEST_ID_MASK 0x0000ff00
17#define HT_IRQ_LOW_DEST_ID(v) \
18 (((v) << HT_IRQ_LOW_DEST_ID_SHIFT) & HT_IRQ_LOW_DEST_ID_MASK)
19
20#define HT_IRQ_LOW_DM_PHYSICAL 0x0000000
21#define HT_IRQ_LOW_DM_LOGICAL 0x0000040
22
23#define HT_IRQ_LOW_RQEOI_EDGE 0x0000000
24#define HT_IRQ_LOW_RQEOI_LEVEL 0x0000020
25
26
27#define HT_IRQ_LOW_MT_FIXED 0x0000000
28#define HT_IRQ_LOW_MT_ARBITRATED 0x0000004
29#define HT_IRQ_LOW_MT_SMI 0x0000008
30#define HT_IRQ_LOW_MT_NMI 0x000000c
31#define HT_IRQ_LOW_MT_INIT 0x0000010
32#define HT_IRQ_LOW_MT_STARTUP 0x0000014
33#define HT_IRQ_LOW_MT_EXTINT 0x0000018
34#define HT_IRQ_LOW_MT_LINT1 0x000008c
35#define HT_IRQ_LOW_MT_LINT0 0x0000098
36
37#define HT_IRQ_LOW_IRQ_MASKED 0x0000001
38
39
40#define HT_IRQ_HIGH_DEST_ID_SHIFT 0
41#define HT_IRQ_HIGH_DEST_ID_MASK 0x00ffffff
42#define HT_IRQ_HIGH_DEST_ID(v) \
43 ((((v) >> 8) << HT_IRQ_HIGH_DEST_ID_SHIFT) & HT_IRQ_HIGH_DEST_ID_MASK)
44
45#endif /* _ASM_X86_HYPERTRANSPORT_H */
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
new file mode 100644
index 00000000000..48f0004db8c
--- /dev/null
+++ b/arch/x86/include/asm/i387.h
@@ -0,0 +1,400 @@
1/*
2 * Copyright (C) 1994 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * General FPU state handling cleanups
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 * x86-64 work by Andi Kleen 2002
8 */
9
10#ifndef _ASM_X86_I387_H
11#define _ASM_X86_I387_H
12
13#include <linux/sched.h>
14#include <linux/kernel_stat.h>
15#include <linux/regset.h>
16#include <linux/hardirq.h>
17#include <asm/asm.h>
18#include <asm/processor.h>
19#include <asm/sigcontext.h>
20#include <asm/user.h>
21#include <asm/uaccess.h>
22#include <asm/xsave.h>
23
24extern unsigned int sig_xstate_size;
25extern void fpu_init(void);
26extern void mxcsr_feature_mask_init(void);
27extern int init_fpu(struct task_struct *child);
28extern asmlinkage void math_state_restore(void);
29extern void init_thread_xstate(void);
30extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
31
32extern user_regset_active_fn fpregs_active, xfpregs_active;
33extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get;
34extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set;
35
36extern struct _fpx_sw_bytes fx_sw_reserved;
37#ifdef CONFIG_IA32_EMULATION
38extern unsigned int sig_xstate_ia32_size;
39extern struct _fpx_sw_bytes fx_sw_reserved_ia32;
40struct _fpstate_ia32;
41struct _xstate_ia32;
42extern int save_i387_xstate_ia32(void __user *buf);
43extern int restore_i387_xstate_ia32(void __user *buf);
44#endif
45
46#define X87_FSW_ES (1 << 7) /* Exception Summary */
47
48#ifdef CONFIG_X86_64
49
50/* Ignore delayed exceptions from user space */
51static inline void tolerant_fwait(void)
52{
53 asm volatile("1: fwait\n"
54 "2:\n"
55 _ASM_EXTABLE(1b, 2b));
56}
57
58static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
59{
60 int err;
61
62 asm volatile("1: rex64/fxrstor (%[fx])\n\t"
63 "2:\n"
64 ".section .fixup,\"ax\"\n"
65 "3: movl $-1,%[err]\n"
66 " jmp 2b\n"
67 ".previous\n"
68 _ASM_EXTABLE(1b, 3b)
69 : [err] "=r" (err)
70#if 0 /* See comment in __save_init_fpu() below. */
71 : [fx] "r" (fx), "m" (*fx), "0" (0));
72#else
73 : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
74#endif
75 return err;
76}
77
78static inline int restore_fpu_checking(struct task_struct *tsk)
79{
80 if (task_thread_info(tsk)->status & TS_XSAVE)
81 return xrstor_checking(&tsk->thread.xstate->xsave);
82 else
83 return fxrstor_checking(&tsk->thread.xstate->fxsave);
84}
85
86/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
87 is pending. Clear the x87 state here by setting it to fixed
88 values. The kernel data segment can be sometimes 0 and sometimes
89 new user value. Both should be ok.
90 Use the PDA as safe address because it should be already in L1. */
91static inline void clear_fpu_state(struct task_struct *tsk)
92{
93 struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
94 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
95
96 /*
97 * xsave header may indicate the init state of the FP.
98 */
99 if ((task_thread_info(tsk)->status & TS_XSAVE) &&
100 !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
101 return;
102
103 if (unlikely(fx->swd & X87_FSW_ES))
104 asm volatile("fnclex");
105 alternative_input(ASM_NOP8 ASM_NOP2,
106 " emms\n" /* clear stack tags */
107 " fildl %%gs:0", /* load to clear state */
108 X86_FEATURE_FXSAVE_LEAK);
109}
110
111static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
112{
113 int err;
114
115 asm volatile("1: rex64/fxsave (%[fx])\n\t"
116 "2:\n"
117 ".section .fixup,\"ax\"\n"
118 "3: movl $-1,%[err]\n"
119 " jmp 2b\n"
120 ".previous\n"
121 _ASM_EXTABLE(1b, 3b)
122 : [err] "=r" (err), "=m" (*fx)
123#if 0 /* See comment in __fxsave_clear() below. */
124 : [fx] "r" (fx), "0" (0));
125#else
126 : [fx] "cdaSDb" (fx), "0" (0));
127#endif
128 if (unlikely(err) &&
129 __clear_user(fx, sizeof(struct i387_fxsave_struct)))
130 err = -EFAULT;
131 /* No need to clear here because the caller clears USED_MATH */
132 return err;
133}
134
135static inline void fxsave(struct task_struct *tsk)
136{
137 /* Using "rex64; fxsave %0" is broken because, if the memory operand
138 uses any extended registers for addressing, a second REX prefix
139 will be generated (to the assembler, rex64 followed by semicolon
140 is a separate instruction), and hence the 64-bitness is lost. */
141#if 0
142 /* Using "fxsaveq %0" would be the ideal choice, but is only supported
143 starting with gas 2.16. */
144 __asm__ __volatile__("fxsaveq %0"
145 : "=m" (tsk->thread.xstate->fxsave));
146#elif 0
147 /* Using, as a workaround, the properly prefixed form below isn't
148 accepted by any binutils version so far released, complaining that
149 the same type of prefix is used twice if an extended register is
150 needed for addressing (fix submitted to mainline 2005-11-21). */
151 __asm__ __volatile__("rex64/fxsave %0"
152 : "=m" (tsk->thread.xstate->fxsave));
153#else
154 /* This, however, we can work around by forcing the compiler to select
155 an addressing mode that doesn't require extended registers. */
156 __asm__ __volatile__("rex64/fxsave (%1)"
157 : "=m" (tsk->thread.xstate->fxsave)
158 : "cdaSDb" (&tsk->thread.xstate->fxsave));
159#endif
160}
161
162static inline void __save_init_fpu(struct task_struct *tsk)
163{
164 if (task_thread_info(tsk)->status & TS_XSAVE)
165 xsave(tsk);
166 else
167 fxsave(tsk);
168
169 clear_fpu_state(tsk);
170 task_thread_info(tsk)->status &= ~TS_USEDFPU;
171}
172
173#else /* CONFIG_X86_32 */
174
175extern void finit(void);
176
177static inline void tolerant_fwait(void)
178{
179 asm volatile("fnclex ; fwait");
180}
181
182static inline void restore_fpu(struct task_struct *tsk)
183{
184 if (task_thread_info(tsk)->status & TS_XSAVE) {
185 xrstor_checking(&tsk->thread.xstate->xsave);
186 return;
187 }
188 /*
189 * The "nop" is needed to make the instructions the same
190 * length.
191 */
192 alternative_input(
193 "nop ; frstor %1",
194 "fxrstor %1",
195 X86_FEATURE_FXSR,
196 "m" (tsk->thread.xstate->fxsave));
197}
198
199/* We need a safe address that is cheap to find and that is already
200 in L1 during context switch. The best choices are unfortunately
201 different for UP and SMP */
202#ifdef CONFIG_SMP
203#define safe_address (__per_cpu_offset[0])
204#else
205#define safe_address (kstat_cpu(0).cpustat.user)
206#endif
207
208/*
209 * These must be called with preempt disabled
210 */
211static inline void __save_init_fpu(struct task_struct *tsk)
212{
213 if (task_thread_info(tsk)->status & TS_XSAVE) {
214 struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
215 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
216
217 xsave(tsk);
218
219 /*
220 * xsave header may indicate the init state of the FP.
221 */
222 if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
223 goto end;
224
225 if (unlikely(fx->swd & X87_FSW_ES))
226 asm volatile("fnclex");
227
228 /*
229 * we can do a simple return here or be paranoid :)
230 */
231 goto clear_state;
232 }
233
234 /* Use more nops than strictly needed in case the compiler
235 varies code */
236 alternative_input(
237 "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
238 "fxsave %[fx]\n"
239 "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
240 X86_FEATURE_FXSR,
241 [fx] "m" (tsk->thread.xstate->fxsave),
242 [fsw] "m" (tsk->thread.xstate->fxsave.swd) : "memory");
243clear_state:
244 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
245 is pending. Clear the x87 state here by setting it to fixed
246 values. safe_address is a random variable that should be in L1 */
247 alternative_input(
248 GENERIC_NOP8 GENERIC_NOP2,
249 "emms\n\t" /* clear stack tags */
250 "fildl %[addr]", /* set F?P to defined value */
251 X86_FEATURE_FXSAVE_LEAK,
252 [addr] "m" (safe_address));
253end:
254 task_thread_info(tsk)->status &= ~TS_USEDFPU;
255}
256
257#endif /* CONFIG_X86_64 */
258
259/*
260 * Signal frame handlers...
261 */
262extern int save_i387_xstate(void __user *buf);
263extern int restore_i387_xstate(void __user *buf);
264
265static inline void __unlazy_fpu(struct task_struct *tsk)
266{
267 if (task_thread_info(tsk)->status & TS_USEDFPU) {
268 __save_init_fpu(tsk);
269 stts();
270 } else
271 tsk->fpu_counter = 0;
272}
273
274static inline void __clear_fpu(struct task_struct *tsk)
275{
276 if (task_thread_info(tsk)->status & TS_USEDFPU) {
277 tolerant_fwait();
278 task_thread_info(tsk)->status &= ~TS_USEDFPU;
279 stts();
280 }
281}
282
283static inline void kernel_fpu_begin(void)
284{
285 struct thread_info *me = current_thread_info();
286 preempt_disable();
287 if (me->status & TS_USEDFPU)
288 __save_init_fpu(me->task);
289 else
290 clts();
291}
292
293static inline void kernel_fpu_end(void)
294{
295 stts();
296 preempt_enable();
297}
298
299/*
300 * Some instructions like VIA's padlock instructions generate a spurious
301 * DNA fault but don't modify SSE registers. And these instructions
302 * get used from interrupt context aswell. To prevent these kernel instructions
303 * in interrupt context interact wrongly with other user/kernel fpu usage, we
304 * should use them only in the context of irq_ts_save/restore()
305 */
306static inline int irq_ts_save(void)
307{
308 /*
309 * If we are in process context, we are ok to take a spurious DNA fault.
310 * Otherwise, doing clts() in process context require pre-emption to
311 * be disabled or some heavy lifting like kernel_fpu_begin()
312 */
313 if (!in_interrupt())
314 return 0;
315
316 if (read_cr0() & X86_CR0_TS) {
317 clts();
318 return 1;
319 }
320
321 return 0;
322}
323
324static inline void irq_ts_restore(int TS_state)
325{
326 if (TS_state)
327 stts();
328}
329
330#ifdef CONFIG_X86_64
331
332static inline void save_init_fpu(struct task_struct *tsk)
333{
334 __save_init_fpu(tsk);
335 stts();
336}
337
338#define unlazy_fpu __unlazy_fpu
339#define clear_fpu __clear_fpu
340
341#else /* CONFIG_X86_32 */
342
343/*
344 * These disable preemption on their own and are safe
345 */
346static inline void save_init_fpu(struct task_struct *tsk)
347{
348 preempt_disable();
349 __save_init_fpu(tsk);
350 stts();
351 preempt_enable();
352}
353
354static inline void unlazy_fpu(struct task_struct *tsk)
355{
356 preempt_disable();
357 __unlazy_fpu(tsk);
358 preempt_enable();
359}
360
361static inline void clear_fpu(struct task_struct *tsk)
362{
363 preempt_disable();
364 __clear_fpu(tsk);
365 preempt_enable();
366}
367
368#endif /* CONFIG_X86_64 */
369
370/*
371 * i387 state interaction
372 */
373static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
374{
375 if (cpu_has_fxsr) {
376 return tsk->thread.xstate->fxsave.cwd;
377 } else {
378 return (unsigned short)tsk->thread.xstate->fsave.cwd;
379 }
380}
381
382static inline unsigned short get_fpu_swd(struct task_struct *tsk)
383{
384 if (cpu_has_fxsr) {
385 return tsk->thread.xstate->fxsave.swd;
386 } else {
387 return (unsigned short)tsk->thread.xstate->fsave.swd;
388 }
389}
390
391static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
392{
393 if (cpu_has_xmm) {
394 return tsk->thread.xstate->fxsave.mxcsr;
395 } else {
396 return MXCSR_DEFAULT;
397 }
398}
399
400#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
new file mode 100644
index 00000000000..1edbf89680f
--- /dev/null
+++ b/arch/x86/include/asm/i8253.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_X86_I8253_H
2#define _ASM_X86_I8253_H
3
4/* i8253A PIT registers */
5#define PIT_MODE 0x43
6#define PIT_CH0 0x40
7#define PIT_CH2 0x42
8
9extern spinlock_t i8253_lock;
10
11extern struct clock_event_device *global_clock_event;
12
13extern void setup_pit_timer(void);
14
15#define inb_pit inb_p
16#define outb_pit outb_p
17
18#endif /* _ASM_X86_I8253_H */
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
new file mode 100644
index 00000000000..58d7091eeb1
--- /dev/null
+++ b/arch/x86/include/asm/i8259.h
@@ -0,0 +1,63 @@
1#ifndef _ASM_X86_I8259_H
2#define _ASM_X86_I8259_H
3
4#include <linux/delay.h>
5
6extern unsigned int cached_irq_mask;
7
8#define __byte(x, y) (((unsigned char *)&(y))[x])
9#define cached_master_mask (__byte(0, cached_irq_mask))
10#define cached_slave_mask (__byte(1, cached_irq_mask))
11
12/* i8259A PIC registers */
13#define PIC_MASTER_CMD 0x20
14#define PIC_MASTER_IMR 0x21
15#define PIC_MASTER_ISR PIC_MASTER_CMD
16#define PIC_MASTER_POLL PIC_MASTER_ISR
17#define PIC_MASTER_OCW3 PIC_MASTER_ISR
18#define PIC_SLAVE_CMD 0xa0
19#define PIC_SLAVE_IMR 0xa1
20
21/* i8259A PIC related value */
22#define PIC_CASCADE_IR 2
23#define MASTER_ICW4_DEFAULT 0x01
24#define SLAVE_ICW4_DEFAULT 0x01
25#define PIC_ICW4_AEOI 2
26
27extern spinlock_t i8259A_lock;
28
29extern void init_8259A(int auto_eoi);
30extern void enable_8259A_irq(unsigned int irq);
31extern void disable_8259A_irq(unsigned int irq);
32extern unsigned int startup_8259A_irq(unsigned int irq);
33
34/* the PIC may need a careful delay on some platforms, hence specific calls */
35static inline unsigned char inb_pic(unsigned int port)
36{
37 unsigned char value = inb(port);
38
39 /*
40 * delay for some accesses to PIC on motherboard or in chipset
41 * must be at least one microsecond, so be safe here:
42 */
43 udelay(2);
44
45 return value;
46}
47
48static inline void outb_pic(unsigned char value, unsigned int port)
49{
50 outb(value, port);
51 /*
52 * delay for some accesses to PIC on motherboard or in chipset
53 * must be at least one microsecond, so be safe here:
54 */
55 udelay(2);
56}
57
58extern struct irq_chip i8259A_chip;
59
60extern void mask_8259A(void);
61extern void unmask_8259A(void);
62
63#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
new file mode 100644
index 00000000000..97989c0e534
--- /dev/null
+++ b/arch/x86/include/asm/ia32.h
@@ -0,0 +1,170 @@
1#ifndef _ASM_X86_IA32_H
2#define _ASM_X86_IA32_H
3
4
5#ifdef CONFIG_IA32_EMULATION
6
7#include <linux/compat.h>
8
9/*
10 * 32 bit structures for IA32 support.
11 */
12
13#include <asm/sigcontext32.h>
14
15/* signal.h */
16struct sigaction32 {
17 unsigned int sa_handler; /* Really a pointer, but need to deal
18 with 32 bits */
19 unsigned int sa_flags;
20 unsigned int sa_restorer; /* Another 32 bit pointer */
21 compat_sigset_t sa_mask; /* A 32 bit mask */
22};
23
24struct old_sigaction32 {
25 unsigned int sa_handler; /* Really a pointer, but need to deal
26 with 32 bits */
27 compat_old_sigset_t sa_mask; /* A 32 bit mask */
28 unsigned int sa_flags;
29 unsigned int sa_restorer; /* Another 32 bit pointer */
30};
31
32typedef struct sigaltstack_ia32 {
33 unsigned int ss_sp;
34 int ss_flags;
35 unsigned int ss_size;
36} stack_ia32_t;
37
38struct ucontext_ia32 {
39 unsigned int uc_flags;
40 unsigned int uc_link;
41 stack_ia32_t uc_stack;
42 struct sigcontext_ia32 uc_mcontext;
43 compat_sigset_t uc_sigmask; /* mask last for extensibility */
44};
45
46/* This matches struct stat64 in glibc2.2, hence the absolutely
47 * insane amounts of padding around dev_t's.
48 */
49struct stat64 {
50 unsigned long long st_dev;
51 unsigned char __pad0[4];
52
53#define STAT64_HAS_BROKEN_ST_INO 1
54 unsigned int __st_ino;
55
56 unsigned int st_mode;
57 unsigned int st_nlink;
58
59 unsigned int st_uid;
60 unsigned int st_gid;
61
62 unsigned long long st_rdev;
63 unsigned char __pad3[4];
64
65 long long st_size;
66 unsigned int st_blksize;
67
68 long long st_blocks;/* Number 512-byte blocks allocated */
69
70 unsigned st_atime;
71 unsigned st_atime_nsec;
72 unsigned st_mtime;
73 unsigned st_mtime_nsec;
74 unsigned st_ctime;
75 unsigned st_ctime_nsec;
76
77 unsigned long long st_ino;
78} __attribute__((packed));
79
80typedef struct compat_siginfo {
81 int si_signo;
82 int si_errno;
83 int si_code;
84
85 union {
86 int _pad[((128 / sizeof(int)) - 3)];
87
88 /* kill() */
89 struct {
90 unsigned int _pid; /* sender's pid */
91 unsigned int _uid; /* sender's uid */
92 } _kill;
93
94 /* POSIX.1b timers */
95 struct {
96 compat_timer_t _tid; /* timer id */
97 int _overrun; /* overrun count */
98 compat_sigval_t _sigval; /* same as below */
99 int _sys_private; /* not to be passed to user */
100 int _overrun_incr; /* amount to add to overrun */
101 } _timer;
102
103 /* POSIX.1b signals */
104 struct {
105 unsigned int _pid; /* sender's pid */
106 unsigned int _uid; /* sender's uid */
107 compat_sigval_t _sigval;
108 } _rt;
109
110 /* SIGCHLD */
111 struct {
112 unsigned int _pid; /* which child */
113 unsigned int _uid; /* sender's uid */
114 int _status; /* exit code */
115 compat_clock_t _utime;
116 compat_clock_t _stime;
117 } _sigchld;
118
119 /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
120 struct {
121 unsigned int _addr; /* faulting insn/memory ref. */
122 } _sigfault;
123
124 /* SIGPOLL */
125 struct {
126 int _band; /* POLL_IN, POLL_OUT, POLL_MSG */
127 int _fd;
128 } _sigpoll;
129 } _sifields;
130} compat_siginfo_t;
131
132struct sigframe32 {
133 u32 pretcode;
134 int sig;
135 struct sigcontext_ia32 sc;
136 struct _fpstate_ia32 fpstate;
137 unsigned int extramask[_COMPAT_NSIG_WORDS-1];
138};
139
140struct rt_sigframe32 {
141 u32 pretcode;
142 int sig;
143 u32 pinfo;
144 u32 puc;
145 compat_siginfo_t info;
146 struct ucontext_ia32 uc;
147 struct _fpstate_ia32 fpstate;
148};
149
150struct ustat32 {
151 __u32 f_tfree;
152 compat_ino_t f_tinode;
153 char f_fname[6];
154 char f_fpack[6];
155};
156
157#define IA32_STACK_TOP IA32_PAGE_OFFSET
158
159#ifdef __KERNEL__
160struct linux_binprm;
161extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
162 unsigned long stack_top, int exec_stack);
163struct mm_struct;
164extern void ia32_pick_mmap_layout(struct mm_struct *mm);
165
166#endif
167
168#endif /* !CONFIG_IA32_SUPPORT */
169
170#endif /* _ASM_X86_IA32_H */
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
new file mode 100644
index 00000000000..976f6ecd2ce
--- /dev/null
+++ b/arch/x86/include/asm/ia32_unistd.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_X86_IA32_UNISTD_H
2#define _ASM_X86_IA32_UNISTD_H
3
4/*
5 * This file contains the system call numbers of the ia32 port,
6 * this is for the kernel only.
7 * Only add syscalls here where some part of the kernel needs to know
8 * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK
9 */
10
11#define __NR_ia32_restart_syscall 0
12#define __NR_ia32_exit 1
13#define __NR_ia32_read 3
14#define __NR_ia32_write 4
15#define __NR_ia32_sigreturn 119
16#define __NR_ia32_rt_sigreturn 173
17
18#endif /* _ASM_X86_IA32_UNISTD_H */
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
new file mode 100644
index 00000000000..44c89c3a23e
--- /dev/null
+++ b/arch/x86/include/asm/idle.h
@@ -0,0 +1,16 @@
1#ifndef _ASM_X86_IDLE_H
2#define _ASM_X86_IDLE_H
3
4#define IDLE_START 1
5#define IDLE_END 2
6
7struct notifier_block;
8void idle_notifier_register(struct notifier_block *n);
9void idle_notifier_unregister(struct notifier_block *n);
10
11void enter_idle(void);
12void exit_idle(void);
13
14void c1e_remove_cpu(int cpu);
15
16#endif /* _ASM_X86_IDLE_H */
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
new file mode 100644
index 00000000000..fa0fd068bc2
--- /dev/null
+++ b/arch/x86/include/asm/intel_arch_perfmon.h
@@ -0,0 +1,31 @@
1#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
2#define _ASM_X86_INTEL_ARCH_PERFMON_H
3
4#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
5#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
6
7#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
8#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
9
10#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
11#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
12#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
13#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
14
15#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
16#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
17#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
18#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
19 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
20
21union cpuid10_eax {
22 struct {
23 unsigned int version_id:8;
24 unsigned int num_counters:8;
25 unsigned int bit_width:8;
26 unsigned int mask_length:8;
27 } split;
28 unsigned int full;
29};
30
31#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
new file mode 100644
index 00000000000..5618a103f39
--- /dev/null
+++ b/arch/x86/include/asm/io.h
@@ -0,0 +1,91 @@
1#ifndef _ASM_X86_IO_H
2#define _ASM_X86_IO_H
3
4#define ARCH_HAS_IOREMAP_WC
5
6#include <linux/compiler.h>
7
8#define build_mmio_read(name, size, type, reg, barrier) \
9static inline type name(const volatile void __iomem *addr) \
10{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
11:"m" (*(volatile type __force *)addr) barrier); return ret; }
12
13#define build_mmio_write(name, size, type, reg, barrier) \
14static inline void name(type val, volatile void __iomem *addr) \
15{ asm volatile("mov" size " %0,%1": :reg (val), \
16"m" (*(volatile type __force *)addr) barrier); }
17
18build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
19build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
20build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
21
22build_mmio_read(__readb, "b", unsigned char, "=q", )
23build_mmio_read(__readw, "w", unsigned short, "=r", )
24build_mmio_read(__readl, "l", unsigned int, "=r", )
25
26build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
27build_mmio_write(writew, "w", unsigned short, "r", :"memory")
28build_mmio_write(writel, "l", unsigned int, "r", :"memory")
29
30build_mmio_write(__writeb, "b", unsigned char, "q", )
31build_mmio_write(__writew, "w", unsigned short, "r", )
32build_mmio_write(__writel, "l", unsigned int, "r", )
33
34#define readb_relaxed(a) __readb(a)
35#define readw_relaxed(a) __readw(a)
36#define readl_relaxed(a) __readl(a)
37#define __raw_readb __readb
38#define __raw_readw __readw
39#define __raw_readl __readl
40
41#define __raw_writeb __writeb
42#define __raw_writew __writew
43#define __raw_writel __writel
44
45#define mmiowb() barrier()
46
47#ifdef CONFIG_X86_64
48build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
49build_mmio_read(__readq, "q", unsigned long, "=r", )
50build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
51build_mmio_write(__writeq, "q", unsigned long, "r", )
52
53#define readq_relaxed(a) __readq(a)
54#define __raw_readq __readq
55#define __raw_writeq writeq
56
57/* Let people know we have them */
58#define readq readq
59#define writeq writeq
60#endif
61
62extern int iommu_bio_merge;
63
64#ifdef CONFIG_X86_32
65# include "io_32.h"
66#else
67# include "io_64.h"
68#endif
69
70extern void *xlate_dev_mem_ptr(unsigned long phys);
71extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
72
73extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
74 unsigned long prot_val);
75extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
76
77/*
78 * early_ioremap() and early_iounmap() are for temporary early boot-time
79 * mappings, before the real ioremap() is functional.
80 * A boot-time mapping is currently limited to at most 16 pages.
81 */
82extern void early_ioremap_init(void);
83extern void early_ioremap_clear(void);
84extern void early_ioremap_reset(void);
85extern void *early_ioremap(unsigned long offset, unsigned long size);
86extern void *early_memremap(unsigned long offset, unsigned long size);
87extern void early_iounmap(void *addr, unsigned long size);
88extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
89
90
91#endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h
new file mode 100644
index 00000000000..d8e242e1b39
--- /dev/null
+++ b/arch/x86/include/asm/io_32.h
@@ -0,0 +1,284 @@
1#ifndef _ASM_X86_IO_32_H
2#define _ASM_X86_IO_32_H
3
4#include <linux/string.h>
5#include <linux/compiler.h>
6
7/*
8 * This file contains the definitions for the x86 IO instructions
9 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
10 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
11 * versions of the single-IO instructions (inb_p/inw_p/..).
12 *
13 * This file is not meant to be obfuscating: it's just complicated
14 * to (a) handle it all in a way that makes gcc able to optimize it
15 * as well as possible and (b) trying to avoid writing the same thing
16 * over and over again with slight variations and possibly making a
17 * mistake somewhere.
18 */
19
20/*
21 * Thanks to James van Artsdalen for a better timing-fix than
22 * the two short jumps: using outb's to a nonexistent port seems
23 * to guarantee better timings even on fast machines.
24 *
25 * On the other hand, I'd like to be sure of a non-existent port:
26 * I feel a bit unsafe about using 0x80 (should be safe, though)
27 *
28 * Linus
29 */
30
31 /*
32 * Bit simplified and optimized by Jan Hubicka
33 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
34 *
35 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
36 * isa_read[wl] and isa_write[wl] fixed
37 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
38 */
39
40#define IO_SPACE_LIMIT 0xffff
41
42#define XQUAD_PORTIO_BASE 0xfe400000
43#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
44
45#ifdef __KERNEL__
46
47#include <asm-generic/iomap.h>
48
49#include <linux/vmalloc.h>
50
51/*
52 * Convert a virtual cached pointer to an uncached pointer
53 */
54#define xlate_dev_kmem_ptr(p) p
55
56/**
57 * virt_to_phys - map virtual addresses to physical
58 * @address: address to remap
59 *
60 * The returned physical address is the physical (CPU) mapping for
61 * the memory address given. It is only valid to use this function on
62 * addresses directly mapped or allocated via kmalloc.
63 *
64 * This function does not give bus mappings for DMA transfers. In
65 * almost all conceivable cases a device driver should not be using
66 * this function
67 */
68
69static inline unsigned long virt_to_phys(volatile void *address)
70{
71 return __pa(address);
72}
73
74/**
75 * phys_to_virt - map physical address to virtual
76 * @address: address to remap
77 *
78 * The returned virtual address is a current CPU mapping for
79 * the memory address given. It is only valid to use this function on
80 * addresses that have a kernel mapping
81 *
82 * This function does not handle bus mappings for DMA transfers. In
83 * almost all conceivable cases a device driver should not be using
84 * this function
85 */
86
87static inline void *phys_to_virt(unsigned long address)
88{
89 return __va(address);
90}
91
92/*
93 * Change "struct page" to physical address.
94 */
95#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
96
97/**
98 * ioremap - map bus memory into CPU space
99 * @offset: bus address of the memory
100 * @size: size of the resource to map
101 *
102 * ioremap performs a platform specific sequence of operations to
103 * make bus memory CPU accessible via the readb/readw/readl/writeb/
104 * writew/writel functions and the other mmio helpers. The returned
105 * address is not guaranteed to be usable directly as a virtual
106 * address.
107 *
108 * If the area you are trying to map is a PCI BAR you should have a
109 * look at pci_iomap().
110 */
111extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
112extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
113extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
114 unsigned long prot_val);
115
116/*
117 * The default ioremap() behavior is non-cached:
118 */
119static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
120{
121 return ioremap_nocache(offset, size);
122}
123
124extern void iounmap(volatile void __iomem *addr);
125
126/*
127 * ISA I/O bus memory addresses are 1:1 with the physical address.
128 */
129#define isa_virt_to_bus virt_to_phys
130#define isa_page_to_bus page_to_phys
131#define isa_bus_to_virt phys_to_virt
132
133/*
134 * However PCI ones are not necessarily 1:1 and therefore these interfaces
135 * are forbidden in portable PCI drivers.
136 *
137 * Allow them on x86 for legacy drivers, though.
138 */
139#define virt_to_bus virt_to_phys
140#define bus_to_virt phys_to_virt
141
142static inline void
143memset_io(volatile void __iomem *addr, unsigned char val, int count)
144{
145 memset((void __force *)addr, val, count);
146}
147
148static inline void
149memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
150{
151 __memcpy(dst, (const void __force *)src, count);
152}
153
154static inline void
155memcpy_toio(volatile void __iomem *dst, const void *src, int count)
156{
157 __memcpy((void __force *)dst, src, count);
158}
159
160/*
161 * ISA space is 'always mapped' on a typical x86 system, no need to
162 * explicitly ioremap() it. The fact that the ISA IO space is mapped
163 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
164 * are physical addresses. The following constant pointer can be
165 * used as the IO-area pointer (it can be iounmapped as well, so the
166 * analogy with PCI is quite large):
167 */
168#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
169
170/*
171 * Cache management
172 *
173 * This needed for two cases
174 * 1. Out of order aware processors
175 * 2. Accidentally out of order processors (PPro errata #51)
176 */
177
178#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
179
180static inline void flush_write_buffers(void)
181{
182 asm volatile("lock; addl $0,0(%%esp)": : :"memory");
183}
184
185#else
186
187#define flush_write_buffers() do { } while (0)
188
189#endif
190
191#endif /* __KERNEL__ */
192
193extern void native_io_delay(void);
194
195extern int io_delay_type;
196extern void io_delay_init(void);
197
198#if defined(CONFIG_PARAVIRT)
199#include <asm/paravirt.h>
200#else
201
202static inline void slow_down_io(void)
203{
204 native_io_delay();
205#ifdef REALLY_SLOW_IO
206 native_io_delay();
207 native_io_delay();
208 native_io_delay();
209#endif
210}
211
212#endif
213
214#define __BUILDIO(bwl, bw, type) \
215static inline void out##bwl(unsigned type value, int port) \
216{ \
217 out##bwl##_local(value, port); \
218} \
219 \
220static inline unsigned type in##bwl(int port) \
221{ \
222 return in##bwl##_local(port); \
223}
224
225#define BUILDIO(bwl, bw, type) \
226static inline void out##bwl##_local(unsigned type value, int port) \
227{ \
228 asm volatile("out" #bwl " %" #bw "0, %w1" \
229 : : "a"(value), "Nd"(port)); \
230} \
231 \
232static inline unsigned type in##bwl##_local(int port) \
233{ \
234 unsigned type value; \
235 asm volatile("in" #bwl " %w1, %" #bw "0" \
236 : "=a"(value) : "Nd"(port)); \
237 return value; \
238} \
239 \
240static inline void out##bwl##_local_p(unsigned type value, int port) \
241{ \
242 out##bwl##_local(value, port); \
243 slow_down_io(); \
244} \
245 \
246static inline unsigned type in##bwl##_local_p(int port) \
247{ \
248 unsigned type value = in##bwl##_local(port); \
249 slow_down_io(); \
250 return value; \
251} \
252 \
253__BUILDIO(bwl, bw, type) \
254 \
255static inline void out##bwl##_p(unsigned type value, int port) \
256{ \
257 out##bwl(value, port); \
258 slow_down_io(); \
259} \
260 \
261static inline unsigned type in##bwl##_p(int port) \
262{ \
263 unsigned type value = in##bwl(port); \
264 slow_down_io(); \
265 return value; \
266} \
267 \
268static inline void outs##bwl(int port, const void *addr, unsigned long count) \
269{ \
270 asm volatile("rep; outs" #bwl \
271 : "+S"(addr), "+c"(count) : "d"(port)); \
272} \
273 \
274static inline void ins##bwl(int port, void *addr, unsigned long count) \
275{ \
276 asm volatile("rep; ins" #bwl \
277 : "+D"(addr), "+c"(count) : "d"(port)); \
278}
279
280BUILDIO(b, b, char)
281BUILDIO(w, w, short)
282BUILDIO(l, , int)
283
284#endif /* _ASM_X86_IO_32_H */
diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h
new file mode 100644
index 00000000000..fea325a1122
--- /dev/null
+++ b/arch/x86/include/asm/io_64.h
@@ -0,0 +1,244 @@
1#ifndef _ASM_X86_IO_64_H
2#define _ASM_X86_IO_64_H
3
4
5/*
6 * This file contains the definitions for the x86 IO instructions
7 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
8 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
9 * versions of the single-IO instructions (inb_p/inw_p/..).
10 *
11 * This file is not meant to be obfuscating: it's just complicated
12 * to (a) handle it all in a way that makes gcc able to optimize it
13 * as well as possible and (b) trying to avoid writing the same thing
14 * over and over again with slight variations and possibly making a
15 * mistake somewhere.
16 */
17
18/*
19 * Thanks to James van Artsdalen for a better timing-fix than
20 * the two short jumps: using outb's to a nonexistent port seems
21 * to guarantee better timings even on fast machines.
22 *
23 * On the other hand, I'd like to be sure of a non-existent port:
24 * I feel a bit unsafe about using 0x80 (should be safe, though)
25 *
26 * Linus
27 */
28
29 /*
30 * Bit simplified and optimized by Jan Hubicka
31 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
32 *
33 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
34 * isa_read[wl] and isa_write[wl] fixed
35 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
36 */
37
38extern void native_io_delay(void);
39
40extern int io_delay_type;
41extern void io_delay_init(void);
42
43#if defined(CONFIG_PARAVIRT)
44#include <asm/paravirt.h>
45#else
46
47static inline void slow_down_io(void)
48{
49 native_io_delay();
50#ifdef REALLY_SLOW_IO
51 native_io_delay();
52 native_io_delay();
53 native_io_delay();
54#endif
55}
56#endif
57
58/*
59 * Talk about misusing macros..
60 */
61#define __OUT1(s, x) \
62static inline void out##s(unsigned x value, unsigned short port) {
63
64#define __OUT2(s, s1, s2) \
65asm volatile ("out" #s " %" s1 "0,%" s2 "1"
66
67#ifndef REALLY_SLOW_IO
68#define REALLY_SLOW_IO
69#define UNSET_REALLY_SLOW_IO
70#endif
71
72#define __OUT(s, s1, x) \
73 __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
74 } \
75 __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
76 slow_down_io(); \
77}
78
79#define __IN1(s) \
80static inline RETURN_TYPE in##s(unsigned short port) \
81{ \
82 RETURN_TYPE _v;
83
84#define __IN2(s, s1, s2) \
85 asm volatile ("in" #s " %" s2 "1,%" s1 "0"
86
87#define __IN(s, s1, i...) \
88 __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
89 return _v; \
90 } \
91 __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
92 slow_down_io(); \
93 return _v; }
94
95#ifdef UNSET_REALLY_SLOW_IO
96#undef REALLY_SLOW_IO
97#endif
98
99#define __INS(s) \
100static inline void ins##s(unsigned short port, void *addr, \
101 unsigned long count) \
102{ \
103 asm volatile ("rep ; ins" #s \
104 : "=D" (addr), "=c" (count) \
105 : "d" (port), "0" (addr), "1" (count)); \
106}
107
108#define __OUTS(s) \
109static inline void outs##s(unsigned short port, const void *addr, \
110 unsigned long count) \
111{ \
112 asm volatile ("rep ; outs" #s \
113 : "=S" (addr), "=c" (count) \
114 : "d" (port), "0" (addr), "1" (count)); \
115}
116
117#define RETURN_TYPE unsigned char
118__IN(b, "")
119#undef RETURN_TYPE
120#define RETURN_TYPE unsigned short
121__IN(w, "")
122#undef RETURN_TYPE
123#define RETURN_TYPE unsigned int
124__IN(l, "")
125#undef RETURN_TYPE
126
127__OUT(b, "b", char)
128__OUT(w, "w", short)
129__OUT(l, , int)
130
131__INS(b)
132__INS(w)
133__INS(l)
134
135__OUTS(b)
136__OUTS(w)
137__OUTS(l)
138
139#define IO_SPACE_LIMIT 0xffff
140
141#if defined(__KERNEL__) && defined(__x86_64__)
142
143#include <linux/vmalloc.h>
144
145#ifndef __i386__
146/*
147 * Change virtual addresses to physical addresses and vv.
148 * These are pretty trivial
149 */
150static inline unsigned long virt_to_phys(volatile void *address)
151{
152 return __pa(address);
153}
154
155static inline void *phys_to_virt(unsigned long address)
156{
157 return __va(address);
158}
159#endif
160
161/*
162 * Change "struct page" to physical address.
163 */
164#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
165
166#include <asm-generic/iomap.h>
167
168/*
169 * This one maps high address device memory and turns off caching for that area.
170 * it's useful if some control registers are in such an area and write combining
171 * or read caching is not desirable:
172 */
173extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
174extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
175extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
176 unsigned long prot_val);
177
178/*
179 * The default ioremap() behavior is non-cached:
180 */
181static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
182{
183 return ioremap_nocache(offset, size);
184}
185
186extern void iounmap(volatile void __iomem *addr);
187
188extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
189
190/*
191 * ISA I/O bus memory addresses are 1:1 with the physical address.
192 */
193#define isa_virt_to_bus virt_to_phys
194#define isa_page_to_bus page_to_phys
195#define isa_bus_to_virt phys_to_virt
196
197/*
198 * However PCI ones are not necessarily 1:1 and therefore these interfaces
199 * are forbidden in portable PCI drivers.
200 *
201 * Allow them on x86 for legacy drivers, though.
202 */
203#define virt_to_bus virt_to_phys
204#define bus_to_virt phys_to_virt
205
206void __memcpy_fromio(void *, unsigned long, unsigned);
207void __memcpy_toio(unsigned long, const void *, unsigned);
208
209static inline void memcpy_fromio(void *to, const volatile void __iomem *from,
210 unsigned len)
211{
212 __memcpy_fromio(to, (unsigned long)from, len);
213}
214
215static inline void memcpy_toio(volatile void __iomem *to, const void *from,
216 unsigned len)
217{
218 __memcpy_toio((unsigned long)to, from, len);
219}
220
221void memset_io(volatile void __iomem *a, int b, size_t c);
222
223/*
224 * ISA space is 'always mapped' on a typical x86 system, no need to
225 * explicitly ioremap() it. The fact that the ISA IO space is mapped
226 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
227 * are physical addresses. The following constant pointer can be
228 * used as the IO-area pointer (it can be iounmapped as well, so the
229 * analogy with PCI is quite large):
230 */
231#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
232
233#define flush_write_buffers()
234
235#define BIO_VMERGE_BOUNDARY iommu_bio_merge
236
237/*
238 * Convert a virtual cached pointer to an uncached pointer
239 */
240#define xlate_dev_kmem_ptr(p) p
241
242#endif /* __KERNEL__ */
243
244#endif /* _ASM_X86_IO_64_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
new file mode 100644
index 00000000000..6afd9933a7d
--- /dev/null
+++ b/arch/x86/include/asm/io_apic.h
@@ -0,0 +1,204 @@
1#ifndef _ASM_X86_IO_APIC_H
2#define _ASM_X86_IO_APIC_H
3
4#include <linux/types.h>
5#include <asm/mpspec.h>
6#include <asm/apicdef.h>
7#include <asm/irq_vectors.h>
8
9/*
10 * Intel IO-APIC support for SMP and UP systems.
11 *
12 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
13 */
14
15/* I/O Unit Redirection Table */
16#define IO_APIC_REDIR_VECTOR_MASK 0x000FF
17#define IO_APIC_REDIR_DEST_LOGICAL 0x00800
18#define IO_APIC_REDIR_DEST_PHYSICAL 0x00000
19#define IO_APIC_REDIR_SEND_PENDING (1 << 12)
20#define IO_APIC_REDIR_REMOTE_IRR (1 << 14)
21#define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15)
22#define IO_APIC_REDIR_MASKED (1 << 16)
23
24/*
25 * The structure of the IO-APIC:
26 */
27union IO_APIC_reg_00 {
28 u32 raw;
29 struct {
30 u32 __reserved_2 : 14,
31 LTS : 1,
32 delivery_type : 1,
33 __reserved_1 : 8,
34 ID : 8;
35 } __attribute__ ((packed)) bits;
36};
37
38union IO_APIC_reg_01 {
39 u32 raw;
40 struct {
41 u32 version : 8,
42 __reserved_2 : 7,
43 PRQ : 1,
44 entries : 8,
45 __reserved_1 : 8;
46 } __attribute__ ((packed)) bits;
47};
48
49union IO_APIC_reg_02 {
50 u32 raw;
51 struct {
52 u32 __reserved_2 : 24,
53 arbitration : 4,
54 __reserved_1 : 4;
55 } __attribute__ ((packed)) bits;
56};
57
58union IO_APIC_reg_03 {
59 u32 raw;
60 struct {
61 u32 boot_DT : 1,
62 __reserved_1 : 31;
63 } __attribute__ ((packed)) bits;
64};
65
66enum ioapic_irq_destination_types {
67 dest_Fixed = 0,
68 dest_LowestPrio = 1,
69 dest_SMI = 2,
70 dest__reserved_1 = 3,
71 dest_NMI = 4,
72 dest_INIT = 5,
73 dest__reserved_2 = 6,
74 dest_ExtINT = 7
75};
76
77struct IO_APIC_route_entry {
78 __u32 vector : 8,
79 delivery_mode : 3, /* 000: FIXED
80 * 001: lowest prio
81 * 111: ExtINT
82 */
83 dest_mode : 1, /* 0: physical, 1: logical */
84 delivery_status : 1,
85 polarity : 1,
86 irr : 1,
87 trigger : 1, /* 0: edge, 1: level */
88 mask : 1, /* 0: enabled, 1: disabled */
89 __reserved_2 : 15;
90
91 __u32 __reserved_3 : 24,
92 dest : 8;
93} __attribute__ ((packed));
94
95struct IR_IO_APIC_route_entry {
96 __u64 vector : 8,
97 zero : 3,
98 index2 : 1,
99 delivery_status : 1,
100 polarity : 1,
101 irr : 1,
102 trigger : 1,
103 mask : 1,
104 reserved : 31,
105 format : 1,
106 index : 15;
107} __attribute__ ((packed));
108
109#ifdef CONFIG_X86_IO_APIC
110
111/*
112 * # of IO-APICs and # of IRQ routing registers
113 */
114extern int nr_ioapics;
115extern int nr_ioapic_registers[MAX_IO_APICS];
116
117/*
118 * MP-BIOS irq configuration table structures:
119 */
120
121#define MP_MAX_IOAPIC_PIN 127
122
123struct mp_config_ioapic {
124 unsigned long mp_apicaddr;
125 unsigned int mp_apicid;
126 unsigned char mp_type;
127 unsigned char mp_apicver;
128 unsigned char mp_flags;
129};
130
131struct mp_config_intsrc {
132 unsigned int mp_dstapic;
133 unsigned char mp_type;
134 unsigned char mp_irqtype;
135 unsigned short mp_irqflag;
136 unsigned char mp_srcbus;
137 unsigned char mp_srcbusirq;
138 unsigned char mp_dstirq;
139};
140
141/* I/O APIC entries */
142extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
143
144/* # of MP IRQ source entries */
145extern int mp_irq_entries;
146
147/* MP IRQ source entries */
148extern struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
149
150/* non-0 if default (table-less) MP configuration */
151extern int mpc_default_type;
152
153/* Older SiS APIC requires we rewrite the index register */
154extern int sis_apic_bug;
155
156/* 1 if "noapic" boot option passed */
157extern int skip_ioapic_setup;
158
159/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
160extern int timer_through_8259;
161
162static inline void disable_ioapic_setup(void)
163{
164 skip_ioapic_setup = 1;
165}
166
167/*
168 * If we use the IO-APIC for IRQ routing, disable automatic
169 * assignment of PCI IRQ's.
170 */
171#define io_apic_assign_pci_irqs \
172 (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
173
174#ifdef CONFIG_ACPI
175extern int io_apic_get_unique_id(int ioapic, int apic_id);
176extern int io_apic_get_version(int ioapic);
177extern int io_apic_get_redir_entries(int ioapic);
178extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
179 int edge_level, int active_high_low);
180#endif /* CONFIG_ACPI */
181
182extern int (*ioapic_renumber_irq)(int ioapic, int irq);
183extern void ioapic_init_mappings(void);
184
185#ifdef CONFIG_X86_64
186extern int save_mask_IO_APIC_setup(void);
187extern void restore_IO_APIC_setup(void);
188extern void reinit_intr_remapped_IO_APIC(int);
189#endif
190
191extern int probe_nr_irqs(void);
192
193#else /* !CONFIG_X86_IO_APIC */
194#define io_apic_assign_pci_irqs 0
195static const int timer_through_8259 = 0;
196static inline void ioapic_init_mappings(void) { }
197
198static inline int probe_nr_irqs(void)
199{
200 return NR_IRQS;
201}
202#endif
203
204#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/ioctl.h b/arch/x86/include/asm/ioctl.h
new file mode 100644
index 00000000000..b279fe06dfe
--- /dev/null
+++ b/arch/x86/include/asm/ioctl.h
@@ -0,0 +1 @@
#include <asm-generic/ioctl.h>
diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/asm/ioctls.h
new file mode 100644
index 00000000000..0d5b23b7b06
--- /dev/null
+++ b/arch/x86/include/asm/ioctls.h
@@ -0,0 +1,94 @@
1#ifndef _ASM_X86_IOCTLS_H
2#define _ASM_X86_IOCTLS_H
3
4#include <asm/ioctl.h>
5
6/* 0x54 is just a magic number to make these relatively unique ('T') */
7
8#define TCGETS 0x5401
9#define TCSETS 0x5402 /* Clashes with SNDCTL_TMR_START sound ioctl */
10#define TCSETSW 0x5403
11#define TCSETSF 0x5404
12#define TCGETA 0x5405
13#define TCSETA 0x5406
14#define TCSETAW 0x5407
15#define TCSETAF 0x5408
16#define TCSBRK 0x5409
17#define TCXONC 0x540A
18#define TCFLSH 0x540B
19#define TIOCEXCL 0x540C
20#define TIOCNXCL 0x540D
21#define TIOCSCTTY 0x540E
22#define TIOCGPGRP 0x540F
23#define TIOCSPGRP 0x5410
24#define TIOCOUTQ 0x5411
25#define TIOCSTI 0x5412
26#define TIOCGWINSZ 0x5413
27#define TIOCSWINSZ 0x5414
28#define TIOCMGET 0x5415
29#define TIOCMBIS 0x5416
30#define TIOCMBIC 0x5417
31#define TIOCMSET 0x5418
32#define TIOCGSOFTCAR 0x5419
33#define TIOCSSOFTCAR 0x541A
34#define FIONREAD 0x541B
35#define TIOCINQ FIONREAD
36#define TIOCLINUX 0x541C
37#define TIOCCONS 0x541D
38#define TIOCGSERIAL 0x541E
39#define TIOCSSERIAL 0x541F
40#define TIOCPKT 0x5420
41#define FIONBIO 0x5421
42#define TIOCNOTTY 0x5422
43#define TIOCSETD 0x5423
44#define TIOCGETD 0x5424
45#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */
46/* #define TIOCTTYGSTRUCT 0x5426 - Former debugging-only ioctl */
47#define TIOCSBRK 0x5427 /* BSD compatibility */
48#define TIOCCBRK 0x5428 /* BSD compatibility */
49#define TIOCGSID 0x5429 /* Return the session ID of FD */
50#define TCGETS2 _IOR('T', 0x2A, struct termios2)
51#define TCSETS2 _IOW('T', 0x2B, struct termios2)
52#define TCSETSW2 _IOW('T', 0x2C, struct termios2)
53#define TCSETSF2 _IOW('T', 0x2D, struct termios2)
54#define TIOCGRS485 0x542E
55#define TIOCSRS485 0x542F
56#define TIOCGPTN _IOR('T', 0x30, unsigned int)
57 /* Get Pty Number (of pty-mux device) */
58#define TIOCSPTLCK _IOW('T', 0x31, int) /* Lock/unlock Pty */
59#define TCGETX 0x5432 /* SYS5 TCGETX compatibility */
60#define TCSETX 0x5433
61#define TCSETXF 0x5434
62#define TCSETXW 0x5435
63
64#define FIONCLEX 0x5450
65#define FIOCLEX 0x5451
66#define FIOASYNC 0x5452
67#define TIOCSERCONFIG 0x5453
68#define TIOCSERGWILD 0x5454
69#define TIOCSERSWILD 0x5455
70#define TIOCGLCKTRMIOS 0x5456
71#define TIOCSLCKTRMIOS 0x5457
72#define TIOCSERGSTRUCT 0x5458 /* For debugging only */
73#define TIOCSERGETLSR 0x5459 /* Get line status register */
74#define TIOCSERGETMULTI 0x545A /* Get multiport config */
75#define TIOCSERSETMULTI 0x545B /* Set multiport config */
76
77#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */
78#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */
79#define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */
80#define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */
81#define FIOQSIZE 0x5460
82
83/* Used for packet mode */
84#define TIOCPKT_DATA 0
85#define TIOCPKT_FLUSHREAD 1
86#define TIOCPKT_FLUSHWRITE 2
87#define TIOCPKT_STOP 4
88#define TIOCPKT_START 8
89#define TIOCPKT_NOSTOP 16
90#define TIOCPKT_DOSTOP 32
91
92#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */
93
94#endif /* _ASM_X86_IOCTLS_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
new file mode 100644
index 00000000000..98e28ea8cd1
--- /dev/null
+++ b/arch/x86/include/asm/iommu.h
@@ -0,0 +1,50 @@
1#ifndef _ASM_X86_IOMMU_H
2#define _ASM_X86_IOMMU_H
3
4extern void pci_iommu_shutdown(void);
5extern void no_iommu_init(void);
6extern struct dma_mapping_ops nommu_dma_ops;
7extern int force_iommu, no_iommu;
8extern int iommu_detected;
9extern int dmar_disabled;
10extern int forbid_dac;
11
12extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len);
13
14/* 10 seconds */
15#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
16
17#ifdef CONFIG_GART_IOMMU
18extern int gart_iommu_aperture;
19extern int gart_iommu_aperture_allowed;
20extern int gart_iommu_aperture_disabled;
21
22extern void early_gart_iommu_check(void);
23extern void gart_iommu_init(void);
24extern void gart_iommu_shutdown(void);
25extern void __init gart_parse_options(char *);
26extern void gart_iommu_hole_init(void);
27
28#else
29#define gart_iommu_aperture 0
30#define gart_iommu_aperture_allowed 0
31#define gart_iommu_aperture_disabled 1
32
33static inline void early_gart_iommu_check(void)
34{
35}
36static inline void gart_iommu_init(void)
37{
38}
39static inline void gart_iommu_shutdown(void)
40{
41}
42static inline void gart_parse_options(char *options)
43{
44}
45static inline void gart_iommu_hole_init(void)
46{
47}
48#endif
49
50#endif /* _ASM_X86_IOMMU_H */
diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/asm/ipcbuf.h
new file mode 100644
index 00000000000..ee678fd5159
--- /dev/null
+++ b/arch/x86/include/asm/ipcbuf.h
@@ -0,0 +1,28 @@
1#ifndef _ASM_X86_IPCBUF_H
2#define _ASM_X86_IPCBUF_H
3
4/*
5 * The ipc64_perm structure for x86 architecture.
6 * Note extra padding because this structure is passed back and forth
7 * between kernel and user space.
8 *
9 * Pad space is left for:
10 * - 32-bit mode_t and seq
11 * - 2 miscellaneous 32-bit values
12 */
13
14struct ipc64_perm {
15 __kernel_key_t key;
16 __kernel_uid32_t uid;
17 __kernel_gid32_t gid;
18 __kernel_uid32_t cuid;
19 __kernel_gid32_t cgid;
20 __kernel_mode_t mode;
21 unsigned short __pad1;
22 unsigned short seq;
23 unsigned short __pad2;
24 unsigned long __unused1;
25 unsigned long __unused2;
26};
27
28#endif /* _ASM_X86_IPCBUF_H */
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
new file mode 100644
index 00000000000..f89dffb28aa
--- /dev/null
+++ b/arch/x86/include/asm/ipi.h
@@ -0,0 +1,138 @@
1#ifndef _ASM_X86_IPI_H
2#define _ASM_X86_IPI_H
3
4/*
5 * Copyright 2004 James Cleverdon, IBM.
6 * Subject to the GNU Public License, v.2
7 *
8 * Generic APIC InterProcessor Interrupt code.
9 *
10 * Moved to include file by James Cleverdon from
11 * arch/x86-64/kernel/smp.c
12 *
13 * Copyrights from kernel/smp.c:
14 *
15 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
16 * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
17 * (c) 2002,2003 Andi Kleen, SuSE Labs.
18 * Subject to the GNU Public License, v.2
19 */
20
21#include <asm/hw_irq.h>
22#include <asm/apic.h>
23#include <asm/smp.h>
24
25/*
26 * the following functions deal with sending IPIs between CPUs.
27 *
28 * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
29 */
30
31static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector,
32 unsigned int dest)
33{
34 unsigned int icr = shortcut | dest;
35
36 switch (vector) {
37 default:
38 icr |= APIC_DM_FIXED | vector;
39 break;
40 case NMI_VECTOR:
41 icr |= APIC_DM_NMI;
42 break;
43 }
44 return icr;
45}
46
47static inline int __prepare_ICR2(unsigned int mask)
48{
49 return SET_APIC_DEST_FIELD(mask);
50}
51
52static inline void __xapic_wait_icr_idle(void)
53{
54 while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY)
55 cpu_relax();
56}
57
58static inline void __send_IPI_shortcut(unsigned int shortcut, int vector,
59 unsigned int dest)
60{
61 /*
62 * Subtle. In the case of the 'never do double writes' workaround
63 * we have to lock out interrupts to be safe. As we don't care
64 * of the value read we use an atomic rmw access to avoid costly
65 * cli/sti. Otherwise we use an even cheaper single atomic write
66 * to the APIC.
67 */
68 unsigned int cfg;
69
70 /*
71 * Wait for idle.
72 */
73 __xapic_wait_icr_idle();
74
75 /*
76 * No need to touch the target chip field
77 */
78 cfg = __prepare_ICR(shortcut, vector, dest);
79
80 /*
81 * Send the IPI. The write to APIC_ICR fires this off.
82 */
83 native_apic_mem_write(APIC_ICR, cfg);
84}
85
86/*
87 * This is used to send an IPI with no shorthand notation (the destination is
88 * specified in bits 56 to 63 of the ICR).
89 */
90static inline void __send_IPI_dest_field(unsigned int mask, int vector,
91 unsigned int dest)
92{
93 unsigned long cfg;
94
95 /*
96 * Wait for idle.
97 */
98 if (unlikely(vector == NMI_VECTOR))
99 safe_apic_wait_icr_idle();
100 else
101 __xapic_wait_icr_idle();
102
103 /*
104 * prepare target chip field
105 */
106 cfg = __prepare_ICR2(mask);
107 native_apic_mem_write(APIC_ICR2, cfg);
108
109 /*
110 * program the ICR
111 */
112 cfg = __prepare_ICR(0, vector, dest);
113
114 /*
115 * Send the IPI. The write to APIC_ICR fires this off.
116 */
117 native_apic_mem_write(APIC_ICR, cfg);
118}
119
120static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
121{
122 unsigned long flags;
123 unsigned long query_cpu;
124
125 /*
126 * Hack. The clustered APIC addressing mode doesn't allow us to send
127 * to an arbitrary mask, so I do a unicast to each CPU instead.
128 * - mbligh
129 */
130 local_irq_save(flags);
131 for_each_cpu_mask_nr(query_cpu, mask) {
132 __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu),
133 vector, APIC_DEST_PHYSICAL);
134 }
135 local_irq_restore(flags);
136}
137
138#endif /* _ASM_X86_IPI_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
new file mode 100644
index 00000000000..bae0eda9548
--- /dev/null
+++ b/arch/x86/include/asm/irq.h
@@ -0,0 +1,50 @@
1#ifndef _ASM_X86_IRQ_H
2#define _ASM_X86_IRQ_H
3/*
4 * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
5 *
6 * IRQ/IPI changes taken from work by Thomas Radke
7 * <tomsoft@informatik.tu-chemnitz.de>
8 */
9
10#include <asm/apicdef.h>
11#include <asm/irq_vectors.h>
12
13static inline int irq_canonicalize(int irq)
14{
15 return ((irq == 2) ? 9 : irq);
16}
17
18#ifdef CONFIG_X86_LOCAL_APIC
19# define ARCH_HAS_NMI_WATCHDOG
20#endif
21
22#ifdef CONFIG_4KSTACKS
23 extern void irq_ctx_init(int cpu);
24 extern void irq_ctx_exit(int cpu);
25# define __ARCH_HAS_DO_SOFTIRQ
26#else
27# define irq_ctx_init(cpu) do { } while (0)
28# define irq_ctx_exit(cpu) do { } while (0)
29# ifdef CONFIG_X86_64
30# define __ARCH_HAS_DO_SOFTIRQ
31# endif
32#endif
33
34#ifdef CONFIG_IRQBALANCE
35extern int irqbalance_disable(char *str);
36#endif
37
38#ifdef CONFIG_HOTPLUG_CPU
39#include <linux/cpumask.h>
40extern void fixup_irqs(cpumask_t map);
41#endif
42
43extern unsigned int do_IRQ(struct pt_regs *regs);
44extern void init_IRQ(void);
45extern void native_init_IRQ(void);
46
47/* Interrupt vector management */
48extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
49
50#endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h
new file mode 100644
index 00000000000..89c898ab298
--- /dev/null
+++ b/arch/x86/include/asm/irq_regs.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "irq_regs_32.h"
3#else
4# include "irq_regs_64.h"
5#endif
diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h
new file mode 100644
index 00000000000..af2f02d27fc
--- /dev/null
+++ b/arch/x86/include/asm/irq_regs_32.h
@@ -0,0 +1,29 @@
1/*
2 * Per-cpu current frame pointer - the location of the last exception frame on
3 * the stack, stored in the per-cpu area.
4 *
5 * Jeremy Fitzhardinge <jeremy@goop.org>
6 */
7#ifndef _ASM_X86_IRQ_REGS_32_H
8#define _ASM_X86_IRQ_REGS_32_H
9
10#include <asm/percpu.h>
11
12DECLARE_PER_CPU(struct pt_regs *, irq_regs);
13
14static inline struct pt_regs *get_irq_regs(void)
15{
16 return x86_read_percpu(irq_regs);
17}
18
19static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
20{
21 struct pt_regs *old_regs;
22
23 old_regs = get_irq_regs();
24 x86_write_percpu(irq_regs, new_regs);
25
26 return old_regs;
27}
28
29#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_regs_64.h b/arch/x86/include/asm/irq_regs_64.h
new file mode 100644
index 00000000000..3dd9c0b7027
--- /dev/null
+++ b/arch/x86/include/asm/irq_regs_64.h
@@ -0,0 +1 @@
#include <asm-generic/irq_regs.h>
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
new file mode 100644
index 00000000000..20e1fd588db
--- /dev/null
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -0,0 +1,8 @@
1#ifndef _ASM_X86_IRQ_REMAPPING_H
2#define _ASM_X86_IRQ_REMAPPING_H
3
4extern int x2apic;
5
6#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
7
8#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
new file mode 100644
index 00000000000..d843ed0e9b2
--- /dev/null
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -0,0 +1,164 @@
1#ifndef _ASM_X86_IRQ_VECTORS_H
2#define _ASM_X86_IRQ_VECTORS_H
3
4#include <linux/threads.h>
5
6#define NMI_VECTOR 0x02
7
8/*
9 * IDT vectors usable for external interrupt sources start
10 * at 0x20:
11 */
12#define FIRST_EXTERNAL_VECTOR 0x20
13
14#ifdef CONFIG_X86_32
15# define SYSCALL_VECTOR 0x80
16#else
17# define IA32_SYSCALL_VECTOR 0x80
18#endif
19
20/*
21 * Reserve the lowest usable priority level 0x20 - 0x2f for triggering
22 * cleanup after irq migration.
23 */
24#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
25
26/*
27 * Vectors 0x30-0x3f are used for ISA interrupts.
28 */
29#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10)
30#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
31#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
32#define IRQ3_VECTOR (IRQ0_VECTOR + 3)
33#define IRQ4_VECTOR (IRQ0_VECTOR + 4)
34#define IRQ5_VECTOR (IRQ0_VECTOR + 5)
35#define IRQ6_VECTOR (IRQ0_VECTOR + 6)
36#define IRQ7_VECTOR (IRQ0_VECTOR + 7)
37#define IRQ8_VECTOR (IRQ0_VECTOR + 8)
38#define IRQ9_VECTOR (IRQ0_VECTOR + 9)
39#define IRQ10_VECTOR (IRQ0_VECTOR + 10)
40#define IRQ11_VECTOR (IRQ0_VECTOR + 11)
41#define IRQ12_VECTOR (IRQ0_VECTOR + 12)
42#define IRQ13_VECTOR (IRQ0_VECTOR + 13)
43#define IRQ14_VECTOR (IRQ0_VECTOR + 14)
44#define IRQ15_VECTOR (IRQ0_VECTOR + 15)
45
46/*
47 * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
48 *
49 * some of the following vectors are 'rare', they are merged
50 * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
51 * TLB, reschedule and local APIC vectors are performance-critical.
52 *
53 * Vectors 0xf0-0xfa are free (reserved for future Linux use).
54 */
55#ifdef CONFIG_X86_32
56
57# define SPURIOUS_APIC_VECTOR 0xff
58# define ERROR_APIC_VECTOR 0xfe
59# define INVALIDATE_TLB_VECTOR 0xfd
60# define RESCHEDULE_VECTOR 0xfc
61# define CALL_FUNCTION_VECTOR 0xfb
62# define CALL_FUNCTION_SINGLE_VECTOR 0xfa
63# define THERMAL_APIC_VECTOR 0xf0
64
65#else
66
67#define SPURIOUS_APIC_VECTOR 0xff
68#define ERROR_APIC_VECTOR 0xfe
69#define RESCHEDULE_VECTOR 0xfd
70#define CALL_FUNCTION_VECTOR 0xfc
71#define CALL_FUNCTION_SINGLE_VECTOR 0xfb
72#define THERMAL_APIC_VECTOR 0xfa
73#define THRESHOLD_APIC_VECTOR 0xf9
74#define UV_BAU_MESSAGE 0xf8
75#define INVALIDATE_TLB_VECTOR_END 0xf7
76#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */
77
78#define NUM_INVALIDATE_TLB_VECTORS 8
79
80#endif
81
82/*
83 * Local APIC timer IRQ vector is on a different priority level,
84 * to work around the 'lost local interrupt if more than 2 IRQ
85 * sources per level' errata.
86 */
87#define LOCAL_TIMER_VECTOR 0xef
88
89/*
90 * First APIC vector available to drivers: (vectors 0x30-0xee) we
91 * start at 0x31(0x41) to spread out vectors evenly between priority
92 * levels. (0x80 is the syscall vector)
93 */
94#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
95
96#define NR_VECTORS 256
97
98#define FPU_IRQ 13
99
100#define FIRST_VM86_IRQ 3
101#define LAST_VM86_IRQ 15
102#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
103
104#ifdef CONFIG_X86_64
105# if NR_CPUS < MAX_IO_APICS
106# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
107# else
108# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
109# endif
110
111#elif !defined(CONFIG_X86_VOYAGER)
112
113# if defined(CONFIG_X86_IO_APIC) || defined(CONFIG_PARAVIRT) || defined(CONFIG_X86_VISWS)
114
115# define NR_IRQS 224
116
117# else /* IO_APIC || PARAVIRT */
118
119# define NR_IRQS 16
120
121# endif
122
123#else /* !VISWS && !VOYAGER */
124
125# define NR_IRQS 224
126
127#endif /* VISWS */
128
129/* Voyager specific defines */
130/* These define the CPIs we use in linux */
131#define VIC_CPI_LEVEL0 0
132#define VIC_CPI_LEVEL1 1
133/* now the fake CPIs */
134#define VIC_TIMER_CPI 2
135#define VIC_INVALIDATE_CPI 3
136#define VIC_RESCHEDULE_CPI 4
137#define VIC_ENABLE_IRQ_CPI 5
138#define VIC_CALL_FUNCTION_CPI 6
139#define VIC_CALL_FUNCTION_SINGLE_CPI 7
140
141/* Now the QIC CPIs: Since we don't need the two initial levels,
142 * these are 2 less than the VIC CPIs */
143#define QIC_CPI_OFFSET 1
144#define QIC_TIMER_CPI (VIC_TIMER_CPI - QIC_CPI_OFFSET)
145#define QIC_INVALIDATE_CPI (VIC_INVALIDATE_CPI - QIC_CPI_OFFSET)
146#define QIC_RESCHEDULE_CPI (VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET)
147#define QIC_ENABLE_IRQ_CPI (VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET)
148#define QIC_CALL_FUNCTION_CPI (VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET)
149#define QIC_CALL_FUNCTION_SINGLE_CPI (VIC_CALL_FUNCTION_SINGLE_CPI - QIC_CPI_OFFSET)
150
151#define VIC_START_FAKE_CPI VIC_TIMER_CPI
152#define VIC_END_FAKE_CPI VIC_CALL_FUNCTION_SINGLE_CPI
153
154/* this is the SYS_INT CPI. */
155#define VIC_SYS_INT 8
156#define VIC_CMN_INT 15
157
158/* This is the boot CPI for alternate processors. It gets overwritten
159 * by the above once the system has activated all available processors */
160#define VIC_CPU_BOOT_CPI VIC_CPI_LEVEL0
161#define VIC_CPU_BOOT_ERRATA_CPI (VIC_CPI_LEVEL0 + 8)
162
163
164#endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
new file mode 100644
index 00000000000..2bdab21f089
--- /dev/null
+++ b/arch/x86/include/asm/irqflags.h
@@ -0,0 +1,211 @@
1#ifndef _X86_IRQFLAGS_H_
2#define _X86_IRQFLAGS_H_
3
4#include <asm/processor-flags.h>
5
6#ifndef __ASSEMBLY__
7/*
8 * Interrupt control:
9 */
10
11static inline unsigned long native_save_fl(void)
12{
13 unsigned long flags;
14
15 asm volatile("# __raw_save_flags\n\t"
16 "pushf ; pop %0"
17 : "=g" (flags)
18 : /* no input */
19 : "memory");
20
21 return flags;
22}
23
24static inline void native_restore_fl(unsigned long flags)
25{
26 asm volatile("push %0 ; popf"
27 : /* no output */
28 :"g" (flags)
29 :"memory", "cc");
30}
31
32static inline void native_irq_disable(void)
33{
34 asm volatile("cli": : :"memory");
35}
36
37static inline void native_irq_enable(void)
38{
39 asm volatile("sti": : :"memory");
40}
41
42static inline void native_safe_halt(void)
43{
44 asm volatile("sti; hlt": : :"memory");
45}
46
47static inline void native_halt(void)
48{
49 asm volatile("hlt": : :"memory");
50}
51
52#endif
53
54#ifdef CONFIG_PARAVIRT
55#include <asm/paravirt.h>
56#else
57#ifndef __ASSEMBLY__
58
59static inline unsigned long __raw_local_save_flags(void)
60{
61 return native_save_fl();
62}
63
64static inline void raw_local_irq_restore(unsigned long flags)
65{
66 native_restore_fl(flags);
67}
68
69static inline void raw_local_irq_disable(void)
70{
71 native_irq_disable();
72}
73
74static inline void raw_local_irq_enable(void)
75{
76 native_irq_enable();
77}
78
79/*
80 * Used in the idle loop; sti takes one instruction cycle
81 * to complete:
82 */
83static inline void raw_safe_halt(void)
84{
85 native_safe_halt();
86}
87
88/*
89 * Used when interrupts are already enabled or to
90 * shutdown the processor:
91 */
92static inline void halt(void)
93{
94 native_halt();
95}
96
97/*
98 * For spinlocks, etc:
99 */
100static inline unsigned long __raw_local_irq_save(void)
101{
102 unsigned long flags = __raw_local_save_flags();
103
104 raw_local_irq_disable();
105
106 return flags;
107}
108#else
109
110#define ENABLE_INTERRUPTS(x) sti
111#define DISABLE_INTERRUPTS(x) cli
112
113#ifdef CONFIG_X86_64
114#define SWAPGS swapgs
115/*
116 * Currently paravirt can't handle swapgs nicely when we
117 * don't have a stack we can rely on (such as a user space
118 * stack). So we either find a way around these or just fault
119 * and emulate if a guest tries to call swapgs directly.
120 *
121 * Either way, this is a good way to document that we don't
122 * have a reliable stack. x86_64 only.
123 */
124#define SWAPGS_UNSAFE_STACK swapgs
125
126#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */
127
128#define INTERRUPT_RETURN iretq
129#define USERGS_SYSRET64 \
130 swapgs; \
131 sysretq;
132#define USERGS_SYSRET32 \
133 swapgs; \
134 sysretl
135#define ENABLE_INTERRUPTS_SYSEXIT32 \
136 swapgs; \
137 sti; \
138 sysexit
139
140#else
141#define INTERRUPT_RETURN iret
142#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
143#define GET_CR0_INTO_EAX movl %cr0, %eax
144#endif
145
146
147#endif /* __ASSEMBLY__ */
148#endif /* CONFIG_PARAVIRT */
149
150#ifndef __ASSEMBLY__
151#define raw_local_save_flags(flags) \
152 do { (flags) = __raw_local_save_flags(); } while (0)
153
154#define raw_local_irq_save(flags) \
155 do { (flags) = __raw_local_irq_save(); } while (0)
156
157static inline int raw_irqs_disabled_flags(unsigned long flags)
158{
159 return !(flags & X86_EFLAGS_IF);
160}
161
162static inline int raw_irqs_disabled(void)
163{
164 unsigned long flags = __raw_local_save_flags();
165
166 return raw_irqs_disabled_flags(flags);
167}
168
169#else
170
171#ifdef CONFIG_X86_64
172#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
173#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
174 TRACE_IRQS_ON; \
175 sti; \
176 SAVE_REST; \
177 LOCKDEP_SYS_EXIT; \
178 RESTORE_REST; \
179 cli; \
180 TRACE_IRQS_OFF;
181
182#else
183#define ARCH_LOCKDEP_SYS_EXIT \
184 pushl %eax; \
185 pushl %ecx; \
186 pushl %edx; \
187 call lockdep_sys_exit; \
188 popl %edx; \
189 popl %ecx; \
190 popl %eax;
191
192#define ARCH_LOCKDEP_SYS_EXIT_IRQ
193#endif
194
195#ifdef CONFIG_TRACE_IRQFLAGS
196# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
197# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
198#else
199# define TRACE_IRQS_ON
200# define TRACE_IRQS_OFF
201#endif
202#ifdef CONFIG_DEBUG_LOCK_ALLOC
203# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
204# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
205# else
206# define LOCKDEP_SYS_EXIT
207# define LOCKDEP_SYS_EXIT_IRQ
208# endif
209
210#endif /* __ASSEMBLY__ */
211#endif
diff --git a/arch/x86/include/asm/ist.h b/arch/x86/include/asm/ist.h
new file mode 100644
index 00000000000..7e5dff1de0e
--- /dev/null
+++ b/arch/x86/include/asm/ist.h
@@ -0,0 +1,34 @@
1#ifndef _ASM_X86_IST_H
2#define _ASM_X86_IST_H
3
4/*
5 * Include file for the interface to IST BIOS
6 * Copyright 2002 Andy Grover <andrew.grover@intel.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License as published by the
10 * Free Software Foundation; either version 2, or (at your option) any
11 * later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 */
18
19
20#include <linux/types.h>
21
22struct ist_info {
23 __u32 signature;
24 __u32 command;
25 __u32 event;
26 __u32 perf_level;
27};
28
29#ifdef __KERNEL__
30
31extern struct ist_info ist_info;
32
33#endif /* __KERNEL__ */
34#endif /* _ASM_X86_IST_H */
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
new file mode 100644
index 00000000000..54c8cc53b24
--- /dev/null
+++ b/arch/x86/include/asm/k8.h
@@ -0,0 +1,15 @@
1#ifndef _ASM_X86_K8_H
2#define _ASM_X86_K8_H
3
4#include <linux/pci.h>
5
6extern struct pci_device_id k8_nb_ids[];
7
8extern int early_is_k8_nb(u32 value);
9extern struct pci_dev **k8_northbridges;
10extern int num_k8_northbridges;
11extern int cache_k8_northbridges(void);
12extern void k8_flush_garts(void);
13extern int k8_scan_nodes(unsigned long start, unsigned long end);
14
15#endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
new file mode 100644
index 00000000000..fa7c0b97476
--- /dev/null
+++ b/arch/x86/include/asm/kdebug.h
@@ -0,0 +1,37 @@
1#ifndef _ASM_X86_KDEBUG_H
2#define _ASM_X86_KDEBUG_H
3
4#include <linux/notifier.h>
5
6struct pt_regs;
7
8/* Grossly misnamed. */
9enum die_val {
10 DIE_OOPS = 1,
11 DIE_INT3,
12 DIE_DEBUG,
13 DIE_PANIC,
14 DIE_NMI,
15 DIE_DIE,
16 DIE_NMIWATCHDOG,
17 DIE_KERNELDEBUG,
18 DIE_TRAP,
19 DIE_GPF,
20 DIE_CALL,
21 DIE_NMI_IPI,
22 DIE_PAGE_FAULT,
23 DIE_NMIUNKNOWN,
24};
25
26extern void printk_address(unsigned long address, int reliable);
27extern void die(const char *, struct pt_regs *,long);
28extern int __must_check __die(const char *, struct pt_regs *, long);
29extern void show_registers(struct pt_regs *regs);
30extern void show_trace(struct task_struct *t, struct pt_regs *regs,
31 unsigned long *sp, unsigned long bp);
32extern void __show_regs(struct pt_regs *regs, int all);
33extern void show_regs(struct pt_regs *regs);
34extern unsigned long oops_begin(void);
35extern void oops_end(unsigned long, struct pt_regs *, int signr);
36
37#endif /* _ASM_X86_KDEBUG_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
new file mode 100644
index 00000000000..a1f22771a15
--- /dev/null
+++ b/arch/x86/include/asm/kexec.h
@@ -0,0 +1,175 @@
1#ifndef _ASM_X86_KEXEC_H
2#define _ASM_X86_KEXEC_H
3
4#ifdef CONFIG_X86_32
5# define PA_CONTROL_PAGE 0
6# define VA_CONTROL_PAGE 1
7# define PA_PGD 2
8# define VA_PGD 3
9# define PA_PTE_0 4
10# define VA_PTE_0 5
11# define PA_PTE_1 6
12# define VA_PTE_1 7
13# define PA_SWAP_PAGE 8
14# ifdef CONFIG_X86_PAE
15# define PA_PMD_0 9
16# define VA_PMD_0 10
17# define PA_PMD_1 11
18# define VA_PMD_1 12
19# define PAGES_NR 13
20# else
21# define PAGES_NR 9
22# endif
23#else
24# define PA_CONTROL_PAGE 0
25# define VA_CONTROL_PAGE 1
26# define PA_PGD 2
27# define VA_PGD 3
28# define PA_PUD_0 4
29# define VA_PUD_0 5
30# define PA_PMD_0 6
31# define VA_PMD_0 7
32# define PA_PTE_0 8
33# define VA_PTE_0 9
34# define PA_PUD_1 10
35# define VA_PUD_1 11
36# define PA_PMD_1 12
37# define VA_PMD_1 13
38# define PA_PTE_1 14
39# define VA_PTE_1 15
40# define PA_TABLE_PAGE 16
41# define PAGES_NR 17
42#endif
43
44#ifdef CONFIG_X86_32
45# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
46#endif
47
48#ifndef __ASSEMBLY__
49
50#include <linux/string.h>
51
52#include <asm/page.h>
53#include <asm/ptrace.h>
54
55/*
56 * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
57 * I.e. Maximum page that is mapped directly into kernel memory,
58 * and kmap is not required.
59 *
60 * So far x86_64 is limited to 40 physical address bits.
61 */
62#ifdef CONFIG_X86_32
63/* Maximum physical address we can use pages from */
64# define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
65/* Maximum address we can reach in physical address mode */
66# define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
67/* Maximum address we can use for the control code buffer */
68# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
69
70# define KEXEC_CONTROL_PAGE_SIZE 4096
71
72/* The native architecture */
73# define KEXEC_ARCH KEXEC_ARCH_386
74
75/* We can also handle crash dumps from 64 bit kernel. */
76# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
77#else
78/* Maximum physical address we can use pages from */
79# define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL)
80/* Maximum address we can reach in physical address mode */
81# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
82/* Maximum address we can use for the control pages */
83# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL)
84
85/* Allocate one page for the pdp and the second for the code */
86# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
87
88/* The native architecture */
89# define KEXEC_ARCH KEXEC_ARCH_X86_64
90#endif
91
92/*
93 * CPU does not save ss and sp on stack if execution is already
94 * running in kernel mode at the time of NMI occurrence. This code
95 * fixes it.
96 */
97static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
98 struct pt_regs *oldregs)
99{
100#ifdef CONFIG_X86_32
101 newregs->sp = (unsigned long)&(oldregs->sp);
102 asm volatile("xorl %%eax, %%eax\n\t"
103 "movw %%ss, %%ax\n\t"
104 :"=a"(newregs->ss));
105#endif
106}
107
108/*
109 * This function is responsible for capturing register states if coming
110 * via panic otherwise just fix up the ss and sp if coming via kernel
111 * mode exception.
112 */
113static inline void crash_setup_regs(struct pt_regs *newregs,
114 struct pt_regs *oldregs)
115{
116 if (oldregs) {
117 memcpy(newregs, oldregs, sizeof(*newregs));
118 crash_fixup_ss_esp(newregs, oldregs);
119 } else {
120#ifdef CONFIG_X86_32
121 asm volatile("movl %%ebx,%0" : "=m"(newregs->bx));
122 asm volatile("movl %%ecx,%0" : "=m"(newregs->cx));
123 asm volatile("movl %%edx,%0" : "=m"(newregs->dx));
124 asm volatile("movl %%esi,%0" : "=m"(newregs->si));
125 asm volatile("movl %%edi,%0" : "=m"(newregs->di));
126 asm volatile("movl %%ebp,%0" : "=m"(newregs->bp));
127 asm volatile("movl %%eax,%0" : "=m"(newregs->ax));
128 asm volatile("movl %%esp,%0" : "=m"(newregs->sp));
129 asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss));
130 asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs));
131 asm volatile("movl %%ds, %%eax;" :"=a"(newregs->ds));
132 asm volatile("movl %%es, %%eax;" :"=a"(newregs->es));
133 asm volatile("pushfl; popl %0" :"=m"(newregs->flags));
134#else
135 asm volatile("movq %%rbx,%0" : "=m"(newregs->bx));
136 asm volatile("movq %%rcx,%0" : "=m"(newregs->cx));
137 asm volatile("movq %%rdx,%0" : "=m"(newregs->dx));
138 asm volatile("movq %%rsi,%0" : "=m"(newregs->si));
139 asm volatile("movq %%rdi,%0" : "=m"(newregs->di));
140 asm volatile("movq %%rbp,%0" : "=m"(newregs->bp));
141 asm volatile("movq %%rax,%0" : "=m"(newregs->ax));
142 asm volatile("movq %%rsp,%0" : "=m"(newregs->sp));
143 asm volatile("movq %%r8,%0" : "=m"(newregs->r8));
144 asm volatile("movq %%r9,%0" : "=m"(newregs->r9));
145 asm volatile("movq %%r10,%0" : "=m"(newregs->r10));
146 asm volatile("movq %%r11,%0" : "=m"(newregs->r11));
147 asm volatile("movq %%r12,%0" : "=m"(newregs->r12));
148 asm volatile("movq %%r13,%0" : "=m"(newregs->r13));
149 asm volatile("movq %%r14,%0" : "=m"(newregs->r14));
150 asm volatile("movq %%r15,%0" : "=m"(newregs->r15));
151 asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss));
152 asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs));
153 asm volatile("pushfq; popq %0" :"=m"(newregs->flags));
154#endif
155 newregs->ip = (unsigned long)current_text_addr();
156 }
157}
158
159#ifdef CONFIG_X86_32
160asmlinkage unsigned long
161relocate_kernel(unsigned long indirection_page,
162 unsigned long control_page,
163 unsigned long start_address,
164 unsigned int has_pae,
165 unsigned int preserve_context);
166#else
167NORET_TYPE void
168relocate_kernel(unsigned long indirection_page,
169 unsigned long page_list,
170 unsigned long start_address) ATTRIB_NORET;
171#endif
172
173#endif /* __ASSEMBLY__ */
174
175#endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
new file mode 100644
index 00000000000..e6c6c808489
--- /dev/null
+++ b/arch/x86/include/asm/kgdb.h
@@ -0,0 +1,79 @@
1#ifndef _ASM_X86_KGDB_H
2#define _ASM_X86_KGDB_H
3
4/*
5 * Copyright (C) 2001-2004 Amit S. Kale
6 * Copyright (C) 2008 Wind River Systems, Inc.
7 */
8
9/*
10 * BUFMAX defines the maximum number of characters in inbound/outbound
11 * buffers at least NUMREGBYTES*2 are needed for register packets
12 * Longer buffer is needed to list all threads
13 */
14#define BUFMAX 1024
15
16/*
17 * Note that this register image is in a different order than
18 * the register image that Linux produces at interrupt time.
19 *
20 * Linux's register image is defined by struct pt_regs in ptrace.h.
21 * Just why GDB uses a different order is a historical mystery.
22 */
23#ifdef CONFIG_X86_32
24enum regnames {
25 GDB_AX, /* 0 */
26 GDB_CX, /* 1 */
27 GDB_DX, /* 2 */
28 GDB_BX, /* 3 */
29 GDB_SP, /* 4 */
30 GDB_BP, /* 5 */
31 GDB_SI, /* 6 */
32 GDB_DI, /* 7 */
33 GDB_PC, /* 8 also known as eip */
34 GDB_PS, /* 9 also known as eflags */
35 GDB_CS, /* 10 */
36 GDB_SS, /* 11 */
37 GDB_DS, /* 12 */
38 GDB_ES, /* 13 */
39 GDB_FS, /* 14 */
40 GDB_GS, /* 15 */
41};
42#define NUMREGBYTES ((GDB_GS+1)*4)
43#else /* ! CONFIG_X86_32 */
44enum regnames64 {
45 GDB_AX, /* 0 */
46 GDB_BX, /* 1 */
47 GDB_CX, /* 2 */
48 GDB_DX, /* 3 */
49 GDB_SI, /* 4 */
50 GDB_DI, /* 5 */
51 GDB_BP, /* 6 */
52 GDB_SP, /* 7 */
53 GDB_R8, /* 8 */
54 GDB_R9, /* 9 */
55 GDB_R10, /* 10 */
56 GDB_R11, /* 11 */
57 GDB_R12, /* 12 */
58 GDB_R13, /* 13 */
59 GDB_R14, /* 14 */
60 GDB_R15, /* 15 */
61 GDB_PC, /* 16 */
62};
63
64enum regnames32 {
65 GDB_PS = 34,
66 GDB_CS,
67 GDB_SS,
68};
69#define NUMREGBYTES ((GDB_SS+1)*4)
70#endif /* CONFIG_X86_32 */
71
72static inline void arch_kgdb_breakpoint(void)
73{
74 asm(" int $3");
75}
76#define BREAK_INSTR_SIZE 1
77#define CACHE_FLUSH_IS_SAFE 1
78
79#endif /* _ASM_X86_KGDB_H */
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
new file mode 100644
index 00000000000..5759c165a5c
--- /dev/null
+++ b/arch/x86/include/asm/kmap_types.h
@@ -0,0 +1,29 @@
1#ifndef _ASM_X86_KMAP_TYPES_H
2#define _ASM_X86_KMAP_TYPES_H
3
4#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
5# define D(n) __KM_FENCE_##n ,
6#else
7# define D(n)
8#endif
9
10enum km_type {
11D(0) KM_BOUNCE_READ,
12D(1) KM_SKB_SUNRPC_DATA,
13D(2) KM_SKB_DATA_SOFTIRQ,
14D(3) KM_USER0,
15D(4) KM_USER1,
16D(5) KM_BIO_SRC_IRQ,
17D(6) KM_BIO_DST_IRQ,
18D(7) KM_PTE0,
19D(8) KM_PTE1,
20D(9) KM_IRQ0,
21D(10) KM_IRQ1,
22D(11) KM_SOFTIRQ0,
23D(12) KM_SOFTIRQ1,
24D(13) KM_TYPE_NR
25};
26
27#undef D
28
29#endif /* _ASM_X86_KMAP_TYPES_H */
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
new file mode 100644
index 00000000000..4fe681de1e7
--- /dev/null
+++ b/arch/x86/include/asm/kprobes.h
@@ -0,0 +1,88 @@
1#ifndef _ASM_X86_KPROBES_H
2#define _ASM_X86_KPROBES_H
3/*
4 * Kernel Probes (KProbes)
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright (C) IBM Corporation, 2002, 2004
21 *
22 * See arch/x86/kernel/kprobes.c for x86 kprobes history.
23 */
24#include <linux/types.h>
25#include <linux/ptrace.h>
26#include <linux/percpu.h>
27
28#define __ARCH_WANT_KPROBES_INSN_SLOT
29
30struct pt_regs;
31struct kprobe;
32
33typedef u8 kprobe_opcode_t;
34#define BREAKPOINT_INSTRUCTION 0xcc
35#define RELATIVEJUMP_INSTRUCTION 0xe9
36#define MAX_INSN_SIZE 16
37#define MAX_STACK_SIZE 64
38#define MIN_STACK_SIZE(ADDR) \
39 (((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \
40 THREAD_SIZE - (unsigned long)(ADDR))) \
41 ? (MAX_STACK_SIZE) \
42 : (((unsigned long)current_thread_info()) + \
43 THREAD_SIZE - (unsigned long)(ADDR)))
44
45#define flush_insn_slot(p) do { } while (0)
46
47extern const int kretprobe_blacklist_size;
48
49void arch_remove_kprobe(struct kprobe *p);
50void kretprobe_trampoline(void);
51
52/* Architecture specific copy of original instruction*/
53struct arch_specific_insn {
54 /* copy of the original instruction */
55 kprobe_opcode_t *insn;
56 /*
57 * boostable = -1: This instruction type is not boostable.
58 * boostable = 0: This instruction type is boostable.
59 * boostable = 1: This instruction has been boosted: we have
60 * added a relative jump after the instruction copy in insn,
61 * so no single-step and fixup are needed (unless there's
62 * a post_handler or break_handler).
63 */
64 int boostable;
65};
66
67struct prev_kprobe {
68 struct kprobe *kp;
69 unsigned long status;
70 unsigned long old_flags;
71 unsigned long saved_flags;
72};
73
74/* per-cpu kprobe control block */
75struct kprobe_ctlblk {
76 unsigned long kprobe_status;
77 unsigned long kprobe_old_flags;
78 unsigned long kprobe_saved_flags;
79 unsigned long *jprobe_saved_sp;
80 struct pt_regs jprobe_saved_regs;
81 kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
82 struct prev_kprobe prev_kprobe;
83};
84
85extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
86extern int kprobe_exceptions_notify(struct notifier_block *self,
87 unsigned long val, void *data);
88#endif /* _ASM_X86_KPROBES_H */
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
new file mode 100644
index 00000000000..b95162af0bf
--- /dev/null
+++ b/arch/x86/include/asm/kvm.h
@@ -0,0 +1,211 @@
1#ifndef _ASM_X86_KVM_H
2#define _ASM_X86_KVM_H
3
4/*
5 * KVM x86 specific structures and definitions
6 *
7 */
8
9#include <asm/types.h>
10#include <linux/ioctl.h>
11
12/* Architectural interrupt line count. */
13#define KVM_NR_INTERRUPTS 256
14
15struct kvm_memory_alias {
16 __u32 slot; /* this has a different namespace than memory slots */
17 __u32 flags;
18 __u64 guest_phys_addr;
19 __u64 memory_size;
20 __u64 target_phys_addr;
21};
22
23/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
24struct kvm_pic_state {
25 __u8 last_irr; /* edge detection */
26 __u8 irr; /* interrupt request register */
27 __u8 imr; /* interrupt mask register */
28 __u8 isr; /* interrupt service register */
29 __u8 priority_add; /* highest irq priority */
30 __u8 irq_base;
31 __u8 read_reg_select;
32 __u8 poll;
33 __u8 special_mask;
34 __u8 init_state;
35 __u8 auto_eoi;
36 __u8 rotate_on_auto_eoi;
37 __u8 special_fully_nested_mode;
38 __u8 init4; /* true if 4 byte init */
39 __u8 elcr; /* PIIX edge/trigger selection */
40 __u8 elcr_mask;
41};
42
43#define KVM_IOAPIC_NUM_PINS 24
44struct kvm_ioapic_state {
45 __u64 base_address;
46 __u32 ioregsel;
47 __u32 id;
48 __u32 irr;
49 __u32 pad;
50 union {
51 __u64 bits;
52 struct {
53 __u8 vector;
54 __u8 delivery_mode:3;
55 __u8 dest_mode:1;
56 __u8 delivery_status:1;
57 __u8 polarity:1;
58 __u8 remote_irr:1;
59 __u8 trig_mode:1;
60 __u8 mask:1;
61 __u8 reserve:7;
62 __u8 reserved[4];
63 __u8 dest_id;
64 } fields;
65 } redirtbl[KVM_IOAPIC_NUM_PINS];
66};
67
68#define KVM_IRQCHIP_PIC_MASTER 0
69#define KVM_IRQCHIP_PIC_SLAVE 1
70#define KVM_IRQCHIP_IOAPIC 2
71
72/* for KVM_GET_REGS and KVM_SET_REGS */
73struct kvm_regs {
74 /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
75 __u64 rax, rbx, rcx, rdx;
76 __u64 rsi, rdi, rsp, rbp;
77 __u64 r8, r9, r10, r11;
78 __u64 r12, r13, r14, r15;
79 __u64 rip, rflags;
80};
81
82/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
83#define KVM_APIC_REG_SIZE 0x400
84struct kvm_lapic_state {
85 char regs[KVM_APIC_REG_SIZE];
86};
87
88struct kvm_segment {
89 __u64 base;
90 __u32 limit;
91 __u16 selector;
92 __u8 type;
93 __u8 present, dpl, db, s, l, g, avl;
94 __u8 unusable;
95 __u8 padding;
96};
97
98struct kvm_dtable {
99 __u64 base;
100 __u16 limit;
101 __u16 padding[3];
102};
103
104
105/* for KVM_GET_SREGS and KVM_SET_SREGS */
106struct kvm_sregs {
107 /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
108 struct kvm_segment cs, ds, es, fs, gs, ss;
109 struct kvm_segment tr, ldt;
110 struct kvm_dtable gdt, idt;
111 __u64 cr0, cr2, cr3, cr4, cr8;
112 __u64 efer;
113 __u64 apic_base;
114 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
115};
116
117/* for KVM_GET_FPU and KVM_SET_FPU */
118struct kvm_fpu {
119 __u8 fpr[8][16];
120 __u16 fcw;
121 __u16 fsw;
122 __u8 ftwx; /* in fxsave format */
123 __u8 pad1;
124 __u16 last_opcode;
125 __u64 last_ip;
126 __u64 last_dp;
127 __u8 xmm[16][16];
128 __u32 mxcsr;
129 __u32 pad2;
130};
131
132struct kvm_msr_entry {
133 __u32 index;
134 __u32 reserved;
135 __u64 data;
136};
137
138/* for KVM_GET_MSRS and KVM_SET_MSRS */
139struct kvm_msrs {
140 __u32 nmsrs; /* number of msrs in entries */
141 __u32 pad;
142
143 struct kvm_msr_entry entries[0];
144};
145
146/* for KVM_GET_MSR_INDEX_LIST */
147struct kvm_msr_list {
148 __u32 nmsrs; /* number of msrs in entries */
149 __u32 indices[0];
150};
151
152
153struct kvm_cpuid_entry {
154 __u32 function;
155 __u32 eax;
156 __u32 ebx;
157 __u32 ecx;
158 __u32 edx;
159 __u32 padding;
160};
161
162/* for KVM_SET_CPUID */
163struct kvm_cpuid {
164 __u32 nent;
165 __u32 padding;
166 struct kvm_cpuid_entry entries[0];
167};
168
169struct kvm_cpuid_entry2 {
170 __u32 function;
171 __u32 index;
172 __u32 flags;
173 __u32 eax;
174 __u32 ebx;
175 __u32 ecx;
176 __u32 edx;
177 __u32 padding[3];
178};
179
180#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
181#define KVM_CPUID_FLAG_STATEFUL_FUNC 2
182#define KVM_CPUID_FLAG_STATE_READ_NEXT 4
183
184/* for KVM_SET_CPUID2 */
185struct kvm_cpuid2 {
186 __u32 nent;
187 __u32 padding;
188 struct kvm_cpuid_entry2 entries[0];
189};
190
191/* for KVM_GET_PIT and KVM_SET_PIT */
192struct kvm_pit_channel_state {
193 __u32 count; /* can be 65536 */
194 __u16 latched_count;
195 __u8 count_latched;
196 __u8 status_latched;
197 __u8 status;
198 __u8 read_state;
199 __u8 write_state;
200 __u8 write_latch;
201 __u8 rw_mode;
202 __u8 mode;
203 __u8 bcd;
204 __u8 gate;
205 __s64 count_load_time;
206};
207
208struct kvm_pit_state {
209 struct kvm_pit_channel_state channels[3];
210};
211#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
new file mode 100644
index 00000000000..65679d00633
--- /dev/null
+++ b/arch/x86/include/asm/kvm_host.h
@@ -0,0 +1,752 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This header defines architecture specific interfaces, x86 version
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
8 *
9 */
10
11#ifndef _ASM_X86_KVM_HOST_H
12#define _ASM_X86_KVM_HOST_H
13
14#include <linux/types.h>
15#include <linux/mm.h>
16#include <linux/mmu_notifier.h>
17
18#include <linux/kvm.h>
19#include <linux/kvm_para.h>
20#include <linux/kvm_types.h>
21
22#include <asm/pvclock-abi.h>
23#include <asm/desc.h>
24
25#define KVM_MAX_VCPUS 16
26#define KVM_MEMORY_SLOTS 32
27/* memory slots that does not exposed to userspace */
28#define KVM_PRIVATE_MEM_SLOTS 4
29
30#define KVM_PIO_PAGE_OFFSET 1
31#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
32
33#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
34#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
35#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
36 0xFFFFFF0000000000ULL)
37
38#define KVM_GUEST_CR0_MASK \
39 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
40 | X86_CR0_NW | X86_CR0_CD)
41#define KVM_VM_CR0_ALWAYS_ON \
42 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
43 | X86_CR0_MP)
44#define KVM_GUEST_CR4_MASK \
45 (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
46#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
47#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
48
49#define INVALID_PAGE (~(hpa_t)0)
50#define UNMAPPED_GVA (~(gpa_t)0)
51
52/* shadow tables are PAE even on non-PAE hosts */
53#define KVM_HPAGE_SHIFT 21
54#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT)
55#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1))
56
57#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
58
59#define DE_VECTOR 0
60#define DB_VECTOR 1
61#define BP_VECTOR 3
62#define OF_VECTOR 4
63#define BR_VECTOR 5
64#define UD_VECTOR 6
65#define NM_VECTOR 7
66#define DF_VECTOR 8
67#define TS_VECTOR 10
68#define NP_VECTOR 11
69#define SS_VECTOR 12
70#define GP_VECTOR 13
71#define PF_VECTOR 14
72#define MF_VECTOR 16
73#define MC_VECTOR 18
74
75#define SELECTOR_TI_MASK (1 << 2)
76#define SELECTOR_RPL_MASK 0x03
77
78#define IOPL_SHIFT 12
79
80#define KVM_ALIAS_SLOTS 4
81
82#define KVM_PERMILLE_MMU_PAGES 20
83#define KVM_MIN_ALLOC_MMU_PAGES 64
84#define KVM_MMU_HASH_SHIFT 10
85#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
86#define KVM_MIN_FREE_MMU_PAGES 5
87#define KVM_REFILL_PAGES 25
88#define KVM_MAX_CPUID_ENTRIES 40
89#define KVM_NR_VAR_MTRR 8
90
91extern spinlock_t kvm_lock;
92extern struct list_head vm_list;
93
94struct kvm_vcpu;
95struct kvm;
96
97enum kvm_reg {
98 VCPU_REGS_RAX = 0,
99 VCPU_REGS_RCX = 1,
100 VCPU_REGS_RDX = 2,
101 VCPU_REGS_RBX = 3,
102 VCPU_REGS_RSP = 4,
103 VCPU_REGS_RBP = 5,
104 VCPU_REGS_RSI = 6,
105 VCPU_REGS_RDI = 7,
106#ifdef CONFIG_X86_64
107 VCPU_REGS_R8 = 8,
108 VCPU_REGS_R9 = 9,
109 VCPU_REGS_R10 = 10,
110 VCPU_REGS_R11 = 11,
111 VCPU_REGS_R12 = 12,
112 VCPU_REGS_R13 = 13,
113 VCPU_REGS_R14 = 14,
114 VCPU_REGS_R15 = 15,
115#endif
116 VCPU_REGS_RIP,
117 NR_VCPU_REGS
118};
119
120enum {
121 VCPU_SREG_ES,
122 VCPU_SREG_CS,
123 VCPU_SREG_SS,
124 VCPU_SREG_DS,
125 VCPU_SREG_FS,
126 VCPU_SREG_GS,
127 VCPU_SREG_TR,
128 VCPU_SREG_LDTR,
129};
130
131#include <asm/kvm_x86_emulate.h>
132
133#define KVM_NR_MEM_OBJS 40
134
135struct kvm_guest_debug {
136 int enabled;
137 unsigned long bp[4];
138 int singlestep;
139};
140
141/*
142 * We don't want allocation failures within the mmu code, so we preallocate
143 * enough memory for a single page fault in a cache.
144 */
145struct kvm_mmu_memory_cache {
146 int nobjs;
147 void *objects[KVM_NR_MEM_OBJS];
148};
149
150#define NR_PTE_CHAIN_ENTRIES 5
151
152struct kvm_pte_chain {
153 u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
154 struct hlist_node link;
155};
156
157/*
158 * kvm_mmu_page_role, below, is defined as:
159 *
160 * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
161 * bits 4:7 - page table level for this shadow (1-4)
162 * bits 8:9 - page table quadrant for 2-level guests
163 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
164 * bits 17:19 - common access permissions for all ptes in this shadow page
165 */
166union kvm_mmu_page_role {
167 unsigned word;
168 struct {
169 unsigned glevels:4;
170 unsigned level:4;
171 unsigned quadrant:2;
172 unsigned pad_for_nice_hex_output:6;
173 unsigned metaphysical:1;
174 unsigned access:3;
175 unsigned invalid:1;
176 };
177};
178
179struct kvm_mmu_page {
180 struct list_head link;
181 struct hlist_node hash_link;
182
183 /*
184 * The following two entries are used to key the shadow page in the
185 * hash table.
186 */
187 gfn_t gfn;
188 union kvm_mmu_page_role role;
189
190 u64 *spt;
191 /* hold the gfn of each spte inside spt */
192 gfn_t *gfns;
193 unsigned long slot_bitmap; /* One bit set per slot which has memory
194 * in this shadow page.
195 */
196 int multimapped; /* More than one parent_pte? */
197 int root_count; /* Currently serving as active root */
198 bool unsync;
199 bool unsync_children;
200 union {
201 u64 *parent_pte; /* !multimapped */
202 struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
203 };
204 DECLARE_BITMAP(unsync_child_bitmap, 512);
205};
206
207struct kvm_pv_mmu_op_buffer {
208 void *ptr;
209 unsigned len;
210 unsigned processed;
211 char buf[512] __aligned(sizeof(long));
212};
213
214/*
215 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
216 * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
217 * mode.
218 */
219struct kvm_mmu {
220 void (*new_cr3)(struct kvm_vcpu *vcpu);
221 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
222 void (*free)(struct kvm_vcpu *vcpu);
223 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
224 void (*prefetch_page)(struct kvm_vcpu *vcpu,
225 struct kvm_mmu_page *page);
226 int (*sync_page)(struct kvm_vcpu *vcpu,
227 struct kvm_mmu_page *sp);
228 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
229 hpa_t root_hpa;
230 int root_level;
231 int shadow_root_level;
232
233 u64 *pae_root;
234};
235
236struct kvm_vcpu_arch {
237 u64 host_tsc;
238 int interrupt_window_open;
239 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
240 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
241 /*
242 * rip and regs accesses must go through
243 * kvm_{register,rip}_{read,write} functions.
244 */
245 unsigned long regs[NR_VCPU_REGS];
246 u32 regs_avail;
247 u32 regs_dirty;
248
249 unsigned long cr0;
250 unsigned long cr2;
251 unsigned long cr3;
252 unsigned long cr4;
253 unsigned long cr8;
254 u64 pdptrs[4]; /* pae */
255 u64 shadow_efer;
256 u64 apic_base;
257 struct kvm_lapic *apic; /* kernel irqchip context */
258 int mp_state;
259 int sipi_vector;
260 u64 ia32_misc_enable_msr;
261 bool tpr_access_reporting;
262
263 struct kvm_mmu mmu;
264 /* only needed in kvm_pv_mmu_op() path, but it's hot so
265 * put it here to avoid allocation */
266 struct kvm_pv_mmu_op_buffer mmu_op_buffer;
267
268 struct kvm_mmu_memory_cache mmu_pte_chain_cache;
269 struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
270 struct kvm_mmu_memory_cache mmu_page_cache;
271 struct kvm_mmu_memory_cache mmu_page_header_cache;
272
273 gfn_t last_pt_write_gfn;
274 int last_pt_write_count;
275 u64 *last_pte_updated;
276 gfn_t last_pte_gfn;
277
278 struct {
279 gfn_t gfn; /* presumed gfn during guest pte update */
280 pfn_t pfn; /* pfn corresponding to that gfn */
281 int largepage;
282 unsigned long mmu_seq;
283 } update_pte;
284
285 struct i387_fxsave_struct host_fx_image;
286 struct i387_fxsave_struct guest_fx_image;
287
288 gva_t mmio_fault_cr2;
289 struct kvm_pio_request pio;
290 void *pio_data;
291
292 struct kvm_queued_exception {
293 bool pending;
294 bool has_error_code;
295 u8 nr;
296 u32 error_code;
297 } exception;
298
299 struct kvm_queued_interrupt {
300 bool pending;
301 u8 nr;
302 } interrupt;
303
304 struct {
305 int active;
306 u8 save_iopl;
307 struct kvm_save_segment {
308 u16 selector;
309 unsigned long base;
310 u32 limit;
311 u32 ar;
312 } tr, es, ds, fs, gs;
313 } rmode;
314 int halt_request; /* real mode on Intel only */
315
316 int cpuid_nent;
317 struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
318 /* emulate context */
319
320 struct x86_emulate_ctxt emulate_ctxt;
321
322 gpa_t time;
323 struct pvclock_vcpu_time_info hv_clock;
324 unsigned int hv_clock_tsc_khz;
325 unsigned int time_offset;
326 struct page *time_page;
327
328 bool nmi_pending;
329 bool nmi_injected;
330
331 u64 mtrr[0x100];
332};
333
334struct kvm_mem_alias {
335 gfn_t base_gfn;
336 unsigned long npages;
337 gfn_t target_gfn;
338};
339
340struct kvm_arch{
341 int naliases;
342 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
343
344 unsigned int n_free_mmu_pages;
345 unsigned int n_requested_mmu_pages;
346 unsigned int n_alloc_mmu_pages;
347 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
348 /*
349 * Hash table of struct kvm_mmu_page.
350 */
351 struct list_head active_mmu_pages;
352 struct list_head assigned_dev_head;
353 struct dmar_domain *intel_iommu_domain;
354 struct kvm_pic *vpic;
355 struct kvm_ioapic *vioapic;
356 struct kvm_pit *vpit;
357 struct hlist_head irq_ack_notifier_list;
358
359 int round_robin_prev_vcpu;
360 unsigned int tss_addr;
361 struct page *apic_access_page;
362
363 gpa_t wall_clock;
364
365 struct page *ept_identity_pagetable;
366 bool ept_identity_pagetable_done;
367};
368
369struct kvm_vm_stat {
370 u32 mmu_shadow_zapped;
371 u32 mmu_pte_write;
372 u32 mmu_pte_updated;
373 u32 mmu_pde_zapped;
374 u32 mmu_flooded;
375 u32 mmu_recycled;
376 u32 mmu_cache_miss;
377 u32 mmu_unsync;
378 u32 remote_tlb_flush;
379 u32 lpages;
380};
381
382struct kvm_vcpu_stat {
383 u32 pf_fixed;
384 u32 pf_guest;
385 u32 tlb_flush;
386 u32 invlpg;
387
388 u32 exits;
389 u32 io_exits;
390 u32 mmio_exits;
391 u32 signal_exits;
392 u32 irq_window_exits;
393 u32 nmi_window_exits;
394 u32 halt_exits;
395 u32 halt_wakeup;
396 u32 request_irq_exits;
397 u32 irq_exits;
398 u32 host_state_reload;
399 u32 efer_reload;
400 u32 fpu_reload;
401 u32 insn_emulation;
402 u32 insn_emulation_fail;
403 u32 hypercalls;
404 u32 irq_injections;
405};
406
407struct descriptor_table {
408 u16 limit;
409 unsigned long base;
410} __attribute__((packed));
411
412struct kvm_x86_ops {
413 int (*cpu_has_kvm_support)(void); /* __init */
414 int (*disabled_by_bios)(void); /* __init */
415 void (*hardware_enable)(void *dummy); /* __init */
416 void (*hardware_disable)(void *dummy);
417 void (*check_processor_compatibility)(void *rtn);
418 int (*hardware_setup)(void); /* __init */
419 void (*hardware_unsetup)(void); /* __exit */
420 bool (*cpu_has_accelerated_tpr)(void);
421
422 /* Create, but do not attach this VCPU */
423 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
424 void (*vcpu_free)(struct kvm_vcpu *vcpu);
425 int (*vcpu_reset)(struct kvm_vcpu *vcpu);
426
427 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
428 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
429 void (*vcpu_put)(struct kvm_vcpu *vcpu);
430
431 int (*set_guest_debug)(struct kvm_vcpu *vcpu,
432 struct kvm_debug_guest *dbg);
433 void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
434 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
435 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
436 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
437 void (*get_segment)(struct kvm_vcpu *vcpu,
438 struct kvm_segment *var, int seg);
439 int (*get_cpl)(struct kvm_vcpu *vcpu);
440 void (*set_segment)(struct kvm_vcpu *vcpu,
441 struct kvm_segment *var, int seg);
442 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
443 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
444 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
445 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
446 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
447 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
448 void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
449 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
450 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
451 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
452 unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
453 void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
454 int *exception);
455 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
456 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
457 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
458
459 void (*tlb_flush)(struct kvm_vcpu *vcpu);
460
461 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
462 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
463 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
464 void (*patch_hypercall)(struct kvm_vcpu *vcpu,
465 unsigned char *hypercall_addr);
466 int (*get_irq)(struct kvm_vcpu *vcpu);
467 void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
468 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
469 bool has_error_code, u32 error_code);
470 bool (*exception_injected)(struct kvm_vcpu *vcpu);
471 void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
472 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
473 struct kvm_run *run);
474
475 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
476 int (*get_tdp_level)(void);
477};
478
479extern struct kvm_x86_ops *kvm_x86_ops;
480
481int kvm_mmu_module_init(void);
482void kvm_mmu_module_exit(void);
483
484void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
485int kvm_mmu_create(struct kvm_vcpu *vcpu);
486int kvm_mmu_setup(struct kvm_vcpu *vcpu);
487void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
488void kvm_mmu_set_base_ptes(u64 base_pte);
489void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
490 u64 dirty_mask, u64 nx_mask, u64 x_mask);
491
492int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
493void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
494void kvm_mmu_zap_all(struct kvm *kvm);
495unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
496void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
497
498int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
499
500int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
501 const void *val, int bytes);
502int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
503 gpa_t addr, unsigned long *ret);
504
505extern bool tdp_enabled;
506
507enum emulation_result {
508 EMULATE_DONE, /* no further processing */
509 EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
510 EMULATE_FAIL, /* can't emulate this instruction */
511};
512
513#define EMULTYPE_NO_DECODE (1 << 0)
514#define EMULTYPE_TRAP_UD (1 << 1)
515int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
516 unsigned long cr2, u16 error_code, int emulation_type);
517void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
518void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
519void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
520void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
521 unsigned long *rflags);
522
523unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
524void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
525 unsigned long *rflags);
526void kvm_enable_efer_bits(u64);
527int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
528int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
529
530struct x86_emulate_ctxt;
531
532int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
533 int size, unsigned port);
534int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
535 int size, unsigned long count, int down,
536 gva_t address, int rep, unsigned port);
537void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
538int kvm_emulate_halt(struct kvm_vcpu *vcpu);
539int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
540int emulate_clts(struct kvm_vcpu *vcpu);
541int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
542 unsigned long *dest);
543int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
544 unsigned long value);
545
546void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
547int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
548 int type_bits, int seg);
549
550int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
551
552void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
553void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
554void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
555void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
556unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
557void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
558void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
559
560int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
561int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
562
563void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
564void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
565void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
566 u32 error_code);
567
568void kvm_pic_set_irq(void *opaque, int irq, int level);
569
570void kvm_inject_nmi(struct kvm_vcpu *vcpu);
571
572void fx_init(struct kvm_vcpu *vcpu);
573
574int emulator_read_std(unsigned long addr,
575 void *val,
576 unsigned int bytes,
577 struct kvm_vcpu *vcpu);
578int emulator_write_emulated(unsigned long addr,
579 const void *val,
580 unsigned int bytes,
581 struct kvm_vcpu *vcpu);
582
583unsigned long segment_base(u16 selector);
584
585void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
586void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
587 const u8 *new, int bytes);
588int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
589void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
590int kvm_mmu_load(struct kvm_vcpu *vcpu);
591void kvm_mmu_unload(struct kvm_vcpu *vcpu);
592void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
593
594int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
595
596int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
597
598int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
599void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
600
601void kvm_enable_tdp(void);
602void kvm_disable_tdp(void);
603
604int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
605int complete_pio(struct kvm_vcpu *vcpu);
606
607static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
608{
609 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
610
611 return (struct kvm_mmu_page *)page_private(page);
612}
613
614static inline u16 kvm_read_fs(void)
615{
616 u16 seg;
617 asm("mov %%fs, %0" : "=g"(seg));
618 return seg;
619}
620
621static inline u16 kvm_read_gs(void)
622{
623 u16 seg;
624 asm("mov %%gs, %0" : "=g"(seg));
625 return seg;
626}
627
628static inline u16 kvm_read_ldt(void)
629{
630 u16 ldt;
631 asm("sldt %0" : "=g"(ldt));
632 return ldt;
633}
634
635static inline void kvm_load_fs(u16 sel)
636{
637 asm("mov %0, %%fs" : : "rm"(sel));
638}
639
640static inline void kvm_load_gs(u16 sel)
641{
642 asm("mov %0, %%gs" : : "rm"(sel));
643}
644
645static inline void kvm_load_ldt(u16 sel)
646{
647 asm("lldt %0" : : "rm"(sel));
648}
649
650static inline void kvm_get_idt(struct descriptor_table *table)
651{
652 asm("sidt %0" : "=m"(*table));
653}
654
655static inline void kvm_get_gdt(struct descriptor_table *table)
656{
657 asm("sgdt %0" : "=m"(*table));
658}
659
660static inline unsigned long kvm_read_tr_base(void)
661{
662 u16 tr;
663 asm("str %0" : "=g"(tr));
664 return segment_base(tr);
665}
666
667#ifdef CONFIG_X86_64
668static inline unsigned long read_msr(unsigned long msr)
669{
670 u64 value;
671
672 rdmsrl(msr, value);
673 return value;
674}
675#endif
676
677static inline void kvm_fx_save(struct i387_fxsave_struct *image)
678{
679 asm("fxsave (%0)":: "r" (image));
680}
681
682static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
683{
684 asm("fxrstor (%0)":: "r" (image));
685}
686
687static inline void kvm_fx_finit(void)
688{
689 asm("finit");
690}
691
692static inline u32 get_rdx_init_val(void)
693{
694 return 0x600; /* P6 family */
695}
696
697static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
698{
699 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
700}
701
702#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
703#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
704#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
705#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
706#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
707#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
708#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
709#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
710#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
711#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
712#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
713
714#define MSR_IA32_TIME_STAMP_COUNTER 0x010
715
716#define TSS_IOPB_BASE_OFFSET 0x66
717#define TSS_BASE_SIZE 0x68
718#define TSS_IOPB_SIZE (65536 / 8)
719#define TSS_REDIRECTION_SIZE (256 / 8)
720#define RMODE_TSS_SIZE \
721 (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
722
723enum {
724 TASK_SWITCH_CALL = 0,
725 TASK_SWITCH_IRET = 1,
726 TASK_SWITCH_JMP = 2,
727 TASK_SWITCH_GATE = 3,
728};
729
730/*
731 * Hardware virtualization extension instructions may fault if a
732 * reboot turns off virtualization while processes are running.
733 * Trap the fault and ignore the instruction if that happens.
734 */
735asmlinkage void kvm_handle_fault_on_reboot(void);
736
737#define __kvm_handle_fault_on_reboot(insn) \
738 "666: " insn "\n\t" \
739 ".pushsection .fixup, \"ax\" \n" \
740 "667: \n\t" \
741 __ASM_SIZE(push) " $666b \n\t" \
742 "jmp kvm_handle_fault_on_reboot \n\t" \
743 ".popsection \n\t" \
744 ".pushsection __ex_table, \"a\" \n\t" \
745 _ASM_PTR " 666b, 667b \n\t" \
746 ".popsection"
747
748#define KVM_ARCH_WANT_MMU_NOTIFIER
749int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
750int kvm_age_hva(struct kvm *kvm, unsigned long hva);
751
752#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
new file mode 100644
index 00000000000..b8a3305ae09
--- /dev/null
+++ b/arch/x86/include/asm/kvm_para.h
@@ -0,0 +1,147 @@
1#ifndef _ASM_X86_KVM_PARA_H
2#define _ASM_X86_KVM_PARA_H
3
4/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
5 * should be used to determine that a VM is running under KVM.
6 */
7#define KVM_CPUID_SIGNATURE 0x40000000
8
9/* This CPUID returns a feature bitmap in eax. Before enabling a particular
10 * paravirtualization, the appropriate feature bit should be checked.
11 */
12#define KVM_CPUID_FEATURES 0x40000001
13#define KVM_FEATURE_CLOCKSOURCE 0
14#define KVM_FEATURE_NOP_IO_DELAY 1
15#define KVM_FEATURE_MMU_OP 2
16
17#define MSR_KVM_WALL_CLOCK 0x11
18#define MSR_KVM_SYSTEM_TIME 0x12
19
20#define KVM_MAX_MMU_OP_BATCH 32
21
22/* Operations for KVM_HC_MMU_OP */
23#define KVM_MMU_OP_WRITE_PTE 1
24#define KVM_MMU_OP_FLUSH_TLB 2
25#define KVM_MMU_OP_RELEASE_PT 3
26
27/* Payload for KVM_HC_MMU_OP */
28struct kvm_mmu_op_header {
29 __u32 op;
30 __u32 pad;
31};
32
33struct kvm_mmu_op_write_pte {
34 struct kvm_mmu_op_header header;
35 __u64 pte_phys;
36 __u64 pte_val;
37};
38
39struct kvm_mmu_op_flush_tlb {
40 struct kvm_mmu_op_header header;
41};
42
43struct kvm_mmu_op_release_pt {
44 struct kvm_mmu_op_header header;
45 __u64 pt_phys;
46};
47
48#ifdef __KERNEL__
49#include <asm/processor.h>
50
51extern void kvmclock_init(void);
52
53
54/* This instruction is vmcall. On non-VT architectures, it will generate a
55 * trap that we will then rewrite to the appropriate instruction.
56 */
57#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
58
59/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
60 * instruction. The hypervisor may replace it with something else but only the
61 * instructions are guaranteed to be supported.
62 *
63 * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
64 * The hypercall number should be placed in rax and the return value will be
65 * placed in rax. No other registers will be clobbered unless explicited
66 * noted by the particular hypercall.
67 */
68
69static inline long kvm_hypercall0(unsigned int nr)
70{
71 long ret;
72 asm volatile(KVM_HYPERCALL
73 : "=a"(ret)
74 : "a"(nr)
75 : "memory");
76 return ret;
77}
78
79static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
80{
81 long ret;
82 asm volatile(KVM_HYPERCALL
83 : "=a"(ret)
84 : "a"(nr), "b"(p1)
85 : "memory");
86 return ret;
87}
88
89static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
90 unsigned long p2)
91{
92 long ret;
93 asm volatile(KVM_HYPERCALL
94 : "=a"(ret)
95 : "a"(nr), "b"(p1), "c"(p2)
96 : "memory");
97 return ret;
98}
99
100static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
101 unsigned long p2, unsigned long p3)
102{
103 long ret;
104 asm volatile(KVM_HYPERCALL
105 : "=a"(ret)
106 : "a"(nr), "b"(p1), "c"(p2), "d"(p3)
107 : "memory");
108 return ret;
109}
110
111static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
112 unsigned long p2, unsigned long p3,
113 unsigned long p4)
114{
115 long ret;
116 asm volatile(KVM_HYPERCALL
117 : "=a"(ret)
118 : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)
119 : "memory");
120 return ret;
121}
122
123static inline int kvm_para_available(void)
124{
125 unsigned int eax, ebx, ecx, edx;
126 char signature[13];
127
128 cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
129 memcpy(signature + 0, &ebx, 4);
130 memcpy(signature + 4, &ecx, 4);
131 memcpy(signature + 8, &edx, 4);
132 signature[12] = 0;
133
134 if (strcmp(signature, "KVMKVMKVM") == 0)
135 return 1;
136
137 return 0;
138}
139
140static inline unsigned int kvm_arch_para_features(void)
141{
142 return cpuid_eax(KVM_CPUID_FEATURES);
143}
144
145#endif
146
147#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
new file mode 100644
index 00000000000..25179a29f20
--- /dev/null
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -0,0 +1,184 @@
1/******************************************************************************
2 * x86_emulate.h
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
9 */
10
11#ifndef _ASM_X86_KVM_X86_EMULATE_H
12#define _ASM_X86_KVM_X86_EMULATE_H
13
14struct x86_emulate_ctxt;
15
16/*
17 * x86_emulate_ops:
18 *
19 * These operations represent the instruction emulator's interface to memory.
20 * There are two categories of operation: those that act on ordinary memory
21 * regions (*_std), and those that act on memory regions known to require
22 * special treatment or emulation (*_emulated).
23 *
24 * The emulator assumes that an instruction accesses only one 'emulated memory'
25 * location, that this location is the given linear faulting address (cr2), and
26 * that this is one of the instruction's data operands. Instruction fetches and
27 * stack operations are assumed never to access emulated memory. The emulator
28 * automatically deduces which operand of a string-move operation is accessing
29 * emulated memory, and assumes that the other operand accesses normal memory.
30 *
31 * NOTES:
32 * 1. The emulator isn't very smart about emulated vs. standard memory.
33 * 'Emulated memory' access addresses should be checked for sanity.
34 * 'Normal memory' accesses may fault, and the caller must arrange to
35 * detect and handle reentrancy into the emulator via recursive faults.
36 * Accesses may be unaligned and may cross page boundaries.
37 * 2. If the access fails (cannot emulate, or a standard access faults) then
38 * it is up to the memop to propagate the fault to the guest VM via
39 * some out-of-band mechanism, unknown to the emulator. The memop signals
40 * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
41 * then immediately bail.
42 * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
43 * cmpxchg8b_emulated need support 8-byte accesses.
44 * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
45 */
46/* Access completed successfully: continue emulation as normal. */
47#define X86EMUL_CONTINUE 0
48/* Access is unhandleable: bail from emulation and return error to caller. */
49#define X86EMUL_UNHANDLEABLE 1
50/* Terminate emulation but return success to the caller. */
51#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
52#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */
53#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */
54struct x86_emulate_ops {
55 /*
56 * read_std: Read bytes of standard (non-emulated/special) memory.
57 * Used for instruction fetch, stack operations, and others.
58 * @addr: [IN ] Linear address from which to read.
59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
60 * @bytes: [IN ] Number of bytes to read from memory.
61 */
62 int (*read_std)(unsigned long addr, void *val,
63 unsigned int bytes, struct kvm_vcpu *vcpu);
64
65 /*
66 * read_emulated: Read bytes from emulated/special memory area.
67 * @addr: [IN ] Linear address from which to read.
68 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
69 * @bytes: [IN ] Number of bytes to read from memory.
70 */
71 int (*read_emulated)(unsigned long addr,
72 void *val,
73 unsigned int bytes,
74 struct kvm_vcpu *vcpu);
75
76 /*
77 * write_emulated: Read bytes from emulated/special memory area.
78 * @addr: [IN ] Linear address to which to write.
79 * @val: [IN ] Value to write to memory (low-order bytes used as
80 * required).
81 * @bytes: [IN ] Number of bytes to write to memory.
82 */
83 int (*write_emulated)(unsigned long addr,
84 const void *val,
85 unsigned int bytes,
86 struct kvm_vcpu *vcpu);
87
88 /*
89 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
90 * emulated/special memory area.
91 * @addr: [IN ] Linear address to access.
92 * @old: [IN ] Value expected to be current at @addr.
93 * @new: [IN ] Value to write to @addr.
94 * @bytes: [IN ] Number of bytes to access using CMPXCHG.
95 */
96 int (*cmpxchg_emulated)(unsigned long addr,
97 const void *old,
98 const void *new,
99 unsigned int bytes,
100 struct kvm_vcpu *vcpu);
101
102};
103
104/* Type, address-of, and value of an instruction's operand. */
105struct operand {
106 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
107 unsigned int bytes;
108 unsigned long val, orig_val, *ptr;
109};
110
111struct fetch_cache {
112 u8 data[15];
113 unsigned long start;
114 unsigned long end;
115};
116
117struct decode_cache {
118 u8 twobyte;
119 u8 b;
120 u8 lock_prefix;
121 u8 rep_prefix;
122 u8 op_bytes;
123 u8 ad_bytes;
124 u8 rex_prefix;
125 struct operand src;
126 struct operand dst;
127 bool has_seg_override;
128 u8 seg_override;
129 unsigned int d;
130 unsigned long regs[NR_VCPU_REGS];
131 unsigned long eip;
132 /* modrm */
133 u8 modrm;
134 u8 modrm_mod;
135 u8 modrm_reg;
136 u8 modrm_rm;
137 u8 use_modrm_ea;
138 bool rip_relative;
139 unsigned long modrm_ea;
140 void *modrm_ptr;
141 unsigned long modrm_val;
142 struct fetch_cache fetch;
143};
144
145struct x86_emulate_ctxt {
146 /* Register state before/after emulation. */
147 struct kvm_vcpu *vcpu;
148
149 /* Linear faulting address (if emulating a page-faulting instruction) */
150 unsigned long eflags;
151
152 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
153 int mode;
154
155 u32 cs_base;
156
157 /* decode cache */
158
159 struct decode_cache decode;
160};
161
162/* Repeat String Operation Prefix */
163#define REPE_PREFIX 1
164#define REPNE_PREFIX 2
165
166/* Execution mode, passed to the emulator. */
167#define X86EMUL_MODE_REAL 0 /* Real mode. */
168#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
169#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
170#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
171
172/* Host execution mode. */
173#if defined(__i386__)
174#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
175#elif defined(CONFIG_X86_64)
176#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
177#endif
178
179int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
180 struct x86_emulate_ops *ops);
181int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
182 struct x86_emulate_ops *ops);
183
184#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/ldt.h b/arch/x86/include/asm/ldt.h
new file mode 100644
index 00000000000..46727eb37bf
--- /dev/null
+++ b/arch/x86/include/asm/ldt.h
@@ -0,0 +1,40 @@
1/*
2 * ldt.h
3 *
4 * Definitions of structures used with the modify_ldt system call.
5 */
6#ifndef _ASM_X86_LDT_H
7#define _ASM_X86_LDT_H
8
9/* Maximum number of LDT entries supported. */
10#define LDT_ENTRIES 8192
11/* The size of each LDT entry. */
12#define LDT_ENTRY_SIZE 8
13
14#ifndef __ASSEMBLY__
15/*
16 * Note on 64bit base and limit is ignored and you cannot set DS/ES/CS
17 * not to the default values if you still want to do syscalls. This
18 * call is more for 32bit mode therefore.
19 */
20struct user_desc {
21 unsigned int entry_number;
22 unsigned int base_addr;
23 unsigned int limit;
24 unsigned int seg_32bit:1;
25 unsigned int contents:2;
26 unsigned int read_exec_only:1;
27 unsigned int limit_in_pages:1;
28 unsigned int seg_not_present:1;
29 unsigned int useable:1;
30#ifdef __x86_64__
31 unsigned int lm:1;
32#endif
33};
34
35#define MODIFY_LDT_CONTENTS_DATA 0
36#define MODIFY_LDT_CONTENTS_STACK 1
37#define MODIFY_LDT_CONTENTS_CODE 2
38
39#endif /* !__ASSEMBLY__ */
40#endif /* _ASM_X86_LDT_H */
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
new file mode 100644
index 00000000000..d28a507cef3
--- /dev/null
+++ b/arch/x86/include/asm/lguest.h
@@ -0,0 +1,94 @@
1#ifndef _ASM_X86_LGUEST_H
2#define _ASM_X86_LGUEST_H
3
4#define GDT_ENTRY_LGUEST_CS 10
5#define GDT_ENTRY_LGUEST_DS 11
6#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
7#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
8
9#ifndef __ASSEMBLY__
10#include <asm/desc.h>
11
12#define GUEST_PL 1
13
14/* Every guest maps the core switcher code. */
15#define SHARED_SWITCHER_PAGES \
16 DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
17/* Pages for switcher itself, then two pages per cpu */
18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
19
20/* We map at -4M for ease of mapping into the guest (one PTE page). */
21#define SWITCHER_ADDR 0xFFC00000
22
23/* Found in switcher.S */
24extern unsigned long default_idt_entries[];
25
26/* Declarations for definitions in lguest_guest.S */
27extern char lguest_noirq_start[], lguest_noirq_end[];
28extern const char lgstart_cli[], lgend_cli[];
29extern const char lgstart_sti[], lgend_sti[];
30extern const char lgstart_popf[], lgend_popf[];
31extern const char lgstart_pushf[], lgend_pushf[];
32extern const char lgstart_iret[], lgend_iret[];
33
34extern void lguest_iret(void);
35extern void lguest_init(void);
36
37struct lguest_regs {
38 /* Manually saved part. */
39 unsigned long eax, ebx, ecx, edx;
40 unsigned long esi, edi, ebp;
41 unsigned long gs;
42 unsigned long fs, ds, es;
43 unsigned long trapnum, errcode;
44 /* Trap pushed part */
45 unsigned long eip;
46 unsigned long cs;
47 unsigned long eflags;
48 unsigned long esp;
49 unsigned long ss;
50};
51
52/* This is a guest-specific page (mapped ro) into the guest. */
53struct lguest_ro_state {
54 /* Host information we need to restore when we switch back. */
55 u32 host_cr3;
56 struct desc_ptr host_idt_desc;
57 struct desc_ptr host_gdt_desc;
58 u32 host_sp;
59
60 /* Fields which are used when guest is running. */
61 struct desc_ptr guest_idt_desc;
62 struct desc_ptr guest_gdt_desc;
63 struct x86_hw_tss guest_tss;
64 struct desc_struct guest_idt[IDT_ENTRIES];
65 struct desc_struct guest_gdt[GDT_ENTRIES];
66};
67
68struct lg_cpu_arch {
69 /* The GDT entries copied into lguest_ro_state when running. */
70 struct desc_struct gdt[GDT_ENTRIES];
71
72 /* The IDT entries: some copied into lguest_ro_state when running. */
73 struct desc_struct idt[IDT_ENTRIES];
74
75 /* The address of the last guest-visible pagefault (ie. cr2). */
76 unsigned long last_pagefault;
77};
78
79static inline void lguest_set_ts(void)
80{
81 u32 cr0;
82
83 cr0 = read_cr0();
84 if (!(cr0 & 8))
85 write_cr0(cr0 | 8);
86}
87
88/* Full 4G segment descriptors, suitable for CS and DS. */
89#define FULL_EXEC_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9b00} } })
90#define FULL_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9300} } })
91
92#endif /* __ASSEMBLY__ */
93
94#endif /* _ASM_X86_LGUEST_H */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
new file mode 100644
index 00000000000..43894428c3c
--- /dev/null
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -0,0 +1,71 @@
1/* Architecture specific portion of the lguest hypercalls */
2#ifndef _ASM_X86_LGUEST_HCALL_H
3#define _ASM_X86_LGUEST_HCALL_H
4
5#define LHCALL_FLUSH_ASYNC 0
6#define LHCALL_LGUEST_INIT 1
7#define LHCALL_SHUTDOWN 2
8#define LHCALL_LOAD_GDT 3
9#define LHCALL_NEW_PGTABLE 4
10#define LHCALL_FLUSH_TLB 5
11#define LHCALL_LOAD_IDT_ENTRY 6
12#define LHCALL_SET_STACK 7
13#define LHCALL_TS 8
14#define LHCALL_SET_CLOCKEVENT 9
15#define LHCALL_HALT 10
16#define LHCALL_SET_PTE 14
17#define LHCALL_SET_PMD 15
18#define LHCALL_LOAD_TLS 16
19#define LHCALL_NOTIFY 17
20
21#define LGUEST_TRAP_ENTRY 0x1F
22
23/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
24#define LGUEST_SHUTDOWN_POWEROFF 1
25#define LGUEST_SHUTDOWN_RESTART 2
26
27#ifndef __ASSEMBLY__
28#include <asm/hw_irq.h>
29
30/*G:031 But first, how does our Guest contact the Host to ask for privileged
31 * operations? There are two ways: the direct way is to make a "hypercall",
32 * to make requests of the Host Itself.
33 *
34 * Our hypercall mechanism uses the highest unused trap code (traps 32 and
35 * above are used by real hardware interrupts). Fifteen hypercalls are
36 * available: the hypercall number is put in the %eax register, and the
37 * arguments (when required) are placed in %edx, %ebx and %ecx. If a return
38 * value makes sense, it's returned in %eax.
39 *
40 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
41 * Host, rather than returning failure. This reflects Winston Churchill's
42 * definition of a gentleman: "someone who is only rude intentionally". */
43static inline unsigned long
44hcall(unsigned long call,
45 unsigned long arg1, unsigned long arg2, unsigned long arg3)
46{
47 /* "int" is the Intel instruction to trigger a trap. */
48 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
49 /* The call in %eax (aka "a") might be overwritten */
50 : "=a"(call)
51 /* The arguments are in %eax, %edx, %ebx & %ecx */
52 : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
53 /* "memory" means this might write somewhere in memory.
54 * This isn't true for all calls, but it's safe to tell
55 * gcc that it might happen so it doesn't get clever. */
56 : "memory");
57 return call;
58}
59/*:*/
60
61/* Can't use our min() macro here: needs to be a constant */
62#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
63
64#define LHCALL_RING_SIZE 64
65struct hcall_args {
66 /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */
67 unsigned long arg0, arg2, arg3, arg1;
68};
69
70#endif /* !__ASSEMBLY__ */
71#endif /* _ASM_X86_LGUEST_HCALL_H */
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
new file mode 100644
index 00000000000..f61ee8f937e
--- /dev/null
+++ b/arch/x86/include/asm/linkage.h
@@ -0,0 +1,61 @@
1#ifndef _ASM_X86_LINKAGE_H
2#define _ASM_X86_LINKAGE_H
3
4#undef notrace
5#define notrace __attribute__((no_instrument_function))
6
7#ifdef CONFIG_X86_64
8#define __ALIGN .p2align 4,,15
9#define __ALIGN_STR ".p2align 4,,15"
10#endif
11
12#ifdef CONFIG_X86_32
13#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
14/*
15 * For 32-bit UML - mark functions implemented in assembly that use
16 * regparm input parameters:
17 */
18#define asmregparm __attribute__((regparm(3)))
19
20/*
21 * Make sure the compiler doesn't do anything stupid with the
22 * arguments on the stack - they are owned by the *caller*, not
23 * the callee. This just fools gcc into not spilling into them,
24 * and keeps it from doing tailcall recursion and/or using the
25 * stack slots for temporaries, since they are live and "used"
26 * all the way to the end of the function.
27 *
28 * NOTE! On x86-64, all the arguments are in registers, so this
29 * only matters on a 32-bit kernel.
30 */
31#define asmlinkage_protect(n, ret, args...) \
32 __asmlinkage_protect##n(ret, ##args)
33#define __asmlinkage_protect_n(ret, args...) \
34 __asm__ __volatile__ ("" : "=r" (ret) : "0" (ret), ##args)
35#define __asmlinkage_protect0(ret) \
36 __asmlinkage_protect_n(ret)
37#define __asmlinkage_protect1(ret, arg1) \
38 __asmlinkage_protect_n(ret, "g" (arg1))
39#define __asmlinkage_protect2(ret, arg1, arg2) \
40 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2))
41#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \
42 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3))
43#define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \
44 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
45 "g" (arg4))
46#define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \
47 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
48 "g" (arg4), "g" (arg5))
49#define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \
50 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
51 "g" (arg4), "g" (arg5), "g" (arg6))
52
53#endif
54
55#ifdef CONFIG_X86_ALIGNMENT_16
56#define __ALIGN .align 16,0x90
57#define __ALIGN_STR ".align 16,0x90"
58#endif
59
60#endif /* _ASM_X86_LINKAGE_H */
61
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
new file mode 100644
index 00000000000..47b9b6f1905
--- /dev/null
+++ b/arch/x86/include/asm/local.h
@@ -0,0 +1,235 @@
1#ifndef _ASM_X86_LOCAL_H
2#define _ASM_X86_LOCAL_H
3
4#include <linux/percpu.h>
5
6#include <asm/system.h>
7#include <asm/atomic.h>
8#include <asm/asm.h>
9
10typedef struct {
11 atomic_long_t a;
12} local_t;
13
14#define LOCAL_INIT(i) { ATOMIC_LONG_INIT(i) }
15
16#define local_read(l) atomic_long_read(&(l)->a)
17#define local_set(l, i) atomic_long_set(&(l)->a, (i))
18
19static inline void local_inc(local_t *l)
20{
21 asm volatile(_ASM_INC "%0"
22 : "+m" (l->a.counter));
23}
24
25static inline void local_dec(local_t *l)
26{
27 asm volatile(_ASM_DEC "%0"
28 : "+m" (l->a.counter));
29}
30
31static inline void local_add(long i, local_t *l)
32{
33 asm volatile(_ASM_ADD "%1,%0"
34 : "+m" (l->a.counter)
35 : "ir" (i));
36}
37
38static inline void local_sub(long i, local_t *l)
39{
40 asm volatile(_ASM_SUB "%1,%0"
41 : "+m" (l->a.counter)
42 : "ir" (i));
43}
44
45/**
46 * local_sub_and_test - subtract value from variable and test result
47 * @i: integer value to subtract
48 * @l: pointer to type local_t
49 *
50 * Atomically subtracts @i from @l and returns
51 * true if the result is zero, or false for all
52 * other cases.
53 */
54static inline int local_sub_and_test(long i, local_t *l)
55{
56 unsigned char c;
57
58 asm volatile(_ASM_SUB "%2,%0; sete %1"
59 : "+m" (l->a.counter), "=qm" (c)
60 : "ir" (i) : "memory");
61 return c;
62}
63
64/**
65 * local_dec_and_test - decrement and test
66 * @l: pointer to type local_t
67 *
68 * Atomically decrements @l by 1 and
69 * returns true if the result is 0, or false for all other
70 * cases.
71 */
72static inline int local_dec_and_test(local_t *l)
73{
74 unsigned char c;
75
76 asm volatile(_ASM_DEC "%0; sete %1"
77 : "+m" (l->a.counter), "=qm" (c)
78 : : "memory");
79 return c != 0;
80}
81
82/**
83 * local_inc_and_test - increment and test
84 * @l: pointer to type local_t
85 *
86 * Atomically increments @l by 1
87 * and returns true if the result is zero, or false for all
88 * other cases.
89 */
90static inline int local_inc_and_test(local_t *l)
91{
92 unsigned char c;
93
94 asm volatile(_ASM_INC "%0; sete %1"
95 : "+m" (l->a.counter), "=qm" (c)
96 : : "memory");
97 return c != 0;
98}
99
100/**
101 * local_add_negative - add and test if negative
102 * @i: integer value to add
103 * @l: pointer to type local_t
104 *
105 * Atomically adds @i to @l and returns true
106 * if the result is negative, or false when
107 * result is greater than or equal to zero.
108 */
109static inline int local_add_negative(long i, local_t *l)
110{
111 unsigned char c;
112
113 asm volatile(_ASM_ADD "%2,%0; sets %1"
114 : "+m" (l->a.counter), "=qm" (c)
115 : "ir" (i) : "memory");
116 return c;
117}
118
119/**
120 * local_add_return - add and return
121 * @i: integer value to add
122 * @l: pointer to type local_t
123 *
124 * Atomically adds @i to @l and returns @i + @l
125 */
126static inline long local_add_return(long i, local_t *l)
127{
128 long __i;
129#ifdef CONFIG_M386
130 unsigned long flags;
131 if (unlikely(boot_cpu_data.x86 <= 3))
132 goto no_xadd;
133#endif
134 /* Modern 486+ processor */
135 __i = i;
136 asm volatile(_ASM_XADD "%0, %1;"
137 : "+r" (i), "+m" (l->a.counter)
138 : : "memory");
139 return i + __i;
140
141#ifdef CONFIG_M386
142no_xadd: /* Legacy 386 processor */
143 local_irq_save(flags);
144 __i = local_read(l);
145 local_set(l, i + __i);
146 local_irq_restore(flags);
147 return i + __i;
148#endif
149}
150
151static inline long local_sub_return(long i, local_t *l)
152{
153 return local_add_return(-i, l);
154}
155
156#define local_inc_return(l) (local_add_return(1, l))
157#define local_dec_return(l) (local_sub_return(1, l))
158
159#define local_cmpxchg(l, o, n) \
160 (cmpxchg_local(&((l)->a.counter), (o), (n)))
161/* Always has a lock prefix */
162#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
163
164/**
165 * local_add_unless - add unless the number is a given value
166 * @l: pointer of type local_t
167 * @a: the amount to add to l...
168 * @u: ...unless l is equal to u.
169 *
170 * Atomically adds @a to @l, so long as it was not @u.
171 * Returns non-zero if @l was not @u, and zero otherwise.
172 */
173#define local_add_unless(l, a, u) \
174({ \
175 long c, old; \
176 c = local_read((l)); \
177 for (;;) { \
178 if (unlikely(c == (u))) \
179 break; \
180 old = local_cmpxchg((l), c, c + (a)); \
181 if (likely(old == c)) \
182 break; \
183 c = old; \
184 } \
185 c != (u); \
186})
187#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
188
189/* On x86_32, these are no better than the atomic variants.
190 * On x86-64 these are better than the atomic variants on SMP kernels
191 * because they dont use a lock prefix.
192 */
193#define __local_inc(l) local_inc(l)
194#define __local_dec(l) local_dec(l)
195#define __local_add(i, l) local_add((i), (l))
196#define __local_sub(i, l) local_sub((i), (l))
197
198/* Use these for per-cpu local_t variables: on some archs they are
199 * much more efficient than these naive implementations. Note they take
200 * a variable, not an address.
201 *
202 * X86_64: This could be done better if we moved the per cpu data directly
203 * after GS.
204 */
205
206/* Need to disable preemption for the cpu local counters otherwise we could
207 still access a variable of a previous CPU in a non atomic way. */
208#define cpu_local_wrap_v(l) \
209({ \
210 local_t res__; \
211 preempt_disable(); \
212 res__ = (l); \
213 preempt_enable(); \
214 res__; \
215})
216#define cpu_local_wrap(l) \
217({ \
218 preempt_disable(); \
219 (l); \
220 preempt_enable(); \
221}) \
222
223#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var((l))))
224#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var((l)), (i)))
225#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var((l))))
226#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var((l))))
227#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var((l))))
228#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var((l))))
229
230#define __cpu_local_inc(l) cpu_local_inc((l))
231#define __cpu_local_dec(l) cpu_local_dec((l))
232#define __cpu_local_add(i, l) cpu_local_add((i), (l))
233#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
234
235#endif /* _ASM_X86_LOCAL_H */
diff --git a/arch/x86/include/asm/mach-default/apm.h b/arch/x86/include/asm/mach-default/apm.h
new file mode 100644
index 00000000000..20370c6db74
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/apm.h
@@ -0,0 +1,73 @@
1/*
2 * Machine specific APM BIOS functions for generic.
3 * Split out from apm.c by Osamu Tomita <tomita@cinet.co.jp>
4 */
5
6#ifndef _ASM_X86_MACH_DEFAULT_APM_H
7#define _ASM_X86_MACH_DEFAULT_APM_H
8
9#ifdef APM_ZERO_SEGS
10# define APM_DO_ZERO_SEGS \
11 "pushl %%ds\n\t" \
12 "pushl %%es\n\t" \
13 "xorl %%edx, %%edx\n\t" \
14 "mov %%dx, %%ds\n\t" \
15 "mov %%dx, %%es\n\t" \
16 "mov %%dx, %%fs\n\t" \
17 "mov %%dx, %%gs\n\t"
18# define APM_DO_POP_SEGS \
19 "popl %%es\n\t" \
20 "popl %%ds\n\t"
21#else
22# define APM_DO_ZERO_SEGS
23# define APM_DO_POP_SEGS
24#endif
25
26static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in,
27 u32 *eax, u32 *ebx, u32 *ecx,
28 u32 *edx, u32 *esi)
29{
30 /*
31 * N.B. We do NOT need a cld after the BIOS call
32 * because we always save and restore the flags.
33 */
34 __asm__ __volatile__(APM_DO_ZERO_SEGS
35 "pushl %%edi\n\t"
36 "pushl %%ebp\n\t"
37 "lcall *%%cs:apm_bios_entry\n\t"
38 "setc %%al\n\t"
39 "popl %%ebp\n\t"
40 "popl %%edi\n\t"
41 APM_DO_POP_SEGS
42 : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx),
43 "=S" (*esi)
44 : "a" (func), "b" (ebx_in), "c" (ecx_in)
45 : "memory", "cc");
46}
47
48static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in,
49 u32 ecx_in, u32 *eax)
50{
51 int cx, dx, si;
52 u8 error;
53
54 /*
55 * N.B. We do NOT need a cld after the BIOS call
56 * because we always save and restore the flags.
57 */
58 __asm__ __volatile__(APM_DO_ZERO_SEGS
59 "pushl %%edi\n\t"
60 "pushl %%ebp\n\t"
61 "lcall *%%cs:apm_bios_entry\n\t"
62 "setc %%bl\n\t"
63 "popl %%ebp\n\t"
64 "popl %%edi\n\t"
65 APM_DO_POP_SEGS
66 : "=a" (*eax), "=b" (error), "=c" (cx), "=d" (dx),
67 "=S" (si)
68 : "a" (func), "b" (ebx_in), "c" (ecx_in)
69 : "memory", "cc");
70 return error;
71}
72
73#endif /* _ASM_X86_MACH_DEFAULT_APM_H */
diff --git a/arch/x86/include/asm/mach-default/do_timer.h b/arch/x86/include/asm/mach-default/do_timer.h
new file mode 100644
index 00000000000..23ecda0b28a
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/do_timer.h
@@ -0,0 +1,16 @@
1/* defines for inline arch setup functions */
2#include <linux/clockchips.h>
3
4#include <asm/i8259.h>
5#include <asm/i8253.h>
6
7/**
8 * do_timer_interrupt_hook - hook into timer tick
9 *
10 * Call the pit clock event handler. see asm/i8253.h
11 **/
12
13static inline void do_timer_interrupt_hook(void)
14{
15 global_clock_event->event_handler(global_clock_event);
16}
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
new file mode 100644
index 00000000000..6b1add8e31d
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -0,0 +1,36 @@
1/*
2 * This file is designed to contain the BUILD_INTERRUPT specifications for
3 * all of the extra named interrupt vectors used by the architecture.
4 * Usually this is the Inter Process Interrupts (IPIs)
5 */
6
7/*
8 * The following vectors are part of the Linux architecture, there
9 * is no hardware IRQ pin equivalent for them, they are triggered
10 * through the ICC by us (IPIs)
11 */
12#ifdef CONFIG_X86_SMP
13BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
14BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
15BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
16BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
17BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
18#endif
19
20/*
21 * every pentium local APIC has two 'local interrupts', with a
22 * soft-definable vector attached to both interrupts, one of
23 * which is a timer interrupt, the other one is error counter
24 * overflow. Linux uses the local APIC timer interrupt to get
25 * a much simpler SMP time architecture:
26 */
27#ifdef CONFIG_X86_LOCAL_APIC
28BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
29BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
30BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
31
32#ifdef CONFIG_X86_MCE_P4THERMAL
33BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
34#endif
35
36#endif
diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h
new file mode 100644
index 00000000000..ff3a6c236c0
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_apic.h
@@ -0,0 +1,156 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_APIC_H
2#define _ASM_X86_MACH_DEFAULT_MACH_APIC_H
3
4#ifdef CONFIG_X86_LOCAL_APIC
5
6#include <mach_apicdef.h>
7#include <asm/smp.h>
8
9#define APIC_DFR_VALUE (APIC_DFR_FLAT)
10
11static inline cpumask_t target_cpus(void)
12{
13#ifdef CONFIG_SMP
14 return cpu_online_map;
15#else
16 return cpumask_of_cpu(0);
17#endif
18}
19
20#define NO_BALANCE_IRQ (0)
21#define esr_disable (0)
22
23#ifdef CONFIG_X86_64
24#include <asm/genapic.h>
25#define INT_DELIVERY_MODE (genapic->int_delivery_mode)
26#define INT_DEST_MODE (genapic->int_dest_mode)
27#define TARGET_CPUS (genapic->target_cpus())
28#define apic_id_registered (genapic->apic_id_registered)
29#define init_apic_ldr (genapic->init_apic_ldr)
30#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
31#define phys_pkg_id (genapic->phys_pkg_id)
32#define vector_allocation_domain (genapic->vector_allocation_domain)
33#define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID)))
34#define send_IPI_self (genapic->send_IPI_self)
35extern void setup_apic_routing(void);
36#else
37#define INT_DELIVERY_MODE dest_LowestPrio
38#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */
39#define TARGET_CPUS (target_cpus())
40/*
41 * Set up the logical destination ID.
42 *
43 * Intel recommends to set DFR, LDR and TPR before enabling
44 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
45 * document number 292116). So here it goes...
46 */
47static inline void init_apic_ldr(void)
48{
49 unsigned long val;
50
51 apic_write(APIC_DFR, APIC_DFR_VALUE);
52 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
53 val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
54 apic_write(APIC_LDR, val);
55}
56
57static inline int apic_id_registered(void)
58{
59 return physid_isset(read_apic_id(), phys_cpu_present_map);
60}
61
62static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
63{
64 return cpus_addr(cpumask)[0];
65}
66
67static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
68{
69 return cpuid_apic >> index_msb;
70}
71
72static inline void setup_apic_routing(void)
73{
74#ifdef CONFIG_X86_IO_APIC
75 printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
76 "Flat", nr_ioapics);
77#endif
78}
79
80static inline int apicid_to_node(int logical_apicid)
81{
82#ifdef CONFIG_SMP
83 return apicid_2_node[hard_smp_processor_id()];
84#else
85 return 0;
86#endif
87}
88
89static inline cpumask_t vector_allocation_domain(int cpu)
90{
91 /* Careful. Some cpus do not strictly honor the set of cpus
92 * specified in the interrupt destination when using lowest
93 * priority interrupt delivery mode.
94 *
95 * In particular there was a hyperthreading cpu observed to
96 * deliver interrupts to the wrong hyperthread when only one
97 * hyperthread was specified in the interrupt desitination.
98 */
99 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
100 return domain;
101}
102#endif
103
104static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
105{
106 return physid_isset(apicid, bitmap);
107}
108
109static inline unsigned long check_apicid_present(int bit)
110{
111 return physid_isset(bit, phys_cpu_present_map);
112}
113
114static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
115{
116 return phys_map;
117}
118
119static inline int multi_timer_check(int apic, int irq)
120{
121 return 0;
122}
123
124/* Mapping from cpu number to logical apicid */
125static inline int cpu_to_logical_apicid(int cpu)
126{
127 return 1 << cpu;
128}
129
130static inline int cpu_present_to_apicid(int mps_cpu)
131{
132 if (mps_cpu < NR_CPUS && cpu_present(mps_cpu))
133 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
134 else
135 return BAD_APICID;
136}
137
138static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
139{
140 return physid_mask_of_physid(phys_apicid);
141}
142
143static inline void setup_portio_remap(void)
144{
145}
146
147static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
148{
149 return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map);
150}
151
152static inline void enable_apic_mode(void)
153{
154}
155#endif /* CONFIG_X86_LOCAL_APIC */
156#endif /* _ASM_X86_MACH_DEFAULT_MACH_APIC_H */
diff --git a/arch/x86/include/asm/mach-default/mach_apicdef.h b/arch/x86/include/asm/mach-default/mach_apicdef.h
new file mode 100644
index 00000000000..53179936d6c
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_apicdef.h
@@ -0,0 +1,24 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H
2#define _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H
3
4#include <asm/apic.h>
5
6#ifdef CONFIG_X86_64
7#define APIC_ID_MASK (genapic->apic_id_mask)
8#define GET_APIC_ID(x) (genapic->get_apic_id(x))
9#define SET_APIC_ID(x) (genapic->set_apic_id(x))
10#else
11#define APIC_ID_MASK (0xF<<24)
12static inline unsigned get_apic_id(unsigned long x)
13{
14 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
15 if (APIC_XAPIC(ver))
16 return (((x)>>24)&0xFF);
17 else
18 return (((x)>>24)&0xF);
19}
20
21#define GET_APIC_ID(x) get_apic_id(x)
22#endif
23
24#endif /* _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H */
diff --git a/arch/x86/include/asm/mach-default/mach_ipi.h b/arch/x86/include/asm/mach-default/mach_ipi.h
new file mode 100644
index 00000000000..fabca01ebac
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_ipi.h
@@ -0,0 +1,64 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_IPI_H
2#define _ASM_X86_MACH_DEFAULT_MACH_IPI_H
3
4/* Avoid include hell */
5#define NMI_VECTOR 0x02
6
7void send_IPI_mask_bitmask(cpumask_t mask, int vector);
8void __send_IPI_shortcut(unsigned int shortcut, int vector);
9
10extern int no_broadcast;
11
12#ifdef CONFIG_X86_64
13#include <asm/genapic.h>
14#define send_IPI_mask (genapic->send_IPI_mask)
15#else
16static inline void send_IPI_mask(cpumask_t mask, int vector)
17{
18 send_IPI_mask_bitmask(mask, vector);
19}
20#endif
21
22static inline void __local_send_IPI_allbutself(int vector)
23{
24 if (no_broadcast || vector == NMI_VECTOR) {
25 cpumask_t mask = cpu_online_map;
26
27 cpu_clear(smp_processor_id(), mask);
28 send_IPI_mask(mask, vector);
29 } else
30 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
31}
32
33static inline void __local_send_IPI_all(int vector)
34{
35 if (no_broadcast || vector == NMI_VECTOR)
36 send_IPI_mask(cpu_online_map, vector);
37 else
38 __send_IPI_shortcut(APIC_DEST_ALLINC, vector);
39}
40
41#ifdef CONFIG_X86_64
42#define send_IPI_allbutself (genapic->send_IPI_allbutself)
43#define send_IPI_all (genapic->send_IPI_all)
44#else
45static inline void send_IPI_allbutself(int vector)
46{
47 /*
48 * if there are no other CPUs in the system then we get an APIC send
49 * error if we try to broadcast, thus avoid sending IPIs in this case.
50 */
51 if (!(num_online_cpus() > 1))
52 return;
53
54 __local_send_IPI_allbutself(vector);
55 return;
56}
57
58static inline void send_IPI_all(int vector)
59{
60 __local_send_IPI_all(vector);
61}
62#endif
63
64#endif /* _ASM_X86_MACH_DEFAULT_MACH_IPI_H */
diff --git a/arch/x86/include/asm/mach-default/mach_mpparse.h b/arch/x86/include/asm/mach-default/mach_mpparse.h
new file mode 100644
index 00000000000..8c1ea21238a
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_mpparse.h
@@ -0,0 +1,17 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H
2#define _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H
3
4static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
5 char *productid)
6{
7 return 0;
8}
9
10/* Hook from generic ACPI tables.c */
11static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
12{
13 return 0;
14}
15
16
17#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/mach-default/mach_mpspec.h b/arch/x86/include/asm/mach-default/mach_mpspec.h
new file mode 100644
index 00000000000..e85ede686be
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_mpspec.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H
2#define _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H
3
4#define MAX_IRQ_SOURCES 256
5
6#if CONFIG_BASE_SMALL == 0
7#define MAX_MP_BUSSES 256
8#else
9#define MAX_MP_BUSSES 32
10#endif
11
12#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H */
diff --git a/arch/x86/include/asm/mach-default/mach_timer.h b/arch/x86/include/asm/mach-default/mach_timer.h
new file mode 100644
index 00000000000..853728519ae
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_timer.h
@@ -0,0 +1,48 @@
1/*
2 * Machine specific calibrate_tsc() for generic.
3 * Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
4 */
5/* ------ Calibrate the TSC -------
6 * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
7 * Too much 64-bit arithmetic here to do this cleanly in C, and for
8 * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
9 * output busy loop as low as possible. We avoid reading the CTC registers
10 * directly because of the awkward 8-bit access mechanism of the 82C54
11 * device.
12 */
13#ifndef _ASM_X86_MACH_DEFAULT_MACH_TIMER_H
14#define _ASM_X86_MACH_DEFAULT_MACH_TIMER_H
15
16#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
17#define CALIBRATE_LATCH \
18 ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
19
20static inline void mach_prepare_counter(void)
21{
22 /* Set the Gate high, disable speaker */
23 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
24
25 /*
26 * Now let's take care of CTC channel 2
27 *
28 * Set the Gate high, program CTC channel 2 for mode 0,
29 * (interrupt on terminal count mode), binary count,
30 * load 5 * LATCH count, (LSB and MSB) to begin countdown.
31 *
32 * Some devices need a delay here.
33 */
34 outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */
35 outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */
36 outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */
37}
38
39static inline void mach_countup(unsigned long *count_p)
40{
41 unsigned long count = 0;
42 do {
43 count++;
44 } while ((inb_p(0x61) & 0x20) == 0);
45 *count_p = count;
46}
47
48#endif /* _ASM_X86_MACH_DEFAULT_MACH_TIMER_H */
diff --git a/arch/x86/include/asm/mach-default/mach_traps.h b/arch/x86/include/asm/mach-default/mach_traps.h
new file mode 100644
index 00000000000..f7920601e47
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_traps.h
@@ -0,0 +1,33 @@
1/*
2 * Machine specific NMI handling for generic.
3 * Split out from traps.c by Osamu Tomita <tomita@cinet.co.jp>
4 */
5#ifndef _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
6#define _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
7
8#include <asm/mc146818rtc.h>
9
10static inline unsigned char get_nmi_reason(void)
11{
12 return inb(0x61);
13}
14
15static inline void reassert_nmi(void)
16{
17 int old_reg = -1;
18
19 if (do_i_have_lock_cmos())
20 old_reg = current_lock_cmos_reg();
21 else
22 lock_cmos(0); /* register doesn't matter here */
23 outb(0x8f, 0x70);
24 inb(0x71); /* dummy */
25 outb(0x0f, 0x70);
26 inb(0x71); /* dummy */
27 if (old_reg >= 0)
28 outb(old_reg, 0x70);
29 else
30 unlock_cmos();
31}
32
33#endif /* _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H */
diff --git a/arch/x86/include/asm/mach-default/mach_wakecpu.h b/arch/x86/include/asm/mach-default/mach_wakecpu.h
new file mode 100644
index 00000000000..d5c0b826a4f
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_wakecpu.h
@@ -0,0 +1,42 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
2#define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
3
4/*
5 * This file copes with machines that wakeup secondary CPUs by the
6 * INIT, INIT, STARTUP sequence.
7 */
8
9#define WAKE_SECONDARY_VIA_INIT
10
11#define TRAMPOLINE_LOW phys_to_virt(0x467)
12#define TRAMPOLINE_HIGH phys_to_virt(0x469)
13
14#define boot_cpu_apicid boot_cpu_physical_apicid
15
16static inline void wait_for_init_deassert(atomic_t *deassert)
17{
18 while (!atomic_read(deassert))
19 cpu_relax();
20 return;
21}
22
23/* Nothing to do for most platforms, since cleared by the INIT cycle */
24static inline void smp_callin_clear_local_apic(void)
25{
26}
27
28static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
29{
30}
31
32static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
33{
34}
35
36#if APIC_DEBUG
37 #define inquire_remote_apic(apicid) __inquire_remote_apic(apicid)
38#else
39 #define inquire_remote_apic(apicid) {}
40#endif
41
42#endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/mach-default/pci-functions.h b/arch/x86/include/asm/mach-default/pci-functions.h
new file mode 100644
index 00000000000..ed0bab42735
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/pci-functions.h
@@ -0,0 +1,19 @@
1/*
2 * PCI BIOS function numbering for conventional PCI BIOS
3 * systems
4 */
5
6#define PCIBIOS_PCI_FUNCTION_ID 0xb1XX
7#define PCIBIOS_PCI_BIOS_PRESENT 0xb101
8#define PCIBIOS_FIND_PCI_DEVICE 0xb102
9#define PCIBIOS_FIND_PCI_CLASS_CODE 0xb103
10#define PCIBIOS_GENERATE_SPECIAL_CYCLE 0xb106
11#define PCIBIOS_READ_CONFIG_BYTE 0xb108
12#define PCIBIOS_READ_CONFIG_WORD 0xb109
13#define PCIBIOS_READ_CONFIG_DWORD 0xb10a
14#define PCIBIOS_WRITE_CONFIG_BYTE 0xb10b
15#define PCIBIOS_WRITE_CONFIG_WORD 0xb10c
16#define PCIBIOS_WRITE_CONFIG_DWORD 0xb10d
17#define PCIBIOS_GET_ROUTING_OPTIONS 0xb10e
18#define PCIBIOS_SET_PCI_HW_INT 0xb10f
19
diff --git a/arch/x86/include/asm/mach-default/setup_arch.h b/arch/x86/include/asm/mach-default/setup_arch.h
new file mode 100644
index 00000000000..38846208b54
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/setup_arch.h
@@ -0,0 +1,3 @@
1/* Hook to call BIOS initialisation function */
2
3/* no action for generic */
diff --git a/arch/x86/include/asm/mach-default/smpboot_hooks.h b/arch/x86/include/asm/mach-default/smpboot_hooks.h
new file mode 100644
index 00000000000..dbab36d64d4
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/smpboot_hooks.h
@@ -0,0 +1,59 @@
1/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws
2 * which needs to alter them. */
3
4static inline void smpboot_clear_io_apic_irqs(void)
5{
6#ifdef CONFIG_X86_IO_APIC
7 io_apic_irqs = 0;
8#endif
9}
10
11static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
12{
13 CMOS_WRITE(0xa, 0xf);
14 local_flush_tlb();
15 pr_debug("1.\n");
16 *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4;
17 pr_debug("2.\n");
18 *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf;
19 pr_debug("3.\n");
20}
21
22static inline void smpboot_restore_warm_reset_vector(void)
23{
24 /*
25 * Install writable page 0 entry to set BIOS data area.
26 */
27 local_flush_tlb();
28
29 /*
30 * Paranoid: Set warm reset code and vector here back
31 * to default values.
32 */
33 CMOS_WRITE(0, 0xf);
34
35 *((volatile long *) phys_to_virt(0x467)) = 0;
36}
37
38static inline void __init smpboot_setup_io_apic(void)
39{
40#ifdef CONFIG_X86_IO_APIC
41 /*
42 * Here we can be sure that there is an IO-APIC in the system. Let's
43 * go and set it up:
44 */
45 if (!skip_ioapic_setup && nr_ioapics)
46 setup_IO_APIC();
47 else {
48 nr_ioapics = 0;
49 localise_nmi_watchdog();
50 }
51#endif
52}
53
54static inline void smpboot_clear_io_apic(void)
55{
56#ifdef CONFIG_X86_IO_APIC
57 nr_ioapics = 0;
58#endif
59}
diff --git a/arch/x86/include/asm/mach-generic/gpio.h b/arch/x86/include/asm/mach-generic/gpio.h
new file mode 100644
index 00000000000..995c45efdb3
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/gpio.h
@@ -0,0 +1,15 @@
1#ifndef _ASM_X86_MACH_GENERIC_GPIO_H
2#define _ASM_X86_MACH_GENERIC_GPIO_H
3
4int gpio_request(unsigned gpio, const char *label);
5void gpio_free(unsigned gpio);
6int gpio_direction_input(unsigned gpio);
7int gpio_direction_output(unsigned gpio, int value);
8int gpio_get_value(unsigned gpio);
9void gpio_set_value(unsigned gpio, int value);
10int gpio_to_irq(unsigned gpio);
11int irq_to_gpio(unsigned irq);
12
13#include <asm-generic/gpio.h> /* cansleep wrappers */
14
15#endif /* _ASM_X86_MACH_GENERIC_GPIO_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h
new file mode 100644
index 00000000000..5180bd7478f
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_apic.h
@@ -0,0 +1,33 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_APIC_H
2#define _ASM_X86_MACH_GENERIC_MACH_APIC_H
3
4#include <asm/genapic.h>
5
6#define esr_disable (genapic->ESR_DISABLE)
7#define NO_BALANCE_IRQ (genapic->no_balance_irq)
8#define INT_DELIVERY_MODE (genapic->int_delivery_mode)
9#define INT_DEST_MODE (genapic->int_dest_mode)
10#undef APIC_DEST_LOGICAL
11#define APIC_DEST_LOGICAL (genapic->apic_destination_logical)
12#define TARGET_CPUS (genapic->target_cpus())
13#define apic_id_registered (genapic->apic_id_registered)
14#define init_apic_ldr (genapic->init_apic_ldr)
15#define ioapic_phys_id_map (genapic->ioapic_phys_id_map)
16#define setup_apic_routing (genapic->setup_apic_routing)
17#define multi_timer_check (genapic->multi_timer_check)
18#define apicid_to_node (genapic->apicid_to_node)
19#define cpu_to_logical_apicid (genapic->cpu_to_logical_apicid)
20#define cpu_present_to_apicid (genapic->cpu_present_to_apicid)
21#define apicid_to_cpu_present (genapic->apicid_to_cpu_present)
22#define setup_portio_remap (genapic->setup_portio_remap)
23#define check_apicid_present (genapic->check_apicid_present)
24#define check_phys_apicid_present (genapic->check_phys_apicid_present)
25#define check_apicid_used (genapic->check_apicid_used)
26#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
27#define vector_allocation_domain (genapic->vector_allocation_domain)
28#define enable_apic_mode (genapic->enable_apic_mode)
29#define phys_pkg_id (genapic->phys_pkg_id)
30
31extern void generic_bigsmp_probe(void);
32
33#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_apicdef.h b/arch/x86/include/asm/mach-generic/mach_apicdef.h
new file mode 100644
index 00000000000..68041f3802f
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_apicdef.h
@@ -0,0 +1,11 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_APICDEF_H
2#define _ASM_X86_MACH_GENERIC_MACH_APICDEF_H
3
4#ifndef APIC_DEFINITION
5#include <asm/genapic.h>
6
7#define GET_APIC_ID (genapic->get_apic_id)
8#define APIC_ID_MASK (genapic->apic_id_mask)
9#endif
10
11#endif /* _ASM_X86_MACH_GENERIC_MACH_APICDEF_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_ipi.h b/arch/x86/include/asm/mach-generic/mach_ipi.h
new file mode 100644
index 00000000000..ffd637e3c3d
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_ipi.h
@@ -0,0 +1,10 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_IPI_H
2#define _ASM_X86_MACH_GENERIC_MACH_IPI_H
3
4#include <asm/genapic.h>
5
6#define send_IPI_mask (genapic->send_IPI_mask)
7#define send_IPI_allbutself (genapic->send_IPI_allbutself)
8#define send_IPI_all (genapic->send_IPI_all)
9
10#endif /* _ASM_X86_MACH_GENERIC_MACH_IPI_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_mpparse.h b/arch/x86/include/asm/mach-generic/mach_mpparse.h
new file mode 100644
index 00000000000..048f1d46853
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_mpparse.h
@@ -0,0 +1,10 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H
2#define _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H
3
4
5extern int mps_oem_check(struct mp_config_table *mpc, char *oem,
6 char *productid);
7
8extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id);
9
10#endif /* _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_mpspec.h b/arch/x86/include/asm/mach-generic/mach_mpspec.h
new file mode 100644
index 00000000000..bbab5ccfd4f
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_mpspec.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H
2#define _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H
3
4#define MAX_IRQ_SOURCES 256
5
6/* Summit or generic (i.e. installer) kernels need lots of bus entries. */
7/* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */
8#define MAX_MP_BUSSES 260
9
10extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
11 char *productid);
12#endif /* _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H */
diff --git a/arch/x86/include/asm/mach-rdc321x/gpio.h b/arch/x86/include/asm/mach-rdc321x/gpio.h
new file mode 100644
index 00000000000..c210ab5788b
--- /dev/null
+++ b/arch/x86/include/asm/mach-rdc321x/gpio.h
@@ -0,0 +1,60 @@
1#ifndef _ASM_X86_MACH_RDC321X_GPIO_H
2#define _ASM_X86_MACH_RDC321X_GPIO_H
3
4#include <linux/kernel.h>
5
6extern int rdc_gpio_get_value(unsigned gpio);
7extern void rdc_gpio_set_value(unsigned gpio, int value);
8extern int rdc_gpio_direction_input(unsigned gpio);
9extern int rdc_gpio_direction_output(unsigned gpio, int value);
10extern int rdc_gpio_request(unsigned gpio, const char *label);
11extern void rdc_gpio_free(unsigned gpio);
12extern void __init rdc321x_gpio_setup(void);
13
14/* Wrappers for the arch-neutral GPIO API */
15
16static inline int gpio_request(unsigned gpio, const char *label)
17{
18 return rdc_gpio_request(gpio, label);
19}
20
21static inline void gpio_free(unsigned gpio)
22{
23 might_sleep();
24 rdc_gpio_free(gpio);
25}
26
27static inline int gpio_direction_input(unsigned gpio)
28{
29 return rdc_gpio_direction_input(gpio);
30}
31
32static inline int gpio_direction_output(unsigned gpio, int value)
33{
34 return rdc_gpio_direction_output(gpio, value);
35}
36
37static inline int gpio_get_value(unsigned gpio)
38{
39 return rdc_gpio_get_value(gpio);
40}
41
42static inline void gpio_set_value(unsigned gpio, int value)
43{
44 rdc_gpio_set_value(gpio, value);
45}
46
47static inline int gpio_to_irq(unsigned gpio)
48{
49 return gpio;
50}
51
52static inline int irq_to_gpio(unsigned irq)
53{
54 return irq;
55}
56
57/* For cansleep */
58#include <asm-generic/gpio.h>
59
60#endif /* _ASM_X86_MACH_RDC321X_GPIO_H */
diff --git a/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h b/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h
new file mode 100644
index 00000000000..c8e9c8bed3d
--- /dev/null
+++ b/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h
@@ -0,0 +1,12 @@
1#define PFX "rdc321x: "
2
3/* General purpose configuration and data registers */
4#define RDC3210_CFGREG_ADDR 0x0CF8
5#define RDC3210_CFGREG_DATA 0x0CFC
6
7#define RDC321X_GPIO_CTRL_REG1 0x48
8#define RDC321X_GPIO_CTRL_REG2 0x84
9#define RDC321X_GPIO_DATA_REG1 0x4c
10#define RDC321X_GPIO_DATA_REG2 0x88
11
12#define RDC321X_MAX_GPIO 58
diff --git a/arch/x86/include/asm/mach-voyager/do_timer.h b/arch/x86/include/asm/mach-voyager/do_timer.h
new file mode 100644
index 00000000000..9e5a459fd15
--- /dev/null
+++ b/arch/x86/include/asm/mach-voyager/do_timer.h
@@ -0,0 +1,17 @@
1/* defines for inline arch setup functions */
2#include <linux/clockchips.h>
3
4#include <asm/voyager.h>
5#include <asm/i8253.h>
6
7/**
8 * do_timer_interrupt_hook - hook into timer tick
9 *
10 * Call the pit clock event handler. see asm/i8253.h
11 **/
12static inline void do_timer_interrupt_hook(void)
13{
14 global_clock_event->event_handler(global_clock_event);
15 voyager_timer_interrupt();
16}
17
diff --git a/arch/x86/include/asm/mach-voyager/entry_arch.h b/arch/x86/include/asm/mach-voyager/entry_arch.h
new file mode 100644
index 00000000000..ae52624b593
--- /dev/null
+++ b/arch/x86/include/asm/mach-voyager/entry_arch.h
@@ -0,0 +1,26 @@
1/* -*- mode: c; c-basic-offset: 8 -*- */
2
3/* Copyright (C) 2002
4 *
5 * Author: James.Bottomley@HansenPartnership.com
6 *
7 * linux/arch/i386/voyager/entry_arch.h
8 *
9 * This file builds the VIC and QIC CPI gates
10 */
11
12/* initialise the voyager interrupt gates
13 *
14 * This uses the macros in irq.h to set up assembly jump gates. The
15 * calls are then redirected to the same routine with smp_ prefixed */
16BUILD_INTERRUPT(vic_sys_interrupt, VIC_SYS_INT)
17BUILD_INTERRUPT(vic_cmn_interrupt, VIC_CMN_INT)
18BUILD_INTERRUPT(vic_cpi_interrupt, VIC_CPI_LEVEL0);
19
20/* do all the QIC interrupts */
21BUILD_INTERRUPT(qic_timer_interrupt, QIC_TIMER_CPI);
22BUILD_INTERRUPT(qic_invalidate_interrupt, QIC_INVALIDATE_CPI);
23BUILD_INTERRUPT(qic_reschedule_interrupt, QIC_RESCHEDULE_CPI);
24BUILD_INTERRUPT(qic_enable_irq_interrupt, QIC_ENABLE_IRQ_CPI);
25BUILD_INTERRUPT(qic_call_function_interrupt, QIC_CALL_FUNCTION_CPI);
26BUILD_INTERRUPT(qic_call_function_single_interrupt, QIC_CALL_FUNCTION_SINGLE_CPI);
diff --git a/arch/x86/include/asm/mach-voyager/setup_arch.h b/arch/x86/include/asm/mach-voyager/setup_arch.h
new file mode 100644
index 00000000000..71729ca05cd
--- /dev/null
+++ b/arch/x86/include/asm/mach-voyager/setup_arch.h
@@ -0,0 +1,12 @@
1#include <asm/voyager.h>
2#include <asm/setup.h>
3#define VOYAGER_BIOS_INFO ((struct voyager_bios_info *) \
4 (&boot_params.apm_bios_info))
5
6/* Hook to call BIOS initialisation function */
7
8/* for voyager, pass the voyager BIOS/SUS info area to the detection
9 * routines */
10
11#define ARCH_SETUP voyager_detect(VOYAGER_BIOS_INFO);
12
diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h
new file mode 100644
index 00000000000..5a65b107ad5
--- /dev/null
+++ b/arch/x86/include/asm/math_emu.h
@@ -0,0 +1,31 @@
1#ifndef _ASM_X86_MATH_EMU_H
2#define _ASM_X86_MATH_EMU_H
3
4/* This structure matches the layout of the data saved to the stack
5 following a device-not-present interrupt, part of it saved
6 automatically by the 80386/80486.
7 */
8struct info {
9 long ___orig_eip;
10 long ___ebx;
11 long ___ecx;
12 long ___edx;
13 long ___esi;
14 long ___edi;
15 long ___ebp;
16 long ___eax;
17 long ___ds;
18 long ___es;
19 long ___fs;
20 long ___orig_eax;
21 long ___eip;
22 long ___cs;
23 long ___eflags;
24 long ___esp;
25 long ___ss;
26 long ___vm86_es; /* This and the following only in vm86 mode */
27 long ___vm86_ds;
28 long ___vm86_fs;
29 long ___vm86_gs;
30};
31#endif /* _ASM_X86_MATH_EMU_H */
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
new file mode 100644
index 00000000000..01fdf5674e2
--- /dev/null
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -0,0 +1,104 @@
1/*
2 * Machine dependent access functions for RTC registers.
3 */
4#ifndef _ASM_X86_MC146818RTC_H
5#define _ASM_X86_MC146818RTC_H
6
7#include <asm/io.h>
8#include <asm/system.h>
9#include <asm/processor.h>
10#include <linux/mc146818rtc.h>
11
12#ifndef RTC_PORT
13#define RTC_PORT(x) (0x70 + (x))
14#define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */
15#endif
16
17#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG)
18/*
19 * This lock provides nmi access to the CMOS/RTC registers. It has some
20 * special properties. It is owned by a CPU and stores the index register
21 * currently being accessed (if owned). The idea here is that it works
22 * like a normal lock (normally). However, in an NMI, the NMI code will
23 * first check to see if its CPU owns the lock, meaning that the NMI
24 * interrupted during the read/write of the device. If it does, it goes ahead
25 * and performs the access and then restores the index register. If it does
26 * not, it locks normally.
27 *
28 * Note that since we are working with NMIs, we need this lock even in
29 * a non-SMP machine just to mark that the lock is owned.
30 *
31 * This only works with compare-and-swap. There is no other way to
32 * atomically claim the lock and set the owner.
33 */
34#include <linux/smp.h>
35extern volatile unsigned long cmos_lock;
36
37/*
38 * All of these below must be called with interrupts off, preempt
39 * disabled, etc.
40 */
41
42static inline void lock_cmos(unsigned char reg)
43{
44 unsigned long new;
45 new = ((smp_processor_id() + 1) << 8) | reg;
46 for (;;) {
47 if (cmos_lock) {
48 cpu_relax();
49 continue;
50 }
51 if (__cmpxchg(&cmos_lock, 0, new, sizeof(cmos_lock)) == 0)
52 return;
53 }
54}
55
56static inline void unlock_cmos(void)
57{
58 cmos_lock = 0;
59}
60
61static inline int do_i_have_lock_cmos(void)
62{
63 return (cmos_lock >> 8) == (smp_processor_id() + 1);
64}
65
66static inline unsigned char current_lock_cmos_reg(void)
67{
68 return cmos_lock & 0xff;
69}
70
71#define lock_cmos_prefix(reg) \
72 do { \
73 unsigned long cmos_flags; \
74 local_irq_save(cmos_flags); \
75 lock_cmos(reg)
76
77#define lock_cmos_suffix(reg) \
78 unlock_cmos(); \
79 local_irq_restore(cmos_flags); \
80 } while (0)
81#else
82#define lock_cmos_prefix(reg) do {} while (0)
83#define lock_cmos_suffix(reg) do {} while (0)
84#define lock_cmos(reg)
85#define unlock_cmos()
86#define do_i_have_lock_cmos() 0
87#define current_lock_cmos_reg() 0
88#endif
89
90/*
91 * The yet supported machines all access the RTC index register via
92 * an ISA port access but the way to access the date register differs ...
93 */
94#define CMOS_READ(addr) rtc_cmos_read(addr)
95#define CMOS_WRITE(val, addr) rtc_cmos_write(val, addr)
96unsigned char rtc_cmos_read(unsigned char addr);
97void rtc_cmos_write(unsigned char val, unsigned char addr);
98
99extern int mach_set_rtc_mmss(unsigned long nowtime);
100extern unsigned long mach_get_cmos_time(void);
101
102#define RTC_IRQ 8
103
104#endif /* _ASM_X86_MC146818RTC_H */
diff --git a/arch/x86/include/asm/mca.h b/arch/x86/include/asm/mca.h
new file mode 100644
index 00000000000..eedbb6cc1ef
--- /dev/null
+++ b/arch/x86/include/asm/mca.h
@@ -0,0 +1,43 @@
1/* -*- mode: c; c-basic-offset: 8 -*- */
2
3/* Platform specific MCA defines */
4#ifndef _ASM_X86_MCA_H
5#define _ASM_X86_MCA_H
6
7/* Maximal number of MCA slots - actually, some machines have less, but
8 * they all have sufficient number of POS registers to cover 8.
9 */
10#define MCA_MAX_SLOT_NR 8
11
12/* Most machines have only one MCA bus. The only multiple bus machines
13 * I know have at most two */
14#define MAX_MCA_BUSSES 2
15
16#define MCA_PRIMARY_BUS 0
17#define MCA_SECONDARY_BUS 1
18
19/* Dummy slot numbers on primary MCA for integrated functions */
20#define MCA_INTEGSCSI (MCA_MAX_SLOT_NR)
21#define MCA_INTEGVIDEO (MCA_MAX_SLOT_NR+1)
22#define MCA_MOTHERBOARD (MCA_MAX_SLOT_NR+2)
23
24/* Dummy POS values for integrated functions */
25#define MCA_DUMMY_POS_START 0x10000
26#define MCA_INTEGSCSI_POS (MCA_DUMMY_POS_START+1)
27#define MCA_INTEGVIDEO_POS (MCA_DUMMY_POS_START+2)
28#define MCA_MOTHERBOARD_POS (MCA_DUMMY_POS_START+3)
29
30/* MCA registers */
31
32#define MCA_MOTHERBOARD_SETUP_REG 0x94
33#define MCA_ADAPTER_SETUP_REG 0x96
34#define MCA_POS_REG(n) (0x100+(n))
35
36#define MCA_ENABLED 0x01 /* POS 2, set if adapter enabled */
37
38/* Max number of adapters, including both slots and various integrated
39 * things.
40 */
41#define MCA_NUMADAPTERS (MCA_MAX_SLOT_NR+3)
42
43#endif /* _ASM_X86_MCA_H */
diff --git a/arch/x86/include/asm/mca_dma.h b/arch/x86/include/asm/mca_dma.h
new file mode 100644
index 00000000000..45271aef82d
--- /dev/null
+++ b/arch/x86/include/asm/mca_dma.h
@@ -0,0 +1,201 @@
1#ifndef _ASM_X86_MCA_DMA_H
2#define _ASM_X86_MCA_DMA_H
3
4#include <asm/io.h>
5#include <linux/ioport.h>
6
7/*
8 * Microchannel specific DMA stuff. DMA on an MCA machine is fairly similar to
9 * standard PC dma, but it certainly has its quirks. DMA register addresses
10 * are in a different place and there are some added functions. Most of this
11 * should be pretty obvious on inspection. Note that the user must divide
12 * count by 2 when using 16-bit dma; that is not handled by these functions.
13 *
14 * Ramen Noodles are yummy.
15 *
16 * 1998 Tymm Twillman <tymm@computer.org>
17 */
18
19/*
20 * Registers that are used by the DMA controller; FN is the function register
21 * (tell the controller what to do) and EXE is the execution register (how
22 * to do it)
23 */
24
25#define MCA_DMA_REG_FN 0x18
26#define MCA_DMA_REG_EXE 0x1A
27
28/*
29 * Functions that the DMA controller can do
30 */
31
32#define MCA_DMA_FN_SET_IO 0x00
33#define MCA_DMA_FN_SET_ADDR 0x20
34#define MCA_DMA_FN_GET_ADDR 0x30
35#define MCA_DMA_FN_SET_COUNT 0x40
36#define MCA_DMA_FN_GET_COUNT 0x50
37#define MCA_DMA_FN_GET_STATUS 0x60
38#define MCA_DMA_FN_SET_MODE 0x70
39#define MCA_DMA_FN_SET_ARBUS 0x80
40#define MCA_DMA_FN_MASK 0x90
41#define MCA_DMA_FN_RESET_MASK 0xA0
42#define MCA_DMA_FN_MASTER_CLEAR 0xD0
43
44/*
45 * Modes (used by setting MCA_DMA_FN_MODE in the function register)
46 *
47 * Note that the MODE_READ is read from memory (write to device), and
48 * MODE_WRITE is vice-versa.
49 */
50
51#define MCA_DMA_MODE_XFER 0x04 /* read by default */
52#define MCA_DMA_MODE_READ 0x04 /* same as XFER */
53#define MCA_DMA_MODE_WRITE 0x08 /* OR with MODE_XFER to use */
54#define MCA_DMA_MODE_IO 0x01 /* DMA from IO register */
55#define MCA_DMA_MODE_16 0x40 /* 16 bit xfers */
56
57
58/**
59 * mca_enable_dma - channel to enable DMA on
60 * @dmanr: DMA channel
61 *
62 * Enable the MCA bus DMA on a channel. This can be called from
63 * IRQ context.
64 */
65
66static inline void mca_enable_dma(unsigned int dmanr)
67{
68 outb(MCA_DMA_FN_RESET_MASK | dmanr, MCA_DMA_REG_FN);
69}
70
71/**
72 * mca_disble_dma - channel to disable DMA on
73 * @dmanr: DMA channel
74 *
75 * Enable the MCA bus DMA on a channel. This can be called from
76 * IRQ context.
77 */
78
79static inline void mca_disable_dma(unsigned int dmanr)
80{
81 outb(MCA_DMA_FN_MASK | dmanr, MCA_DMA_REG_FN);
82}
83
84/**
85 * mca_set_dma_addr - load a 24bit DMA address
86 * @dmanr: DMA channel
87 * @a: 24bit bus address
88 *
89 * Load the address register in the DMA controller. This has a 24bit
90 * limitation (16Mb).
91 */
92
93static inline void mca_set_dma_addr(unsigned int dmanr, unsigned int a)
94{
95 outb(MCA_DMA_FN_SET_ADDR | dmanr, MCA_DMA_REG_FN);
96 outb(a & 0xff, MCA_DMA_REG_EXE);
97 outb((a >> 8) & 0xff, MCA_DMA_REG_EXE);
98 outb((a >> 16) & 0xff, MCA_DMA_REG_EXE);
99}
100
101/**
102 * mca_get_dma_addr - load a 24bit DMA address
103 * @dmanr: DMA channel
104 *
105 * Read the address register in the DMA controller. This has a 24bit
106 * limitation (16Mb). The return is a bus address.
107 */
108
109static inline unsigned int mca_get_dma_addr(unsigned int dmanr)
110{
111 unsigned int addr;
112
113 outb(MCA_DMA_FN_GET_ADDR | dmanr, MCA_DMA_REG_FN);
114 addr = inb(MCA_DMA_REG_EXE);
115 addr |= inb(MCA_DMA_REG_EXE) << 8;
116 addr |= inb(MCA_DMA_REG_EXE) << 16;
117
118 return addr;
119}
120
121/**
122 * mca_set_dma_count - load a 16bit transfer count
123 * @dmanr: DMA channel
124 * @count: count
125 *
126 * Set the DMA count for this channel. This can be up to 64Kbytes.
127 * Setting a count of zero will not do what you expect.
128 */
129
130static inline void mca_set_dma_count(unsigned int dmanr, unsigned int count)
131{
132 count--; /* transfers one more than count -- correct for this */
133
134 outb(MCA_DMA_FN_SET_COUNT | dmanr, MCA_DMA_REG_FN);
135 outb(count & 0xff, MCA_DMA_REG_EXE);
136 outb((count >> 8) & 0xff, MCA_DMA_REG_EXE);
137}
138
139/**
140 * mca_get_dma_residue - get the remaining bytes to transfer
141 * @dmanr: DMA channel
142 *
143 * This function returns the number of bytes left to transfer
144 * on this DMA channel.
145 */
146
147static inline unsigned int mca_get_dma_residue(unsigned int dmanr)
148{
149 unsigned short count;
150
151 outb(MCA_DMA_FN_GET_COUNT | dmanr, MCA_DMA_REG_FN);
152 count = 1 + inb(MCA_DMA_REG_EXE);
153 count += inb(MCA_DMA_REG_EXE) << 8;
154
155 return count;
156}
157
158/**
159 * mca_set_dma_io - set the port for an I/O transfer
160 * @dmanr: DMA channel
161 * @io_addr: an I/O port number
162 *
163 * Unlike the ISA bus DMA controllers the DMA on MCA bus can transfer
164 * with an I/O port target.
165 */
166
167static inline void mca_set_dma_io(unsigned int dmanr, unsigned int io_addr)
168{
169 /*
170 * DMA from a port address -- set the io address
171 */
172
173 outb(MCA_DMA_FN_SET_IO | dmanr, MCA_DMA_REG_FN);
174 outb(io_addr & 0xff, MCA_DMA_REG_EXE);
175 outb((io_addr >> 8) & 0xff, MCA_DMA_REG_EXE);
176}
177
178/**
179 * mca_set_dma_mode - set the DMA mode
180 * @dmanr: DMA channel
181 * @mode: mode to set
182 *
183 * The DMA controller supports several modes. The mode values you can
184 * set are-
185 *
186 * %MCA_DMA_MODE_READ when reading from the DMA device.
187 *
188 * %MCA_DMA_MODE_WRITE to writing to the DMA device.
189 *
190 * %MCA_DMA_MODE_IO to do DMA to or from an I/O port.
191 *
192 * %MCA_DMA_MODE_16 to do 16bit transfers.
193 */
194
195static inline void mca_set_dma_mode(unsigned int dmanr, unsigned int mode)
196{
197 outb(MCA_DMA_FN_SET_MODE | dmanr, MCA_DMA_REG_FN);
198 outb(mode, MCA_DMA_REG_EXE);
199}
200
201#endif /* _ASM_X86_MCA_DMA_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
new file mode 100644
index 00000000000..1d6e17c2f23
--- /dev/null
+++ b/arch/x86/include/asm/mce.h
@@ -0,0 +1,130 @@
1#ifndef _ASM_X86_MCE_H
2#define _ASM_X86_MCE_H
3
4#ifdef __x86_64__
5
6#include <asm/ioctls.h>
7#include <asm/types.h>
8
9/*
10 * Machine Check support for x86
11 */
12
13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
14
15#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
16#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
17#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */
18
19#define MCI_STATUS_VAL (1UL<<63) /* valid error */
20#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */
21#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */
22#define MCI_STATUS_EN (1UL<<60) /* error enabled */
23#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */
24#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */
25#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */
26
27/* Fields are zero when not available */
28struct mce {
29 __u64 status;
30 __u64 misc;
31 __u64 addr;
32 __u64 mcgstatus;
33 __u64 ip;
34 __u64 tsc; /* cpu time stamp counter */
35 __u64 res1; /* for future extension */
36 __u64 res2; /* dito. */
37 __u8 cs; /* code segment */
38 __u8 bank; /* machine check bank */
39 __u8 cpu; /* cpu that raised the error */
40 __u8 finished; /* entry is valid */
41 __u32 pad;
42};
43
44/*
45 * This structure contains all data related to the MCE log. Also
46 * carries a signature to make it easier to find from external
47 * debugging tools. Each entry is only valid when its finished flag
48 * is set.
49 */
50
51#define MCE_LOG_LEN 32
52
53struct mce_log {
54 char signature[12]; /* "MACHINECHECK" */
55 unsigned len; /* = MCE_LOG_LEN */
56 unsigned next;
57 unsigned flags;
58 unsigned pad0;
59 struct mce entry[MCE_LOG_LEN];
60};
61
62#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
63
64#define MCE_LOG_SIGNATURE "MACHINECHECK"
65
66#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
67#define MCE_GET_LOG_LEN _IOR('M', 2, int)
68#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int)
69
70/* Software defined banks */
71#define MCE_EXTENDED_BANK 128
72#define MCE_THERMAL_BANK MCE_EXTENDED_BANK + 0
73
74#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */
75#define K8_MCE_THRESHOLD_BANK_0 (MCE_THRESHOLD_BASE + 0 * 9)
76#define K8_MCE_THRESHOLD_BANK_1 (MCE_THRESHOLD_BASE + 1 * 9)
77#define K8_MCE_THRESHOLD_BANK_2 (MCE_THRESHOLD_BASE + 2 * 9)
78#define K8_MCE_THRESHOLD_BANK_3 (MCE_THRESHOLD_BASE + 3 * 9)
79#define K8_MCE_THRESHOLD_BANK_4 (MCE_THRESHOLD_BASE + 4 * 9)
80#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
81#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
82
83#endif /* __x86_64__ */
84
85#ifdef __KERNEL__
86
87#ifdef CONFIG_X86_32
88extern int mce_disabled;
89#else /* CONFIG_X86_32 */
90
91#include <asm/atomic.h>
92
93void mce_log(struct mce *m);
94DECLARE_PER_CPU(struct sys_device, device_mce);
95extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
96
97#ifdef CONFIG_X86_MCE_INTEL
98void mce_intel_feature_init(struct cpuinfo_x86 *c);
99#else
100static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
101#endif
102
103#ifdef CONFIG_X86_MCE_AMD
104void mce_amd_feature_init(struct cpuinfo_x86 *c);
105#else
106static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
107#endif
108
109void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
110
111extern atomic_t mce_entry;
112
113extern void do_machine_check(struct pt_regs *, long);
114extern int mce_notify_user(void);
115
116#endif /* !CONFIG_X86_32 */
117
118
119
120#ifdef CONFIG_X86_MCE
121extern void mcheck_init(struct cpuinfo_x86 *c);
122#else
123#define mcheck_init(c) do { } while (0)
124#endif
125extern void stop_mce(void);
126extern void restart_mce(void);
127
128#endif /* __KERNEL__ */
129
130#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
new file mode 100644
index 00000000000..c882664716c
--- /dev/null
+++ b/arch/x86/include/asm/microcode.h
@@ -0,0 +1,47 @@
1#ifndef _ASM_X86_MICROCODE_H
2#define _ASM_X86_MICROCODE_H
3
4struct cpu_signature {
5 unsigned int sig;
6 unsigned int pf;
7 unsigned int rev;
8};
9
10struct device;
11
12struct microcode_ops {
13 int (*request_microcode_user) (int cpu, const void __user *buf, size_t size);
14 int (*request_microcode_fw) (int cpu, struct device *device);
15
16 void (*apply_microcode) (int cpu);
17
18 int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
19 void (*microcode_fini_cpu) (int cpu);
20};
21
22struct ucode_cpu_info {
23 struct cpu_signature cpu_sig;
24 int valid;
25 void *mc;
26};
27extern struct ucode_cpu_info ucode_cpu_info[];
28
29#ifdef CONFIG_MICROCODE_INTEL
30extern struct microcode_ops * __init init_intel_microcode(void);
31#else
32static inline struct microcode_ops * __init init_intel_microcode(void)
33{
34 return NULL;
35}
36#endif /* CONFIG_MICROCODE_INTEL */
37
38#ifdef CONFIG_MICROCODE_AMD
39extern struct microcode_ops * __init init_amd_microcode(void);
40#else
41static inline struct microcode_ops * __init init_amd_microcode(void)
42{
43 return NULL;
44}
45#endif
46
47#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
new file mode 100644
index 00000000000..90bc4108a4f
--- /dev/null
+++ b/arch/x86/include/asm/mman.h
@@ -0,0 +1,20 @@
1#ifndef _ASM_X86_MMAN_H
2#define _ASM_X86_MMAN_H
3
4#include <asm-generic/mman.h>
5
6#define MAP_32BIT 0x40 /* only give out 32bit addresses */
7
8#define MAP_GROWSDOWN 0x0100 /* stack-like segment */
9#define MAP_DENYWRITE 0x0800 /* ETXTBSY */
10#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */
11#define MAP_LOCKED 0x2000 /* pages are locked */
12#define MAP_NORESERVE 0x4000 /* don't check for reservations */
13#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
14#define MAP_NONBLOCK 0x10000 /* do not block on IO */
15#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
16
17#define MCL_CURRENT 1 /* lock all current mappings */
18#define MCL_FUTURE 2 /* lock all future mappings */
19
20#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/asm/mmconfig.h b/arch/x86/include/asm/mmconfig.h
new file mode 100644
index 00000000000..9b119da1d10
--- /dev/null
+++ b/arch/x86/include/asm/mmconfig.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_MMCONFIG_H
2#define _ASM_X86_MMCONFIG_H
3
4#ifdef CONFIG_PCI_MMCONFIG
5extern void __cpuinit fam10h_check_enable_mmcfg(void);
6extern void __cpuinit check_enable_amd_mmconf_dmi(void);
7#else
8static inline void fam10h_check_enable_mmcfg(void) { }
9static inline void check_enable_amd_mmconf_dmi(void) { }
10#endif
11
12#endif /* _ASM_X86_MMCONFIG_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
new file mode 100644
index 00000000000..80a1dee5bea
--- /dev/null
+++ b/arch/x86/include/asm/mmu.h
@@ -0,0 +1,26 @@
1#ifndef _ASM_X86_MMU_H
2#define _ASM_X86_MMU_H
3
4#include <linux/spinlock.h>
5#include <linux/mutex.h>
6
7/*
8 * The x86 doesn't have a mmu context, but
9 * we put the segment information here.
10 */
11typedef struct {
12 void *ldt;
13 int size;
14 struct mutex lock;
15 void *vdso;
16} mm_context_t;
17
18#ifdef CONFIG_SMP
19void leave_mm(int cpu);
20#else
21static inline void leave_mm(int cpu)
22{
23}
24#endif
25
26#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
new file mode 100644
index 00000000000..8aeeb3fd73d
--- /dev/null
+++ b/arch/x86/include/asm/mmu_context.h
@@ -0,0 +1,37 @@
1#ifndef _ASM_X86_MMU_CONTEXT_H
2#define _ASM_X86_MMU_CONTEXT_H
3
4#include <asm/desc.h>
5#include <asm/atomic.h>
6#include <asm/pgalloc.h>
7#include <asm/tlbflush.h>
8#include <asm/paravirt.h>
9#ifndef CONFIG_PARAVIRT
10#include <asm-generic/mm_hooks.h>
11
12static inline void paravirt_activate_mm(struct mm_struct *prev,
13 struct mm_struct *next)
14{
15}
16#endif /* !CONFIG_PARAVIRT */
17
18/*
19 * Used for LDT copy/destruction.
20 */
21int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
22void destroy_context(struct mm_struct *mm);
23
24#ifdef CONFIG_X86_32
25# include "mmu_context_32.h"
26#else
27# include "mmu_context_64.h"
28#endif
29
30#define activate_mm(prev, next) \
31do { \
32 paravirt_activate_mm((prev), (next)); \
33 switch_mm((prev), (next), NULL); \
34} while (0);
35
36
37#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h
new file mode 100644
index 00000000000..8e10015781f
--- /dev/null
+++ b/arch/x86/include/asm/mmu_context_32.h
@@ -0,0 +1,56 @@
1#ifndef _ASM_X86_MMU_CONTEXT_32_H
2#define _ASM_X86_MMU_CONTEXT_32_H
3
4static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
5{
6#ifdef CONFIG_SMP
7 unsigned cpu = smp_processor_id();
8 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
9 per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
10#endif
11}
12
13static inline void switch_mm(struct mm_struct *prev,
14 struct mm_struct *next,
15 struct task_struct *tsk)
16{
17 int cpu = smp_processor_id();
18
19 if (likely(prev != next)) {
20 /* stop flush ipis for the previous mm */
21 cpu_clear(cpu, prev->cpu_vm_mask);
22#ifdef CONFIG_SMP
23 per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
24 per_cpu(cpu_tlbstate, cpu).active_mm = next;
25#endif
26 cpu_set(cpu, next->cpu_vm_mask);
27
28 /* Re-load page tables */
29 load_cr3(next->pgd);
30
31 /*
32 * load the LDT, if the LDT is different:
33 */
34 if (unlikely(prev->context.ldt != next->context.ldt))
35 load_LDT_nolock(&next->context);
36 }
37#ifdef CONFIG_SMP
38 else {
39 per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
40 BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
41
42 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
43 /* We were in lazy tlb mode and leave_mm disabled
44 * tlb flush IPI delivery. We must reload %cr3.
45 */
46 load_cr3(next->pgd);
47 load_LDT_nolock(&next->context);
48 }
49 }
50#endif
51}
52
53#define deactivate_mm(tsk, mm) \
54 asm("movl %0,%%gs": :"r" (0));
55
56#endif /* _ASM_X86_MMU_CONTEXT_32_H */
diff --git a/arch/x86/include/asm/mmu_context_64.h b/arch/x86/include/asm/mmu_context_64.h
new file mode 100644
index 00000000000..677d36e9540
--- /dev/null
+++ b/arch/x86/include/asm/mmu_context_64.h
@@ -0,0 +1,54 @@
1#ifndef _ASM_X86_MMU_CONTEXT_64_H
2#define _ASM_X86_MMU_CONTEXT_64_H
3
4#include <asm/pda.h>
5
6static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
7{
8#ifdef CONFIG_SMP
9 if (read_pda(mmu_state) == TLBSTATE_OK)
10 write_pda(mmu_state, TLBSTATE_LAZY);
11#endif
12}
13
14static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
15 struct task_struct *tsk)
16{
17 unsigned cpu = smp_processor_id();
18 if (likely(prev != next)) {
19 /* stop flush ipis for the previous mm */
20 cpu_clear(cpu, prev->cpu_vm_mask);
21#ifdef CONFIG_SMP
22 write_pda(mmu_state, TLBSTATE_OK);
23 write_pda(active_mm, next);
24#endif
25 cpu_set(cpu, next->cpu_vm_mask);
26 load_cr3(next->pgd);
27
28 if (unlikely(next->context.ldt != prev->context.ldt))
29 load_LDT_nolock(&next->context);
30 }
31#ifdef CONFIG_SMP
32 else {
33 write_pda(mmu_state, TLBSTATE_OK);
34 if (read_pda(active_mm) != next)
35 BUG();
36 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
37 /* We were in lazy tlb mode and leave_mm disabled
38 * tlb flush IPI delivery. We must reload CR3
39 * to make sure to use no freed page tables.
40 */
41 load_cr3(next->pgd);
42 load_LDT_nolock(&next->context);
43 }
44 }
45#endif
46}
47
48#define deactivate_mm(tsk, mm) \
49do { \
50 load_gs_index(0); \
51 asm volatile("movl %0,%%fs"::"r"(0)); \
52} while (0)
53
54#endif /* _ASM_X86_MMU_CONTEXT_64_H */
diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h
new file mode 100644
index 00000000000..5cbf3135b97
--- /dev/null
+++ b/arch/x86/include/asm/mmx.h
@@ -0,0 +1,14 @@
1#ifndef _ASM_X86_MMX_H
2#define _ASM_X86_MMX_H
3
4/*
5 * MMX 3Dnow! helper operations
6 */
7
8#include <linux/types.h>
9
10extern void *_mmx_memcpy(void *to, const void *from, size_t size);
11extern void mmx_clear_page(void *page);
12extern void mmx_copy_page(void *to, void *from);
13
14#endif /* _ASM_X86_MMX_H */
diff --git a/arch/x86/include/asm/mmzone.h b/arch/x86/include/asm/mmzone.h
new file mode 100644
index 00000000000..64217ea16a3
--- /dev/null
+++ b/arch/x86/include/asm/mmzone.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "mmzone_32.h"
3#else
4# include "mmzone_64.h"
5#endif
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
new file mode 100644
index 00000000000..485bdf059ff
--- /dev/null
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -0,0 +1,134 @@
1/*
2 * Written by Pat Gaughen (gone@us.ibm.com) Mar 2002
3 *
4 */
5
6#ifndef _ASM_X86_MMZONE_32_H
7#define _ASM_X86_MMZONE_32_H
8
9#include <asm/smp.h>
10
11#ifdef CONFIG_NUMA
12extern struct pglist_data *node_data[];
13#define NODE_DATA(nid) (node_data[nid])
14
15#include <asm/numaq.h>
16/* summit or generic arch */
17#include <asm/srat.h>
18
19extern int get_memcfg_numa_flat(void);
20/*
21 * This allows any one NUMA architecture to be compiled
22 * for, and still fall back to the flat function if it
23 * fails.
24 */
25static inline void get_memcfg_numa(void)
26{
27
28 if (get_memcfg_numaq())
29 return;
30 if (get_memcfg_from_srat())
31 return;
32 get_memcfg_numa_flat();
33}
34
35extern int early_pfn_to_nid(unsigned long pfn);
36
37#else /* !CONFIG_NUMA */
38
39#define get_memcfg_numa get_memcfg_numa_flat
40
41#endif /* CONFIG_NUMA */
42
43#ifdef CONFIG_DISCONTIGMEM
44
45/*
46 * generic node memory support, the following assumptions apply:
47 *
48 * 1) memory comes in 64Mb contigious chunks which are either present or not
49 * 2) we will not have more than 64Gb in total
50 *
51 * for now assume that 64Gb is max amount of RAM for whole system
52 * 64Gb / 4096bytes/page = 16777216 pages
53 */
54#define MAX_NR_PAGES 16777216
55#define MAX_ELEMENTS 1024
56#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS)
57
58extern s8 physnode_map[];
59
60static inline int pfn_to_nid(unsigned long pfn)
61{
62#ifdef CONFIG_NUMA
63 return((int) physnode_map[(pfn) / PAGES_PER_ELEMENT]);
64#else
65 return 0;
66#endif
67}
68
69/*
70 * Following are macros that each numa implmentation must define.
71 */
72
73#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
74#define node_end_pfn(nid) \
75({ \
76 pg_data_t *__pgdat = NODE_DATA(nid); \
77 __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \
78})
79
80static inline int pfn_valid(int pfn)
81{
82 int nid = pfn_to_nid(pfn);
83
84 if (nid >= 0)
85 return (pfn < node_end_pfn(nid));
86 return 0;
87}
88
89#endif /* CONFIG_DISCONTIGMEM */
90
91#ifdef CONFIG_NEED_MULTIPLE_NODES
92
93/*
94 * Following are macros that are specific to this numa platform.
95 */
96#define reserve_bootmem(addr, size, flags) \
97 reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
98#define alloc_bootmem(x) \
99 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
100#define alloc_bootmem_nopanic(x) \
101 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
102 __pa(MAX_DMA_ADDRESS))
103#define alloc_bootmem_low(x) \
104 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
105#define alloc_bootmem_pages(x) \
106 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
107#define alloc_bootmem_pages_nopanic(x) \
108 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
109 __pa(MAX_DMA_ADDRESS))
110#define alloc_bootmem_low_pages(x) \
111 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
112#define alloc_bootmem_node(pgdat, x) \
113({ \
114 struct pglist_data __maybe_unused \
115 *__alloc_bootmem_node__pgdat = (pgdat); \
116 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
117 __pa(MAX_DMA_ADDRESS)); \
118})
119#define alloc_bootmem_pages_node(pgdat, x) \
120({ \
121 struct pglist_data __maybe_unused \
122 *__alloc_bootmem_node__pgdat = (pgdat); \
123 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
124 __pa(MAX_DMA_ADDRESS)); \
125})
126#define alloc_bootmem_low_pages_node(pgdat, x) \
127({ \
128 struct pglist_data __maybe_unused \
129 *__alloc_bootmem_node__pgdat = (pgdat); \
130 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \
131})
132#endif /* CONFIG_NEED_MULTIPLE_NODES */
133
134#endif /* _ASM_X86_MMZONE_32_H */
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
new file mode 100644
index 00000000000..a5b3817d4b9
--- /dev/null
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -0,0 +1,51 @@
1/* K8 NUMA support */
2/* Copyright 2002,2003 by Andi Kleen, SuSE Labs */
3/* 2.5 Version loosely based on the NUMAQ Code by Pat Gaughen. */
4#ifndef _ASM_X86_MMZONE_64_H
5#define _ASM_X86_MMZONE_64_H
6
7
8#ifdef CONFIG_NUMA
9
10#include <linux/mmdebug.h>
11
12#include <asm/smp.h>
13
14/* Simple perfect hash to map physical addresses to node numbers */
15struct memnode {
16 int shift;
17 unsigned int mapsize;
18 s16 *map;
19 s16 embedded_map[64 - 8];
20} ____cacheline_aligned; /* total size = 128 bytes */
21extern struct memnode memnode;
22#define memnode_shift memnode.shift
23#define memnodemap memnode.map
24#define memnodemapsize memnode.mapsize
25
26extern struct pglist_data *node_data[];
27
28static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
29{
30 unsigned nid;
31 VIRTUAL_BUG_ON(!memnodemap);
32 nid = memnodemap[addr >> memnode_shift];
33 VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
34 return nid;
35}
36
37#define NODE_DATA(nid) (node_data[nid])
38
39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
40#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
41 NODE_DATA(nid)->node_spanned_pages)
42
43extern int early_pfn_to_nid(unsigned long pfn);
44
45#ifdef CONFIG_NUMA_EMU
46#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024)
47#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
48#endif
49
50#endif
51#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
new file mode 100644
index 00000000000..47d62743c4d
--- /dev/null
+++ b/arch/x86/include/asm/module.h
@@ -0,0 +1,80 @@
1#ifndef _ASM_X86_MODULE_H
2#define _ASM_X86_MODULE_H
3
4/* x86_32/64 are simple */
5struct mod_arch_specific {};
6
7#ifdef CONFIG_X86_32
8# define Elf_Shdr Elf32_Shdr
9# define Elf_Sym Elf32_Sym
10# define Elf_Ehdr Elf32_Ehdr
11#else
12# define Elf_Shdr Elf64_Shdr
13# define Elf_Sym Elf64_Sym
14# define Elf_Ehdr Elf64_Ehdr
15#endif
16
17#ifdef CONFIG_X86_64
18/* X86_64 does not define MODULE_PROC_FAMILY */
19#elif defined CONFIG_M386
20#define MODULE_PROC_FAMILY "386 "
21#elif defined CONFIG_M486
22#define MODULE_PROC_FAMILY "486 "
23#elif defined CONFIG_M586
24#define MODULE_PROC_FAMILY "586 "
25#elif defined CONFIG_M586TSC
26#define MODULE_PROC_FAMILY "586TSC "
27#elif defined CONFIG_M586MMX
28#define MODULE_PROC_FAMILY "586MMX "
29#elif defined CONFIG_MCORE2
30#define MODULE_PROC_FAMILY "CORE2 "
31#elif defined CONFIG_M686
32#define MODULE_PROC_FAMILY "686 "
33#elif defined CONFIG_MPENTIUMII
34#define MODULE_PROC_FAMILY "PENTIUMII "
35#elif defined CONFIG_MPENTIUMIII
36#define MODULE_PROC_FAMILY "PENTIUMIII "
37#elif defined CONFIG_MPENTIUMM
38#define MODULE_PROC_FAMILY "PENTIUMM "
39#elif defined CONFIG_MPENTIUM4
40#define MODULE_PROC_FAMILY "PENTIUM4 "
41#elif defined CONFIG_MK6
42#define MODULE_PROC_FAMILY "K6 "
43#elif defined CONFIG_MK7
44#define MODULE_PROC_FAMILY "K7 "
45#elif defined CONFIG_MK8
46#define MODULE_PROC_FAMILY "K8 "
47#elif defined CONFIG_X86_ELAN
48#define MODULE_PROC_FAMILY "ELAN "
49#elif defined CONFIG_MCRUSOE
50#define MODULE_PROC_FAMILY "CRUSOE "
51#elif defined CONFIG_MEFFICEON
52#define MODULE_PROC_FAMILY "EFFICEON "
53#elif defined CONFIG_MWINCHIPC6
54#define MODULE_PROC_FAMILY "WINCHIPC6 "
55#elif defined CONFIG_MWINCHIP3D
56#define MODULE_PROC_FAMILY "WINCHIP3D "
57#elif defined CONFIG_MCYRIXIII
58#define MODULE_PROC_FAMILY "CYRIXIII "
59#elif defined CONFIG_MVIAC3_2
60#define MODULE_PROC_FAMILY "VIAC3-2 "
61#elif defined CONFIG_MVIAC7
62#define MODULE_PROC_FAMILY "VIAC7 "
63#elif defined CONFIG_MGEODEGX1
64#define MODULE_PROC_FAMILY "GEODEGX1 "
65#elif defined CONFIG_MGEODE_LX
66#define MODULE_PROC_FAMILY "GEODE "
67#else
68#error unknown processor family
69#endif
70
71#ifdef CONFIG_X86_32
72# ifdef CONFIG_4KSTACKS
73# define MODULE_STACKSIZE "4KSTACKS "
74# else
75# define MODULE_STACKSIZE ""
76# endif
77# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
78#endif
79
80#endif /* _ASM_X86_MODULE_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
new file mode 100644
index 00000000000..91885c28f66
--- /dev/null
+++ b/arch/x86/include/asm/mpspec.h
@@ -0,0 +1,145 @@
1#ifndef _ASM_X86_MPSPEC_H
2#define _ASM_X86_MPSPEC_H
3
4#include <linux/init.h>
5
6#include <asm/mpspec_def.h>
7
8extern int apic_version[MAX_APICS];
9
10#ifdef CONFIG_X86_32
11#include <mach_mpspec.h>
12
13extern unsigned int def_to_bigsmp;
14extern u8 apicid_2_node[];
15extern int pic_mode;
16
17#ifdef CONFIG_X86_NUMAQ
18extern int mp_bus_id_to_node[MAX_MP_BUSSES];
19extern int mp_bus_id_to_local[MAX_MP_BUSSES];
20extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
21#endif
22
23#define MAX_APICID 256
24
25#else
26
27#define MAX_MP_BUSSES 256
28/* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */
29#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
30
31#endif
32
33extern void early_find_smp_config(void);
34extern void early_get_smp_config(void);
35
36#if defined(CONFIG_MCA) || defined(CONFIG_EISA)
37extern int mp_bus_id_to_type[MAX_MP_BUSSES];
38#endif
39
40extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
41
42extern unsigned int boot_cpu_physical_apicid;
43extern unsigned int max_physical_apicid;
44extern int smp_found_config;
45extern int mpc_default_type;
46extern unsigned long mp_lapic_addr;
47
48extern void find_smp_config(void);
49extern void get_smp_config(void);
50#ifdef CONFIG_X86_MPPARSE
51extern void early_reserve_e820_mpc_new(void);
52#else
53static inline void early_reserve_e820_mpc_new(void) { }
54#endif
55
56void __cpuinit generic_processor_info(int apicid, int version);
57#ifdef CONFIG_ACPI
58extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
59extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
60 u32 gsi);
61extern void mp_config_acpi_legacy_irqs(void);
62extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
63#ifdef CONFIG_X86_IO_APIC
64extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
65 u32 gsi, int triggering, int polarity);
66#else
67static inline int
68mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
69 u32 gsi, int triggering, int polarity)
70{
71 return 0;
72}
73#endif
74#endif /* CONFIG_ACPI */
75
76#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS)
77
78struct physid_mask {
79 unsigned long mask[PHYSID_ARRAY_SIZE];
80};
81
82typedef struct physid_mask physid_mask_t;
83
84#define physid_set(physid, map) set_bit(physid, (map).mask)
85#define physid_clear(physid, map) clear_bit(physid, (map).mask)
86#define physid_isset(physid, map) test_bit(physid, (map).mask)
87#define physid_test_and_set(physid, map) \
88 test_and_set_bit(physid, (map).mask)
89
90#define physids_and(dst, src1, src2) \
91 bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
92
93#define physids_or(dst, src1, src2) \
94 bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
95
96#define physids_clear(map) \
97 bitmap_zero((map).mask, MAX_APICS)
98
99#define physids_complement(dst, src) \
100 bitmap_complement((dst).mask, (src).mask, MAX_APICS)
101
102#define physids_empty(map) \
103 bitmap_empty((map).mask, MAX_APICS)
104
105#define physids_equal(map1, map2) \
106 bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
107
108#define physids_weight(map) \
109 bitmap_weight((map).mask, MAX_APICS)
110
111#define physids_shift_right(d, s, n) \
112 bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
113
114#define physids_shift_left(d, s, n) \
115 bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
116
117#define physids_coerce(map) ((map).mask[0])
118
119#define physids_promote(physids) \
120 ({ \
121 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
122 __physid_mask.mask[0] = physids; \
123 __physid_mask; \
124 })
125
126/* Note: will create very large stack frames if physid_mask_t is big */
127#define physid_mask_of_physid(physid) \
128 ({ \
129 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
130 physid_set(physid, __physid_mask); \
131 __physid_mask; \
132 })
133
134static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
135{
136 physids_clear(*map);
137 physid_set(physid, *map);
138}
139
140#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
141#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }
142
143extern physid_mask_t phys_cpu_present_map;
144
145#endif /* _ASM_X86_MPSPEC_H */
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
new file mode 100644
index 00000000000..e3ace7d1d35
--- /dev/null
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -0,0 +1,180 @@
1#ifndef _ASM_X86_MPSPEC_DEF_H
2#define _ASM_X86_MPSPEC_DEF_H
3
4/*
5 * Structure definitions for SMP machines following the
6 * Intel Multiprocessing Specification 1.1 and 1.4.
7 */
8
9/*
10 * This tag identifies where the SMP configuration
11 * information is.
12 */
13
14#define SMP_MAGIC_IDENT (('_'<<24) | ('P'<<16) | ('M'<<8) | '_')
15
16#ifdef CONFIG_X86_32
17# define MAX_MPC_ENTRY 1024
18# define MAX_APICS 256
19#else
20# if NR_CPUS <= 255
21# define MAX_APICS 255
22# else
23# define MAX_APICS 32768
24# endif
25#endif
26
27struct intel_mp_floating {
28 char mpf_signature[4]; /* "_MP_" */
29 unsigned int mpf_physptr; /* Configuration table address */
30 unsigned char mpf_length; /* Our length (paragraphs) */
31 unsigned char mpf_specification;/* Specification version */
32 unsigned char mpf_checksum; /* Checksum (makes sum 0) */
33 unsigned char mpf_feature1; /* Standard or configuration ? */
34 unsigned char mpf_feature2; /* Bit7 set for IMCR|PIC */
35 unsigned char mpf_feature3; /* Unused (0) */
36 unsigned char mpf_feature4; /* Unused (0) */
37 unsigned char mpf_feature5; /* Unused (0) */
38};
39
40#define MPC_SIGNATURE "PCMP"
41
42struct mp_config_table {
43 char mpc_signature[4];
44 unsigned short mpc_length; /* Size of table */
45 char mpc_spec; /* 0x01 */
46 char mpc_checksum;
47 char mpc_oem[8];
48 char mpc_productid[12];
49 unsigned int mpc_oemptr; /* 0 if not present */
50 unsigned short mpc_oemsize; /* 0 if not present */
51 unsigned short mpc_oemcount;
52 unsigned int mpc_lapic; /* APIC address */
53 unsigned int reserved;
54};
55
56/* Followed by entries */
57
58#define MP_PROCESSOR 0
59#define MP_BUS 1
60#define MP_IOAPIC 2
61#define MP_INTSRC 3
62#define MP_LINTSRC 4
63/* Used by IBM NUMA-Q to describe node locality */
64#define MP_TRANSLATION 192
65
66#define CPU_ENABLED 1 /* Processor is available */
67#define CPU_BOOTPROCESSOR 2 /* Processor is the BP */
68
69#define CPU_STEPPING_MASK 0x000F
70#define CPU_MODEL_MASK 0x00F0
71#define CPU_FAMILY_MASK 0x0F00
72
73struct mpc_config_processor {
74 unsigned char mpc_type;
75 unsigned char mpc_apicid; /* Local APIC number */
76 unsigned char mpc_apicver; /* Its versions */
77 unsigned char mpc_cpuflag;
78 unsigned int mpc_cpufeature;
79 unsigned int mpc_featureflag; /* CPUID feature value */
80 unsigned int mpc_reserved[2];
81};
82
83struct mpc_config_bus {
84 unsigned char mpc_type;
85 unsigned char mpc_busid;
86 unsigned char mpc_bustype[6];
87};
88
89/* List of Bus Type string values, Intel MP Spec. */
90#define BUSTYPE_EISA "EISA"
91#define BUSTYPE_ISA "ISA"
92#define BUSTYPE_INTERN "INTERN" /* Internal BUS */
93#define BUSTYPE_MCA "MCA"
94#define BUSTYPE_VL "VL" /* Local bus */
95#define BUSTYPE_PCI "PCI"
96#define BUSTYPE_PCMCIA "PCMCIA"
97#define BUSTYPE_CBUS "CBUS"
98#define BUSTYPE_CBUSII "CBUSII"
99#define BUSTYPE_FUTURE "FUTURE"
100#define BUSTYPE_MBI "MBI"
101#define BUSTYPE_MBII "MBII"
102#define BUSTYPE_MPI "MPI"
103#define BUSTYPE_MPSA "MPSA"
104#define BUSTYPE_NUBUS "NUBUS"
105#define BUSTYPE_TC "TC"
106#define BUSTYPE_VME "VME"
107#define BUSTYPE_XPRESS "XPRESS"
108
109#define MPC_APIC_USABLE 0x01
110
111struct mpc_config_ioapic {
112 unsigned char mpc_type;
113 unsigned char mpc_apicid;
114 unsigned char mpc_apicver;
115 unsigned char mpc_flags;
116 unsigned int mpc_apicaddr;
117};
118
119struct mpc_config_intsrc {
120 unsigned char mpc_type;
121 unsigned char mpc_irqtype;
122 unsigned short mpc_irqflag;
123 unsigned char mpc_srcbus;
124 unsigned char mpc_srcbusirq;
125 unsigned char mpc_dstapic;
126 unsigned char mpc_dstirq;
127};
128
129enum mp_irq_source_types {
130 mp_INT = 0,
131 mp_NMI = 1,
132 mp_SMI = 2,
133 mp_ExtINT = 3
134};
135
136#define MP_IRQDIR_DEFAULT 0
137#define MP_IRQDIR_HIGH 1
138#define MP_IRQDIR_LOW 3
139
140#define MP_APIC_ALL 0xFF
141
142struct mpc_config_lintsrc {
143 unsigned char mpc_type;
144 unsigned char mpc_irqtype;
145 unsigned short mpc_irqflag;
146 unsigned char mpc_srcbusid;
147 unsigned char mpc_srcbusirq;
148 unsigned char mpc_destapic;
149 unsigned char mpc_destapiclint;
150};
151
152#define MPC_OEM_SIGNATURE "_OEM"
153
154struct mp_config_oemtable {
155 char oem_signature[4];
156 unsigned short oem_length; /* Size of table */
157 char oem_rev; /* 0x01 */
158 char oem_checksum;
159 char mpc_oem[8];
160};
161
162/*
163 * Default configurations
164 *
165 * 1 2 CPU ISA 82489DX
166 * 2 2 CPU EISA 82489DX neither IRQ 0 timer nor IRQ 13 DMA chaining
167 * 3 2 CPU EISA 82489DX
168 * 4 2 CPU MCA 82489DX
169 * 5 2 CPU ISA+PCI
170 * 6 2 CPU EISA+PCI
171 * 7 2 CPU MCA+PCI
172 */
173
174enum mp_bustype {
175 MP_BUS_ISA = 1,
176 MP_BUS_EISA,
177 MP_BUS_PCI,
178 MP_BUS_MCA,
179};
180#endif /* _ASM_X86_MPSPEC_DEF_H */
diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/asm/msgbuf.h
new file mode 100644
index 00000000000..7e4e9481f51
--- /dev/null
+++ b/arch/x86/include/asm/msgbuf.h
@@ -0,0 +1,39 @@
1#ifndef _ASM_X86_MSGBUF_H
2#define _ASM_X86_MSGBUF_H
3
4/*
5 * The msqid64_ds structure for i386 architecture.
6 * Note extra padding because this structure is passed back and forth
7 * between kernel and user space.
8 *
9 * Pad space on i386 is left for:
10 * - 64-bit time_t to solve y2038 problem
11 * - 2 miscellaneous 32-bit values
12 *
13 * Pad space on x8664 is left for:
14 * - 2 miscellaneous 64-bit values
15 */
16struct msqid64_ds {
17 struct ipc64_perm msg_perm;
18 __kernel_time_t msg_stime; /* last msgsnd time */
19#ifdef __i386__
20 unsigned long __unused1;
21#endif
22 __kernel_time_t msg_rtime; /* last msgrcv time */
23#ifdef __i386__
24 unsigned long __unused2;
25#endif
26 __kernel_time_t msg_ctime; /* last change time */
27#ifdef __i386__
28 unsigned long __unused3;
29#endif
30 unsigned long msg_cbytes; /* current number of bytes on queue */
31 unsigned long msg_qnum; /* number of messages in queue */
32 unsigned long msg_qbytes; /* max number of bytes on queue */
33 __kernel_pid_t msg_lspid; /* pid of last msgsnd */
34 __kernel_pid_t msg_lrpid; /* last receive pid */
35 unsigned long __unused4;
36 unsigned long __unused5;
37};
38
39#endif /* _ASM_X86_MSGBUF_H */
diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
new file mode 100644
index 00000000000..6706b3006f1
--- /dev/null
+++ b/arch/x86/include/asm/msidef.h
@@ -0,0 +1,55 @@
1#ifndef _ASM_X86_MSIDEF_H
2#define _ASM_X86_MSIDEF_H
3
4/*
5 * Constants for Intel APIC based MSI messages.
6 */
7
8/*
9 * Shifts for MSI data
10 */
11
12#define MSI_DATA_VECTOR_SHIFT 0
13#define MSI_DATA_VECTOR_MASK 0x000000ff
14#define MSI_DATA_VECTOR(v) (((v) << MSI_DATA_VECTOR_SHIFT) & \
15 MSI_DATA_VECTOR_MASK)
16
17#define MSI_DATA_DELIVERY_MODE_SHIFT 8
18#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT)
19#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT)
20
21#define MSI_DATA_LEVEL_SHIFT 14
22#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT)
23#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT)
24
25#define MSI_DATA_TRIGGER_SHIFT 15
26#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT)
27#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT)
28
29/*
30 * Shift/mask fields for msi address
31 */
32
33#define MSI_ADDR_BASE_HI 0
34#define MSI_ADDR_BASE_LO 0xfee00000
35
36#define MSI_ADDR_DEST_MODE_SHIFT 2
37#define MSI_ADDR_DEST_MODE_PHYSICAL (0 << MSI_ADDR_DEST_MODE_SHIFT)
38#define MSI_ADDR_DEST_MODE_LOGICAL (1 << MSI_ADDR_DEST_MODE_SHIFT)
39
40#define MSI_ADDR_REDIRECTION_SHIFT 3
41#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT)
42 /* dedicated cpu */
43#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT)
44 /* lowest priority */
45
46#define MSI_ADDR_DEST_ID_SHIFT 12
47#define MSI_ADDR_DEST_ID_MASK 0x00ffff0
48#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
49 MSI_ADDR_DEST_ID_MASK)
50
51#define MSI_ADDR_IR_EXT_INT (1 << 4)
52#define MSI_ADDR_IR_SHV (1 << 3)
53#define MSI_ADDR_IR_INDEX1(index) ((index & 0x8000) >> 13)
54#define MSI_ADDR_IR_INDEX2(index) ((index & 0x7fff) << 5)
55#endif /* _ASM_X86_MSIDEF_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
new file mode 100644
index 00000000000..e38859d577a
--- /dev/null
+++ b/arch/x86/include/asm/msr-index.h
@@ -0,0 +1,332 @@
1#ifndef _ASM_X86_MSR_INDEX_H
2#define _ASM_X86_MSR_INDEX_H
3
4/* CPU model specific register (MSR) numbers */
5
6/* x86-64 specific MSRs */
7#define MSR_EFER 0xc0000080 /* extended feature register */
8#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */
9#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */
10#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target */
11#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */
12#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */
13#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */
14#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */
15
16/* EFER bits: */
17#define _EFER_SCE 0 /* SYSCALL/SYSRET */
18#define _EFER_LME 8 /* Long mode enable */
19#define _EFER_LMA 10 /* Long mode active (read-only) */
20#define _EFER_NX 11 /* No execute enable */
21
22#define EFER_SCE (1<<_EFER_SCE)
23#define EFER_LME (1<<_EFER_LME)
24#define EFER_LMA (1<<_EFER_LMA)
25#define EFER_NX (1<<_EFER_NX)
26
27/* Intel MSRs. Some also available on other CPUs */
28#define MSR_IA32_PERFCTR0 0x000000c1
29#define MSR_IA32_PERFCTR1 0x000000c2
30#define MSR_FSB_FREQ 0x000000cd
31
32#define MSR_MTRRcap 0x000000fe
33#define MSR_IA32_BBL_CR_CTL 0x00000119
34
35#define MSR_IA32_SYSENTER_CS 0x00000174
36#define MSR_IA32_SYSENTER_ESP 0x00000175
37#define MSR_IA32_SYSENTER_EIP 0x00000176
38
39#define MSR_IA32_MCG_CAP 0x00000179
40#define MSR_IA32_MCG_STATUS 0x0000017a
41#define MSR_IA32_MCG_CTL 0x0000017b
42
43#define MSR_IA32_PEBS_ENABLE 0x000003f1
44#define MSR_IA32_DS_AREA 0x00000600
45#define MSR_IA32_PERF_CAPABILITIES 0x00000345
46
47#define MSR_MTRRfix64K_00000 0x00000250
48#define MSR_MTRRfix16K_80000 0x00000258
49#define MSR_MTRRfix16K_A0000 0x00000259
50#define MSR_MTRRfix4K_C0000 0x00000268
51#define MSR_MTRRfix4K_C8000 0x00000269
52#define MSR_MTRRfix4K_D0000 0x0000026a
53#define MSR_MTRRfix4K_D8000 0x0000026b
54#define MSR_MTRRfix4K_E0000 0x0000026c
55#define MSR_MTRRfix4K_E8000 0x0000026d
56#define MSR_MTRRfix4K_F0000 0x0000026e
57#define MSR_MTRRfix4K_F8000 0x0000026f
58#define MSR_MTRRdefType 0x000002ff
59
60#define MSR_IA32_CR_PAT 0x00000277
61
62#define MSR_IA32_DEBUGCTLMSR 0x000001d9
63#define MSR_IA32_LASTBRANCHFROMIP 0x000001db
64#define MSR_IA32_LASTBRANCHTOIP 0x000001dc
65#define MSR_IA32_LASTINTFROMIP 0x000001dd
66#define MSR_IA32_LASTINTTOIP 0x000001de
67
68/* DEBUGCTLMSR bits (others vary by model): */
69#define _DEBUGCTLMSR_LBR 0 /* last branch recording */
70#define _DEBUGCTLMSR_BTF 1 /* single-step on branches */
71
72#define DEBUGCTLMSR_LBR (1UL << _DEBUGCTLMSR_LBR)
73#define DEBUGCTLMSR_BTF (1UL << _DEBUGCTLMSR_BTF)
74
75#define MSR_IA32_MC0_CTL 0x00000400
76#define MSR_IA32_MC0_STATUS 0x00000401
77#define MSR_IA32_MC0_ADDR 0x00000402
78#define MSR_IA32_MC0_MISC 0x00000403
79
80#define MSR_P6_PERFCTR0 0x000000c1
81#define MSR_P6_PERFCTR1 0x000000c2
82#define MSR_P6_EVNTSEL0 0x00000186
83#define MSR_P6_EVNTSEL1 0x00000187
84
85/* AMD64 MSRs. Not complete. See the architecture manual for a more
86 complete list. */
87
88#define MSR_AMD64_NB_CFG 0xc001001f
89#define MSR_AMD64_IBSFETCHCTL 0xc0011030
90#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
91#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
92#define MSR_AMD64_IBSOPCTL 0xc0011033
93#define MSR_AMD64_IBSOPRIP 0xc0011034
94#define MSR_AMD64_IBSOPDATA 0xc0011035
95#define MSR_AMD64_IBSOPDATA2 0xc0011036
96#define MSR_AMD64_IBSOPDATA3 0xc0011037
97#define MSR_AMD64_IBSDCLINAD 0xc0011038
98#define MSR_AMD64_IBSDCPHYSAD 0xc0011039
99#define MSR_AMD64_IBSCTL 0xc001103a
100
101/* Fam 10h MSRs */
102#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
103#define FAM10H_MMIO_CONF_ENABLE (1<<0)
104#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
105#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
106#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff
107#define FAM10H_MMIO_CONF_BASE_SHIFT 20
108
109/* K8 MSRs */
110#define MSR_K8_TOP_MEM1 0xc001001a
111#define MSR_K8_TOP_MEM2 0xc001001d
112#define MSR_K8_SYSCFG 0xc0010010
113#define MSR_K8_HWCR 0xc0010015
114#define MSR_K8_INT_PENDING_MSG 0xc0010055
115/* C1E active bits in int pending message */
116#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
117#define MSR_K8_TSEG_ADDR 0xc0010112
118#define K8_MTRRFIXRANGE_DRAM_ENABLE 0x00040000 /* MtrrFixDramEn bit */
119#define K8_MTRRFIXRANGE_DRAM_MODIFY 0x00080000 /* MtrrFixDramModEn bit */
120#define K8_MTRR_RDMEM_WRMEM_MASK 0x18181818 /* Mask: RdMem|WrMem */
121
122/* K7 MSRs */
123#define MSR_K7_EVNTSEL0 0xc0010000
124#define MSR_K7_PERFCTR0 0xc0010004
125#define MSR_K7_EVNTSEL1 0xc0010001
126#define MSR_K7_PERFCTR1 0xc0010005
127#define MSR_K7_EVNTSEL2 0xc0010002
128#define MSR_K7_PERFCTR2 0xc0010006
129#define MSR_K7_EVNTSEL3 0xc0010003
130#define MSR_K7_PERFCTR3 0xc0010007
131#define MSR_K7_CLK_CTL 0xc001001b
132#define MSR_K7_HWCR 0xc0010015
133#define MSR_K7_FID_VID_CTL 0xc0010041
134#define MSR_K7_FID_VID_STATUS 0xc0010042
135
136/* K6 MSRs */
137#define MSR_K6_EFER 0xc0000080
138#define MSR_K6_STAR 0xc0000081
139#define MSR_K6_WHCR 0xc0000082
140#define MSR_K6_UWCCR 0xc0000085
141#define MSR_K6_EPMR 0xc0000086
142#define MSR_K6_PSOR 0xc0000087
143#define MSR_K6_PFIR 0xc0000088
144
145/* Centaur-Hauls/IDT defined MSRs. */
146#define MSR_IDT_FCR1 0x00000107
147#define MSR_IDT_FCR2 0x00000108
148#define MSR_IDT_FCR3 0x00000109
149#define MSR_IDT_FCR4 0x0000010a
150
151#define MSR_IDT_MCR0 0x00000110
152#define MSR_IDT_MCR1 0x00000111
153#define MSR_IDT_MCR2 0x00000112
154#define MSR_IDT_MCR3 0x00000113
155#define MSR_IDT_MCR4 0x00000114
156#define MSR_IDT_MCR5 0x00000115
157#define MSR_IDT_MCR6 0x00000116
158#define MSR_IDT_MCR7 0x00000117
159#define MSR_IDT_MCR_CTRL 0x00000120
160
161/* VIA Cyrix defined MSRs*/
162#define MSR_VIA_FCR 0x00001107
163#define MSR_VIA_LONGHAUL 0x0000110a
164#define MSR_VIA_RNG 0x0000110b
165#define MSR_VIA_BCR2 0x00001147
166
167/* Transmeta defined MSRs */
168#define MSR_TMTA_LONGRUN_CTRL 0x80868010
169#define MSR_TMTA_LONGRUN_FLAGS 0x80868011
170#define MSR_TMTA_LRTI_READOUT 0x80868018
171#define MSR_TMTA_LRTI_VOLT_MHZ 0x8086801a
172
173/* Intel defined MSRs. */
174#define MSR_IA32_P5_MC_ADDR 0x00000000
175#define MSR_IA32_P5_MC_TYPE 0x00000001
176#define MSR_IA32_TSC 0x00000010
177#define MSR_IA32_PLATFORM_ID 0x00000017
178#define MSR_IA32_EBL_CR_POWERON 0x0000002a
179#define MSR_IA32_FEATURE_CONTROL 0x0000003a
180
181#define FEATURE_CONTROL_LOCKED (1<<0)
182#define FEATURE_CONTROL_VMXON_ENABLED (1<<2)
183
184#define MSR_IA32_APICBASE 0x0000001b
185#define MSR_IA32_APICBASE_BSP (1<<8)
186#define MSR_IA32_APICBASE_ENABLE (1<<11)
187#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
188
189#define MSR_IA32_UCODE_WRITE 0x00000079
190#define MSR_IA32_UCODE_REV 0x0000008b
191
192#define MSR_IA32_PERF_STATUS 0x00000198
193#define MSR_IA32_PERF_CTL 0x00000199
194
195#define MSR_IA32_MPERF 0x000000e7
196#define MSR_IA32_APERF 0x000000e8
197
198#define MSR_IA32_THERM_CONTROL 0x0000019a
199#define MSR_IA32_THERM_INTERRUPT 0x0000019b
200#define MSR_IA32_THERM_STATUS 0x0000019c
201#define MSR_IA32_MISC_ENABLE 0x000001a0
202
203/* Intel Model 6 */
204#define MSR_P6_EVNTSEL0 0x00000186
205#define MSR_P6_EVNTSEL1 0x00000187
206
207/* P4/Xeon+ specific */
208#define MSR_IA32_MCG_EAX 0x00000180
209#define MSR_IA32_MCG_EBX 0x00000181
210#define MSR_IA32_MCG_ECX 0x00000182
211#define MSR_IA32_MCG_EDX 0x00000183
212#define MSR_IA32_MCG_ESI 0x00000184
213#define MSR_IA32_MCG_EDI 0x00000185
214#define MSR_IA32_MCG_EBP 0x00000186
215#define MSR_IA32_MCG_ESP 0x00000187
216#define MSR_IA32_MCG_EFLAGS 0x00000188
217#define MSR_IA32_MCG_EIP 0x00000189
218#define MSR_IA32_MCG_RESERVED 0x0000018a
219
220/* Pentium IV performance counter MSRs */
221#define MSR_P4_BPU_PERFCTR0 0x00000300
222#define MSR_P4_BPU_PERFCTR1 0x00000301
223#define MSR_P4_BPU_PERFCTR2 0x00000302
224#define MSR_P4_BPU_PERFCTR3 0x00000303
225#define MSR_P4_MS_PERFCTR0 0x00000304
226#define MSR_P4_MS_PERFCTR1 0x00000305
227#define MSR_P4_MS_PERFCTR2 0x00000306
228#define MSR_P4_MS_PERFCTR3 0x00000307
229#define MSR_P4_FLAME_PERFCTR0 0x00000308
230#define MSR_P4_FLAME_PERFCTR1 0x00000309
231#define MSR_P4_FLAME_PERFCTR2 0x0000030a
232#define MSR_P4_FLAME_PERFCTR3 0x0000030b
233#define MSR_P4_IQ_PERFCTR0 0x0000030c
234#define MSR_P4_IQ_PERFCTR1 0x0000030d
235#define MSR_P4_IQ_PERFCTR2 0x0000030e
236#define MSR_P4_IQ_PERFCTR3 0x0000030f
237#define MSR_P4_IQ_PERFCTR4 0x00000310
238#define MSR_P4_IQ_PERFCTR5 0x00000311
239#define MSR_P4_BPU_CCCR0 0x00000360
240#define MSR_P4_BPU_CCCR1 0x00000361
241#define MSR_P4_BPU_CCCR2 0x00000362
242#define MSR_P4_BPU_CCCR3 0x00000363
243#define MSR_P4_MS_CCCR0 0x00000364
244#define MSR_P4_MS_CCCR1 0x00000365
245#define MSR_P4_MS_CCCR2 0x00000366
246#define MSR_P4_MS_CCCR3 0x00000367
247#define MSR_P4_FLAME_CCCR0 0x00000368
248#define MSR_P4_FLAME_CCCR1 0x00000369
249#define MSR_P4_FLAME_CCCR2 0x0000036a
250#define MSR_P4_FLAME_CCCR3 0x0000036b
251#define MSR_P4_IQ_CCCR0 0x0000036c
252#define MSR_P4_IQ_CCCR1 0x0000036d
253#define MSR_P4_IQ_CCCR2 0x0000036e
254#define MSR_P4_IQ_CCCR3 0x0000036f
255#define MSR_P4_IQ_CCCR4 0x00000370
256#define MSR_P4_IQ_CCCR5 0x00000371
257#define MSR_P4_ALF_ESCR0 0x000003ca
258#define MSR_P4_ALF_ESCR1 0x000003cb
259#define MSR_P4_BPU_ESCR0 0x000003b2
260#define MSR_P4_BPU_ESCR1 0x000003b3
261#define MSR_P4_BSU_ESCR0 0x000003a0
262#define MSR_P4_BSU_ESCR1 0x000003a1
263#define MSR_P4_CRU_ESCR0 0x000003b8
264#define MSR_P4_CRU_ESCR1 0x000003b9
265#define MSR_P4_CRU_ESCR2 0x000003cc
266#define MSR_P4_CRU_ESCR3 0x000003cd
267#define MSR_P4_CRU_ESCR4 0x000003e0
268#define MSR_P4_CRU_ESCR5 0x000003e1
269#define MSR_P4_DAC_ESCR0 0x000003a8
270#define MSR_P4_DAC_ESCR1 0x000003a9
271#define MSR_P4_FIRM_ESCR0 0x000003a4
272#define MSR_P4_FIRM_ESCR1 0x000003a5
273#define MSR_P4_FLAME_ESCR0 0x000003a6
274#define MSR_P4_FLAME_ESCR1 0x000003a7
275#define MSR_P4_FSB_ESCR0 0x000003a2
276#define MSR_P4_FSB_ESCR1 0x000003a3
277#define MSR_P4_IQ_ESCR0 0x000003ba
278#define MSR_P4_IQ_ESCR1 0x000003bb
279#define MSR_P4_IS_ESCR0 0x000003b4
280#define MSR_P4_IS_ESCR1 0x000003b5
281#define MSR_P4_ITLB_ESCR0 0x000003b6
282#define MSR_P4_ITLB_ESCR1 0x000003b7
283#define MSR_P4_IX_ESCR0 0x000003c8
284#define MSR_P4_IX_ESCR1 0x000003c9
285#define MSR_P4_MOB_ESCR0 0x000003aa
286#define MSR_P4_MOB_ESCR1 0x000003ab
287#define MSR_P4_MS_ESCR0 0x000003c0
288#define MSR_P4_MS_ESCR1 0x000003c1
289#define MSR_P4_PMH_ESCR0 0x000003ac
290#define MSR_P4_PMH_ESCR1 0x000003ad
291#define MSR_P4_RAT_ESCR0 0x000003bc
292#define MSR_P4_RAT_ESCR1 0x000003bd
293#define MSR_P4_SAAT_ESCR0 0x000003ae
294#define MSR_P4_SAAT_ESCR1 0x000003af
295#define MSR_P4_SSU_ESCR0 0x000003be
296#define MSR_P4_SSU_ESCR1 0x000003bf /* guess: not in manual */
297
298#define MSR_P4_TBPU_ESCR0 0x000003c2
299#define MSR_P4_TBPU_ESCR1 0x000003c3
300#define MSR_P4_TC_ESCR0 0x000003c4
301#define MSR_P4_TC_ESCR1 0x000003c5
302#define MSR_P4_U2L_ESCR0 0x000003b0
303#define MSR_P4_U2L_ESCR1 0x000003b1
304
305/* Intel Core-based CPU performance counters */
306#define MSR_CORE_PERF_FIXED_CTR0 0x00000309
307#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a
308#define MSR_CORE_PERF_FIXED_CTR2 0x0000030b
309#define MSR_CORE_PERF_FIXED_CTR_CTRL 0x0000038d
310#define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e
311#define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f
312#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390
313
314/* Geode defined MSRs */
315#define MSR_GEODE_BUSCONT_CONF0 0x00001900
316
317/* Intel VT MSRs */
318#define MSR_IA32_VMX_BASIC 0x00000480
319#define MSR_IA32_VMX_PINBASED_CTLS 0x00000481
320#define MSR_IA32_VMX_PROCBASED_CTLS 0x00000482
321#define MSR_IA32_VMX_EXIT_CTLS 0x00000483
322#define MSR_IA32_VMX_ENTRY_CTLS 0x00000484
323#define MSR_IA32_VMX_MISC 0x00000485
324#define MSR_IA32_VMX_CR0_FIXED0 0x00000486
325#define MSR_IA32_VMX_CR0_FIXED1 0x00000487
326#define MSR_IA32_VMX_CR4_FIXED0 0x00000488
327#define MSR_IA32_VMX_CR4_FIXED1 0x00000489
328#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a
329#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b
330#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c
331
332#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
new file mode 100644
index 00000000000..46be2fa7ac2
--- /dev/null
+++ b/arch/x86/include/asm/msr.h
@@ -0,0 +1,247 @@
1#ifndef _ASM_X86_MSR_H
2#define _ASM_X86_MSR_H
3
4#include <asm/msr-index.h>
5
6#ifndef __ASSEMBLY__
7# include <linux/types.h>
8#endif
9
10#ifdef __KERNEL__
11#ifndef __ASSEMBLY__
12
13#include <asm/asm.h>
14#include <asm/errno.h>
15
16static inline unsigned long long native_read_tscp(unsigned int *aux)
17{
18 unsigned long low, high;
19 asm volatile(".byte 0x0f,0x01,0xf9"
20 : "=a" (low), "=d" (high), "=c" (*aux));
21 return low | ((u64)high << 32);
22}
23
24/*
25 * i386 calling convention returns 64-bit value in edx:eax, while
26 * x86_64 returns at rax. Also, the "A" constraint does not really
27 * mean rdx:rax in x86_64, so we need specialized behaviour for each
28 * architecture
29 */
30#ifdef CONFIG_X86_64
31#define DECLARE_ARGS(val, low, high) unsigned low, high
32#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32))
33#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high)
34#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)
35#else
36#define DECLARE_ARGS(val, low, high) unsigned long long val
37#define EAX_EDX_VAL(val, low, high) (val)
38#define EAX_EDX_ARGS(val, low, high) "A" (val)
39#define EAX_EDX_RET(val, low, high) "=A" (val)
40#endif
41
42static inline unsigned long long native_read_msr(unsigned int msr)
43{
44 DECLARE_ARGS(val, low, high);
45
46 asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
47 return EAX_EDX_VAL(val, low, high);
48}
49
50static inline unsigned long long native_read_msr_safe(unsigned int msr,
51 int *err)
52{
53 DECLARE_ARGS(val, low, high);
54
55 asm volatile("2: rdmsr ; xor %[err],%[err]\n"
56 "1:\n\t"
57 ".section .fixup,\"ax\"\n\t"
58 "3: mov %[fault],%[err] ; jmp 1b\n\t"
59 ".previous\n\t"
60 _ASM_EXTABLE(2b, 3b)
61 : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
62 : "c" (msr), [fault] "i" (-EFAULT));
63 return EAX_EDX_VAL(val, low, high);
64}
65
66static inline unsigned long long native_read_msr_amd_safe(unsigned int msr,
67 int *err)
68{
69 DECLARE_ARGS(val, low, high);
70
71 asm volatile("2: rdmsr ; xor %0,%0\n"
72 "1:\n\t"
73 ".section .fixup,\"ax\"\n\t"
74 "3: mov %3,%0 ; jmp 1b\n\t"
75 ".previous\n\t"
76 _ASM_EXTABLE(2b, 3b)
77 : "=r" (*err), EAX_EDX_RET(val, low, high)
78 : "c" (msr), "D" (0x9c5a203a), "i" (-EFAULT));
79 return EAX_EDX_VAL(val, low, high);
80}
81
82static inline void native_write_msr(unsigned int msr,
83 unsigned low, unsigned high)
84{
85 asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
86}
87
88static inline int native_write_msr_safe(unsigned int msr,
89 unsigned low, unsigned high)
90{
91 int err;
92 asm volatile("2: wrmsr ; xor %[err],%[err]\n"
93 "1:\n\t"
94 ".section .fixup,\"ax\"\n\t"
95 "3: mov %[fault],%[err] ; jmp 1b\n\t"
96 ".previous\n\t"
97 _ASM_EXTABLE(2b, 3b)
98 : [err] "=a" (err)
99 : "c" (msr), "0" (low), "d" (high),
100 [fault] "i" (-EFAULT)
101 : "memory");
102 return err;
103}
104
105extern unsigned long long native_read_tsc(void);
106
107static __always_inline unsigned long long __native_read_tsc(void)
108{
109 DECLARE_ARGS(val, low, high);
110
111 rdtsc_barrier();
112 asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
113 rdtsc_barrier();
114
115 return EAX_EDX_VAL(val, low, high);
116}
117
118static inline unsigned long long native_read_pmc(int counter)
119{
120 DECLARE_ARGS(val, low, high);
121
122 asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
123 return EAX_EDX_VAL(val, low, high);
124}
125
126#ifdef CONFIG_PARAVIRT
127#include <asm/paravirt.h>
128#else
129#include <linux/errno.h>
130/*
131 * Access to machine-specific registers (available on 586 and better only)
132 * Note: the rd* operations modify the parameters directly (without using
133 * pointer indirection), this allows gcc to optimize better
134 */
135
136#define rdmsr(msr, val1, val2) \
137do { \
138 u64 __val = native_read_msr((msr)); \
139 (val1) = (u32)__val; \
140 (val2) = (u32)(__val >> 32); \
141} while (0)
142
143static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
144{
145 native_write_msr(msr, low, high);
146}
147
148#define rdmsrl(msr, val) \
149 ((val) = native_read_msr((msr)))
150
151#define wrmsrl(msr, val) \
152 native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32))
153
154/* wrmsr with exception handling */
155static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
156{
157 return native_write_msr_safe(msr, low, high);
158}
159
160/* rdmsr with exception handling */
161#define rdmsr_safe(msr, p1, p2) \
162({ \
163 int __err; \
164 u64 __val = native_read_msr_safe((msr), &__err); \
165 (*p1) = (u32)__val; \
166 (*p2) = (u32)(__val >> 32); \
167 __err; \
168})
169
170static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
171{
172 int err;
173
174 *p = native_read_msr_safe(msr, &err);
175 return err;
176}
177static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
178{
179 int err;
180
181 *p = native_read_msr_amd_safe(msr, &err);
182 return err;
183}
184
185#define rdtscl(low) \
186 ((low) = (u32)native_read_tsc())
187
188#define rdtscll(val) \
189 ((val) = native_read_tsc())
190
191#define rdpmc(counter, low, high) \
192do { \
193 u64 _l = native_read_pmc((counter)); \
194 (low) = (u32)_l; \
195 (high) = (u32)(_l >> 32); \
196} while (0)
197
198#define rdtscp(low, high, aux) \
199do { \
200 unsigned long long _val = native_read_tscp(&(aux)); \
201 (low) = (u32)_val; \
202 (high) = (u32)(_val >> 32); \
203} while (0)
204
205#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
206
207#endif /* !CONFIG_PARAVIRT */
208
209
210#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \
211 (u32)((val) >> 32))
212
213#define write_tsc(val1, val2) wrmsr(0x10, (val1), (val2))
214
215#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0)
216
217#ifdef CONFIG_SMP
218int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
219int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
220int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
221int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
222#else /* CONFIG_SMP */
223static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
224{
225 rdmsr(msr_no, *l, *h);
226 return 0;
227}
228static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
229{
230 wrmsr(msr_no, l, h);
231 return 0;
232}
233static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
234 u32 *l, u32 *h)
235{
236 return rdmsr_safe(msr_no, l, h);
237}
238static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
239{
240 return wrmsr_safe(msr_no, l, h);
241}
242#endif /* CONFIG_SMP */
243#endif /* __ASSEMBLY__ */
244#endif /* __KERNEL__ */
245
246
247#endif /* _ASM_X86_MSR_H */
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
new file mode 100644
index 00000000000..7c1e4258b31
--- /dev/null
+++ b/arch/x86/include/asm/mtrr.h
@@ -0,0 +1,173 @@
1/* Generic MTRR (Memory Type Range Register) ioctls.
2
3 Copyright (C) 1997-1999 Richard Gooch
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public
7 License as published by the Free Software Foundation; either
8 version 2 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
14
15 You should have received a copy of the GNU Library General Public
16 License along with this library; if not, write to the Free
17 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
19 Richard Gooch may be reached by email at rgooch@atnf.csiro.au
20 The postal address is:
21 Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
22*/
23#ifndef _ASM_X86_MTRR_H
24#define _ASM_X86_MTRR_H
25
26#include <linux/ioctl.h>
27#include <linux/errno.h>
28
29#define MTRR_IOCTL_BASE 'M'
30
31struct mtrr_sentry {
32 unsigned long base; /* Base address */
33 unsigned int size; /* Size of region */
34 unsigned int type; /* Type of region */
35};
36
37/* Warning: this structure has a different order from i386
38 on x86-64. The 32bit emulation code takes care of that.
39 But you need to use this for 64bit, otherwise your X server
40 will break. */
41
42#ifdef __i386__
43struct mtrr_gentry {
44 unsigned int regnum; /* Register number */
45 unsigned long base; /* Base address */
46 unsigned int size; /* Size of region */
47 unsigned int type; /* Type of region */
48};
49
50#else /* __i386__ */
51
52struct mtrr_gentry {
53 unsigned long base; /* Base address */
54 unsigned int size; /* Size of region */
55 unsigned int regnum; /* Register number */
56 unsigned int type; /* Type of region */
57};
58#endif /* !__i386__ */
59
60/* These are the various ioctls */
61#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry)
62#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry)
63#define MTRRIOC_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry)
64#define MTRRIOC_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry)
65#define MTRRIOC_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry)
66#define MTRRIOC_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry)
67#define MTRRIOC_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry)
68#define MTRRIOC_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry)
69#define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry)
70#define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry)
71
72/* These are the region types */
73#define MTRR_TYPE_UNCACHABLE 0
74#define MTRR_TYPE_WRCOMB 1
75/*#define MTRR_TYPE_ 2*/
76/*#define MTRR_TYPE_ 3*/
77#define MTRR_TYPE_WRTHROUGH 4
78#define MTRR_TYPE_WRPROT 5
79#define MTRR_TYPE_WRBACK 6
80#define MTRR_NUM_TYPES 7
81
82#ifdef __KERNEL__
83
84/* The following functions are for use by other drivers */
85# ifdef CONFIG_MTRR
86extern u8 mtrr_type_lookup(u64 addr, u64 end);
87extern void mtrr_save_fixed_ranges(void *);
88extern void mtrr_save_state(void);
89extern int mtrr_add(unsigned long base, unsigned long size,
90 unsigned int type, bool increment);
91extern int mtrr_add_page(unsigned long base, unsigned long size,
92 unsigned int type, bool increment);
93extern int mtrr_del(int reg, unsigned long base, unsigned long size);
94extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
95extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
96extern void mtrr_ap_init(void);
97extern void mtrr_bp_init(void);
98extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
99extern int amd_special_default_mtrr(void);
100# else
101static inline u8 mtrr_type_lookup(u64 addr, u64 end)
102{
103 /*
104 * Return no-MTRRs:
105 */
106 return 0xff;
107}
108#define mtrr_save_fixed_ranges(arg) do {} while (0)
109#define mtrr_save_state() do {} while (0)
110static inline int mtrr_add(unsigned long base, unsigned long size,
111 unsigned int type, bool increment)
112{
113 return -ENODEV;
114}
115static inline int mtrr_add_page(unsigned long base, unsigned long size,
116 unsigned int type, bool increment)
117{
118 return -ENODEV;
119}
120static inline int mtrr_del(int reg, unsigned long base, unsigned long size)
121{
122 return -ENODEV;
123}
124static inline int mtrr_del_page(int reg, unsigned long base, unsigned long size)
125{
126 return -ENODEV;
127}
128static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
129{
130 return 0;
131}
132static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
133{
134}
135
136#define mtrr_ap_init() do {} while (0)
137#define mtrr_bp_init() do {} while (0)
138# endif
139
140#ifdef CONFIG_COMPAT
141#include <linux/compat.h>
142
143struct mtrr_sentry32 {
144 compat_ulong_t base; /* Base address */
145 compat_uint_t size; /* Size of region */
146 compat_uint_t type; /* Type of region */
147};
148
149struct mtrr_gentry32 {
150 compat_ulong_t regnum; /* Register number */
151 compat_uint_t base; /* Base address */
152 compat_uint_t size; /* Size of region */
153 compat_uint_t type; /* Type of region */
154};
155
156#define MTRR_IOCTL_BASE 'M'
157
158#define MTRRIOC32_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry32)
159#define MTRRIOC32_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry32)
160#define MTRRIOC32_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry32)
161#define MTRRIOC32_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry32)
162#define MTRRIOC32_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry32)
163#define MTRRIOC32_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry32)
164#define MTRRIOC32_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry32)
165#define MTRRIOC32_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry32)
166#define MTRRIOC32_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry32)
167#define MTRRIOC32_KILL_PAGE_ENTRY \
168 _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32)
169#endif /* CONFIG_COMPAT */
170
171#endif /* __KERNEL__ */
172
173#endif /* _ASM_X86_MTRR_H */
diff --git a/arch/x86/include/asm/mutex.h b/arch/x86/include/asm/mutex.h
new file mode 100644
index 00000000000..a731b9c573a
--- /dev/null
+++ b/arch/x86/include/asm/mutex.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "mutex_32.h"
3#else
4# include "mutex_64.h"
5#endif
diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h
new file mode 100644
index 00000000000..03f90c8a5a7
--- /dev/null
+++ b/arch/x86/include/asm/mutex_32.h
@@ -0,0 +1,125 @@
1/*
2 * Assembly implementation of the mutex fastpath, based on atomic
3 * decrement/increment.
4 *
5 * started by Ingo Molnar:
6 *
7 * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
8 */
9#ifndef _ASM_X86_MUTEX_32_H
10#define _ASM_X86_MUTEX_32_H
11
12#include <asm/alternative.h>
13
14/**
15 * __mutex_fastpath_lock - try to take the lock by moving the count
16 * from 1 to a 0 value
17 * @count: pointer of type atomic_t
18 * @fn: function to call if the original value was not 1
19 *
20 * Change the count from 1 to a value lower than 1, and call <fn> if it
21 * wasn't 1 originally. This function MUST leave the value lower than 1
22 * even when the "1" assertion wasn't true.
23 */
24#define __mutex_fastpath_lock(count, fail_fn) \
25do { \
26 unsigned int dummy; \
27 \
28 typecheck(atomic_t *, count); \
29 typecheck_fn(void (*)(atomic_t *), fail_fn); \
30 \
31 asm volatile(LOCK_PREFIX " decl (%%eax)\n" \
32 " jns 1f \n" \
33 " call " #fail_fn "\n" \
34 "1:\n" \
35 : "=a" (dummy) \
36 : "a" (count) \
37 : "memory", "ecx", "edx"); \
38} while (0)
39
40
41/**
42 * __mutex_fastpath_lock_retval - try to take the lock by moving the count
43 * from 1 to a 0 value
44 * @count: pointer of type atomic_t
45 * @fail_fn: function to call if the original value was not 1
46 *
47 * Change the count from 1 to a value lower than 1, and call <fail_fn> if it
48 * wasn't 1 originally. This function returns 0 if the fastpath succeeds,
49 * or anything the slow path function returns
50 */
51static inline int __mutex_fastpath_lock_retval(atomic_t *count,
52 int (*fail_fn)(atomic_t *))
53{
54 if (unlikely(atomic_dec_return(count) < 0))
55 return fail_fn(count);
56 else
57 return 0;
58}
59
60/**
61 * __mutex_fastpath_unlock - try to promote the mutex from 0 to 1
62 * @count: pointer of type atomic_t
63 * @fail_fn: function to call if the original value was not 0
64 *
65 * try to promote the mutex from 0 to 1. if it wasn't 0, call <fail_fn>.
66 * In the failure case, this function is allowed to either set the value
67 * to 1, or to set it to a value lower than 1.
68 *
69 * If the implementation sets it to a value of lower than 1, the
70 * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs
71 * to return 0 otherwise.
72 */
73#define __mutex_fastpath_unlock(count, fail_fn) \
74do { \
75 unsigned int dummy; \
76 \
77 typecheck(atomic_t *, count); \
78 typecheck_fn(void (*)(atomic_t *), fail_fn); \
79 \
80 asm volatile(LOCK_PREFIX " incl (%%eax)\n" \
81 " jg 1f\n" \
82 " call " #fail_fn "\n" \
83 "1:\n" \
84 : "=a" (dummy) \
85 : "a" (count) \
86 : "memory", "ecx", "edx"); \
87} while (0)
88
89#define __mutex_slowpath_needs_to_unlock() 1
90
91/**
92 * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
93 *
94 * @count: pointer of type atomic_t
95 * @fail_fn: fallback function
96 *
97 * Change the count from 1 to a value lower than 1, and return 0 (failure)
98 * if it wasn't 1 originally, or return 1 (success) otherwise. This function
99 * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
100 * Additionally, if the value was < 0 originally, this function must not leave
101 * it to 0 on failure.
102 */
103static inline int __mutex_fastpath_trylock(atomic_t *count,
104 int (*fail_fn)(atomic_t *))
105{
106 /*
107 * We have two variants here. The cmpxchg based one is the best one
108 * because it never induce a false contention state. It is included
109 * here because architectures using the inc/dec algorithms over the
110 * xchg ones are much more likely to support cmpxchg natively.
111 *
112 * If not we fall back to the spinlock based variant - that is
113 * just as efficient (and simpler) as a 'destructive' probing of
114 * the mutex state would be.
115 */
116#ifdef __HAVE_ARCH_CMPXCHG
117 if (likely(atomic_cmpxchg(count, 1, 0) == 1))
118 return 1;
119 return 0;
120#else
121 return fail_fn(count);
122#endif
123}
124
125#endif /* _ASM_X86_MUTEX_32_H */
diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h
new file mode 100644
index 00000000000..68a87b0f8e2
--- /dev/null
+++ b/arch/x86/include/asm/mutex_64.h
@@ -0,0 +1,100 @@
1/*
2 * Assembly implementation of the mutex fastpath, based on atomic
3 * decrement/increment.
4 *
5 * started by Ingo Molnar:
6 *
7 * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
8 */
9#ifndef _ASM_X86_MUTEX_64_H
10#define _ASM_X86_MUTEX_64_H
11
12/**
13 * __mutex_fastpath_lock - decrement and call function if negative
14 * @v: pointer of type atomic_t
15 * @fail_fn: function to call if the result is negative
16 *
17 * Atomically decrements @v and calls <fail_fn> if the result is negative.
18 */
19#define __mutex_fastpath_lock(v, fail_fn) \
20do { \
21 unsigned long dummy; \
22 \
23 typecheck(atomic_t *, v); \
24 typecheck_fn(void (*)(atomic_t *), fail_fn); \
25 \
26 asm volatile(LOCK_PREFIX " decl (%%rdi)\n" \
27 " jns 1f \n" \
28 " call " #fail_fn "\n" \
29 "1:" \
30 : "=D" (dummy) \
31 : "D" (v) \
32 : "rax", "rsi", "rdx", "rcx", \
33 "r8", "r9", "r10", "r11", "memory"); \
34} while (0)
35
36/**
37 * __mutex_fastpath_lock_retval - try to take the lock by moving the count
38 * from 1 to a 0 value
39 * @count: pointer of type atomic_t
40 * @fail_fn: function to call if the original value was not 1
41 *
42 * Change the count from 1 to a value lower than 1, and call <fail_fn> if
43 * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
44 * or anything the slow path function returns
45 */
46static inline int __mutex_fastpath_lock_retval(atomic_t *count,
47 int (*fail_fn)(atomic_t *))
48{
49 if (unlikely(atomic_dec_return(count) < 0))
50 return fail_fn(count);
51 else
52 return 0;
53}
54
55/**
56 * __mutex_fastpath_unlock - increment and call function if nonpositive
57 * @v: pointer of type atomic_t
58 * @fail_fn: function to call if the result is nonpositive
59 *
60 * Atomically increments @v and calls <fail_fn> if the result is nonpositive.
61 */
62#define __mutex_fastpath_unlock(v, fail_fn) \
63do { \
64 unsigned long dummy; \
65 \
66 typecheck(atomic_t *, v); \
67 typecheck_fn(void (*)(atomic_t *), fail_fn); \
68 \
69 asm volatile(LOCK_PREFIX " incl (%%rdi)\n" \
70 " jg 1f\n" \
71 " call " #fail_fn "\n" \
72 "1:" \
73 : "=D" (dummy) \
74 : "D" (v) \
75 : "rax", "rsi", "rdx", "rcx", \
76 "r8", "r9", "r10", "r11", "memory"); \
77} while (0)
78
79#define __mutex_slowpath_needs_to_unlock() 1
80
81/**
82 * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
83 *
84 * @count: pointer of type atomic_t
85 * @fail_fn: fallback function
86 *
87 * Change the count from 1 to 0 and return 1 (success), or return 0 (failure)
88 * if it wasn't 1 originally. [the fallback function is never used on
89 * x86_64, because all x86_64 CPUs have a CMPXCHG instruction.]
90 */
91static inline int __mutex_fastpath_trylock(atomic_t *count,
92 int (*fail_fn)(atomic_t *))
93{
94 if (likely(atomic_cmpxchg(count, 1, 0) == 1))
95 return 1;
96 else
97 return 0;
98}
99
100#endif /* _ASM_X86_MUTEX_64_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
new file mode 100644
index 00000000000..c45a0a568df
--- /dev/null
+++ b/arch/x86/include/asm/nmi.h
@@ -0,0 +1,81 @@
1#ifndef _ASM_X86_NMI_H
2#define _ASM_X86_NMI_H
3
4#include <linux/pm.h>
5#include <asm/irq.h>
6#include <asm/io.h>
7
8#ifdef ARCH_HAS_NMI_WATCHDOG
9
10/**
11 * do_nmi_callback
12 *
13 * Check to see if a callback exists and execute it. Return 1
14 * if the handler exists and was handled successfully.
15 */
16int do_nmi_callback(struct pt_regs *regs, int cpu);
17
18extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
19extern int check_nmi_watchdog(void);
20extern int nmi_watchdog_enabled;
21extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
22extern int avail_to_resrv_perfctr_nmi(unsigned int);
23extern int reserve_perfctr_nmi(unsigned int);
24extern void release_perfctr_nmi(unsigned int);
25extern int reserve_evntsel_nmi(unsigned int);
26extern void release_evntsel_nmi(unsigned int);
27
28extern void setup_apic_nmi_watchdog(void *);
29extern void stop_apic_nmi_watchdog(void *);
30extern void disable_timer_nmi_watchdog(void);
31extern void enable_timer_nmi_watchdog(void);
32extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
33extern void cpu_nmi_set_wd_enabled(void);
34
35extern atomic_t nmi_active;
36extern unsigned int nmi_watchdog;
37#define NMI_NONE 0
38#define NMI_IO_APIC 1
39#define NMI_LOCAL_APIC 2
40#define NMI_INVALID 3
41
42struct ctl_table;
43struct file;
44extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
45 void __user *, size_t *, loff_t *);
46extern int unknown_nmi_panic;
47
48void __trigger_all_cpu_backtrace(void);
49#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
50
51static inline void localise_nmi_watchdog(void)
52{
53 if (nmi_watchdog == NMI_IO_APIC)
54 nmi_watchdog = NMI_LOCAL_APIC;
55}
56
57/* check if nmi_watchdog is active (ie was specified at boot) */
58static inline int nmi_watchdog_active(void)
59{
60 /*
61 * actually it should be:
62 * return (nmi_watchdog == NMI_LOCAL_APIC ||
63 * nmi_watchdog == NMI_IO_APIC)
64 * but since they are power of two we could use a
65 * cheaper way --cvg
66 */
67 return nmi_watchdog & 0x3;
68}
69#endif
70
71void lapic_watchdog_stop(void);
72int lapic_watchdog_init(unsigned nmi_hz);
73int lapic_wd_event(unsigned nmi_hz);
74unsigned lapic_adjust_nmi_hz(unsigned hz);
75int lapic_watchdog_ok(void);
76void disable_lapic_nmi_watchdog(void);
77void enable_lapic_nmi_watchdog(void);
78void stop_nmi(void);
79void restart_nmi(void);
80
81#endif /* _ASM_X86_NMI_H */
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
new file mode 100644
index 00000000000..ad2668ee1aa
--- /dev/null
+++ b/arch/x86/include/asm/nops.h
@@ -0,0 +1,118 @@
1#ifndef _ASM_X86_NOPS_H
2#define _ASM_X86_NOPS_H
3
4/* Define nops for use with alternative() */
5
6/* generic versions from gas
7 1: nop
8 the following instructions are NOT nops in 64-bit mode,
9 for 64-bit mode use K8 or P6 nops instead
10 2: movl %esi,%esi
11 3: leal 0x00(%esi),%esi
12 4: leal 0x00(,%esi,1),%esi
13 6: leal 0x00000000(%esi),%esi
14 7: leal 0x00000000(,%esi,1),%esi
15*/
16#define GENERIC_NOP1 ".byte 0x90\n"
17#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
18#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
19#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
20#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
21#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
22#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
23#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
24
25/* Opteron 64bit nops
26 1: nop
27 2: osp nop
28 3: osp osp nop
29 4: osp osp osp nop
30*/
31#define K8_NOP1 GENERIC_NOP1
32#define K8_NOP2 ".byte 0x66,0x90\n"
33#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
34#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
35#define K8_NOP5 K8_NOP3 K8_NOP2
36#define K8_NOP6 K8_NOP3 K8_NOP3
37#define K8_NOP7 K8_NOP4 K8_NOP3
38#define K8_NOP8 K8_NOP4 K8_NOP4
39
40/* K7 nops
41 uses eax dependencies (arbitary choice)
42 1: nop
43 2: movl %eax,%eax
44 3: leal (,%eax,1),%eax
45 4: leal 0x00(,%eax,1),%eax
46 6: leal 0x00000000(%eax),%eax
47 7: leal 0x00000000(,%eax,1),%eax
48*/
49#define K7_NOP1 GENERIC_NOP1
50#define K7_NOP2 ".byte 0x8b,0xc0\n"
51#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
52#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
53#define K7_NOP5 K7_NOP4 ASM_NOP1
54#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
55#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
56#define K7_NOP8 K7_NOP7 ASM_NOP1
57
58/* P6 nops
59 uses eax dependencies (Intel-recommended choice)
60 1: nop
61 2: osp nop
62 3: nopl (%eax)
63 4: nopl 0x00(%eax)
64 5: nopl 0x00(%eax,%eax,1)
65 6: osp nopl 0x00(%eax,%eax,1)
66 7: nopl 0x00000000(%eax)
67 8: nopl 0x00000000(%eax,%eax,1)
68*/
69#define P6_NOP1 GENERIC_NOP1
70#define P6_NOP2 ".byte 0x66,0x90\n"
71#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
72#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
73#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
74#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
75#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
76#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
77
78#if defined(CONFIG_MK7)
79#define ASM_NOP1 K7_NOP1
80#define ASM_NOP2 K7_NOP2
81#define ASM_NOP3 K7_NOP3
82#define ASM_NOP4 K7_NOP4
83#define ASM_NOP5 K7_NOP5
84#define ASM_NOP6 K7_NOP6
85#define ASM_NOP7 K7_NOP7
86#define ASM_NOP8 K7_NOP8
87#elif defined(CONFIG_X86_P6_NOP)
88#define ASM_NOP1 P6_NOP1
89#define ASM_NOP2 P6_NOP2
90#define ASM_NOP3 P6_NOP3
91#define ASM_NOP4 P6_NOP4
92#define ASM_NOP5 P6_NOP5
93#define ASM_NOP6 P6_NOP6
94#define ASM_NOP7 P6_NOP7
95#define ASM_NOP8 P6_NOP8
96#elif defined(CONFIG_X86_64)
97#define ASM_NOP1 K8_NOP1
98#define ASM_NOP2 K8_NOP2
99#define ASM_NOP3 K8_NOP3
100#define ASM_NOP4 K8_NOP4
101#define ASM_NOP5 K8_NOP5
102#define ASM_NOP6 K8_NOP6
103#define ASM_NOP7 K8_NOP7
104#define ASM_NOP8 K8_NOP8
105#else
106#define ASM_NOP1 GENERIC_NOP1
107#define ASM_NOP2 GENERIC_NOP2
108#define ASM_NOP3 GENERIC_NOP3
109#define ASM_NOP4 GENERIC_NOP4
110#define ASM_NOP5 GENERIC_NOP5
111#define ASM_NOP6 GENERIC_NOP6
112#define ASM_NOP7 GENERIC_NOP7
113#define ASM_NOP8 GENERIC_NOP8
114#endif
115
116#define ASM_NOP_MAX 8
117
118#endif /* _ASM_X86_NOPS_H */
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
new file mode 100644
index 00000000000..27da400d313
--- /dev/null
+++ b/arch/x86/include/asm/numa.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "numa_32.h"
3#else
4# include "numa_64.h"
5#endif
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
new file mode 100644
index 00000000000..e9f5db79624
--- /dev/null
+++ b/arch/x86/include/asm/numa_32.h
@@ -0,0 +1,11 @@
1#ifndef _ASM_X86_NUMA_32_H
2#define _ASM_X86_NUMA_32_H
3
4extern int pxm_to_nid(int pxm);
5extern void numa_remove_cpu(int cpu);
6
7#ifdef CONFIG_NUMA
8extern void set_highmem_pages_init(void);
9#endif
10
11#endif /* _ASM_X86_NUMA_32_H */
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
new file mode 100644
index 00000000000..064ed6df4cb
--- /dev/null
+++ b/arch/x86/include/asm/numa_64.h
@@ -0,0 +1,43 @@
1#ifndef _ASM_X86_NUMA_64_H
2#define _ASM_X86_NUMA_64_H
3
4#include <linux/nodemask.h>
5#include <asm/apicdef.h>
6
7struct bootnode {
8 u64 start;
9 u64 end;
10};
11
12extern int compute_hash_shift(struct bootnode *nodes, int numblks,
13 int *nodeids);
14
15#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
16
17extern void numa_init_array(void);
18extern int numa_off;
19
20extern void srat_reserve_add_area(int nodeid);
21extern int hotadd_percent;
22
23extern s16 apicid_to_node[MAX_LOCAL_APIC];
24
25extern unsigned long numa_free_all_bootmem(void);
26extern void setup_node_bootmem(int nodeid, unsigned long start,
27 unsigned long end);
28
29#ifdef CONFIG_NUMA
30extern void __init init_cpu_to_node(void);
31extern void __cpuinit numa_set_node(int cpu, int node);
32extern void __cpuinit numa_clear_node(int cpu);
33extern void __cpuinit numa_add_cpu(int cpu);
34extern void __cpuinit numa_remove_cpu(int cpu);
35#else
36static inline void init_cpu_to_node(void) { }
37static inline void numa_set_node(int cpu, int node) { }
38static inline void numa_clear_node(int cpu) { }
39static inline void numa_add_cpu(int cpu, int node) { }
40static inline void numa_remove_cpu(int cpu) { }
41#endif
42
43#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
new file mode 100644
index 00000000000..1e8bd30b4c1
--- /dev/null
+++ b/arch/x86/include/asm/numaq.h
@@ -0,0 +1,169 @@
1/*
2 * Written by: Patricia Gaughen, IBM Corporation
3 *
4 * Copyright (C) 2002, IBM Corp.
5 *
6 * All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16 * NON INFRINGEMENT. See the GNU General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * Send feedback to <gone@us.ibm.com>
24 */
25
26#ifndef _ASM_X86_NUMAQ_H
27#define _ASM_X86_NUMAQ_H
28
29#ifdef CONFIG_X86_NUMAQ
30
31extern int found_numaq;
32extern int get_memcfg_numaq(void);
33
34/*
35 * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the
36 */
37#define SYS_CFG_DATA_PRIV_ADDR 0x0009d000 /* place for scd in private
38 quad space */
39
40/*
41 * Communication area for each processor on lynxer-processor tests.
42 *
43 * NOTE: If you change the size of this eachproc structure you need
44 * to change the definition for EACH_QUAD_SIZE.
45 */
46struct eachquadmem {
47 unsigned int priv_mem_start; /* Starting address of this */
48 /* quad's private memory. */
49 /* This is always 0. */
50 /* In MB. */
51 unsigned int priv_mem_size; /* Size of this quad's */
52 /* private memory. */
53 /* In MB. */
54 unsigned int low_shrd_mem_strp_start;/* Starting address of this */
55 /* quad's low shared block */
56 /* (untranslated). */
57 /* In MB. */
58 unsigned int low_shrd_mem_start; /* Starting address of this */
59 /* quad's low shared memory */
60 /* (untranslated). */
61 /* In MB. */
62 unsigned int low_shrd_mem_size; /* Size of this quad's low */
63 /* shared memory. */
64 /* In MB. */
65 unsigned int lmmio_copb_start; /* Starting address of this */
66 /* quad's local memory */
67 /* mapped I/O in the */
68 /* compatibility OPB. */
69 /* In MB. */
70 unsigned int lmmio_copb_size; /* Size of this quad's local */
71 /* memory mapped I/O in the */
72 /* compatibility OPB. */
73 /* In MB. */
74 unsigned int lmmio_nopb_start; /* Starting address of this */
75 /* quad's local memory */
76 /* mapped I/O in the */
77 /* non-compatibility OPB. */
78 /* In MB. */
79 unsigned int lmmio_nopb_size; /* Size of this quad's local */
80 /* memory mapped I/O in the */
81 /* non-compatibility OPB. */
82 /* In MB. */
83 unsigned int io_apic_0_start; /* Starting address of I/O */
84 /* APIC 0. */
85 unsigned int io_apic_0_sz; /* Size I/O APIC 0. */
86 unsigned int io_apic_1_start; /* Starting address of I/O */
87 /* APIC 1. */
88 unsigned int io_apic_1_sz; /* Size I/O APIC 1. */
89 unsigned int hi_shrd_mem_start; /* Starting address of this */
90 /* quad's high shared memory.*/
91 /* In MB. */
92 unsigned int hi_shrd_mem_size; /* Size of this quad's high */
93 /* shared memory. */
94 /* In MB. */
95 unsigned int mps_table_addr; /* Address of this quad's */
96 /* MPS tables from BIOS, */
97 /* in system space.*/
98 unsigned int lcl_MDC_pio_addr; /* Port-I/O address for */
99 /* local access of MDC. */
100 unsigned int rmt_MDC_mmpio_addr; /* MM-Port-I/O address for */
101 /* remote access of MDC. */
102 unsigned int mm_port_io_start; /* Starting address of this */
103 /* quad's memory mapped Port */
104 /* I/O space. */
105 unsigned int mm_port_io_size; /* Size of this quad's memory*/
106 /* mapped Port I/O space. */
107 unsigned int mm_rmt_io_apic_start; /* Starting address of this */
108 /* quad's memory mapped */
109 /* remote I/O APIC space. */
110 unsigned int mm_rmt_io_apic_size; /* Size of this quad's memory*/
111 /* mapped remote I/O APIC */
112 /* space. */
113 unsigned int mm_isa_start; /* Starting address of this */
114 /* quad's memory mapped ISA */
115 /* space (contains MDC */
116 /* memory space). */
117 unsigned int mm_isa_size; /* Size of this quad's memory*/
118 /* mapped ISA space (contains*/
119 /* MDC memory space). */
120 unsigned int rmt_qmi_addr; /* Remote addr to access QMI.*/
121 unsigned int lcl_qmi_addr; /* Local addr to access QMI. */
122};
123
124/*
125 * Note: This structure must be NOT be changed unless the multiproc and
126 * OS are changed to reflect the new structure.
127 */
128struct sys_cfg_data {
129 unsigned int quad_id;
130 unsigned int bsp_proc_id; /* Boot Strap Processor in this quad. */
131 unsigned int scd_version; /* Version number of this table. */
132 unsigned int first_quad_id;
133 unsigned int quads_present31_0; /* 1 bit for each quad */
134 unsigned int quads_present63_32; /* 1 bit for each quad */
135 unsigned int config_flags;
136 unsigned int boot_flags;
137 unsigned int csr_start_addr; /* Absolute value (not in MB) */
138 unsigned int csr_size; /* Absolute value (not in MB) */
139 unsigned int lcl_apic_start_addr; /* Absolute value (not in MB) */
140 unsigned int lcl_apic_size; /* Absolute value (not in MB) */
141 unsigned int low_shrd_mem_base; /* 0 or 512MB or 1GB */
142 unsigned int low_shrd_mem_quad_offset; /* 0,128M,256M,512M,1G */
143 /* may not be totally populated */
144 unsigned int split_mem_enbl; /* 0 for no low shared memory */
145 unsigned int mmio_sz; /* Size of total system memory mapped I/O */
146 /* (in MB). */
147 unsigned int quad_spin_lock; /* Spare location used for quad */
148 /* bringup. */
149 unsigned int nonzero55; /* For checksumming. */
150 unsigned int nonzeroaa; /* For checksumming. */
151 unsigned int scd_magic_number;
152 unsigned int system_type;
153 unsigned int checksum;
154 /*
155 * memory configuration area for each quad
156 */
157 struct eachquadmem eq[MAX_NUMNODES]; /* indexed by quad id */
158};
159
160void numaq_tsc_disable(void);
161
162#else
163static inline int get_memcfg_numaq(void)
164{
165 return 0;
166}
167#endif /* CONFIG_X86_NUMAQ */
168#endif /* _ASM_X86_NUMAQ_H */
169
diff --git a/arch/x86/include/asm/numaq/apic.h b/arch/x86/include/asm/numaq/apic.h
new file mode 100644
index 00000000000..0bf2a06b7a4
--- /dev/null
+++ b/arch/x86/include/asm/numaq/apic.h
@@ -0,0 +1,136 @@
1#ifndef __ASM_NUMAQ_APIC_H
2#define __ASM_NUMAQ_APIC_H
3
4#include <asm/io.h>
5#include <linux/mmzone.h>
6#include <linux/nodemask.h>
7
8#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
9
10static inline cpumask_t target_cpus(void)
11{
12 return CPU_MASK_ALL;
13}
14
15#define NO_BALANCE_IRQ (1)
16#define esr_disable (1)
17
18#define INT_DELIVERY_MODE dest_LowestPrio
19#define INT_DEST_MODE 0 /* physical delivery on LOCAL quad */
20
21static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
22{
23 return physid_isset(apicid, bitmap);
24}
25static inline unsigned long check_apicid_present(int bit)
26{
27 return physid_isset(bit, phys_cpu_present_map);
28}
29#define apicid_cluster(apicid) (apicid & 0xF0)
30
31static inline int apic_id_registered(void)
32{
33 return 1;
34}
35
36static inline void init_apic_ldr(void)
37{
38 /* Already done in NUMA-Q firmware */
39}
40
41static inline void setup_apic_routing(void)
42{
43 printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
44 "NUMA-Q", nr_ioapics);
45}
46
47/*
48 * Skip adding the timer int on secondary nodes, which causes
49 * a small but painful rift in the time-space continuum.
50 */
51static inline int multi_timer_check(int apic, int irq)
52{
53 return apic != 0 && irq == 0;
54}
55
56static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
57{
58 /* We don't have a good way to do this yet - hack */
59 return physids_promote(0xFUL);
60}
61
62/* Mapping from cpu number to logical apicid */
63extern u8 cpu_2_logical_apicid[];
64static inline int cpu_to_logical_apicid(int cpu)
65{
66 if (cpu >= NR_CPUS)
67 return BAD_APICID;
68 return (int)cpu_2_logical_apicid[cpu];
69}
70
71/*
72 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
73 * cpu to APIC ID relation to properly interact with the intelligent
74 * mode of the cluster controller.
75 */
76static inline int cpu_present_to_apicid(int mps_cpu)
77{
78 if (mps_cpu < 60)
79 return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
80 else
81 return BAD_APICID;
82}
83
84static inline int apicid_to_node(int logical_apicid)
85{
86 return logical_apicid >> 4;
87}
88
89static inline physid_mask_t apicid_to_cpu_present(int logical_apicid)
90{
91 int node = apicid_to_node(logical_apicid);
92 int cpu = __ffs(logical_apicid & 0xf);
93
94 return physid_mask_of_physid(cpu + 4*node);
95}
96
97extern void *xquad_portio;
98
99static inline void setup_portio_remap(void)
100{
101 int num_quads = num_online_nodes();
102
103 if (num_quads <= 1)
104 return;
105
106 printk("Remapping cross-quad port I/O for %d quads\n", num_quads);
107 xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
108 printk("xquad_portio vaddr 0x%08lx, len %08lx\n",
109 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
110}
111
112static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
113{
114 return (1);
115}
116
117static inline void enable_apic_mode(void)
118{
119}
120
121/*
122 * We use physical apicids here, not logical, so just return the default
123 * physical broadcast to stop people from breaking us
124 */
125static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
126{
127 return (int) 0xF;
128}
129
130/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
131static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
132{
133 return cpuid_apic >> index_msb;
134}
135
136#endif /* __ASM_NUMAQ_APIC_H */
diff --git a/arch/x86/include/asm/numaq/apicdef.h b/arch/x86/include/asm/numaq/apicdef.h
new file mode 100644
index 00000000000..e012a46cc22
--- /dev/null
+++ b/arch/x86/include/asm/numaq/apicdef.h
@@ -0,0 +1,14 @@
1#ifndef __ASM_NUMAQ_APICDEF_H
2#define __ASM_NUMAQ_APICDEF_H
3
4
5#define APIC_ID_MASK (0xF<<24)
6
7static inline unsigned get_apic_id(unsigned long x)
8{
9 return (((x)>>24)&0x0F);
10}
11
12#define GET_APIC_ID(x) get_apic_id(x)
13
14#endif
diff --git a/arch/x86/include/asm/numaq/ipi.h b/arch/x86/include/asm/numaq/ipi.h
new file mode 100644
index 00000000000..935588d286c
--- /dev/null
+++ b/arch/x86/include/asm/numaq/ipi.h
@@ -0,0 +1,25 @@
1#ifndef __ASM_NUMAQ_IPI_H
2#define __ASM_NUMAQ_IPI_H
3
4void send_IPI_mask_sequence(cpumask_t, int vector);
5
6static inline void send_IPI_mask(cpumask_t mask, int vector)
7{
8 send_IPI_mask_sequence(mask, vector);
9}
10
11static inline void send_IPI_allbutself(int vector)
12{
13 cpumask_t mask = cpu_online_map;
14 cpu_clear(smp_processor_id(), mask);
15
16 if (!cpus_empty(mask))
17 send_IPI_mask(mask, vector);
18}
19
20static inline void send_IPI_all(int vector)
21{
22 send_IPI_mask(cpu_online_map, vector);
23}
24
25#endif /* __ASM_NUMAQ_IPI_H */
diff --git a/arch/x86/include/asm/numaq/mpparse.h b/arch/x86/include/asm/numaq/mpparse.h
new file mode 100644
index 00000000000..252292e077b
--- /dev/null
+++ b/arch/x86/include/asm/numaq/mpparse.h
@@ -0,0 +1,7 @@
1#ifndef __ASM_NUMAQ_MPPARSE_H
2#define __ASM_NUMAQ_MPPARSE_H
3
4extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
5 char *productid);
6
7#endif /* __ASM_NUMAQ_MPPARSE_H */
diff --git a/arch/x86/include/asm/numaq/wakecpu.h b/arch/x86/include/asm/numaq/wakecpu.h
new file mode 100644
index 00000000000..c577bda5b1c
--- /dev/null
+++ b/arch/x86/include/asm/numaq/wakecpu.h
@@ -0,0 +1,43 @@
1#ifndef __ASM_NUMAQ_WAKECPU_H
2#define __ASM_NUMAQ_WAKECPU_H
3
4/* This file copes with machines that wakeup secondary CPUs by NMIs */
5
6#define WAKE_SECONDARY_VIA_NMI
7
8#define TRAMPOLINE_LOW phys_to_virt(0x8)
9#define TRAMPOLINE_HIGH phys_to_virt(0xa)
10
11#define boot_cpu_apicid boot_cpu_logical_apicid
12
13/* We don't do anything here because we use NMI's to boot instead */
14static inline void wait_for_init_deassert(atomic_t *deassert)
15{
16}
17
18/*
19 * Because we use NMIs rather than the INIT-STARTUP sequence to
20 * bootstrap the CPUs, the APIC may be in a weird state. Kick it.
21 */
22static inline void smp_callin_clear_local_apic(void)
23{
24 clear_local_APIC();
25}
26
27static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
28{
29 printk("Storing NMI vector\n");
30 *high = *((volatile unsigned short *) TRAMPOLINE_HIGH);
31 *low = *((volatile unsigned short *) TRAMPOLINE_LOW);
32}
33
34static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
35{
36 printk("Restoring NMI vector\n");
37 *((volatile unsigned short *) TRAMPOLINE_HIGH) = *high;
38 *((volatile unsigned short *) TRAMPOLINE_LOW) = *low;
39}
40
41#define inquire_remote_apic(apicid) {}
42
43#endif /* __ASM_NUMAQ_WAKECPU_H */
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
new file mode 100644
index 00000000000..834a30295fa
--- /dev/null
+++ b/arch/x86/include/asm/olpc.h
@@ -0,0 +1,132 @@
1/* OLPC machine specific definitions */
2
3#ifndef _ASM_X86_OLPC_H
4#define _ASM_X86_OLPC_H
5
6#include <asm/geode.h>
7
8struct olpc_platform_t {
9 int flags;
10 uint32_t boardrev;
11 int ecver;
12};
13
14#define OLPC_F_PRESENT 0x01
15#define OLPC_F_DCON 0x02
16#define OLPC_F_VSA 0x04
17
18#ifdef CONFIG_OLPC
19
20extern struct olpc_platform_t olpc_platform_info;
21
22/*
23 * OLPC board IDs contain the major build number within the mask 0x0ff0,
24 * and the minor build number withing 0x000f. Pre-builds have a minor
25 * number less than 8, and normal builds start at 8. For example, 0x0B10
26 * is a PreB1, and 0x0C18 is a C1.
27 */
28
29static inline uint32_t olpc_board(uint8_t id)
30{
31 return (id << 4) | 0x8;
32}
33
34static inline uint32_t olpc_board_pre(uint8_t id)
35{
36 return id << 4;
37}
38
39static inline int machine_is_olpc(void)
40{
41 return (olpc_platform_info.flags & OLPC_F_PRESENT) ? 1 : 0;
42}
43
44/*
45 * The DCON is OLPC's Display Controller. It has a number of unique
46 * features that we might want to take advantage of..
47 */
48static inline int olpc_has_dcon(void)
49{
50 return (olpc_platform_info.flags & OLPC_F_DCON) ? 1 : 0;
51}
52
53/*
54 * The VSA is software from AMD that typical Geode bioses will include.
55 * It is used to emulate the PCI bus, VGA, etc. OLPC's Open Firmware does
56 * not include the VSA; instead, PCI is emulated by the kernel.
57 *
58 * The VSA is described further in arch/x86/pci/olpc.c.
59 */
60static inline int olpc_has_vsa(void)
61{
62 return (olpc_platform_info.flags & OLPC_F_VSA) ? 1 : 0;
63}
64
65/*
66 * The "Mass Production" version of OLPC's XO is identified as being model
67 * C2. During the prototype phase, the following models (in chronological
68 * order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models
69 * were based on Geode GX CPUs, and models after that were based upon
70 * Geode LX CPUs. There were also some hand-assembled models floating
71 * around, referred to as PreB1, PreB2, etc.
72 */
73static inline int olpc_board_at_least(uint32_t rev)
74{
75 return olpc_platform_info.boardrev >= rev;
76}
77
78#else
79
80static inline int machine_is_olpc(void)
81{
82 return 0;
83}
84
85static inline int olpc_has_dcon(void)
86{
87 return 0;
88}
89
90static inline int olpc_has_vsa(void)
91{
92 return 0;
93}
94
95#endif
96
97/* EC related functions */
98
99extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
100 unsigned char *outbuf, size_t outlen);
101
102extern int olpc_ec_mask_set(uint8_t bits);
103extern int olpc_ec_mask_unset(uint8_t bits);
104
105/* EC commands */
106
107#define EC_FIRMWARE_REV 0x08
108
109/* SCI source values */
110
111#define EC_SCI_SRC_EMPTY 0x00
112#define EC_SCI_SRC_GAME 0x01
113#define EC_SCI_SRC_BATTERY 0x02
114#define EC_SCI_SRC_BATSOC 0x04
115#define EC_SCI_SRC_BATERR 0x08
116#define EC_SCI_SRC_EBOOK 0x10
117#define EC_SCI_SRC_WLAN 0x20
118#define EC_SCI_SRC_ACPWR 0x40
119#define EC_SCI_SRC_ALL 0x7F
120
121/* GPIO assignments */
122
123#define OLPC_GPIO_MIC_AC geode_gpio(1)
124#define OLPC_GPIO_DCON_IRQ geode_gpio(7)
125#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
126#define OLPC_GPIO_SMB_CLK geode_gpio(14)
127#define OLPC_GPIO_SMB_DATA geode_gpio(15)
128#define OLPC_GPIO_WORKAUX geode_gpio(24)
129#define OLPC_GPIO_LID geode_gpio(26)
130#define OLPC_GPIO_ECSCI geode_gpio(27)
131
132#endif /* _ASM_X86_OLPC_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
new file mode 100644
index 00000000000..e9873a2e869
--- /dev/null
+++ b/arch/x86/include/asm/page.h
@@ -0,0 +1,209 @@
1#ifndef _ASM_X86_PAGE_H
2#define _ASM_X86_PAGE_H
3
4#include <linux/const.h>
5
6/* PAGE_SHIFT determines the page size */
7#define PAGE_SHIFT 12
8#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
9#define PAGE_MASK (~(PAGE_SIZE-1))
10
11#ifdef __KERNEL__
12
13#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
14#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
15
16/* Cast PAGE_MASK to a signed type so that it is sign-extended if
17 virtual addresses are 32-bits but physical addresses are larger
18 (ie, 32-bit PAE). */
19#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
20
21/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
22#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
23
24/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
25#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
26
27#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
28#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
29
30#define HPAGE_SHIFT PMD_SHIFT
31#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
32#define HPAGE_MASK (~(HPAGE_SIZE - 1))
33#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
34
35#define HUGE_MAX_HSTATE 2
36
37#ifndef __ASSEMBLY__
38#include <linux/types.h>
39#endif
40
41#ifdef CONFIG_X86_64
42#include <asm/page_64.h>
43#else
44#include <asm/page_32.h>
45#endif /* CONFIG_X86_64 */
46
47#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
48
49#define VM_DATA_DEFAULT_FLAGS \
50 (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
51 VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
52
53
54#ifndef __ASSEMBLY__
55
56typedef struct { pgdval_t pgd; } pgd_t;
57typedef struct { pgprotval_t pgprot; } pgprot_t;
58
59extern int page_is_ram(unsigned long pagenr);
60extern int pagerange_is_ram(unsigned long start, unsigned long end);
61extern int devmem_is_allowed(unsigned long pagenr);
62extern void map_devmem(unsigned long pfn, unsigned long size,
63 pgprot_t vma_prot);
64extern void unmap_devmem(unsigned long pfn, unsigned long size,
65 pgprot_t vma_prot);
66
67extern unsigned long max_low_pfn_mapped;
68extern unsigned long max_pfn_mapped;
69
70struct page;
71
72static inline void clear_user_page(void *page, unsigned long vaddr,
73 struct page *pg)
74{
75 clear_page(page);
76}
77
78static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
79 struct page *topage)
80{
81 copy_page(to, from);
82}
83
84#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
85 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
86#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
87
88static inline pgd_t native_make_pgd(pgdval_t val)
89{
90 return (pgd_t) { val };
91}
92
93static inline pgdval_t native_pgd_val(pgd_t pgd)
94{
95 return pgd.pgd;
96}
97
98#if PAGETABLE_LEVELS >= 3
99#if PAGETABLE_LEVELS == 4
100typedef struct { pudval_t pud; } pud_t;
101
102static inline pud_t native_make_pud(pmdval_t val)
103{
104 return (pud_t) { val };
105}
106
107static inline pudval_t native_pud_val(pud_t pud)
108{
109 return pud.pud;
110}
111#else /* PAGETABLE_LEVELS == 3 */
112#include <asm-generic/pgtable-nopud.h>
113
114static inline pudval_t native_pud_val(pud_t pud)
115{
116 return native_pgd_val(pud.pgd);
117}
118#endif /* PAGETABLE_LEVELS == 4 */
119
120typedef struct { pmdval_t pmd; } pmd_t;
121
122static inline pmd_t native_make_pmd(pmdval_t val)
123{
124 return (pmd_t) { val };
125}
126
127static inline pmdval_t native_pmd_val(pmd_t pmd)
128{
129 return pmd.pmd;
130}
131#else /* PAGETABLE_LEVELS == 2 */
132#include <asm-generic/pgtable-nopmd.h>
133
134static inline pmdval_t native_pmd_val(pmd_t pmd)
135{
136 return native_pgd_val(pmd.pud.pgd);
137}
138#endif /* PAGETABLE_LEVELS >= 3 */
139
140static inline pte_t native_make_pte(pteval_t val)
141{
142 return (pte_t) { .pte = val };
143}
144
145static inline pteval_t native_pte_val(pte_t pte)
146{
147 return pte.pte;
148}
149
150static inline pteval_t native_pte_flags(pte_t pte)
151{
152 return native_pte_val(pte) & PTE_FLAGS_MASK;
153}
154
155#define pgprot_val(x) ((x).pgprot)
156#define __pgprot(x) ((pgprot_t) { (x) } )
157
158#ifdef CONFIG_PARAVIRT
159#include <asm/paravirt.h>
160#else /* !CONFIG_PARAVIRT */
161
162#define pgd_val(x) native_pgd_val(x)
163#define __pgd(x) native_make_pgd(x)
164
165#ifndef __PAGETABLE_PUD_FOLDED
166#define pud_val(x) native_pud_val(x)
167#define __pud(x) native_make_pud(x)
168#endif
169
170#ifndef __PAGETABLE_PMD_FOLDED
171#define pmd_val(x) native_pmd_val(x)
172#define __pmd(x) native_make_pmd(x)
173#endif
174
175#define pte_val(x) native_pte_val(x)
176#define pte_flags(x) native_pte_flags(x)
177#define __pte(x) native_make_pte(x)
178
179#endif /* CONFIG_PARAVIRT */
180
181#define __pa(x) __phys_addr((unsigned long)(x))
182#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x))
183/* __pa_symbol should be used for C visible symbols.
184 This seems to be the official gcc blessed way to do such arithmetic. */
185#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
186
187#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
188
189#define __boot_va(x) __va(x)
190#define __boot_pa(x) __pa(x)
191
192/*
193 * virt_to_page(kaddr) returns a valid pointer if and only if
194 * virt_addr_valid(kaddr) returns true.
195 */
196#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
197#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
198extern bool __virt_addr_valid(unsigned long kaddr);
199#define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr))
200
201#endif /* __ASSEMBLY__ */
202
203#include <asm-generic/memory_model.h>
204#include <asm-generic/page.h>
205
206#define __HAVE_ARCH_GATE_AREA 1
207
208#endif /* __KERNEL__ */
209#endif /* _ASM_X86_PAGE_H */
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
new file mode 100644
index 00000000000..bcde0d7b432
--- /dev/null
+++ b/arch/x86/include/asm/page_32.h
@@ -0,0 +1,136 @@
1#ifndef _ASM_X86_PAGE_32_H
2#define _ASM_X86_PAGE_32_H
3
4/*
5 * This handles the memory map.
6 *
7 * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
8 * a virtual address space of one gigabyte, which limits the
9 * amount of physical memory you can use to about 950MB.
10 *
11 * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
12 * and CONFIG_HIGHMEM64G options in the kernel configuration.
13 */
14#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
15
16#ifdef CONFIG_4KSTACKS
17#define THREAD_ORDER 0
18#else
19#define THREAD_ORDER 1
20#endif
21#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
22
23#define STACKFAULT_STACK 0
24#define DOUBLEFAULT_STACK 1
25#define NMI_STACK 0
26#define DEBUG_STACK 0
27#define MCE_STACK 0
28#define N_EXCEPTION_STACKS 1
29
30#ifdef CONFIG_X86_PAE
31/* 44=32+12, the limit we can fit into an unsigned long pfn */
32#define __PHYSICAL_MASK_SHIFT 44
33#define __VIRTUAL_MASK_SHIFT 32
34#define PAGETABLE_LEVELS 3
35
36#ifndef __ASSEMBLY__
37typedef u64 pteval_t;
38typedef u64 pmdval_t;
39typedef u64 pudval_t;
40typedef u64 pgdval_t;
41typedef u64 pgprotval_t;
42
43typedef union {
44 struct {
45 unsigned long pte_low, pte_high;
46 };
47 pteval_t pte;
48} pte_t;
49#endif /* __ASSEMBLY__
50 */
51#else /* !CONFIG_X86_PAE */
52#define __PHYSICAL_MASK_SHIFT 32
53#define __VIRTUAL_MASK_SHIFT 32
54#define PAGETABLE_LEVELS 2
55
56#ifndef __ASSEMBLY__
57typedef unsigned long pteval_t;
58typedef unsigned long pmdval_t;
59typedef unsigned long pudval_t;
60typedef unsigned long pgdval_t;
61typedef unsigned long pgprotval_t;
62
63typedef union {
64 pteval_t pte;
65 pteval_t pte_low;
66} pte_t;
67
68#endif /* __ASSEMBLY__ */
69#endif /* CONFIG_X86_PAE */
70
71#ifndef __ASSEMBLY__
72typedef struct page *pgtable_t;
73#endif
74
75#ifdef CONFIG_HUGETLB_PAGE
76#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
77#endif
78
79#ifndef __ASSEMBLY__
80#define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET)
81#ifdef CONFIG_DEBUG_VIRTUAL
82extern unsigned long __phys_addr(unsigned long);
83#else
84#define __phys_addr(x) __phys_addr_nodebug(x)
85#endif
86#define __phys_reloc_hide(x) RELOC_HIDE((x), 0)
87
88#ifdef CONFIG_FLATMEM
89#define pfn_valid(pfn) ((pfn) < max_mapnr)
90#endif /* CONFIG_FLATMEM */
91
92extern int nx_enabled;
93
94/*
95 * This much address space is reserved for vmalloc() and iomap()
96 * as well as fixmap mappings.
97 */
98extern unsigned int __VMALLOC_RESERVE;
99extern int sysctl_legacy_va_layout;
100
101extern void find_low_pfn_range(void);
102extern unsigned long init_memory_mapping(unsigned long start,
103 unsigned long end);
104extern void initmem_init(unsigned long, unsigned long);
105extern void free_initmem(void);
106extern void setup_bootmem_allocator(void);
107
108
109#ifdef CONFIG_X86_USE_3DNOW
110#include <asm/mmx.h>
111
112static inline void clear_page(void *page)
113{
114 mmx_clear_page(page);
115}
116
117static inline void copy_page(void *to, void *from)
118{
119 mmx_copy_page(to, from);
120}
121#else /* !CONFIG_X86_USE_3DNOW */
122#include <linux/string.h>
123
124static inline void clear_page(void *page)
125{
126 memset(page, 0, PAGE_SIZE);
127}
128
129static inline void copy_page(void *to, void *from)
130{
131 memcpy(to, from, PAGE_SIZE);
132}
133#endif /* CONFIG_X86_3DNOW */
134#endif /* !__ASSEMBLY__ */
135
136#endif /* _ASM_X86_PAGE_32_H */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
new file mode 100644
index 00000000000..5ebca29f44f
--- /dev/null
+++ b/arch/x86/include/asm/page_64.h
@@ -0,0 +1,105 @@
1#ifndef _ASM_X86_PAGE_64_H
2#define _ASM_X86_PAGE_64_H
3
4#define PAGETABLE_LEVELS 4
5
6#define THREAD_ORDER 1
7#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
8#define CURRENT_MASK (~(THREAD_SIZE - 1))
9
10#define EXCEPTION_STACK_ORDER 0
11#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
12
13#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
14#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
15
16#define IRQSTACK_ORDER 2
17#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
18
19#define STACKFAULT_STACK 1
20#define DOUBLEFAULT_STACK 2
21#define NMI_STACK 3
22#define DEBUG_STACK 4
23#define MCE_STACK 5
24#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
25
26#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
27#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
28
29/*
30 * Set __PAGE_OFFSET to the most negative possible address +
31 * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
32 * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
33 * what Xen requires.
34 */
35#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
36
37#define __PHYSICAL_START CONFIG_PHYSICAL_START
38#define __KERNEL_ALIGN 0x200000
39
40/*
41 * Make sure kernel is aligned to 2MB address. Catching it at compile
42 * time is better. Change your config file and compile the kernel
43 * for a 2MB aligned address (CONFIG_PHYSICAL_START)
44 */
45#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
46#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
47#endif
48
49#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
50#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
51
52/* See Documentation/x86_64/mm.txt for a description of the memory map. */
53#define __PHYSICAL_MASK_SHIFT 46
54#define __VIRTUAL_MASK_SHIFT 48
55
56/*
57 * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
58 * arch/x86/kernel/head_64.S), and it is mapped here:
59 */
60#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
61#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
62
63#ifndef __ASSEMBLY__
64void clear_page(void *page);
65void copy_page(void *to, void *from);
66
67/* duplicated to the one in bootmem.h */
68extern unsigned long max_pfn;
69extern unsigned long phys_base;
70
71extern unsigned long __phys_addr(unsigned long);
72#define __phys_reloc_hide(x) (x)
73
74/*
75 * These are used to make use of C type-checking..
76 */
77typedef unsigned long pteval_t;
78typedef unsigned long pmdval_t;
79typedef unsigned long pudval_t;
80typedef unsigned long pgdval_t;
81typedef unsigned long pgprotval_t;
82
83typedef struct page *pgtable_t;
84
85typedef struct { pteval_t pte; } pte_t;
86
87#define vmemmap ((struct page *)VMEMMAP_START)
88
89extern unsigned long init_memory_mapping(unsigned long start,
90 unsigned long end);
91
92extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
93extern void free_initmem(void);
94
95extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
96extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
97
98#endif /* !__ASSEMBLY__ */
99
100#ifdef CONFIG_FLATMEM
101#define pfn_valid(pfn) ((pfn) < max_pfn)
102#endif
103
104
105#endif /* _ASM_X86_PAGE_64_H */
diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/asm/param.h
new file mode 100644
index 00000000000..6f0d0422f4c
--- /dev/null
+++ b/arch/x86/include/asm/param.h
@@ -0,0 +1,22 @@
1#ifndef _ASM_X86_PARAM_H
2#define _ASM_X86_PARAM_H
3
4#ifdef __KERNEL__
5# define HZ CONFIG_HZ /* Internal kernel timer frequency */
6# define USER_HZ 100 /* some user interfaces are */
7# define CLOCKS_PER_SEC (USER_HZ) /* in "ticks" like times() */
8#endif
9
10#ifndef HZ
11#define HZ 100
12#endif
13
14#define EXEC_PAGESIZE 4096
15
16#ifndef NOGROUP
17#define NOGROUP (-1)
18#endif
19
20#define MAXHOSTNAMELEN 64 /* max length of hostname */
21
22#endif /* _ASM_X86_PARAM_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
new file mode 100644
index 00000000000..ba3e2ff6aed
--- /dev/null
+++ b/arch/x86/include/asm/paravirt.h
@@ -0,0 +1,1650 @@
1#ifndef _ASM_X86_PARAVIRT_H
2#define _ASM_X86_PARAVIRT_H
3/* Various instructions on x86 need to be replaced for
4 * para-virtualization: those hooks are defined here. */
5
6#ifdef CONFIG_PARAVIRT
7#include <asm/page.h>
8#include <asm/asm.h>
9
10/* Bitmask of what can be clobbered: usually at least eax. */
11#define CLBR_NONE 0
12#define CLBR_EAX (1 << 0)
13#define CLBR_ECX (1 << 1)
14#define CLBR_EDX (1 << 2)
15
16#ifdef CONFIG_X86_64
17#define CLBR_RSI (1 << 3)
18#define CLBR_RDI (1 << 4)
19#define CLBR_R8 (1 << 5)
20#define CLBR_R9 (1 << 6)
21#define CLBR_R10 (1 << 7)
22#define CLBR_R11 (1 << 8)
23#define CLBR_ANY ((1 << 9) - 1)
24#include <asm/desc_defs.h>
25#else
26/* CLBR_ANY should match all regs platform has. For i386, that's just it */
27#define CLBR_ANY ((1 << 3) - 1)
28#endif /* X86_64 */
29
30#ifndef __ASSEMBLY__
31#include <linux/types.h>
32#include <linux/cpumask.h>
33#include <asm/kmap_types.h>
34#include <asm/desc_defs.h>
35
36struct page;
37struct thread_struct;
38struct desc_ptr;
39struct tss_struct;
40struct mm_struct;
41struct desc_struct;
42
43/* general info */
44struct pv_info {
45 unsigned int kernel_rpl;
46 int shared_kernel_pmd;
47 int paravirt_enabled;
48 const char *name;
49};
50
51struct pv_init_ops {
52 /*
53 * Patch may replace one of the defined code sequences with
54 * arbitrary code, subject to the same register constraints.
55 * This generally means the code is not free to clobber any
56 * registers other than EAX. The patch function should return
57 * the number of bytes of code generated, as we nop pad the
58 * rest in generic code.
59 */
60 unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
61 unsigned long addr, unsigned len);
62
63 /* Basic arch-specific setup */
64 void (*arch_setup)(void);
65 char *(*memory_setup)(void);
66 void (*post_allocator_init)(void);
67
68 /* Print a banner to identify the environment */
69 void (*banner)(void);
70};
71
72
73struct pv_lazy_ops {
74 /* Set deferred update mode, used for batching operations. */
75 void (*enter)(void);
76 void (*leave)(void);
77};
78
79struct pv_time_ops {
80 void (*time_init)(void);
81
82 /* Set and set time of day */
83 unsigned long (*get_wallclock)(void);
84 int (*set_wallclock)(unsigned long);
85
86 unsigned long long (*sched_clock)(void);
87 unsigned long (*get_tsc_khz)(void);
88};
89
90struct pv_cpu_ops {
91 /* hooks for various privileged instructions */
92 unsigned long (*get_debugreg)(int regno);
93 void (*set_debugreg)(int regno, unsigned long value);
94
95 void (*clts)(void);
96
97 unsigned long (*read_cr0)(void);
98 void (*write_cr0)(unsigned long);
99
100 unsigned long (*read_cr4_safe)(void);
101 unsigned long (*read_cr4)(void);
102 void (*write_cr4)(unsigned long);
103
104#ifdef CONFIG_X86_64
105 unsigned long (*read_cr8)(void);
106 void (*write_cr8)(unsigned long);
107#endif
108
109 /* Segment descriptor handling */
110 void (*load_tr_desc)(void);
111 void (*load_gdt)(const struct desc_ptr *);
112 void (*load_idt)(const struct desc_ptr *);
113 void (*store_gdt)(struct desc_ptr *);
114 void (*store_idt)(struct desc_ptr *);
115 void (*set_ldt)(const void *desc, unsigned entries);
116 unsigned long (*store_tr)(void);
117 void (*load_tls)(struct thread_struct *t, unsigned int cpu);
118#ifdef CONFIG_X86_64
119 void (*load_gs_index)(unsigned int idx);
120#endif
121 void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum,
122 const void *desc);
123 void (*write_gdt_entry)(struct desc_struct *,
124 int entrynum, const void *desc, int size);
125 void (*write_idt_entry)(gate_desc *,
126 int entrynum, const gate_desc *gate);
127 void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
128 void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
129
130 void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
131
132 void (*set_iopl_mask)(unsigned mask);
133
134 void (*wbinvd)(void);
135 void (*io_delay)(void);
136
137 /* cpuid emulation, mostly so that caps bits can be disabled */
138 void (*cpuid)(unsigned int *eax, unsigned int *ebx,
139 unsigned int *ecx, unsigned int *edx);
140
141 /* MSR, PMC and TSR operations.
142 err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
143 u64 (*read_msr_amd)(unsigned int msr, int *err);
144 u64 (*read_msr)(unsigned int msr, int *err);
145 int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
146
147 u64 (*read_tsc)(void);
148 u64 (*read_pmc)(int counter);
149 unsigned long long (*read_tscp)(unsigned int *aux);
150
151 /*
152 * Atomically enable interrupts and return to userspace. This
153 * is only ever used to return to 32-bit processes; in a
154 * 64-bit kernel, it's used for 32-on-64 compat processes, but
155 * never native 64-bit processes. (Jump, not call.)
156 */
157 void (*irq_enable_sysexit)(void);
158
159 /*
160 * Switch to usermode gs and return to 64-bit usermode using
161 * sysret. Only used in 64-bit kernels to return to 64-bit
162 * processes. Usermode register state, including %rsp, must
163 * already be restored.
164 */
165 void (*usergs_sysret64)(void);
166
167 /*
168 * Switch to usermode gs and return to 32-bit usermode using
169 * sysret. Used to return to 32-on-64 compat processes.
170 * Other usermode register state, including %esp, must already
171 * be restored.
172 */
173 void (*usergs_sysret32)(void);
174
175 /* Normal iret. Jump to this with the standard iret stack
176 frame set up. */
177 void (*iret)(void);
178
179 void (*swapgs)(void);
180
181 struct pv_lazy_ops lazy_mode;
182};
183
184struct pv_irq_ops {
185 void (*init_IRQ)(void);
186
187 /*
188 * Get/set interrupt state. save_fl and restore_fl are only
189 * expected to use X86_EFLAGS_IF; all other bits
190 * returned from save_fl are undefined, and may be ignored by
191 * restore_fl.
192 */
193 unsigned long (*save_fl)(void);
194 void (*restore_fl)(unsigned long);
195 void (*irq_disable)(void);
196 void (*irq_enable)(void);
197 void (*safe_halt)(void);
198 void (*halt)(void);
199
200#ifdef CONFIG_X86_64
201 void (*adjust_exception_frame)(void);
202#endif
203};
204
205struct pv_apic_ops {
206#ifdef CONFIG_X86_LOCAL_APIC
207 void (*setup_boot_clock)(void);
208 void (*setup_secondary_clock)(void);
209
210 void (*startup_ipi_hook)(int phys_apicid,
211 unsigned long start_eip,
212 unsigned long start_esp);
213#endif
214};
215
216struct pv_mmu_ops {
217 /*
218 * Called before/after init_mm pagetable setup. setup_start
219 * may reset %cr3, and may pre-install parts of the pagetable;
220 * pagetable setup is expected to preserve any existing
221 * mapping.
222 */
223 void (*pagetable_setup_start)(pgd_t *pgd_base);
224 void (*pagetable_setup_done)(pgd_t *pgd_base);
225
226 unsigned long (*read_cr2)(void);
227 void (*write_cr2)(unsigned long);
228
229 unsigned long (*read_cr3)(void);
230 void (*write_cr3)(unsigned long);
231
232 /*
233 * Hooks for intercepting the creation/use/destruction of an
234 * mm_struct.
235 */
236 void (*activate_mm)(struct mm_struct *prev,
237 struct mm_struct *next);
238 void (*dup_mmap)(struct mm_struct *oldmm,
239 struct mm_struct *mm);
240 void (*exit_mmap)(struct mm_struct *mm);
241
242
243 /* TLB operations */
244 void (*flush_tlb_user)(void);
245 void (*flush_tlb_kernel)(void);
246 void (*flush_tlb_single)(unsigned long addr);
247 void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm,
248 unsigned long va);
249
250 /* Hooks for allocating and freeing a pagetable top-level */
251 int (*pgd_alloc)(struct mm_struct *mm);
252 void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd);
253
254 /*
255 * Hooks for allocating/releasing pagetable pages when they're
256 * attached to a pagetable
257 */
258 void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
259 void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
260 void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
261 void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
262 void (*release_pte)(unsigned long pfn);
263 void (*release_pmd)(unsigned long pfn);
264 void (*release_pud)(unsigned long pfn);
265
266 /* Pagetable manipulation functions */
267 void (*set_pte)(pte_t *ptep, pte_t pteval);
268 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
269 pte_t *ptep, pte_t pteval);
270 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
271 void (*pte_update)(struct mm_struct *mm, unsigned long addr,
272 pte_t *ptep);
273 void (*pte_update_defer)(struct mm_struct *mm,
274 unsigned long addr, pte_t *ptep);
275
276 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
277 pte_t *ptep);
278 void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
279 pte_t *ptep, pte_t pte);
280
281 pteval_t (*pte_val)(pte_t);
282 pteval_t (*pte_flags)(pte_t);
283 pte_t (*make_pte)(pteval_t pte);
284
285 pgdval_t (*pgd_val)(pgd_t);
286 pgd_t (*make_pgd)(pgdval_t pgd);
287
288#if PAGETABLE_LEVELS >= 3
289#ifdef CONFIG_X86_PAE
290 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
291 void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
292 pte_t *ptep, pte_t pte);
293 void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
294 pte_t *ptep);
295 void (*pmd_clear)(pmd_t *pmdp);
296
297#endif /* CONFIG_X86_PAE */
298
299 void (*set_pud)(pud_t *pudp, pud_t pudval);
300
301 pmdval_t (*pmd_val)(pmd_t);
302 pmd_t (*make_pmd)(pmdval_t pmd);
303
304#if PAGETABLE_LEVELS == 4
305 pudval_t (*pud_val)(pud_t);
306 pud_t (*make_pud)(pudval_t pud);
307
308 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
309#endif /* PAGETABLE_LEVELS == 4 */
310#endif /* PAGETABLE_LEVELS >= 3 */
311
312#ifdef CONFIG_HIGHPTE
313 void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
314#endif
315
316 struct pv_lazy_ops lazy_mode;
317
318 /* dom0 ops */
319
320 /* Sometimes the physical address is a pfn, and sometimes its
321 an mfn. We can tell which is which from the index. */
322 void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx,
323 unsigned long phys, pgprot_t flags);
324};
325
326struct raw_spinlock;
327struct pv_lock_ops {
328 int (*spin_is_locked)(struct raw_spinlock *lock);
329 int (*spin_is_contended)(struct raw_spinlock *lock);
330 void (*spin_lock)(struct raw_spinlock *lock);
331 void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags);
332 int (*spin_trylock)(struct raw_spinlock *lock);
333 void (*spin_unlock)(struct raw_spinlock *lock);
334};
335
336/* This contains all the paravirt structures: we get a convenient
337 * number for each function using the offset which we use to indicate
338 * what to patch. */
339struct paravirt_patch_template {
340 struct pv_init_ops pv_init_ops;
341 struct pv_time_ops pv_time_ops;
342 struct pv_cpu_ops pv_cpu_ops;
343 struct pv_irq_ops pv_irq_ops;
344 struct pv_apic_ops pv_apic_ops;
345 struct pv_mmu_ops pv_mmu_ops;
346 struct pv_lock_ops pv_lock_ops;
347};
348
349extern struct pv_info pv_info;
350extern struct pv_init_ops pv_init_ops;
351extern struct pv_time_ops pv_time_ops;
352extern struct pv_cpu_ops pv_cpu_ops;
353extern struct pv_irq_ops pv_irq_ops;
354extern struct pv_apic_ops pv_apic_ops;
355extern struct pv_mmu_ops pv_mmu_ops;
356extern struct pv_lock_ops pv_lock_ops;
357
358#define PARAVIRT_PATCH(x) \
359 (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
360
361#define paravirt_type(op) \
362 [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
363 [paravirt_opptr] "m" (op)
364#define paravirt_clobber(clobber) \
365 [paravirt_clobber] "i" (clobber)
366
367/*
368 * Generate some code, and mark it as patchable by the
369 * apply_paravirt() alternate instruction patcher.
370 */
371#define _paravirt_alt(insn_string, type, clobber) \
372 "771:\n\t" insn_string "\n" "772:\n" \
373 ".pushsection .parainstructions,\"a\"\n" \
374 _ASM_ALIGN "\n" \
375 _ASM_PTR " 771b\n" \
376 " .byte " type "\n" \
377 " .byte 772b-771b\n" \
378 " .short " clobber "\n" \
379 ".popsection\n"
380
381/* Generate patchable code, with the default asm parameters. */
382#define paravirt_alt(insn_string) \
383 _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
384
385/* Simple instruction patching code. */
386#define DEF_NATIVE(ops, name, code) \
387 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
388 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
389
390unsigned paravirt_patch_nop(void);
391unsigned paravirt_patch_ignore(unsigned len);
392unsigned paravirt_patch_call(void *insnbuf,
393 const void *target, u16 tgt_clobbers,
394 unsigned long addr, u16 site_clobbers,
395 unsigned len);
396unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
397 unsigned long addr, unsigned len);
398unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
399 unsigned long addr, unsigned len);
400
401unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
402 const char *start, const char *end);
403
404unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
405 unsigned long addr, unsigned len);
406
407int paravirt_disable_iospace(void);
408
409/*
410 * This generates an indirect call based on the operation type number.
411 * The type number, computed in PARAVIRT_PATCH, is derived from the
412 * offset into the paravirt_patch_template structure, and can therefore be
413 * freely converted back into a structure offset.
414 */
415#define PARAVIRT_CALL "call *%[paravirt_opptr];"
416
417/*
418 * These macros are intended to wrap calls through one of the paravirt
419 * ops structs, so that they can be later identified and patched at
420 * runtime.
421 *
422 * Normally, a call to a pv_op function is a simple indirect call:
423 * (pv_op_struct.operations)(args...).
424 *
425 * Unfortunately, this is a relatively slow operation for modern CPUs,
426 * because it cannot necessarily determine what the destination
427 * address is. In this case, the address is a runtime constant, so at
428 * the very least we can patch the call to e a simple direct call, or
429 * ideally, patch an inline implementation into the callsite. (Direct
430 * calls are essentially free, because the call and return addresses
431 * are completely predictable.)
432 *
433 * For i386, these macros rely on the standard gcc "regparm(3)" calling
434 * convention, in which the first three arguments are placed in %eax,
435 * %edx, %ecx (in that order), and the remaining arguments are placed
436 * on the stack. All caller-save registers (eax,edx,ecx) are expected
437 * to be modified (either clobbered or used for return values).
438 * X86_64, on the other hand, already specifies a register-based calling
439 * conventions, returning at %rax, with parameteres going on %rdi, %rsi,
440 * %rdx, and %rcx. Note that for this reason, x86_64 does not need any
441 * special handling for dealing with 4 arguments, unlike i386.
442 * However, x86_64 also have to clobber all caller saved registers, which
443 * unfortunately, are quite a bit (r8 - r11)
444 *
445 * The call instruction itself is marked by placing its start address
446 * and size into the .parainstructions section, so that
447 * apply_paravirt() in arch/i386/kernel/alternative.c can do the
448 * appropriate patching under the control of the backend pv_init_ops
449 * implementation.
450 *
451 * Unfortunately there's no way to get gcc to generate the args setup
452 * for the call, and then allow the call itself to be generated by an
453 * inline asm. Because of this, we must do the complete arg setup and
454 * return value handling from within these macros. This is fairly
455 * cumbersome.
456 *
457 * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
458 * It could be extended to more arguments, but there would be little
459 * to be gained from that. For each number of arguments, there are
460 * the two VCALL and CALL variants for void and non-void functions.
461 *
462 * When there is a return value, the invoker of the macro must specify
463 * the return type. The macro then uses sizeof() on that type to
464 * determine whether its a 32 or 64 bit value, and places the return
465 * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
466 * 64-bit). For x86_64 machines, it just returns at %rax regardless of
467 * the return value size.
468 *
469 * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
470 * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments
471 * in low,high order
472 *
473 * Small structures are passed and returned in registers. The macro
474 * calling convention can't directly deal with this, so the wrapper
475 * functions must do this.
476 *
477 * These PVOP_* macros are only defined within this header. This
478 * means that all uses must be wrapped in inline functions. This also
479 * makes sure the incoming and outgoing types are always correct.
480 */
481#ifdef CONFIG_X86_32
482#define PVOP_VCALL_ARGS unsigned long __eax, __edx, __ecx
483#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
484#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \
485 "=c" (__ecx)
486#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS
487#define EXTRA_CLOBBERS
488#define VEXTRA_CLOBBERS
489#else
490#define PVOP_VCALL_ARGS unsigned long __edi, __esi, __edx, __ecx
491#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax
492#define PVOP_VCALL_CLOBBERS "=D" (__edi), \
493 "=S" (__esi), "=d" (__edx), \
494 "=c" (__ecx)
495
496#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
497
498#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
499#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11"
500#endif
501
502#ifdef CONFIG_PARAVIRT_DEBUG
503#define PVOP_TEST_NULL(op) BUG_ON(op == NULL)
504#else
505#define PVOP_TEST_NULL(op) ((void)op)
506#endif
507
508#define __PVOP_CALL(rettype, op, pre, post, ...) \
509 ({ \
510 rettype __ret; \
511 PVOP_CALL_ARGS; \
512 PVOP_TEST_NULL(op); \
513 /* This is 32-bit specific, but is okay in 64-bit */ \
514 /* since this condition will never hold */ \
515 if (sizeof(rettype) > sizeof(unsigned long)) { \
516 asm volatile(pre \
517 paravirt_alt(PARAVIRT_CALL) \
518 post \
519 : PVOP_CALL_CLOBBERS \
520 : paravirt_type(op), \
521 paravirt_clobber(CLBR_ANY), \
522 ##__VA_ARGS__ \
523 : "memory", "cc" EXTRA_CLOBBERS); \
524 __ret = (rettype)((((u64)__edx) << 32) | __eax); \
525 } else { \
526 asm volatile(pre \
527 paravirt_alt(PARAVIRT_CALL) \
528 post \
529 : PVOP_CALL_CLOBBERS \
530 : paravirt_type(op), \
531 paravirt_clobber(CLBR_ANY), \
532 ##__VA_ARGS__ \
533 : "memory", "cc" EXTRA_CLOBBERS); \
534 __ret = (rettype)__eax; \
535 } \
536 __ret; \
537 })
538#define __PVOP_VCALL(op, pre, post, ...) \
539 ({ \
540 PVOP_VCALL_ARGS; \
541 PVOP_TEST_NULL(op); \
542 asm volatile(pre \
543 paravirt_alt(PARAVIRT_CALL) \
544 post \
545 : PVOP_VCALL_CLOBBERS \
546 : paravirt_type(op), \
547 paravirt_clobber(CLBR_ANY), \
548 ##__VA_ARGS__ \
549 : "memory", "cc" VEXTRA_CLOBBERS); \
550 })
551
552#define PVOP_CALL0(rettype, op) \
553 __PVOP_CALL(rettype, op, "", "")
554#define PVOP_VCALL0(op) \
555 __PVOP_VCALL(op, "", "")
556
557#define PVOP_CALL1(rettype, op, arg1) \
558 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)))
559#define PVOP_VCALL1(op, arg1) \
560 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)))
561
562#define PVOP_CALL2(rettype, op, arg1, arg2) \
563 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
564 "1" ((unsigned long)(arg2)))
565#define PVOP_VCALL2(op, arg1, arg2) \
566 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
567 "1" ((unsigned long)(arg2)))
568
569#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
570 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
571 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
572#define PVOP_VCALL3(op, arg1, arg2, arg3) \
573 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
574 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
575
576/* This is the only difference in x86_64. We can make it much simpler */
577#ifdef CONFIG_X86_32
578#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
579 __PVOP_CALL(rettype, op, \
580 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
581 "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
582 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
583#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
584 __PVOP_VCALL(op, \
585 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
586 "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
587 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
588#else
589#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
590 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
591 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \
592 "3"((unsigned long)(arg4)))
593#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
594 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
595 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \
596 "3"((unsigned long)(arg4)))
597#endif
598
599static inline int paravirt_enabled(void)
600{
601 return pv_info.paravirt_enabled;
602}
603
604static inline void load_sp0(struct tss_struct *tss,
605 struct thread_struct *thread)
606{
607 PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
608}
609
610#define ARCH_SETUP pv_init_ops.arch_setup();
611static inline unsigned long get_wallclock(void)
612{
613 return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
614}
615
616static inline int set_wallclock(unsigned long nowtime)
617{
618 return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
619}
620
621static inline void (*choose_time_init(void))(void)
622{
623 return pv_time_ops.time_init;
624}
625
626/* The paravirtualized CPUID instruction. */
627static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
628 unsigned int *ecx, unsigned int *edx)
629{
630 PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx);
631}
632
633/*
634 * These special macros can be used to get or set a debugging register
635 */
636static inline unsigned long paravirt_get_debugreg(int reg)
637{
638 return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg);
639}
640#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
641static inline void set_debugreg(unsigned long val, int reg)
642{
643 PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val);
644}
645
646static inline void clts(void)
647{
648 PVOP_VCALL0(pv_cpu_ops.clts);
649}
650
651static inline unsigned long read_cr0(void)
652{
653 return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0);
654}
655
656static inline void write_cr0(unsigned long x)
657{
658 PVOP_VCALL1(pv_cpu_ops.write_cr0, x);
659}
660
661static inline unsigned long read_cr2(void)
662{
663 return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2);
664}
665
666static inline void write_cr2(unsigned long x)
667{
668 PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
669}
670
671static inline unsigned long read_cr3(void)
672{
673 return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
674}
675
676static inline void write_cr3(unsigned long x)
677{
678 PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
679}
680
681static inline unsigned long read_cr4(void)
682{
683 return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
684}
685static inline unsigned long read_cr4_safe(void)
686{
687 return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
688}
689
690static inline void write_cr4(unsigned long x)
691{
692 PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
693}
694
695#ifdef CONFIG_X86_64
696static inline unsigned long read_cr8(void)
697{
698 return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr8);
699}
700
701static inline void write_cr8(unsigned long x)
702{
703 PVOP_VCALL1(pv_cpu_ops.write_cr8, x);
704}
705#endif
706
707static inline void raw_safe_halt(void)
708{
709 PVOP_VCALL0(pv_irq_ops.safe_halt);
710}
711
712static inline void halt(void)
713{
714 PVOP_VCALL0(pv_irq_ops.safe_halt);
715}
716
717static inline void wbinvd(void)
718{
719 PVOP_VCALL0(pv_cpu_ops.wbinvd);
720}
721
722#define get_kernel_rpl() (pv_info.kernel_rpl)
723
724static inline u64 paravirt_read_msr(unsigned msr, int *err)
725{
726 return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
727}
728static inline u64 paravirt_read_msr_amd(unsigned msr, int *err)
729{
730 return PVOP_CALL2(u64, pv_cpu_ops.read_msr_amd, msr, err);
731}
732static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
733{
734 return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
735}
736
737/* These should all do BUG_ON(_err), but our headers are too tangled. */
738#define rdmsr(msr, val1, val2) \
739do { \
740 int _err; \
741 u64 _l = paravirt_read_msr(msr, &_err); \
742 val1 = (u32)_l; \
743 val2 = _l >> 32; \
744} while (0)
745
746#define wrmsr(msr, val1, val2) \
747do { \
748 paravirt_write_msr(msr, val1, val2); \
749} while (0)
750
751#define rdmsrl(msr, val) \
752do { \
753 int _err; \
754 val = paravirt_read_msr(msr, &_err); \
755} while (0)
756
757#define wrmsrl(msr, val) wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32)
758#define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b)
759
760/* rdmsr with exception handling */
761#define rdmsr_safe(msr, a, b) \
762({ \
763 int _err; \
764 u64 _l = paravirt_read_msr(msr, &_err); \
765 (*a) = (u32)_l; \
766 (*b) = _l >> 32; \
767 _err; \
768})
769
770static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
771{
772 int err;
773
774 *p = paravirt_read_msr(msr, &err);
775 return err;
776}
777static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
778{
779 int err;
780
781 *p = paravirt_read_msr_amd(msr, &err);
782 return err;
783}
784
785static inline u64 paravirt_read_tsc(void)
786{
787 return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
788}
789
790#define rdtscl(low) \
791do { \
792 u64 _l = paravirt_read_tsc(); \
793 low = (int)_l; \
794} while (0)
795
796#define rdtscll(val) (val = paravirt_read_tsc())
797
798static inline unsigned long long paravirt_sched_clock(void)
799{
800 return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
801}
802#define calibrate_tsc() (pv_time_ops.get_tsc_khz())
803
804static inline unsigned long long paravirt_read_pmc(int counter)
805{
806 return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
807}
808
809#define rdpmc(counter, low, high) \
810do { \
811 u64 _l = paravirt_read_pmc(counter); \
812 low = (u32)_l; \
813 high = _l >> 32; \
814} while (0)
815
816static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
817{
818 return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
819}
820
821#define rdtscp(low, high, aux) \
822do { \
823 int __aux; \
824 unsigned long __val = paravirt_rdtscp(&__aux); \
825 (low) = (u32)__val; \
826 (high) = (u32)(__val >> 32); \
827 (aux) = __aux; \
828} while (0)
829
830#define rdtscpll(val, aux) \
831do { \
832 unsigned long __aux; \
833 val = paravirt_rdtscp(&__aux); \
834 (aux) = __aux; \
835} while (0)
836
837static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
838{
839 PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
840}
841
842static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
843{
844 PVOP_VCALL2(pv_cpu_ops.free_ldt, ldt, entries);
845}
846
847static inline void load_TR_desc(void)
848{
849 PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
850}
851static inline void load_gdt(const struct desc_ptr *dtr)
852{
853 PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
854}
855static inline void load_idt(const struct desc_ptr *dtr)
856{
857 PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
858}
859static inline void set_ldt(const void *addr, unsigned entries)
860{
861 PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
862}
863static inline void store_gdt(struct desc_ptr *dtr)
864{
865 PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
866}
867static inline void store_idt(struct desc_ptr *dtr)
868{
869 PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
870}
871static inline unsigned long paravirt_store_tr(void)
872{
873 return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
874}
875#define store_tr(tr) ((tr) = paravirt_store_tr())
876static inline void load_TLS(struct thread_struct *t, unsigned cpu)
877{
878 PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
879}
880
881#ifdef CONFIG_X86_64
882static inline void load_gs_index(unsigned int gs)
883{
884 PVOP_VCALL1(pv_cpu_ops.load_gs_index, gs);
885}
886#endif
887
888static inline void write_ldt_entry(struct desc_struct *dt, int entry,
889 const void *desc)
890{
891 PVOP_VCALL3(pv_cpu_ops.write_ldt_entry, dt, entry, desc);
892}
893
894static inline void write_gdt_entry(struct desc_struct *dt, int entry,
895 void *desc, int type)
896{
897 PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, desc, type);
898}
899
900static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
901{
902 PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
903}
904static inline void set_iopl_mask(unsigned mask)
905{
906 PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
907}
908
909/* The paravirtualized I/O functions */
910static inline void slow_down_io(void)
911{
912 pv_cpu_ops.io_delay();
913#ifdef REALLY_SLOW_IO
914 pv_cpu_ops.io_delay();
915 pv_cpu_ops.io_delay();
916 pv_cpu_ops.io_delay();
917#endif
918}
919
920#ifdef CONFIG_X86_LOCAL_APIC
921static inline void setup_boot_clock(void)
922{
923 PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
924}
925
926static inline void setup_secondary_clock(void)
927{
928 PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
929}
930#endif
931
932static inline void paravirt_post_allocator_init(void)
933{
934 if (pv_init_ops.post_allocator_init)
935 (*pv_init_ops.post_allocator_init)();
936}
937
938static inline void paravirt_pagetable_setup_start(pgd_t *base)
939{
940 (*pv_mmu_ops.pagetable_setup_start)(base);
941}
942
943static inline void paravirt_pagetable_setup_done(pgd_t *base)
944{
945 (*pv_mmu_ops.pagetable_setup_done)(base);
946}
947
948#ifdef CONFIG_SMP
949static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
950 unsigned long start_esp)
951{
952 PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
953 phys_apicid, start_eip, start_esp);
954}
955#endif
956
957static inline void paravirt_activate_mm(struct mm_struct *prev,
958 struct mm_struct *next)
959{
960 PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next);
961}
962
963static inline void arch_dup_mmap(struct mm_struct *oldmm,
964 struct mm_struct *mm)
965{
966 PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm);
967}
968
969static inline void arch_exit_mmap(struct mm_struct *mm)
970{
971 PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm);
972}
973
974static inline void __flush_tlb(void)
975{
976 PVOP_VCALL0(pv_mmu_ops.flush_tlb_user);
977}
978static inline void __flush_tlb_global(void)
979{
980 PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
981}
982static inline void __flush_tlb_single(unsigned long addr)
983{
984 PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
985}
986
987static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
988 unsigned long va)
989{
990 PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
991}
992
993static inline int paravirt_pgd_alloc(struct mm_struct *mm)
994{
995 return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm);
996}
997
998static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd)
999{
1000 PVOP_VCALL2(pv_mmu_ops.pgd_free, mm, pgd);
1001}
1002
1003static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1004{
1005 PVOP_VCALL2(pv_mmu_ops.alloc_pte, mm, pfn);
1006}
1007static inline void paravirt_release_pte(unsigned long pfn)
1008{
1009 PVOP_VCALL1(pv_mmu_ops.release_pte, pfn);
1010}
1011
1012static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1013{
1014 PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
1015}
1016
1017static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
1018 unsigned long start, unsigned long count)
1019{
1020 PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
1021}
1022static inline void paravirt_release_pmd(unsigned long pfn)
1023{
1024 PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
1025}
1026
1027static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1028{
1029 PVOP_VCALL2(pv_mmu_ops.alloc_pud, mm, pfn);
1030}
1031static inline void paravirt_release_pud(unsigned long pfn)
1032{
1033 PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
1034}
1035
1036#ifdef CONFIG_HIGHPTE
1037static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
1038{
1039 unsigned long ret;
1040 ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
1041 return (void *)ret;
1042}
1043#endif
1044
1045static inline void pte_update(struct mm_struct *mm, unsigned long addr,
1046 pte_t *ptep)
1047{
1048 PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
1049}
1050
1051static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
1052 pte_t *ptep)
1053{
1054 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
1055}
1056
1057static inline pte_t __pte(pteval_t val)
1058{
1059 pteval_t ret;
1060
1061 if (sizeof(pteval_t) > sizeof(long))
1062 ret = PVOP_CALL2(pteval_t,
1063 pv_mmu_ops.make_pte,
1064 val, (u64)val >> 32);
1065 else
1066 ret = PVOP_CALL1(pteval_t,
1067 pv_mmu_ops.make_pte,
1068 val);
1069
1070 return (pte_t) { .pte = ret };
1071}
1072
1073static inline pteval_t pte_val(pte_t pte)
1074{
1075 pteval_t ret;
1076
1077 if (sizeof(pteval_t) > sizeof(long))
1078 ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val,
1079 pte.pte, (u64)pte.pte >> 32);
1080 else
1081 ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val,
1082 pte.pte);
1083
1084 return ret;
1085}
1086
1087static inline pteval_t pte_flags(pte_t pte)
1088{
1089 pteval_t ret;
1090
1091 if (sizeof(pteval_t) > sizeof(long))
1092 ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags,
1093 pte.pte, (u64)pte.pte >> 32);
1094 else
1095 ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags,
1096 pte.pte);
1097
1098#ifdef CONFIG_PARAVIRT_DEBUG
1099 BUG_ON(ret & PTE_PFN_MASK);
1100#endif
1101 return ret;
1102}
1103
1104static inline pgd_t __pgd(pgdval_t val)
1105{
1106 pgdval_t ret;
1107
1108 if (sizeof(pgdval_t) > sizeof(long))
1109 ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd,
1110 val, (u64)val >> 32);
1111 else
1112 ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd,
1113 val);
1114
1115 return (pgd_t) { ret };
1116}
1117
1118static inline pgdval_t pgd_val(pgd_t pgd)
1119{
1120 pgdval_t ret;
1121
1122 if (sizeof(pgdval_t) > sizeof(long))
1123 ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val,
1124 pgd.pgd, (u64)pgd.pgd >> 32);
1125 else
1126 ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val,
1127 pgd.pgd);
1128
1129 return ret;
1130}
1131
1132#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
1133static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
1134 pte_t *ptep)
1135{
1136 pteval_t ret;
1137
1138 ret = PVOP_CALL3(pteval_t, pv_mmu_ops.ptep_modify_prot_start,
1139 mm, addr, ptep);
1140
1141 return (pte_t) { .pte = ret };
1142}
1143
1144static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
1145 pte_t *ptep, pte_t pte)
1146{
1147 if (sizeof(pteval_t) > sizeof(long))
1148 /* 5 arg words */
1149 pv_mmu_ops.ptep_modify_prot_commit(mm, addr, ptep, pte);
1150 else
1151 PVOP_VCALL4(pv_mmu_ops.ptep_modify_prot_commit,
1152 mm, addr, ptep, pte.pte);
1153}
1154
1155static inline void set_pte(pte_t *ptep, pte_t pte)
1156{
1157 if (sizeof(pteval_t) > sizeof(long))
1158 PVOP_VCALL3(pv_mmu_ops.set_pte, ptep,
1159 pte.pte, (u64)pte.pte >> 32);
1160 else
1161 PVOP_VCALL2(pv_mmu_ops.set_pte, ptep,
1162 pte.pte);
1163}
1164
1165static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
1166 pte_t *ptep, pte_t pte)
1167{
1168 if (sizeof(pteval_t) > sizeof(long))
1169 /* 5 arg words */
1170 pv_mmu_ops.set_pte_at(mm, addr, ptep, pte);
1171 else
1172 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
1173}
1174
1175static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
1176{
1177 pmdval_t val = native_pmd_val(pmd);
1178
1179 if (sizeof(pmdval_t) > sizeof(long))
1180 PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp, val, (u64)val >> 32);
1181 else
1182 PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
1183}
1184
1185#if PAGETABLE_LEVELS >= 3
1186static inline pmd_t __pmd(pmdval_t val)
1187{
1188 pmdval_t ret;
1189
1190 if (sizeof(pmdval_t) > sizeof(long))
1191 ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd,
1192 val, (u64)val >> 32);
1193 else
1194 ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd,
1195 val);
1196
1197 return (pmd_t) { ret };
1198}
1199
1200static inline pmdval_t pmd_val(pmd_t pmd)
1201{
1202 pmdval_t ret;
1203
1204 if (sizeof(pmdval_t) > sizeof(long))
1205 ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val,
1206 pmd.pmd, (u64)pmd.pmd >> 32);
1207 else
1208 ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val,
1209 pmd.pmd);
1210
1211 return ret;
1212}
1213
1214static inline void set_pud(pud_t *pudp, pud_t pud)
1215{
1216 pudval_t val = native_pud_val(pud);
1217
1218 if (sizeof(pudval_t) > sizeof(long))
1219 PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
1220 val, (u64)val >> 32);
1221 else
1222 PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
1223 val);
1224}
1225#if PAGETABLE_LEVELS == 4
1226static inline pud_t __pud(pudval_t val)
1227{
1228 pudval_t ret;
1229
1230 if (sizeof(pudval_t) > sizeof(long))
1231 ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud,
1232 val, (u64)val >> 32);
1233 else
1234 ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud,
1235 val);
1236
1237 return (pud_t) { ret };
1238}
1239
1240static inline pudval_t pud_val(pud_t pud)
1241{
1242 pudval_t ret;
1243
1244 if (sizeof(pudval_t) > sizeof(long))
1245 ret = PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val,
1246 pud.pud, (u64)pud.pud >> 32);
1247 else
1248 ret = PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val,
1249 pud.pud);
1250
1251 return ret;
1252}
1253
1254static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
1255{
1256 pgdval_t val = native_pgd_val(pgd);
1257
1258 if (sizeof(pgdval_t) > sizeof(long))
1259 PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp,
1260 val, (u64)val >> 32);
1261 else
1262 PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp,
1263 val);
1264}
1265
1266static inline void pgd_clear(pgd_t *pgdp)
1267{
1268 set_pgd(pgdp, __pgd(0));
1269}
1270
1271static inline void pud_clear(pud_t *pudp)
1272{
1273 set_pud(pudp, __pud(0));
1274}
1275
1276#endif /* PAGETABLE_LEVELS == 4 */
1277
1278#endif /* PAGETABLE_LEVELS >= 3 */
1279
1280#ifdef CONFIG_X86_PAE
1281/* Special-case pte-setting operations for PAE, which can't update a
1282 64-bit pte atomically */
1283static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
1284{
1285 PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
1286 pte.pte, pte.pte >> 32);
1287}
1288
1289static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
1290 pte_t *ptep, pte_t pte)
1291{
1292 /* 5 arg words */
1293 pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
1294}
1295
1296static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
1297 pte_t *ptep)
1298{
1299 PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
1300}
1301
1302static inline void pmd_clear(pmd_t *pmdp)
1303{
1304 PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
1305}
1306#else /* !CONFIG_X86_PAE */
1307static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
1308{
1309 set_pte(ptep, pte);
1310}
1311
1312static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
1313 pte_t *ptep, pte_t pte)
1314{
1315 set_pte(ptep, pte);
1316}
1317
1318static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
1319 pte_t *ptep)
1320{
1321 set_pte_at(mm, addr, ptep, __pte(0));
1322}
1323
1324static inline void pmd_clear(pmd_t *pmdp)
1325{
1326 set_pmd(pmdp, __pmd(0));
1327}
1328#endif /* CONFIG_X86_PAE */
1329
1330/* Lazy mode for batching updates / context switch */
1331enum paravirt_lazy_mode {
1332 PARAVIRT_LAZY_NONE,
1333 PARAVIRT_LAZY_MMU,
1334 PARAVIRT_LAZY_CPU,
1335};
1336
1337enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
1338void paravirt_enter_lazy_cpu(void);
1339void paravirt_leave_lazy_cpu(void);
1340void paravirt_enter_lazy_mmu(void);
1341void paravirt_leave_lazy_mmu(void);
1342void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
1343
1344#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE
1345static inline void arch_enter_lazy_cpu_mode(void)
1346{
1347 PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
1348}
1349
1350static inline void arch_leave_lazy_cpu_mode(void)
1351{
1352 PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
1353}
1354
1355static inline void arch_flush_lazy_cpu_mode(void)
1356{
1357 if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) {
1358 arch_leave_lazy_cpu_mode();
1359 arch_enter_lazy_cpu_mode();
1360 }
1361}
1362
1363
1364#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
1365static inline void arch_enter_lazy_mmu_mode(void)
1366{
1367 PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter);
1368}
1369
1370static inline void arch_leave_lazy_mmu_mode(void)
1371{
1372 PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave);
1373}
1374
1375static inline void arch_flush_lazy_mmu_mode(void)
1376{
1377 if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) {
1378 arch_leave_lazy_mmu_mode();
1379 arch_enter_lazy_mmu_mode();
1380 }
1381}
1382
1383static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
1384 unsigned long phys, pgprot_t flags)
1385{
1386 pv_mmu_ops.set_fixmap(idx, phys, flags);
1387}
1388
1389void _paravirt_nop(void);
1390#define paravirt_nop ((void *)_paravirt_nop)
1391
1392void paravirt_use_bytelocks(void);
1393
1394#ifdef CONFIG_SMP
1395
1396static inline int __raw_spin_is_locked(struct raw_spinlock *lock)
1397{
1398 return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock);
1399}
1400
1401static inline int __raw_spin_is_contended(struct raw_spinlock *lock)
1402{
1403 return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
1404}
1405
1406static __always_inline void __raw_spin_lock(struct raw_spinlock *lock)
1407{
1408 PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
1409}
1410
1411static __always_inline void __raw_spin_lock_flags(struct raw_spinlock *lock,
1412 unsigned long flags)
1413{
1414 PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags);
1415}
1416
1417static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock)
1418{
1419 return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
1420}
1421
1422static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
1423{
1424 PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
1425}
1426
1427#endif
1428
1429/* These all sit in the .parainstructions section to tell us what to patch. */
1430struct paravirt_patch_site {
1431 u8 *instr; /* original instructions */
1432 u8 instrtype; /* type of this instruction */
1433 u8 len; /* length of original instruction */
1434 u16 clobbers; /* what registers you may clobber */
1435};
1436
1437extern struct paravirt_patch_site __parainstructions[],
1438 __parainstructions_end[];
1439
1440#ifdef CONFIG_X86_32
1441#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;"
1442#define PV_RESTORE_REGS "popl %%edx; popl %%ecx"
1443#define PV_FLAGS_ARG "0"
1444#define PV_EXTRA_CLOBBERS
1445#define PV_VEXTRA_CLOBBERS
1446#else
1447/* We save some registers, but all of them, that's too much. We clobber all
1448 * caller saved registers but the argument parameter */
1449#define PV_SAVE_REGS "pushq %%rdi;"
1450#define PV_RESTORE_REGS "popq %%rdi;"
1451#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx", "rsi"
1452#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx", "rsi"
1453#define PV_FLAGS_ARG "D"
1454#endif
1455
1456static inline unsigned long __raw_local_save_flags(void)
1457{
1458 unsigned long f;
1459
1460 asm volatile(paravirt_alt(PV_SAVE_REGS
1461 PARAVIRT_CALL
1462 PV_RESTORE_REGS)
1463 : "=a"(f)
1464 : paravirt_type(pv_irq_ops.save_fl),
1465 paravirt_clobber(CLBR_EAX)
1466 : "memory", "cc" PV_VEXTRA_CLOBBERS);
1467 return f;
1468}
1469
1470static inline void raw_local_irq_restore(unsigned long f)
1471{
1472 asm volatile(paravirt_alt(PV_SAVE_REGS
1473 PARAVIRT_CALL
1474 PV_RESTORE_REGS)
1475 : "=a"(f)
1476 : PV_FLAGS_ARG(f),
1477 paravirt_type(pv_irq_ops.restore_fl),
1478 paravirt_clobber(CLBR_EAX)
1479 : "memory", "cc" PV_EXTRA_CLOBBERS);
1480}
1481
1482static inline void raw_local_irq_disable(void)
1483{
1484 asm volatile(paravirt_alt(PV_SAVE_REGS
1485 PARAVIRT_CALL
1486 PV_RESTORE_REGS)
1487 :
1488 : paravirt_type(pv_irq_ops.irq_disable),
1489 paravirt_clobber(CLBR_EAX)
1490 : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
1491}
1492
1493static inline void raw_local_irq_enable(void)
1494{
1495 asm volatile(paravirt_alt(PV_SAVE_REGS
1496 PARAVIRT_CALL
1497 PV_RESTORE_REGS)
1498 :
1499 : paravirt_type(pv_irq_ops.irq_enable),
1500 paravirt_clobber(CLBR_EAX)
1501 : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
1502}
1503
1504static inline unsigned long __raw_local_irq_save(void)
1505{
1506 unsigned long f;
1507
1508 f = __raw_local_save_flags();
1509 raw_local_irq_disable();
1510 return f;
1511}
1512
1513
1514/* Make sure as little as possible of this mess escapes. */
1515#undef PARAVIRT_CALL
1516#undef __PVOP_CALL
1517#undef __PVOP_VCALL
1518#undef PVOP_VCALL0
1519#undef PVOP_CALL0
1520#undef PVOP_VCALL1
1521#undef PVOP_CALL1
1522#undef PVOP_VCALL2
1523#undef PVOP_CALL2
1524#undef PVOP_VCALL3
1525#undef PVOP_CALL3
1526#undef PVOP_VCALL4
1527#undef PVOP_CALL4
1528
1529#else /* __ASSEMBLY__ */
1530
1531#define _PVSITE(ptype, clobbers, ops, word, algn) \
1532771:; \
1533 ops; \
1534772:; \
1535 .pushsection .parainstructions,"a"; \
1536 .align algn; \
1537 word 771b; \
1538 .byte ptype; \
1539 .byte 772b-771b; \
1540 .short clobbers; \
1541 .popsection
1542
1543
1544#ifdef CONFIG_X86_64
1545#define PV_SAVE_REGS \
1546 push %rax; \
1547 push %rcx; \
1548 push %rdx; \
1549 push %rsi; \
1550 push %rdi; \
1551 push %r8; \
1552 push %r9; \
1553 push %r10; \
1554 push %r11
1555#define PV_RESTORE_REGS \
1556 pop %r11; \
1557 pop %r10; \
1558 pop %r9; \
1559 pop %r8; \
1560 pop %rdi; \
1561 pop %rsi; \
1562 pop %rdx; \
1563 pop %rcx; \
1564 pop %rax
1565#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
1566#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
1567#define PARA_INDIRECT(addr) *addr(%rip)
1568#else
1569#define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx
1570#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax
1571#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)
1572#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
1573#define PARA_INDIRECT(addr) *%cs:addr
1574#endif
1575
1576#define INTERRUPT_RETURN \
1577 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \
1578 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret))
1579
1580#define DISABLE_INTERRUPTS(clobbers) \
1581 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
1582 PV_SAVE_REGS; \
1583 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \
1584 PV_RESTORE_REGS;) \
1585
1586#define ENABLE_INTERRUPTS(clobbers) \
1587 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
1588 PV_SAVE_REGS; \
1589 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
1590 PV_RESTORE_REGS;)
1591
1592#define USERGS_SYSRET32 \
1593 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \
1594 CLBR_NONE, \
1595 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32))
1596
1597#ifdef CONFIG_X86_32
1598#define GET_CR0_INTO_EAX \
1599 push %ecx; push %edx; \
1600 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \
1601 pop %edx; pop %ecx
1602
1603#define ENABLE_INTERRUPTS_SYSEXIT \
1604 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
1605 CLBR_NONE, \
1606 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
1607
1608
1609#else /* !CONFIG_X86_32 */
1610
1611/*
1612 * If swapgs is used while the userspace stack is still current,
1613 * there's no way to call a pvop. The PV replacement *must* be
1614 * inlined, or the swapgs instruction must be trapped and emulated.
1615 */
1616#define SWAPGS_UNSAFE_STACK \
1617 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
1618 swapgs)
1619
1620#define SWAPGS \
1621 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
1622 PV_SAVE_REGS; \
1623 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \
1624 PV_RESTORE_REGS \
1625 )
1626
1627#define GET_CR2_INTO_RCX \
1628 call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2); \
1629 movq %rax, %rcx; \
1630 xorq %rax, %rax;
1631
1632#define PARAVIRT_ADJUST_EXCEPTION_FRAME \
1633 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \
1634 CLBR_NONE, \
1635 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame))
1636
1637#define USERGS_SYSRET64 \
1638 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
1639 CLBR_NONE, \
1640 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
1641
1642#define ENABLE_INTERRUPTS_SYSEXIT32 \
1643 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
1644 CLBR_NONE, \
1645 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
1646#endif /* CONFIG_X86_32 */
1647
1648#endif /* __ASSEMBLY__ */
1649#endif /* CONFIG_PARAVIRT */
1650#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/parport.h b/arch/x86/include/asm/parport.h
new file mode 100644
index 00000000000..3c4ffeb467e
--- /dev/null
+++ b/arch/x86/include/asm/parport.h
@@ -0,0 +1,10 @@
1#ifndef _ASM_X86_PARPORT_H
2#define _ASM_X86_PARPORT_H
3
4static int __devinit parport_pc_find_isa_ports(int autoirq, int autodma);
5static int __devinit parport_pc_find_nonpci_ports(int autoirq, int autodma)
6{
7 return parport_pc_find_isa_ports(autoirq, autodma);
8}
9
10#endif /* _ASM_X86_PARPORT_H */
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
new file mode 100644
index 00000000000..b8493b3b989
--- /dev/null
+++ b/arch/x86/include/asm/pat.h
@@ -0,0 +1,22 @@
1#ifndef _ASM_X86_PAT_H
2#define _ASM_X86_PAT_H
3
4#include <linux/types.h>
5
6#ifdef CONFIG_X86_PAT
7extern int pat_enabled;
8extern void validate_pat_support(struct cpuinfo_x86 *c);
9#else
10static const int pat_enabled;
11static inline void validate_pat_support(struct cpuinfo_x86 *c) { }
12#endif
13
14extern void pat_init(void);
15
16extern int reserve_memtype(u64 start, u64 end,
17 unsigned long req_type, unsigned long *ret_type);
18extern int free_memtype(u64 start, u64 end);
19
20extern void pat_disable(char *reason);
21
22#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h
new file mode 100644
index 00000000000..b1e7a45d868
--- /dev/null
+++ b/arch/x86/include/asm/pci-direct.h
@@ -0,0 +1,21 @@
1#ifndef _ASM_X86_PCI_DIRECT_H
2#define _ASM_X86_PCI_DIRECT_H
3
4#include <linux/types.h>
5
6/* Direct PCI access. This is used for PCI accesses in early boot before
7 the PCI subsystem works. */
8
9extern u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset);
10extern u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset);
11extern u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset);
12extern void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val);
13extern void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val);
14extern void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val);
15
16extern int early_pci_allowed(void);
17
18extern unsigned int pci_early_dump_regs;
19extern void early_dump_pci_device(u8 bus, u8 slot, u8 func);
20extern void early_dump_pci_devices(void);
21#endif /* _ASM_X86_PCI_DIRECT_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
new file mode 100644
index 00000000000..875b38edf19
--- /dev/null
+++ b/arch/x86/include/asm/pci.h
@@ -0,0 +1,114 @@
1#ifndef _ASM_X86_PCI_H
2#define _ASM_X86_PCI_H
3
4#include <linux/mm.h> /* for struct page */
5#include <linux/types.h>
6#include <linux/slab.h>
7#include <linux/string.h>
8#include <asm/scatterlist.h>
9#include <asm/io.h>
10
11#ifdef __KERNEL__
12
13struct pci_sysdata {
14 int domain; /* PCI domain */
15 int node; /* NUMA node */
16#ifdef CONFIG_X86_64
17 void *iommu; /* IOMMU private data */
18#endif
19};
20
21extern int pci_routeirq;
22
23/* scan a bus after allocating a pci_sysdata for it */
24extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
25 int node);
26extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
27
28static inline int pci_domain_nr(struct pci_bus *bus)
29{
30 struct pci_sysdata *sd = bus->sysdata;
31 return sd->domain;
32}
33
34static inline int pci_proc_domain(struct pci_bus *bus)
35{
36 return pci_domain_nr(bus);
37}
38
39
40/* Can be used to override the logic in pci_scan_bus for skipping
41 already-configured bus numbers - to be used for buggy BIOSes
42 or architectures with incomplete PCI setup by the loader */
43
44#ifdef CONFIG_PCI
45extern unsigned int pcibios_assign_all_busses(void);
46#else
47#define pcibios_assign_all_busses() 0
48#endif
49#define pcibios_scan_all_fns(a, b) 0
50
51extern unsigned long pci_mem_start;
52#define PCIBIOS_MIN_IO 0x1000
53#define PCIBIOS_MIN_MEM (pci_mem_start)
54
55#define PCIBIOS_MIN_CARDBUS_IO 0x4000
56
57void pcibios_config_init(void);
58struct pci_bus *pcibios_scan_root(int bus);
59
60void pcibios_set_master(struct pci_dev *dev);
61void pcibios_penalize_isa_irq(int irq, int active);
62struct irq_routing_table *pcibios_get_irq_routing_table(void);
63int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
64
65
66#define HAVE_PCI_MMAP
67extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
68 enum pci_mmap_state mmap_state,
69 int write_combine);
70
71
72#ifdef CONFIG_PCI
73extern void early_quirks(void);
74static inline void pci_dma_burst_advice(struct pci_dev *pdev,
75 enum pci_dma_burst_strategy *strat,
76 unsigned long *strategy_parameter)
77{
78 *strat = PCI_DMA_BURST_INFINITY;
79 *strategy_parameter = ~0UL;
80}
81#else
82static inline void early_quirks(void) { }
83#endif
84
85#endif /* __KERNEL__ */
86
87#ifdef CONFIG_X86_32
88# include "pci_32.h"
89#else
90# include "pci_64.h"
91#endif
92
93/* implement the pci_ DMA API in terms of the generic device dma_ one */
94#include <asm-generic/pci-dma-compat.h>
95
96/* generic pci stuff */
97#include <asm-generic/pci.h>
98
99#ifdef CONFIG_NUMA
100/* Returns the node based on pci bus */
101static inline int __pcibus_to_node(struct pci_bus *bus)
102{
103 struct pci_sysdata *sd = bus->sysdata;
104
105 return sd->node;
106}
107
108static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
109{
110 return node_to_cpumask(__pcibus_to_node(bus));
111}
112#endif
113
114#endif /* _ASM_X86_PCI_H */
diff --git a/arch/x86/include/asm/pci_32.h b/arch/x86/include/asm/pci_32.h
new file mode 100644
index 00000000000..6f1213a6ef4
--- /dev/null
+++ b/arch/x86/include/asm/pci_32.h
@@ -0,0 +1,34 @@
1#ifndef _ASM_X86_PCI_32_H
2#define _ASM_X86_PCI_32_H
3
4
5#ifdef __KERNEL__
6
7
8/* Dynamic DMA mapping stuff.
9 * i386 has everything mapped statically.
10 */
11
12struct pci_dev;
13
14/* The PCI address space does equal the physical memory
15 * address space. The networking and block device layers use
16 * this boolean for bounce buffer decisions.
17 */
18#define PCI_DMA_BUS_IS_PHYS (1)
19
20/* pci_unmap_{page,single} is a nop so... */
21#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
22#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
23#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
24#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
25 do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
26#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
27#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
28 do { break; } while (pci_unmap_len(PTR, LEN_NAME))
29
30
31#endif /* __KERNEL__ */
32
33
34#endif /* _ASM_X86_PCI_32_H */
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
new file mode 100644
index 00000000000..5b28995d664
--- /dev/null
+++ b/arch/x86/include/asm/pci_64.h
@@ -0,0 +1,66 @@
1#ifndef _ASM_X86_PCI_64_H
2#define _ASM_X86_PCI_64_H
3
4#ifdef __KERNEL__
5
6#ifdef CONFIG_CALGARY_IOMMU
7static inline void *pci_iommu(struct pci_bus *bus)
8{
9 struct pci_sysdata *sd = bus->sysdata;
10 return sd->iommu;
11}
12
13static inline void set_pci_iommu(struct pci_bus *bus, void *val)
14{
15 struct pci_sysdata *sd = bus->sysdata;
16 sd->iommu = val;
17}
18#endif /* CONFIG_CALGARY_IOMMU */
19
20extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
21 int reg, int len, u32 *value);
22extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
23 int reg, int len, u32 value);
24
25extern void dma32_reserve_bootmem(void);
26extern void pci_iommu_alloc(void);
27
28/* The PCI address space does equal the physical memory
29 * address space. The networking and block device layers use
30 * this boolean for bounce buffer decisions
31 *
32 * On AMD64 it mostly equals, but we set it to zero if a hardware
33 * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
34 */
35#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
36
37#if defined(CONFIG_GART_IOMMU) || defined(CONFIG_CALGARY_IOMMU)
38
39#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
40 dma_addr_t ADDR_NAME;
41#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
42 __u32 LEN_NAME;
43#define pci_unmap_addr(PTR, ADDR_NAME) \
44 ((PTR)->ADDR_NAME)
45#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
46 (((PTR)->ADDR_NAME) = (VAL))
47#define pci_unmap_len(PTR, LEN_NAME) \
48 ((PTR)->LEN_NAME)
49#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
50 (((PTR)->LEN_NAME) = (VAL))
51
52#else
53/* No IOMMU */
54
55#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
56#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
57#define pci_unmap_addr(PTR, ADDR_NAME) (0)
58#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
59#define pci_unmap_len(PTR, LEN_NAME) (0)
60#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
61
62#endif
63
64#endif /* __KERNEL__ */
65
66#endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
new file mode 100644
index 00000000000..2fbfff88df3
--- /dev/null
+++ b/arch/x86/include/asm/pda.h
@@ -0,0 +1,137 @@
1#ifndef _ASM_X86_PDA_H
2#define _ASM_X86_PDA_H
3
4#ifndef __ASSEMBLY__
5#include <linux/stddef.h>
6#include <linux/types.h>
7#include <linux/cache.h>
8#include <asm/page.h>
9
10/* Per processor datastructure. %gs points to it while the kernel runs */
11struct x8664_pda {
12 struct task_struct *pcurrent; /* 0 Current process */
13 unsigned long data_offset; /* 8 Per cpu data offset from linker
14 address */
15 unsigned long kernelstack; /* 16 top of kernel stack for current */
16 unsigned long oldrsp; /* 24 user rsp for system call */
17 int irqcount; /* 32 Irq nesting counter. Starts -1 */
18 unsigned int cpunumber; /* 36 Logical CPU number */
19#ifdef CONFIG_CC_STACKPROTECTOR
20 unsigned long stack_canary; /* 40 stack canary value */
21 /* gcc-ABI: this canary MUST be at
22 offset 40!!! */
23#endif
24 char *irqstackptr;
25 short nodenumber; /* number of current node (32k max) */
26 short in_bootmem; /* pda lives in bootmem */
27 unsigned int __softirq_pending;
28 unsigned int __nmi_count; /* number of NMI on this CPUs */
29 short mmu_state;
30 short isidle;
31 struct mm_struct *active_mm;
32 unsigned apic_timer_irqs;
33 unsigned irq0_irqs;
34 unsigned irq_resched_count;
35 unsigned irq_call_count;
36 unsigned irq_tlb_count;
37 unsigned irq_thermal_count;
38 unsigned irq_threshold_count;
39 unsigned irq_spurious_count;
40} ____cacheline_aligned_in_smp;
41
42extern struct x8664_pda **_cpu_pda;
43extern void pda_init(int);
44
45#define cpu_pda(i) (_cpu_pda[i])
46
47/*
48 * There is no fast way to get the base address of the PDA, all the accesses
49 * have to mention %fs/%gs. So it needs to be done this Torvaldian way.
50 */
51extern void __bad_pda_field(void) __attribute__((noreturn));
52
53/*
54 * proxy_pda doesn't actually exist, but tell gcc it is accessed for
55 * all PDA accesses so it gets read/write dependencies right.
56 */
57extern struct x8664_pda _proxy_pda;
58
59#define pda_offset(field) offsetof(struct x8664_pda, field)
60
61#define pda_to_op(op, field, val) \
62do { \
63 typedef typeof(_proxy_pda.field) T__; \
64 if (0) { T__ tmp__; tmp__ = (val); } /* type checking */ \
65 switch (sizeof(_proxy_pda.field)) { \
66 case 2: \
67 asm(op "w %1,%%gs:%c2" : \
68 "+m" (_proxy_pda.field) : \
69 "ri" ((T__)val), \
70 "i"(pda_offset(field))); \
71 break; \
72 case 4: \
73 asm(op "l %1,%%gs:%c2" : \
74 "+m" (_proxy_pda.field) : \
75 "ri" ((T__)val), \
76 "i" (pda_offset(field))); \
77 break; \
78 case 8: \
79 asm(op "q %1,%%gs:%c2": \
80 "+m" (_proxy_pda.field) : \
81 "ri" ((T__)val), \
82 "i"(pda_offset(field))); \
83 break; \
84 default: \
85 __bad_pda_field(); \
86 } \
87} while (0)
88
89#define pda_from_op(op, field) \
90({ \
91 typeof(_proxy_pda.field) ret__; \
92 switch (sizeof(_proxy_pda.field)) { \
93 case 2: \
94 asm(op "w %%gs:%c1,%0" : \
95 "=r" (ret__) : \
96 "i" (pda_offset(field)), \
97 "m" (_proxy_pda.field)); \
98 break; \
99 case 4: \
100 asm(op "l %%gs:%c1,%0": \
101 "=r" (ret__): \
102 "i" (pda_offset(field)), \
103 "m" (_proxy_pda.field)); \
104 break; \
105 case 8: \
106 asm(op "q %%gs:%c1,%0": \
107 "=r" (ret__) : \
108 "i" (pda_offset(field)), \
109 "m" (_proxy_pda.field)); \
110 break; \
111 default: \
112 __bad_pda_field(); \
113 } \
114 ret__; \
115})
116
117#define read_pda(field) pda_from_op("mov", field)
118#define write_pda(field, val) pda_to_op("mov", field, val)
119#define add_pda(field, val) pda_to_op("add", field, val)
120#define sub_pda(field, val) pda_to_op("sub", field, val)
121#define or_pda(field, val) pda_to_op("or", field, val)
122
123/* This is not atomic against other CPUs -- CPU preemption needs to be off */
124#define test_and_clear_bit_pda(bit, field) \
125({ \
126 int old__; \
127 asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \
128 : "=r" (old__), "+m" (_proxy_pda.field) \
129 : "dIr" (bit), "i" (pda_offset(field)) : "memory");\
130 old__; \
131})
132
133#endif
134
135#define PDA_STACKOFFSET (5*8)
136
137#endif /* _ASM_X86_PDA_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
new file mode 100644
index 00000000000..ece72053ba6
--- /dev/null
+++ b/arch/x86/include/asm/percpu.h
@@ -0,0 +1,218 @@
1#ifndef _ASM_X86_PERCPU_H
2#define _ASM_X86_PERCPU_H
3
4#ifdef CONFIG_X86_64
5#include <linux/compiler.h>
6
7/* Same as asm-generic/percpu.h, except that we store the per cpu offset
8 in the PDA. Longer term the PDA and every per cpu variable
9 should be just put into a single section and referenced directly
10 from %gs */
11
12#ifdef CONFIG_SMP
13#include <asm/pda.h>
14
15#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
16#define __my_cpu_offset read_pda(data_offset)
17
18#define per_cpu_offset(x) (__per_cpu_offset(x))
19
20#endif
21#include <asm-generic/percpu.h>
22
23DECLARE_PER_CPU(struct x8664_pda, pda);
24
25/*
26 * These are supposed to be implemented as a single instruction which
27 * operates on the per-cpu data base segment. x86-64 doesn't have
28 * that yet, so this is a fairly inefficient workaround for the
29 * meantime. The single instruction is atomic with respect to
30 * preemption and interrupts, so we need to explicitly disable
31 * interrupts here to achieve the same effect. However, because it
32 * can be used from within interrupt-disable/enable, we can't actually
33 * disable interrupts; disabling preemption is enough.
34 */
35#define x86_read_percpu(var) \
36 ({ \
37 typeof(per_cpu_var(var)) __tmp; \
38 preempt_disable(); \
39 __tmp = __get_cpu_var(var); \
40 preempt_enable(); \
41 __tmp; \
42 })
43
44#define x86_write_percpu(var, val) \
45 do { \
46 preempt_disable(); \
47 __get_cpu_var(var) = (val); \
48 preempt_enable(); \
49 } while(0)
50
51#else /* CONFIG_X86_64 */
52
53#ifdef __ASSEMBLY__
54
55/*
56 * PER_CPU finds an address of a per-cpu variable.
57 *
58 * Args:
59 * var - variable name
60 * reg - 32bit register
61 *
62 * The resulting address is stored in the "reg" argument.
63 *
64 * Example:
65 * PER_CPU(cpu_gdt_descr, %ebx)
66 */
67#ifdef CONFIG_SMP
68#define PER_CPU(var, reg) \
69 movl %fs:per_cpu__##this_cpu_off, reg; \
70 lea per_cpu__##var(reg), reg
71#define PER_CPU_VAR(var) %fs:per_cpu__##var
72#else /* ! SMP */
73#define PER_CPU(var, reg) \
74 movl $per_cpu__##var, reg
75#define PER_CPU_VAR(var) per_cpu__##var
76#endif /* SMP */
77
78#else /* ...!ASSEMBLY */
79
80/*
81 * PER_CPU finds an address of a per-cpu variable.
82 *
83 * Args:
84 * var - variable name
85 * cpu - 32bit register containing the current CPU number
86 *
87 * The resulting address is stored in the "cpu" argument.
88 *
89 * Example:
90 * PER_CPU(cpu_gdt_descr, %ebx)
91 */
92#ifdef CONFIG_SMP
93
94#define __my_cpu_offset x86_read_percpu(this_cpu_off)
95
96/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
97#define __percpu_seg "%%fs:"
98
99#else /* !SMP */
100
101#define __percpu_seg ""
102
103#endif /* SMP */
104
105#include <asm-generic/percpu.h>
106
107/* We can use this directly for local CPU (faster). */
108DECLARE_PER_CPU(unsigned long, this_cpu_off);
109
110/* For arch-specific code, we can use direct single-insn ops (they
111 * don't give an lvalue though). */
112extern void __bad_percpu_size(void);
113
114#define percpu_to_op(op, var, val) \
115do { \
116 typedef typeof(var) T__; \
117 if (0) { \
118 T__ tmp__; \
119 tmp__ = (val); \
120 } \
121 switch (sizeof(var)) { \
122 case 1: \
123 asm(op "b %1,"__percpu_seg"%0" \
124 : "+m" (var) \
125 : "ri" ((T__)val)); \
126 break; \
127 case 2: \
128 asm(op "w %1,"__percpu_seg"%0" \
129 : "+m" (var) \
130 : "ri" ((T__)val)); \
131 break; \
132 case 4: \
133 asm(op "l %1,"__percpu_seg"%0" \
134 : "+m" (var) \
135 : "ri" ((T__)val)); \
136 break; \
137 default: __bad_percpu_size(); \
138 } \
139} while (0)
140
141#define percpu_from_op(op, var) \
142({ \
143 typeof(var) ret__; \
144 switch (sizeof(var)) { \
145 case 1: \
146 asm(op "b "__percpu_seg"%1,%0" \
147 : "=r" (ret__) \
148 : "m" (var)); \
149 break; \
150 case 2: \
151 asm(op "w "__percpu_seg"%1,%0" \
152 : "=r" (ret__) \
153 : "m" (var)); \
154 break; \
155 case 4: \
156 asm(op "l "__percpu_seg"%1,%0" \
157 : "=r" (ret__) \
158 : "m" (var)); \
159 break; \
160 default: __bad_percpu_size(); \
161 } \
162 ret__; \
163})
164
165#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
166#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val)
167#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val)
168#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
169#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
170#endif /* !__ASSEMBLY__ */
171#endif /* !CONFIG_X86_64 */
172
173#ifdef CONFIG_SMP
174
175/*
176 * Define the "EARLY_PER_CPU" macros. These are used for some per_cpu
177 * variables that are initialized and accessed before there are per_cpu
178 * areas allocated.
179 */
180
181#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
182 DEFINE_PER_CPU(_type, _name) = _initvalue; \
183 __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \
184 { [0 ... NR_CPUS-1] = _initvalue }; \
185 __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
186
187#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
188 EXPORT_PER_CPU_SYMBOL(_name)
189
190#define DECLARE_EARLY_PER_CPU(_type, _name) \
191 DECLARE_PER_CPU(_type, _name); \
192 extern __typeof__(_type) *_name##_early_ptr; \
193 extern __typeof__(_type) _name##_early_map[]
194
195#define early_per_cpu_ptr(_name) (_name##_early_ptr)
196#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
197#define early_per_cpu(_name, _cpu) \
198 (early_per_cpu_ptr(_name) ? \
199 early_per_cpu_ptr(_name)[_cpu] : \
200 per_cpu(_name, _cpu))
201
202#else /* !CONFIG_SMP */
203#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
204 DEFINE_PER_CPU(_type, _name) = _initvalue
205
206#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
207 EXPORT_PER_CPU_SYMBOL(_name)
208
209#define DECLARE_EARLY_PER_CPU(_type, _name) \
210 DECLARE_PER_CPU(_type, _name)
211
212#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
213#define early_per_cpu_ptr(_name) NULL
214/* no early_per_cpu_map() */
215
216#endif /* !CONFIG_SMP */
217
218#endif /* _ASM_X86_PERCPU_H */
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
new file mode 100644
index 00000000000..cb7c151a8bf
--- /dev/null
+++ b/arch/x86/include/asm/pgalloc.h
@@ -0,0 +1,114 @@
1#ifndef _ASM_X86_PGALLOC_H
2#define _ASM_X86_PGALLOC_H
3
4#include <linux/threads.h>
5#include <linux/mm.h> /* for struct page */
6#include <linux/pagemap.h>
7
8static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
9
10#ifdef CONFIG_PARAVIRT
11#include <asm/paravirt.h>
12#else
13#define paravirt_pgd_alloc(mm) __paravirt_pgd_alloc(mm)
14static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
15static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
16static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
17static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
18 unsigned long start, unsigned long count) {}
19static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
20static inline void paravirt_release_pte(unsigned long pfn) {}
21static inline void paravirt_release_pmd(unsigned long pfn) {}
22static inline void paravirt_release_pud(unsigned long pfn) {}
23#endif
24
25/*
26 * Allocate and free page tables.
27 */
28extern pgd_t *pgd_alloc(struct mm_struct *);
29extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
30
31extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
32extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
33
34/* Should really implement gc for free page table pages. This could be
35 done with a reference count in struct page. */
36
37static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
38{
39 BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
40 free_page((unsigned long)pte);
41}
42
43static inline void pte_free(struct mm_struct *mm, struct page *pte)
44{
45 __free_page(pte);
46}
47
48extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
49
50static inline void pmd_populate_kernel(struct mm_struct *mm,
51 pmd_t *pmd, pte_t *pte)
52{
53 paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
54 set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
55}
56
57static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
58 struct page *pte)
59{
60 unsigned long pfn = page_to_pfn(pte);
61
62 paravirt_alloc_pte(mm, pfn);
63 set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
64}
65
66#define pmd_pgtable(pmd) pmd_page(pmd)
67
68#if PAGETABLE_LEVELS > 2
69static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
70{
71 return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
72}
73
74static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
75{
76 BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
77 free_page((unsigned long)pmd);
78}
79
80extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
81
82#ifdef CONFIG_X86_PAE
83extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
84#else /* !CONFIG_X86_PAE */
85static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
86{
87 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
88 set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
89}
90#endif /* CONFIG_X86_PAE */
91
92#if PAGETABLE_LEVELS > 3
93static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
94{
95 paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
96 set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
97}
98
99static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
100{
101 return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
102}
103
104static inline void pud_free(struct mm_struct *mm, pud_t *pud)
105{
106 BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
107 free_page((unsigned long)pud);
108}
109
110extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
111#endif /* PAGETABLE_LEVELS > 3 */
112#endif /* PAGETABLE_LEVELS > 2 */
113
114#endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/asm/pgtable-2level-defs.h b/arch/x86/include/asm/pgtable-2level-defs.h
new file mode 100644
index 00000000000..d77db8990ea
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-2level-defs.h
@@ -0,0 +1,20 @@
1#ifndef _ASM_X86_PGTABLE_2LEVEL_DEFS_H
2#define _ASM_X86_PGTABLE_2LEVEL_DEFS_H
3
4#define SHARED_KERNEL_PMD 0
5
6/*
7 * traditional i386 two-level paging structure:
8 */
9
10#define PGDIR_SHIFT 22
11#define PTRS_PER_PGD 1024
12
13/*
14 * the i386 is two-level, so we don't really have any
15 * PMD directory physically.
16 */
17
18#define PTRS_PER_PTE 1024
19
20#endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
new file mode 100644
index 00000000000..b17edfd2362
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -0,0 +1,79 @@
1#ifndef _ASM_X86_PGTABLE_2LEVEL_H
2#define _ASM_X86_PGTABLE_2LEVEL_H
3
4#define pte_ERROR(e) \
5 printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low)
6#define pgd_ERROR(e) \
7 printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
8
9/*
10 * Certain architectures need to do special things when PTEs
11 * within a page table are directly modified. Thus, the following
12 * hook is made available.
13 */
14static inline void native_set_pte(pte_t *ptep , pte_t pte)
15{
16 *ptep = pte;
17}
18
19static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
20{
21 *pmdp = pmd;
22}
23
24static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
25{
26 native_set_pte(ptep, pte);
27}
28
29static inline void native_set_pte_present(struct mm_struct *mm,
30 unsigned long addr,
31 pte_t *ptep, pte_t pte)
32{
33 native_set_pte(ptep, pte);
34}
35
36static inline void native_pmd_clear(pmd_t *pmdp)
37{
38 native_set_pmd(pmdp, __pmd(0));
39}
40
41static inline void native_pte_clear(struct mm_struct *mm,
42 unsigned long addr, pte_t *xp)
43{
44 *xp = native_make_pte(0);
45}
46
47#ifdef CONFIG_SMP
48static inline pte_t native_ptep_get_and_clear(pte_t *xp)
49{
50 return __pte(xchg(&xp->pte_low, 0));
51}
52#else
53#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
54#endif
55
56#define pte_none(x) (!(x).pte_low)
57
58/*
59 * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
60 * into this range:
61 */
62#define PTE_FILE_MAX_BITS 29
63
64#define pte_to_pgoff(pte) \
65 ((((pte).pte_low >> 1) & 0x1f) + (((pte).pte_low >> 8) << 5))
66
67#define pgoff_to_pte(off) \
68 ((pte_t) { .pte_low = (((off) & 0x1f) << 1) + \
69 (((off) >> 5) << 8) + _PAGE_FILE })
70
71/* Encode and de-code a swap entry */
72#define __swp_type(x) (((x).val >> 1) & 0x1f)
73#define __swp_offset(x) ((x).val >> 8)
74#define __swp_entry(type, offset) \
75 ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
76#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
77#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
78
79#endif /* _ASM_X86_PGTABLE_2LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable-3level-defs.h b/arch/x86/include/asm/pgtable-3level-defs.h
new file mode 100644
index 00000000000..62561367653
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-3level-defs.h
@@ -0,0 +1,28 @@
1#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
2#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
3
4#ifdef CONFIG_PARAVIRT
5#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd)
6#else
7#define SHARED_KERNEL_PMD 1
8#endif
9
10/*
11 * PGDIR_SHIFT determines what a top-level page table entry can map
12 */
13#define PGDIR_SHIFT 30
14#define PTRS_PER_PGD 4
15
16/*
17 * PMD_SHIFT determines the size of the area a middle-level
18 * page table can map
19 */
20#define PMD_SHIFT 21
21#define PTRS_PER_PMD 512
22
23/*
24 * entries per page directory level
25 */
26#define PTRS_PER_PTE 512
27
28#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
new file mode 100644
index 00000000000..fb16cec702e
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -0,0 +1,175 @@
1#ifndef _ASM_X86_PGTABLE_3LEVEL_H
2#define _ASM_X86_PGTABLE_3LEVEL_H
3
4/*
5 * Intel Physical Address Extension (PAE) Mode - three-level page
6 * tables on PPro+ CPUs.
7 *
8 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
9 */
10
11#define pte_ERROR(e) \
12 printk("%s:%d: bad pte %p(%08lx%08lx).\n", \
13 __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
14#define pmd_ERROR(e) \
15 printk("%s:%d: bad pmd %p(%016Lx).\n", \
16 __FILE__, __LINE__, &(e), pmd_val(e))
17#define pgd_ERROR(e) \
18 printk("%s:%d: bad pgd %p(%016Lx).\n", \
19 __FILE__, __LINE__, &(e), pgd_val(e))
20
21static inline int pud_none(pud_t pud)
22{
23 return pud_val(pud) == 0;
24}
25
26static inline int pud_bad(pud_t pud)
27{
28 return (pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
29}
30
31static inline int pud_present(pud_t pud)
32{
33 return pud_val(pud) & _PAGE_PRESENT;
34}
35
36/* Rules for using set_pte: the pte being assigned *must* be
37 * either not present or in a state where the hardware will
38 * not attempt to update the pte. In places where this is
39 * not possible, use pte_get_and_clear to obtain the old pte
40 * value and then use set_pte to update it. -ben
41 */
42static inline void native_set_pte(pte_t *ptep, pte_t pte)
43{
44 ptep->pte_high = pte.pte_high;
45 smp_wmb();
46 ptep->pte_low = pte.pte_low;
47}
48
49/*
50 * Since this is only called on user PTEs, and the page fault handler
51 * must handle the already racy situation of simultaneous page faults,
52 * we are justified in merely clearing the PTE present bit, followed
53 * by a set. The ordering here is important.
54 */
55static inline void native_set_pte_present(struct mm_struct *mm,
56 unsigned long addr,
57 pte_t *ptep, pte_t pte)
58{
59 ptep->pte_low = 0;
60 smp_wmb();
61 ptep->pte_high = pte.pte_high;
62 smp_wmb();
63 ptep->pte_low = pte.pte_low;
64}
65
66static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
67{
68 set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
69}
70
71static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
72{
73 set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd));
74}
75
76static inline void native_set_pud(pud_t *pudp, pud_t pud)
77{
78 set_64bit((unsigned long long *)(pudp), native_pud_val(pud));
79}
80
81/*
82 * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
83 * entry, so clear the bottom half first and enforce ordering with a compiler
84 * barrier.
85 */
86static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
87 pte_t *ptep)
88{
89 ptep->pte_low = 0;
90 smp_wmb();
91 ptep->pte_high = 0;
92}
93
94static inline void native_pmd_clear(pmd_t *pmd)
95{
96 u32 *tmp = (u32 *)pmd;
97 *tmp = 0;
98 smp_wmb();
99 *(tmp + 1) = 0;
100}
101
102static inline void pud_clear(pud_t *pudp)
103{
104 unsigned long pgd;
105
106 set_pud(pudp, __pud(0));
107
108 /*
109 * According to Intel App note "TLBs, Paging-Structure Caches,
110 * and Their Invalidation", April 2007, document 317080-001,
111 * section 8.1: in PAE mode we explicitly have to flush the
112 * TLB via cr3 if the top-level pgd is changed...
113 *
114 * Make sure the pud entry we're updating is within the
115 * current pgd to avoid unnecessary TLB flushes.
116 */
117 pgd = read_cr3();
118 if (__pa(pudp) >= pgd && __pa(pudp) <
119 (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
120 write_cr3(pgd);
121}
122
123#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_PFN_MASK))
124
125#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
126
127
128/* Find an entry in the second-level page table.. */
129#define pmd_offset(pud, address) ((pmd_t *)pud_page(*(pud)) + \
130 pmd_index(address))
131
132#ifdef CONFIG_SMP
133static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
134{
135 pte_t res;
136
137 /* xchg acts as a barrier before the setting of the high bits */
138 res.pte_low = xchg(&ptep->pte_low, 0);
139 res.pte_high = ptep->pte_high;
140 ptep->pte_high = 0;
141
142 return res;
143}
144#else
145#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
146#endif
147
148#define __HAVE_ARCH_PTE_SAME
149static inline int pte_same(pte_t a, pte_t b)
150{
151 return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
152}
153
154static inline int pte_none(pte_t pte)
155{
156 return !pte.pte_low && !pte.pte_high;
157}
158
159/*
160 * Bits 0, 6 and 7 are taken in the low part of the pte,
161 * put the 32 bits of offset into the high part.
162 */
163#define pte_to_pgoff(pte) ((pte).pte_high)
164#define pgoff_to_pte(off) \
165 ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
166#define PTE_FILE_MAX_BITS 32
167
168/* Encode and de-code a swap entry */
169#define __swp_type(x) (((x).val) & 0x1f)
170#define __swp_offset(x) ((x).val >> 5)
171#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
172#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
173#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
174
175#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
new file mode 100644
index 00000000000..c012f3b1167
--- /dev/null
+++ b/arch/x86/include/asm/pgtable.h
@@ -0,0 +1,562 @@
1#ifndef _ASM_X86_PGTABLE_H
2#define _ASM_X86_PGTABLE_H
3
4#define FIRST_USER_ADDRESS 0
5
6#define _PAGE_BIT_PRESENT 0 /* is present */
7#define _PAGE_BIT_RW 1 /* writeable */
8#define _PAGE_BIT_USER 2 /* userspace addressable */
9#define _PAGE_BIT_PWT 3 /* page write through */
10#define _PAGE_BIT_PCD 4 /* page cache disabled */
11#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
12#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
13#define _PAGE_BIT_FILE 6
14#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
15#define _PAGE_BIT_PAT 7 /* on 4KB pages */
16#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
17#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
18#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
19#define _PAGE_BIT_UNUSED3 11
20#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
21#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
22#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
23#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
24
25#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
26#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
27#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
28#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
29#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
30#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
31#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
32#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
33#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
34#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
35#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
36#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
37#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
38#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
39#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
40#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
41#define __HAVE_ARCH_PTE_SPECIAL
42
43#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
44#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
45#else
46#define _PAGE_NX (_AT(pteval_t, 0))
47#endif
48
49/* If _PAGE_PRESENT is clear, we use these: */
50#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping,
51 * saved PTE; unset:swap */
52#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
53 pte_present gives true */
54
55#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
56 _PAGE_ACCESSED | _PAGE_DIRTY)
57#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
58 _PAGE_DIRTY)
59
60/* Set of bits not changed in pte_modify */
61#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
62 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
63
64#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
65#define _PAGE_CACHE_WB (0)
66#define _PAGE_CACHE_WC (_PAGE_PWT)
67#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
68#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
69
70#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
71#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
72 _PAGE_ACCESSED | _PAGE_NX)
73
74#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
75 _PAGE_USER | _PAGE_ACCESSED)
76#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
77 _PAGE_ACCESSED | _PAGE_NX)
78#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
79 _PAGE_ACCESSED)
80#define PAGE_COPY PAGE_COPY_NOEXEC
81#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
82 _PAGE_ACCESSED | _PAGE_NX)
83#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
84 _PAGE_ACCESSED)
85
86#define __PAGE_KERNEL_EXEC \
87 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
88#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
89
90#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
91#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
92#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
93#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
94#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
95#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
96#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
97#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
98#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
99#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
100#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
101
102#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP)
103#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
104#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
105#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP)
106
107#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
108#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
109#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
110#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
111#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
112#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
113#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
114#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
115#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
116#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
117#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
118#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
119#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
120
121#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
122#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
123#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS)
124#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC)
125
126/* xwr */
127#define __P000 PAGE_NONE
128#define __P001 PAGE_READONLY
129#define __P010 PAGE_COPY
130#define __P011 PAGE_COPY
131#define __P100 PAGE_READONLY_EXEC
132#define __P101 PAGE_READONLY_EXEC
133#define __P110 PAGE_COPY_EXEC
134#define __P111 PAGE_COPY_EXEC
135
136#define __S000 PAGE_NONE
137#define __S001 PAGE_READONLY
138#define __S010 PAGE_SHARED
139#define __S011 PAGE_SHARED
140#define __S100 PAGE_READONLY_EXEC
141#define __S101 PAGE_READONLY_EXEC
142#define __S110 PAGE_SHARED_EXEC
143#define __S111 PAGE_SHARED_EXEC
144
145/*
146 * early identity mapping pte attrib macros.
147 */
148#ifdef CONFIG_X86_64
149#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
150#else
151/*
152 * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
153 * bits are combined, this will alow user to access the high address mapped
154 * VDSO in the presence of CONFIG_COMPAT_VDSO
155 */
156#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */
157#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
158#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
159#endif
160
161#ifndef __ASSEMBLY__
162
163/*
164 * ZERO_PAGE is a global shared page that is always zero: used
165 * for zero-mapped memory areas etc..
166 */
167extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
168#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
169
170extern spinlock_t pgd_lock;
171extern struct list_head pgd_list;
172
173/*
174 * The following only work if pte_present() is true.
175 * Undefined behaviour if not..
176 */
177static inline int pte_dirty(pte_t pte)
178{
179 return pte_flags(pte) & _PAGE_DIRTY;
180}
181
182static inline int pte_young(pte_t pte)
183{
184 return pte_flags(pte) & _PAGE_ACCESSED;
185}
186
187static inline int pte_write(pte_t pte)
188{
189 return pte_flags(pte) & _PAGE_RW;
190}
191
192static inline int pte_file(pte_t pte)
193{
194 return pte_flags(pte) & _PAGE_FILE;
195}
196
197static inline int pte_huge(pte_t pte)
198{
199 return pte_flags(pte) & _PAGE_PSE;
200}
201
202static inline int pte_global(pte_t pte)
203{
204 return pte_flags(pte) & _PAGE_GLOBAL;
205}
206
207static inline int pte_exec(pte_t pte)
208{
209 return !(pte_flags(pte) & _PAGE_NX);
210}
211
212static inline int pte_special(pte_t pte)
213{
214 return pte_flags(pte) & _PAGE_SPECIAL;
215}
216
217static inline unsigned long pte_pfn(pte_t pte)
218{
219 return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT;
220}
221
222#define pte_page(pte) pfn_to_page(pte_pfn(pte))
223
224static inline int pmd_large(pmd_t pte)
225{
226 return (pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
227 (_PAGE_PSE | _PAGE_PRESENT);
228}
229
230static inline pte_t pte_mkclean(pte_t pte)
231{
232 return __pte(pte_val(pte) & ~_PAGE_DIRTY);
233}
234
235static inline pte_t pte_mkold(pte_t pte)
236{
237 return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
238}
239
240static inline pte_t pte_wrprotect(pte_t pte)
241{
242 return __pte(pte_val(pte) & ~_PAGE_RW);
243}
244
245static inline pte_t pte_mkexec(pte_t pte)
246{
247 return __pte(pte_val(pte) & ~_PAGE_NX);
248}
249
250static inline pte_t pte_mkdirty(pte_t pte)
251{
252 return __pte(pte_val(pte) | _PAGE_DIRTY);
253}
254
255static inline pte_t pte_mkyoung(pte_t pte)
256{
257 return __pte(pte_val(pte) | _PAGE_ACCESSED);
258}
259
260static inline pte_t pte_mkwrite(pte_t pte)
261{
262 return __pte(pte_val(pte) | _PAGE_RW);
263}
264
265static inline pte_t pte_mkhuge(pte_t pte)
266{
267 return __pte(pte_val(pte) | _PAGE_PSE);
268}
269
270static inline pte_t pte_clrhuge(pte_t pte)
271{
272 return __pte(pte_val(pte) & ~_PAGE_PSE);
273}
274
275static inline pte_t pte_mkglobal(pte_t pte)
276{
277 return __pte(pte_val(pte) | _PAGE_GLOBAL);
278}
279
280static inline pte_t pte_clrglobal(pte_t pte)
281{
282 return __pte(pte_val(pte) & ~_PAGE_GLOBAL);
283}
284
285static inline pte_t pte_mkspecial(pte_t pte)
286{
287 return __pte(pte_val(pte) | _PAGE_SPECIAL);
288}
289
290extern pteval_t __supported_pte_mask;
291
292static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
293{
294 return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
295 pgprot_val(pgprot)) & __supported_pte_mask);
296}
297
298static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
299{
300 return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
301 pgprot_val(pgprot)) & __supported_pte_mask);
302}
303
304static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
305{
306 pteval_t val = pte_val(pte);
307
308 /*
309 * Chop off the NX bit (if present), and add the NX portion of
310 * the newprot (if present):
311 */
312 val &= _PAGE_CHG_MASK;
313 val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
314
315 return __pte(val);
316}
317
318/* mprotect needs to preserve PAT bits when updating vm_page_prot */
319#define pgprot_modify pgprot_modify
320static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
321{
322 pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
323 pgprotval_t addbits = pgprot_val(newprot);
324 return __pgprot(preservebits | addbits);
325}
326
327#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
328
329#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
330
331#ifndef __ASSEMBLY__
332#define __HAVE_PHYS_MEM_ACCESS_PROT
333struct file;
334pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
335 unsigned long size, pgprot_t vma_prot);
336int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
337 unsigned long size, pgprot_t *vma_prot);
338#endif
339
340/* Install a pte for a particular vaddr in kernel space. */
341void set_pte_vaddr(unsigned long vaddr, pte_t pte);
342
343#ifdef CONFIG_X86_32
344extern void native_pagetable_setup_start(pgd_t *base);
345extern void native_pagetable_setup_done(pgd_t *base);
346#else
347static inline void native_pagetable_setup_start(pgd_t *base) {}
348static inline void native_pagetable_setup_done(pgd_t *base) {}
349#endif
350
351struct seq_file;
352extern void arch_report_meminfo(struct seq_file *m);
353
354#ifdef CONFIG_PARAVIRT
355#include <asm/paravirt.h>
356#else /* !CONFIG_PARAVIRT */
357#define set_pte(ptep, pte) native_set_pte(ptep, pte)
358#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
359
360#define set_pte_present(mm, addr, ptep, pte) \
361 native_set_pte_present(mm, addr, ptep, pte)
362#define set_pte_atomic(ptep, pte) \
363 native_set_pte_atomic(ptep, pte)
364
365#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd)
366
367#ifndef __PAGETABLE_PUD_FOLDED
368#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
369#define pgd_clear(pgd) native_pgd_clear(pgd)
370#endif
371
372#ifndef set_pud
373# define set_pud(pudp, pud) native_set_pud(pudp, pud)
374#endif
375
376#ifndef __PAGETABLE_PMD_FOLDED
377#define pud_clear(pud) native_pud_clear(pud)
378#endif
379
380#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep)
381#define pmd_clear(pmd) native_pmd_clear(pmd)
382
383#define pte_update(mm, addr, ptep) do { } while (0)
384#define pte_update_defer(mm, addr, ptep) do { } while (0)
385
386static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
387{
388 native_pagetable_setup_start(base);
389}
390
391static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
392{
393 native_pagetable_setup_done(base);
394}
395#endif /* CONFIG_PARAVIRT */
396
397#endif /* __ASSEMBLY__ */
398
399#ifdef CONFIG_X86_32
400# include "pgtable_32.h"
401#else
402# include "pgtable_64.h"
403#endif
404
405/*
406 * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
407 *
408 * this macro returns the index of the entry in the pgd page which would
409 * control the given virtual address
410 */
411#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
412
413/*
414 * pgd_offset() returns a (pgd_t *)
415 * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
416 */
417#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
418/*
419 * a shortcut which implies the use of the kernel's pgd, instead
420 * of a process's
421 */
422#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
423
424
425#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
426#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
427
428#ifndef __ASSEMBLY__
429
430enum {
431 PG_LEVEL_NONE,
432 PG_LEVEL_4K,
433 PG_LEVEL_2M,
434 PG_LEVEL_1G,
435 PG_LEVEL_NUM
436};
437
438#ifdef CONFIG_PROC_FS
439extern void update_page_count(int level, unsigned long pages);
440#else
441static inline void update_page_count(int level, unsigned long pages) { }
442#endif
443
444/*
445 * Helper function that returns the kernel pagetable entry controlling
446 * the virtual address 'address'. NULL means no pagetable entry present.
447 * NOTE: the return type is pte_t but if the pmd is PSE then we return it
448 * as a pte too.
449 */
450extern pte_t *lookup_address(unsigned long address, unsigned int *level);
451
452/* local pte updates need not use xchg for locking */
453static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
454{
455 pte_t res = *ptep;
456
457 /* Pure native function needs no input for mm, addr */
458 native_pte_clear(NULL, 0, ptep);
459 return res;
460}
461
462static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
463 pte_t *ptep , pte_t pte)
464{
465 native_set_pte(ptep, pte);
466}
467
468#ifndef CONFIG_PARAVIRT
469/*
470 * Rules for using pte_update - it must be called after any PTE update which
471 * has not been done using the set_pte / clear_pte interfaces. It is used by
472 * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
473 * updates should either be sets, clears, or set_pte_atomic for P->P
474 * transitions, which means this hook should only be called for user PTEs.
475 * This hook implies a P->P protection or access change has taken place, which
476 * requires a subsequent TLB flush. The notification can optionally be delayed
477 * until the TLB flush event by using the pte_update_defer form of the
478 * interface, but care must be taken to assure that the flush happens while
479 * still holding the same page table lock so that the shadow and primary pages
480 * do not become out of sync on SMP.
481 */
482#define pte_update(mm, addr, ptep) do { } while (0)
483#define pte_update_defer(mm, addr, ptep) do { } while (0)
484#endif
485
486/*
487 * We only update the dirty/accessed state if we set
488 * the dirty bit by hand in the kernel, since the hardware
489 * will do the accessed bit for us, and we don't want to
490 * race with other CPU's that might be updating the dirty
491 * bit at the same time.
492 */
493struct vm_area_struct;
494
495#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
496extern int ptep_set_access_flags(struct vm_area_struct *vma,
497 unsigned long address, pte_t *ptep,
498 pte_t entry, int dirty);
499
500#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
501extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
502 unsigned long addr, pte_t *ptep);
503
504#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
505extern int ptep_clear_flush_young(struct vm_area_struct *vma,
506 unsigned long address, pte_t *ptep);
507
508#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
509static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
510 pte_t *ptep)
511{
512 pte_t pte = native_ptep_get_and_clear(ptep);
513 pte_update(mm, addr, ptep);
514 return pte;
515}
516
517#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
518static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
519 unsigned long addr, pte_t *ptep,
520 int full)
521{
522 pte_t pte;
523 if (full) {
524 /*
525 * Full address destruction in progress; paravirt does not
526 * care about updates and native needs no locking
527 */
528 pte = native_local_ptep_get_and_clear(ptep);
529 } else {
530 pte = ptep_get_and_clear(mm, addr, ptep);
531 }
532 return pte;
533}
534
535#define __HAVE_ARCH_PTEP_SET_WRPROTECT
536static inline void ptep_set_wrprotect(struct mm_struct *mm,
537 unsigned long addr, pte_t *ptep)
538{
539 clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
540 pte_update(mm, addr, ptep);
541}
542
543/*
544 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
545 *
546 * dst - pointer to pgd range anwhere on a pgd page
547 * src - ""
548 * count - the number of pgds to copy.
549 *
550 * dst and src can be on the same page, but the range must not overlap,
551 * and must not cross a page boundary.
552 */
553static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
554{
555 memcpy(dst, src, count * sizeof(pgd_t));
556}
557
558
559#include <asm-generic/pgtable.h>
560#endif /* __ASSEMBLY__ */
561
562#endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
new file mode 100644
index 00000000000..f9d5889b336
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -0,0 +1,191 @@
1#ifndef _ASM_X86_PGTABLE_32_H
2#define _ASM_X86_PGTABLE_32_H
3
4
5/*
6 * The Linux memory management assumes a three-level page table setup. On
7 * the i386, we use that, but "fold" the mid level into the top-level page
8 * table, so that we physically have the same two-level page table as the
9 * i386 mmu expects.
10 *
11 * This file contains the functions and defines necessary to modify and use
12 * the i386 page table tree.
13 */
14#ifndef __ASSEMBLY__
15#include <asm/processor.h>
16#include <asm/fixmap.h>
17#include <linux/threads.h>
18#include <asm/paravirt.h>
19
20#include <linux/bitops.h>
21#include <linux/slab.h>
22#include <linux/list.h>
23#include <linux/spinlock.h>
24
25struct mm_struct;
26struct vm_area_struct;
27
28extern pgd_t swapper_pg_dir[1024];
29
30static inline void pgtable_cache_init(void) { }
31static inline void check_pgt_cache(void) { }
32void paging_init(void);
33
34extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
35
36/*
37 * The Linux x86 paging architecture is 'compile-time dual-mode', it
38 * implements both the traditional 2-level x86 page tables and the
39 * newer 3-level PAE-mode page tables.
40 */
41#ifdef CONFIG_X86_PAE
42# include <asm/pgtable-3level-defs.h>
43# define PMD_SIZE (1UL << PMD_SHIFT)
44# define PMD_MASK (~(PMD_SIZE - 1))
45#else
46# include <asm/pgtable-2level-defs.h>
47#endif
48
49#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
50#define PGDIR_MASK (~(PGDIR_SIZE - 1))
51
52/* Just any arbitrary offset to the start of the vmalloc VM area: the
53 * current 8MB value just means that there will be a 8MB "hole" after the
54 * physical memory until the kernel virtual memory starts. That means that
55 * any out-of-bounds memory accesses will hopefully be caught.
56 * The vmalloc() routines leaves a hole of 4kB between each vmalloced
57 * area for the same reason. ;)
58 */
59#define VMALLOC_OFFSET (8 * 1024 * 1024)
60#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
61#ifdef CONFIG_X86_PAE
62#define LAST_PKMAP 512
63#else
64#define LAST_PKMAP 1024
65#endif
66
67#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
68 & PMD_MASK)
69
70#ifdef CONFIG_HIGHMEM
71# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
72#else
73# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
74#endif
75
76#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
77
78/*
79 * Define this if things work differently on an i386 and an i486:
80 * it will (on an i486) warn about kernel memory accesses that are
81 * done without a 'access_ok(VERIFY_WRITE,..)'
82 */
83#undef TEST_ACCESS_OK
84
85/* The boot page tables (all created as a single array) */
86extern unsigned long pg0[];
87
88#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
89
90/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
91#define pmd_none(x) (!(unsigned long)pmd_val((x)))
92#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT)
93#define pmd_bad(x) ((pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
94
95#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
96
97#ifdef CONFIG_X86_PAE
98# include <asm/pgtable-3level.h>
99#else
100# include <asm/pgtable-2level.h>
101#endif
102
103/*
104 * Macro to mark a page protection value as "uncacheable".
105 * On processors which do not support it, this is a no-op.
106 */
107#define pgprot_noncached(prot) \
108 ((boot_cpu_data.x86 > 3) \
109 ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \
110 : (prot))
111
112/*
113 * Conversion functions: convert a page and protection to a page entry,
114 * and a page entry and page directory to the page they refer to.
115 */
116#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
117
118
119static inline int pud_large(pud_t pud) { return 0; }
120
121/*
122 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
123 *
124 * this macro returns the index of the entry in the pmd page which would
125 * control the given virtual address
126 */
127#define pmd_index(address) \
128 (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
129
130/*
131 * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
132 *
133 * this macro returns the index of the entry in the pte page which would
134 * control the given virtual address
135 */
136#define pte_index(address) \
137 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
138#define pte_offset_kernel(dir, address) \
139 ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address)))
140
141#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
142
143#define pmd_page_vaddr(pmd) \
144 ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
145
146#if defined(CONFIG_HIGHPTE)
147#define pte_offset_map(dir, address) \
148 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \
149 pte_index((address)))
150#define pte_offset_map_nested(dir, address) \
151 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
152 pte_index((address)))
153#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
154#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
155#else
156#define pte_offset_map(dir, address) \
157 ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
158#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
159#define pte_unmap(pte) do { } while (0)
160#define pte_unmap_nested(pte) do { } while (0)
161#endif
162
163/* Clear a kernel PTE and flush it from the TLB */
164#define kpte_clear_flush(ptep, vaddr) \
165do { \
166 pte_clear(&init_mm, (vaddr), (ptep)); \
167 __flush_tlb_one((vaddr)); \
168} while (0)
169
170/*
171 * The i386 doesn't have any external MMU info: the kernel page
172 * tables contain all the necessary information.
173 */
174#define update_mmu_cache(vma, address, pte) do { } while (0)
175
176#endif /* !__ASSEMBLY__ */
177
178/*
179 * kern_addr_valid() is (1) for FLATMEM and (0) for
180 * SPARSEMEM and DISCONTIGMEM
181 */
182#ifdef CONFIG_FLATMEM
183#define kern_addr_valid(addr) (1)
184#else
185#define kern_addr_valid(kaddr) (0)
186#endif
187
188#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
189 remap_pfn_range(vma, vaddr, pfn, size, prot)
190
191#endif /* _ASM_X86_PGTABLE_32_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
new file mode 100644
index 00000000000..545a0e042bb
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -0,0 +1,285 @@
1#ifndef _ASM_X86_PGTABLE_64_H
2#define _ASM_X86_PGTABLE_64_H
3
4#include <linux/const.h>
5#ifndef __ASSEMBLY__
6
7/*
8 * This file contains the functions and defines necessary to modify and use
9 * the x86-64 page table tree.
10 */
11#include <asm/processor.h>
12#include <linux/bitops.h>
13#include <linux/threads.h>
14#include <asm/pda.h>
15
16extern pud_t level3_kernel_pgt[512];
17extern pud_t level3_ident_pgt[512];
18extern pmd_t level2_kernel_pgt[512];
19extern pmd_t level2_fixmap_pgt[512];
20extern pmd_t level2_ident_pgt[512];
21extern pgd_t init_level4_pgt[];
22
23#define swapper_pg_dir init_level4_pgt
24
25extern void paging_init(void);
26
27#endif /* !__ASSEMBLY__ */
28
29#define SHARED_KERNEL_PMD 0
30
31/*
32 * PGDIR_SHIFT determines what a top-level page table entry can map
33 */
34#define PGDIR_SHIFT 39
35#define PTRS_PER_PGD 512
36
37/*
38 * 3rd level page
39 */
40#define PUD_SHIFT 30
41#define PTRS_PER_PUD 512
42
43/*
44 * PMD_SHIFT determines the size of the area a middle-level
45 * page table can map
46 */
47#define PMD_SHIFT 21
48#define PTRS_PER_PMD 512
49
50/*
51 * entries per page directory level
52 */
53#define PTRS_PER_PTE 512
54
55#ifndef __ASSEMBLY__
56
57#define pte_ERROR(e) \
58 printk("%s:%d: bad pte %p(%016lx).\n", \
59 __FILE__, __LINE__, &(e), pte_val(e))
60#define pmd_ERROR(e) \
61 printk("%s:%d: bad pmd %p(%016lx).\n", \
62 __FILE__, __LINE__, &(e), pmd_val(e))
63#define pud_ERROR(e) \
64 printk("%s:%d: bad pud %p(%016lx).\n", \
65 __FILE__, __LINE__, &(e), pud_val(e))
66#define pgd_ERROR(e) \
67 printk("%s:%d: bad pgd %p(%016lx).\n", \
68 __FILE__, __LINE__, &(e), pgd_val(e))
69
70#define pgd_none(x) (!pgd_val(x))
71#define pud_none(x) (!pud_val(x))
72
73struct mm_struct;
74
75void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
76
77
78static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
79 pte_t *ptep)
80{
81 *ptep = native_make_pte(0);
82}
83
84static inline void native_set_pte(pte_t *ptep, pte_t pte)
85{
86 *ptep = pte;
87}
88
89static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
90{
91 native_set_pte(ptep, pte);
92}
93
94static inline pte_t native_ptep_get_and_clear(pte_t *xp)
95{
96#ifdef CONFIG_SMP
97 return native_make_pte(xchg(&xp->pte, 0));
98#else
99 /* native_local_ptep_get_and_clear,
100 but duplicated because of cyclic dependency */
101 pte_t ret = *xp;
102 native_pte_clear(NULL, 0, xp);
103 return ret;
104#endif
105}
106
107static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
108{
109 *pmdp = pmd;
110}
111
112static inline void native_pmd_clear(pmd_t *pmd)
113{
114 native_set_pmd(pmd, native_make_pmd(0));
115}
116
117static inline void native_set_pud(pud_t *pudp, pud_t pud)
118{
119 *pudp = pud;
120}
121
122static inline void native_pud_clear(pud_t *pud)
123{
124 native_set_pud(pud, native_make_pud(0));
125}
126
127static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
128{
129 *pgdp = pgd;
130}
131
132static inline void native_pgd_clear(pgd_t *pgd)
133{
134 native_set_pgd(pgd, native_make_pgd(0));
135}
136
137#define pte_same(a, b) ((a).pte == (b).pte)
138
139#endif /* !__ASSEMBLY__ */
140
141#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
142#define PMD_MASK (~(PMD_SIZE - 1))
143#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
144#define PUD_MASK (~(PUD_SIZE - 1))
145#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
146#define PGDIR_MASK (~(PGDIR_SIZE - 1))
147
148
149#define MAXMEM _AC(0x00003fffffffffff, UL)
150#define VMALLOC_START _AC(0xffffc20000000000, UL)
151#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
152#define VMEMMAP_START _AC(0xffffe20000000000, UL)
153#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
154#define MODULES_END _AC(0xffffffffff000000, UL)
155#define MODULES_LEN (MODULES_END - MODULES_VADDR)
156
157#ifndef __ASSEMBLY__
158
159static inline int pgd_bad(pgd_t pgd)
160{
161 return (pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
162}
163
164static inline int pud_bad(pud_t pud)
165{
166 return (pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
167}
168
169static inline int pmd_bad(pmd_t pmd)
170{
171 return (pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
172}
173
174#define pte_none(x) (!pte_val((x)))
175#define pte_present(x) (pte_val((x)) & (_PAGE_PRESENT | _PAGE_PROTNONE))
176
177#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
178
179/*
180 * Macro to mark a page protection value as "uncacheable".
181 */
182#define pgprot_noncached(prot) \
183 (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
184
185/*
186 * Conversion functions: convert a page and protection to a page entry,
187 * and a page entry and page directory to the page they refer to.
188 */
189
190/*
191 * Level 4 access.
192 */
193#define pgd_page_vaddr(pgd) \
194 ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
195#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
196#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
197static inline int pgd_large(pgd_t pgd) { return 0; }
198#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
199
200/* PUD - Level3 access */
201/* to find an entry in a page-table-directory. */
202#define pud_page_vaddr(pud) \
203 ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK))
204#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT))
205#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
206#define pud_offset(pgd, address) \
207 ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address)))
208#define pud_present(pud) (pud_val((pud)) & _PAGE_PRESENT)
209
210static inline int pud_large(pud_t pte)
211{
212 return (pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
213 (_PAGE_PSE | _PAGE_PRESENT);
214}
215
216/* PMD - Level 2 access */
217#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
218#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
219
220#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
221#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \
222 pmd_index(address))
223#define pmd_none(x) (!pmd_val((x)))
224#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT)
225#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot))))
226#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT)
227
228#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
229#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
230 _PAGE_FILE })
231#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
232
233/* PTE - Level 1 access. */
234
235/* page, protection -> pte */
236#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot))
237
238#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
239#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
240 pte_index((address)))
241
242/* x86-64 always has all page tables mapped. */
243#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
244#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
245#define pte_unmap(pte) /* NOP */
246#define pte_unmap_nested(pte) /* NOP */
247
248#define update_mmu_cache(vma, address, pte) do { } while (0)
249
250extern int direct_gbpages;
251
252/* Encode and de-code a swap entry */
253#define __swp_type(x) (((x).val >> 1) & 0x3f)
254#define __swp_offset(x) ((x).val >> 8)
255#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | \
256 ((offset) << 8) })
257#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
258#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
259
260extern int kern_addr_valid(unsigned long addr);
261extern void cleanup_highmap(void);
262
263#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
264 remap_pfn_range(vma, vaddr, pfn, size, prot)
265
266#define HAVE_ARCH_UNMAPPED_AREA
267#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
268
269#define pgtable_cache_init() do { } while (0)
270#define check_pgt_cache() do { } while (0)
271
272#define PAGE_AGP PAGE_KERNEL_NOCACHE
273#define HAVE_PAGE_AGP 1
274
275/* fs/proc/kcore.c */
276#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
277#define kc_offset_to_vaddr(o) \
278 (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \
279 ? ((o) | ~__VIRTUAL_MASK) \
280 : (o))
281
282#define __HAVE_ARCH_PTE_SAME
283#endif /* !__ASSEMBLY__ */
284
285#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/poll.h b/arch/x86/include/asm/poll.h
new file mode 100644
index 00000000000..c98509d3149
--- /dev/null
+++ b/arch/x86/include/asm/poll.h
@@ -0,0 +1 @@
#include <asm-generic/poll.h>
diff --git a/arch/x86/include/asm/posix_types.h b/arch/x86/include/asm/posix_types.h
new file mode 100644
index 00000000000..bb7133dc155
--- /dev/null
+++ b/arch/x86/include/asm/posix_types.h
@@ -0,0 +1,13 @@
1#ifdef __KERNEL__
2# ifdef CONFIG_X86_32
3# include "posix_types_32.h"
4# else
5# include "posix_types_64.h"
6# endif
7#else
8# ifdef __i386__
9# include "posix_types_32.h"
10# else
11# include "posix_types_64.h"
12# endif
13#endif
diff --git a/arch/x86/include/asm/posix_types_32.h b/arch/x86/include/asm/posix_types_32.h
new file mode 100644
index 00000000000..f7d9adf82e5
--- /dev/null
+++ b/arch/x86/include/asm/posix_types_32.h
@@ -0,0 +1,85 @@
1#ifndef _ASM_X86_POSIX_TYPES_32_H
2#define _ASM_X86_POSIX_TYPES_32_H
3
4/*
5 * This file is generally used by user-level software, so you need to
6 * be a little careful about namespace pollution etc. Also, we cannot
7 * assume GCC is being used.
8 */
9
10typedef unsigned long __kernel_ino_t;
11typedef unsigned short __kernel_mode_t;
12typedef unsigned short __kernel_nlink_t;
13typedef long __kernel_off_t;
14typedef int __kernel_pid_t;
15typedef unsigned short __kernel_ipc_pid_t;
16typedef unsigned short __kernel_uid_t;
17typedef unsigned short __kernel_gid_t;
18typedef unsigned int __kernel_size_t;
19typedef int __kernel_ssize_t;
20typedef int __kernel_ptrdiff_t;
21typedef long __kernel_time_t;
22typedef long __kernel_suseconds_t;
23typedef long __kernel_clock_t;
24typedef int __kernel_timer_t;
25typedef int __kernel_clockid_t;
26typedef int __kernel_daddr_t;
27typedef char * __kernel_caddr_t;
28typedef unsigned short __kernel_uid16_t;
29typedef unsigned short __kernel_gid16_t;
30typedef unsigned int __kernel_uid32_t;
31typedef unsigned int __kernel_gid32_t;
32
33typedef unsigned short __kernel_old_uid_t;
34typedef unsigned short __kernel_old_gid_t;
35typedef unsigned short __kernel_old_dev_t;
36
37#ifdef __GNUC__
38typedef long long __kernel_loff_t;
39#endif
40
41typedef struct {
42 int val[2];
43} __kernel_fsid_t;
44
45#if defined(__KERNEL__)
46
47#undef __FD_SET
48#define __FD_SET(fd,fdsetp) \
49 asm volatile("btsl %1,%0": \
50 "+m" (*(__kernel_fd_set *)(fdsetp)) \
51 : "r" ((int)(fd)))
52
53#undef __FD_CLR
54#define __FD_CLR(fd,fdsetp) \
55 asm volatile("btrl %1,%0": \
56 "+m" (*(__kernel_fd_set *)(fdsetp)) \
57 : "r" ((int) (fd)))
58
59#undef __FD_ISSET
60#define __FD_ISSET(fd,fdsetp) \
61 (__extension__ \
62 ({ \
63 unsigned char __result; \
64 asm volatile("btl %1,%2 ; setb %0" \
65 : "=q" (__result) \
66 : "r" ((int)(fd)), \
67 "m" (*(__kernel_fd_set *)(fdsetp))); \
68 __result; \
69}))
70
71#undef __FD_ZERO
72#define __FD_ZERO(fdsetp) \
73do { \
74 int __d0, __d1; \
75 asm volatile("cld ; rep ; stosl" \
76 : "=m" (*(__kernel_fd_set *)(fdsetp)), \
77 "=&c" (__d0), "=&D" (__d1) \
78 : "a" (0), "1" (__FDSET_LONGS), \
79 "2" ((__kernel_fd_set *)(fdsetp)) \
80 : "memory"); \
81} while (0)
82
83#endif /* defined(__KERNEL__) */
84
85#endif /* _ASM_X86_POSIX_TYPES_32_H */
diff --git a/arch/x86/include/asm/posix_types_64.h b/arch/x86/include/asm/posix_types_64.h
new file mode 100644
index 00000000000..eb8d2d92b63
--- /dev/null
+++ b/arch/x86/include/asm/posix_types_64.h
@@ -0,0 +1,119 @@
1#ifndef _ASM_X86_POSIX_TYPES_64_H
2#define _ASM_X86_POSIX_TYPES_64_H
3
4/*
5 * This file is generally used by user-level software, so you need to
6 * be a little careful about namespace pollution etc. Also, we cannot
7 * assume GCC is being used.
8 */
9
10typedef unsigned long __kernel_ino_t;
11typedef unsigned int __kernel_mode_t;
12typedef unsigned long __kernel_nlink_t;
13typedef long __kernel_off_t;
14typedef int __kernel_pid_t;
15typedef int __kernel_ipc_pid_t;
16typedef unsigned int __kernel_uid_t;
17typedef unsigned int __kernel_gid_t;
18typedef unsigned long __kernel_size_t;
19typedef long __kernel_ssize_t;
20typedef long __kernel_ptrdiff_t;
21typedef long __kernel_time_t;
22typedef long __kernel_suseconds_t;
23typedef long __kernel_clock_t;
24typedef int __kernel_timer_t;
25typedef int __kernel_clockid_t;
26typedef int __kernel_daddr_t;
27typedef char * __kernel_caddr_t;
28typedef unsigned short __kernel_uid16_t;
29typedef unsigned short __kernel_gid16_t;
30
31#ifdef __GNUC__
32typedef long long __kernel_loff_t;
33#endif
34
35typedef struct {
36 int val[2];
37} __kernel_fsid_t;
38
39typedef unsigned short __kernel_old_uid_t;
40typedef unsigned short __kernel_old_gid_t;
41typedef __kernel_uid_t __kernel_uid32_t;
42typedef __kernel_gid_t __kernel_gid32_t;
43
44typedef unsigned long __kernel_old_dev_t;
45
46#ifdef __KERNEL__
47
48#undef __FD_SET
49static inline void __FD_SET(unsigned long fd, __kernel_fd_set *fdsetp)
50{
51 unsigned long _tmp = fd / __NFDBITS;
52 unsigned long _rem = fd % __NFDBITS;
53 fdsetp->fds_bits[_tmp] |= (1UL<<_rem);
54}
55
56#undef __FD_CLR
57static inline void __FD_CLR(unsigned long fd, __kernel_fd_set *fdsetp)
58{
59 unsigned long _tmp = fd / __NFDBITS;
60 unsigned long _rem = fd % __NFDBITS;
61 fdsetp->fds_bits[_tmp] &= ~(1UL<<_rem);
62}
63
64#undef __FD_ISSET
65static inline int __FD_ISSET(unsigned long fd, __const__ __kernel_fd_set *p)
66{
67 unsigned long _tmp = fd / __NFDBITS;
68 unsigned long _rem = fd % __NFDBITS;
69 return (p->fds_bits[_tmp] & (1UL<<_rem)) != 0;
70}
71
72/*
73 * This will unroll the loop for the normal constant cases (8 or 32 longs,
74 * for 256 and 1024-bit fd_sets respectively)
75 */
76#undef __FD_ZERO
77static inline void __FD_ZERO(__kernel_fd_set *p)
78{
79 unsigned long *tmp = p->fds_bits;
80 int i;
81
82 if (__builtin_constant_p(__FDSET_LONGS)) {
83 switch (__FDSET_LONGS) {
84 case 32:
85 tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
86 tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
87 tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0;
88 tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0;
89 tmp[16] = 0; tmp[17] = 0; tmp[18] = 0; tmp[19] = 0;
90 tmp[20] = 0; tmp[21] = 0; tmp[22] = 0; tmp[23] = 0;
91 tmp[24] = 0; tmp[25] = 0; tmp[26] = 0; tmp[27] = 0;
92 tmp[28] = 0; tmp[29] = 0; tmp[30] = 0; tmp[31] = 0;
93 return;
94 case 16:
95 tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
96 tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
97 tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0;
98 tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0;
99 return;
100 case 8:
101 tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
102 tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
103 return;
104 case 4:
105 tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
106 return;
107 }
108 }
109 i = __FDSET_LONGS;
110 while (i) {
111 i--;
112 *tmp = 0;
113 tmp++;
114 }
115}
116
117#endif /* defined(__KERNEL__) */
118
119#endif /* _ASM_X86_POSIX_TYPES_64_H */
diff --git a/arch/x86/include/asm/prctl.h b/arch/x86/include/asm/prctl.h
new file mode 100644
index 00000000000..fe681147a4f
--- /dev/null
+++ b/arch/x86/include/asm/prctl.h
@@ -0,0 +1,10 @@
1#ifndef _ASM_X86_PRCTL_H
2#define _ASM_X86_PRCTL_H
3
4#define ARCH_SET_GS 0x1001
5#define ARCH_SET_FS 0x1002
6#define ARCH_GET_FS 0x1003
7#define ARCH_GET_GS 0x1004
8
9
10#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/asm/processor-cyrix.h b/arch/x86/include/asm/processor-cyrix.h
new file mode 100644
index 00000000000..1198f2a0e42
--- /dev/null
+++ b/arch/x86/include/asm/processor-cyrix.h
@@ -0,0 +1,38 @@
1/*
2 * NSC/Cyrix CPU indexed register access. Must be inlined instead of
3 * macros to ensure correct access ordering
4 * Access order is always 0x22 (=offset), 0x23 (=value)
5 *
6 * When using the old macros a line like
7 * setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
8 * gets expanded to:
9 * do {
10 * outb((CX86_CCR2), 0x22);
11 * outb((({
12 * outb((CX86_CCR2), 0x22);
13 * inb(0x23);
14 * }) | 0x88), 0x23);
15 * } while (0);
16 *
17 * which in fact violates the access order (= 0x22, 0x22, 0x23, 0x23).
18 */
19
20static inline u8 getCx86(u8 reg)
21{
22 outb(reg, 0x22);
23 return inb(0x23);
24}
25
26static inline void setCx86(u8 reg, u8 data)
27{
28 outb(reg, 0x22);
29 outb(data, 0x23);
30}
31
32#define getCx86_old(reg) ({ outb((reg), 0x22); inb(0x23); })
33
34#define setCx86_old(reg, data) do { \
35 outb((reg), 0x22); \
36 outb((data), 0x23); \
37} while (0)
38
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
new file mode 100644
index 00000000000..7a3e836eb2a
--- /dev/null
+++ b/arch/x86/include/asm/processor-flags.h
@@ -0,0 +1,100 @@
1#ifndef _ASM_X86_PROCESSOR_FLAGS_H
2#define _ASM_X86_PROCESSOR_FLAGS_H
3/* Various flags defined: can be included from assembler. */
4
5/*
6 * EFLAGS bits
7 */
8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
9#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
10#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
11#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
12#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
13#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
14#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
15#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
16#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
17#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
18#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
19#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
20#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
21#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
22#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
23#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
24#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
25
26/*
27 * Basic CPU control in CR0
28 */
29#define X86_CR0_PE 0x00000001 /* Protection Enable */
30#define X86_CR0_MP 0x00000002 /* Monitor Coprocessor */
31#define X86_CR0_EM 0x00000004 /* Emulation */
32#define X86_CR0_TS 0x00000008 /* Task Switched */
33#define X86_CR0_ET 0x00000010 /* Extension Type */
34#define X86_CR0_NE 0x00000020 /* Numeric Error */
35#define X86_CR0_WP 0x00010000 /* Write Protect */
36#define X86_CR0_AM 0x00040000 /* Alignment Mask */
37#define X86_CR0_NW 0x20000000 /* Not Write-through */
38#define X86_CR0_CD 0x40000000 /* Cache Disable */
39#define X86_CR0_PG 0x80000000 /* Paging */
40
41/*
42 * Paging options in CR3
43 */
44#define X86_CR3_PWT 0x00000008 /* Page Write Through */
45#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */
46
47/*
48 * Intel CPU features in CR4
49 */
50#define X86_CR4_VME 0x00000001 /* enable vm86 extensions */
51#define X86_CR4_PVI 0x00000002 /* virtual interrupts flag enable */
52#define X86_CR4_TSD 0x00000004 /* disable time stamp at ipl 3 */
53#define X86_CR4_DE 0x00000008 /* enable debugging extensions */
54#define X86_CR4_PSE 0x00000010 /* enable page size extensions */
55#define X86_CR4_PAE 0x00000020 /* enable physical address extensions */
56#define X86_CR4_MCE 0x00000040 /* Machine check enable */
57#define X86_CR4_PGE 0x00000080 /* enable global pages */
58#define X86_CR4_PCE 0x00000100 /* enable performance counters at ipl 3 */
59#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */
60#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
61#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
62#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
63
64/*
65 * x86-64 Task Priority Register, CR8
66 */
67#define X86_CR8_TPR 0x0000000F /* task priority register */
68
69/*
70 * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h>
71 */
72
73/*
74 * NSC/Cyrix CPU configuration register indexes
75 */
76#define CX86_PCR0 0x20
77#define CX86_GCR 0xb8
78#define CX86_CCR0 0xc0
79#define CX86_CCR1 0xc1
80#define CX86_CCR2 0xc2
81#define CX86_CCR3 0xc3
82#define CX86_CCR4 0xe8
83#define CX86_CCR5 0xe9
84#define CX86_CCR6 0xea
85#define CX86_CCR7 0xeb
86#define CX86_PCR1 0xf0
87#define CX86_DIR0 0xfe
88#define CX86_DIR1 0xff
89#define CX86_ARR_BASE 0xc4
90#define CX86_RCR_BASE 0xdc
91
92#ifdef __KERNEL__
93#ifdef CONFIG_VM86
94#define X86_VM_MASK X86_EFLAGS_VM
95#else
96#define X86_VM_MASK 0 /* No VM86 support */
97#endif
98#endif
99
100#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
new file mode 100644
index 00000000000..5ca01e38326
--- /dev/null
+++ b/arch/x86/include/asm/processor.h
@@ -0,0 +1,936 @@
1#ifndef _ASM_X86_PROCESSOR_H
2#define _ASM_X86_PROCESSOR_H
3
4#include <asm/processor-flags.h>
5
6/* Forward declaration, a strange C thing */
7struct task_struct;
8struct mm_struct;
9
10#include <asm/vm86.h>
11#include <asm/math_emu.h>
12#include <asm/segment.h>
13#include <asm/types.h>
14#include <asm/sigcontext.h>
15#include <asm/current.h>
16#include <asm/cpufeature.h>
17#include <asm/system.h>
18#include <asm/page.h>
19#include <asm/percpu.h>
20#include <asm/msr.h>
21#include <asm/desc_defs.h>
22#include <asm/nops.h>
23#include <asm/ds.h>
24
25#include <linux/personality.h>
26#include <linux/cpumask.h>
27#include <linux/cache.h>
28#include <linux/threads.h>
29#include <linux/init.h>
30
31/*
32 * Default implementation of macro that returns current
33 * instruction pointer ("program counter").
34 */
35static inline void *current_text_addr(void)
36{
37 void *pc;
38
39 asm volatile("mov $1f, %0; 1:":"=r" (pc));
40
41 return pc;
42}
43
44#ifdef CONFIG_X86_VSMP
45# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
46# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
47#else
48# define ARCH_MIN_TASKALIGN 16
49# define ARCH_MIN_MMSTRUCT_ALIGN 0
50#endif
51
52/*
53 * CPU type and hardware bug flags. Kept separately for each CPU.
54 * Members of this structure are referenced in head.S, so think twice
55 * before touching them. [mj]
56 */
57
58struct cpuinfo_x86 {
59 __u8 x86; /* CPU family */
60 __u8 x86_vendor; /* CPU vendor */
61 __u8 x86_model;
62 __u8 x86_mask;
63#ifdef CONFIG_X86_32
64 char wp_works_ok; /* It doesn't on 386's */
65
66 /* Problems on some 486Dx4's and old 386's: */
67 char hlt_works_ok;
68 char hard_math;
69 char rfu;
70 char fdiv_bug;
71 char f00f_bug;
72 char coma_bug;
73 char pad0;
74#else
75 /* Number of 4K pages in DTLB/ITLB combined(in pages): */
76 int x86_tlbsize;
77 __u8 x86_virt_bits;
78 __u8 x86_phys_bits;
79#endif
80 /* CPUID returned core id bits: */
81 __u8 x86_coreid_bits;
82 /* Max extended CPUID function supported: */
83 __u32 extended_cpuid_level;
84 /* Maximum supported CPUID level, -1=no CPUID: */
85 int cpuid_level;
86 __u32 x86_capability[NCAPINTS];
87 char x86_vendor_id[16];
88 char x86_model_id[64];
89 /* in KB - valid for CPUS which support this call: */
90 int x86_cache_size;
91 int x86_cache_alignment; /* In bytes */
92 int x86_power;
93 unsigned long loops_per_jiffy;
94#ifdef CONFIG_SMP
95 /* cpus sharing the last level cache: */
96 cpumask_t llc_shared_map;
97#endif
98 /* cpuid returned max cores value: */
99 u16 x86_max_cores;
100 u16 apicid;
101 u16 initial_apicid;
102 u16 x86_clflush_size;
103#ifdef CONFIG_SMP
104 /* number of cores as seen by the OS: */
105 u16 booted_cores;
106 /* Physical processor id: */
107 u16 phys_proc_id;
108 /* Core id: */
109 u16 cpu_core_id;
110 /* Index into per_cpu list: */
111 u16 cpu_index;
112#endif
113} __attribute__((__aligned__(SMP_CACHE_BYTES)));
114
115#define X86_VENDOR_INTEL 0
116#define X86_VENDOR_CYRIX 1
117#define X86_VENDOR_AMD 2
118#define X86_VENDOR_UMC 3
119#define X86_VENDOR_CENTAUR 5
120#define X86_VENDOR_TRANSMETA 7
121#define X86_VENDOR_NSC 8
122#define X86_VENDOR_NUM 9
123
124#define X86_VENDOR_UNKNOWN 0xff
125
126/*
127 * capabilities of CPUs
128 */
129extern struct cpuinfo_x86 boot_cpu_data;
130extern struct cpuinfo_x86 new_cpu_data;
131
132extern struct tss_struct doublefault_tss;
133extern __u32 cleared_cpu_caps[NCAPINTS];
134
135#ifdef CONFIG_SMP
136DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
137#define cpu_data(cpu) per_cpu(cpu_info, cpu)
138#define current_cpu_data __get_cpu_var(cpu_info)
139#else
140#define cpu_data(cpu) boot_cpu_data
141#define current_cpu_data boot_cpu_data
142#endif
143
144extern const struct seq_operations cpuinfo_op;
145
146static inline int hlt_works(int cpu)
147{
148#ifdef CONFIG_X86_32
149 return cpu_data(cpu).hlt_works_ok;
150#else
151 return 1;
152#endif
153}
154
155#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
156
157extern void cpu_detect(struct cpuinfo_x86 *c);
158
159extern struct pt_regs *idle_regs(struct pt_regs *);
160
161extern void early_cpu_init(void);
162extern void identify_boot_cpu(void);
163extern void identify_secondary_cpu(struct cpuinfo_x86 *);
164extern void print_cpu_info(struct cpuinfo_x86 *);
165extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
166extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
167extern unsigned short num_cache_leaves;
168
169extern void detect_extended_topology(struct cpuinfo_x86 *c);
170extern void detect_ht(struct cpuinfo_x86 *c);
171
172static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
173 unsigned int *ecx, unsigned int *edx)
174{
175 /* ecx is often an input as well as an output. */
176 asm("cpuid"
177 : "=a" (*eax),
178 "=b" (*ebx),
179 "=c" (*ecx),
180 "=d" (*edx)
181 : "0" (*eax), "2" (*ecx));
182}
183
184static inline void load_cr3(pgd_t *pgdir)
185{
186 write_cr3(__pa(pgdir));
187}
188
189#ifdef CONFIG_X86_32
190/* This is the TSS defined by the hardware. */
191struct x86_hw_tss {
192 unsigned short back_link, __blh;
193 unsigned long sp0;
194 unsigned short ss0, __ss0h;
195 unsigned long sp1;
196 /* ss1 caches MSR_IA32_SYSENTER_CS: */
197 unsigned short ss1, __ss1h;
198 unsigned long sp2;
199 unsigned short ss2, __ss2h;
200 unsigned long __cr3;
201 unsigned long ip;
202 unsigned long flags;
203 unsigned long ax;
204 unsigned long cx;
205 unsigned long dx;
206 unsigned long bx;
207 unsigned long sp;
208 unsigned long bp;
209 unsigned long si;
210 unsigned long di;
211 unsigned short es, __esh;
212 unsigned short cs, __csh;
213 unsigned short ss, __ssh;
214 unsigned short ds, __dsh;
215 unsigned short fs, __fsh;
216 unsigned short gs, __gsh;
217 unsigned short ldt, __ldth;
218 unsigned short trace;
219 unsigned short io_bitmap_base;
220
221} __attribute__((packed));
222#else
223struct x86_hw_tss {
224 u32 reserved1;
225 u64 sp0;
226 u64 sp1;
227 u64 sp2;
228 u64 reserved2;
229 u64 ist[7];
230 u32 reserved3;
231 u32 reserved4;
232 u16 reserved5;
233 u16 io_bitmap_base;
234
235} __attribute__((packed)) ____cacheline_aligned;
236#endif
237
238/*
239 * IO-bitmap sizes:
240 */
241#define IO_BITMAP_BITS 65536
242#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
243#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
244#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
245#define INVALID_IO_BITMAP_OFFSET 0x8000
246#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
247
248struct tss_struct {
249 /*
250 * The hardware state:
251 */
252 struct x86_hw_tss x86_tss;
253
254 /*
255 * The extra 1 is there because the CPU will access an
256 * additional byte beyond the end of the IO permission
257 * bitmap. The extra byte must be all 1 bits, and must
258 * be within the limit.
259 */
260 unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
261 /*
262 * Cache the current maximum and the last task that used the bitmap:
263 */
264 unsigned long io_bitmap_max;
265 struct thread_struct *io_bitmap_owner;
266
267 /*
268 * .. and then another 0x100 bytes for the emergency kernel stack:
269 */
270 unsigned long stack[64];
271
272} ____cacheline_aligned;
273
274DECLARE_PER_CPU(struct tss_struct, init_tss);
275
276/*
277 * Save the original ist values for checking stack pointers during debugging
278 */
279struct orig_ist {
280 unsigned long ist[7];
281};
282
283#define MXCSR_DEFAULT 0x1f80
284
285struct i387_fsave_struct {
286 u32 cwd; /* FPU Control Word */
287 u32 swd; /* FPU Status Word */
288 u32 twd; /* FPU Tag Word */
289 u32 fip; /* FPU IP Offset */
290 u32 fcs; /* FPU IP Selector */
291 u32 foo; /* FPU Operand Pointer Offset */
292 u32 fos; /* FPU Operand Pointer Selector */
293
294 /* 8*10 bytes for each FP-reg = 80 bytes: */
295 u32 st_space[20];
296
297 /* Software status information [not touched by FSAVE ]: */
298 u32 status;
299};
300
301struct i387_fxsave_struct {
302 u16 cwd; /* Control Word */
303 u16 swd; /* Status Word */
304 u16 twd; /* Tag Word */
305 u16 fop; /* Last Instruction Opcode */
306 union {
307 struct {
308 u64 rip; /* Instruction Pointer */
309 u64 rdp; /* Data Pointer */
310 };
311 struct {
312 u32 fip; /* FPU IP Offset */
313 u32 fcs; /* FPU IP Selector */
314 u32 foo; /* FPU Operand Offset */
315 u32 fos; /* FPU Operand Selector */
316 };
317 };
318 u32 mxcsr; /* MXCSR Register State */
319 u32 mxcsr_mask; /* MXCSR Mask */
320
321 /* 8*16 bytes for each FP-reg = 128 bytes: */
322 u32 st_space[32];
323
324 /* 16*16 bytes for each XMM-reg = 256 bytes: */
325 u32 xmm_space[64];
326
327 u32 padding[12];
328
329 union {
330 u32 padding1[12];
331 u32 sw_reserved[12];
332 };
333
334} __attribute__((aligned(16)));
335
336struct i387_soft_struct {
337 u32 cwd;
338 u32 swd;
339 u32 twd;
340 u32 fip;
341 u32 fcs;
342 u32 foo;
343 u32 fos;
344 /* 8*10 bytes for each FP-reg = 80 bytes: */
345 u32 st_space[20];
346 u8 ftop;
347 u8 changed;
348 u8 lookahead;
349 u8 no_update;
350 u8 rm;
351 u8 alimit;
352 struct info *info;
353 u32 entry_eip;
354};
355
356struct xsave_hdr_struct {
357 u64 xstate_bv;
358 u64 reserved1[2];
359 u64 reserved2[5];
360} __attribute__((packed));
361
362struct xsave_struct {
363 struct i387_fxsave_struct i387;
364 struct xsave_hdr_struct xsave_hdr;
365 /* new processor state extensions will go here */
366} __attribute__ ((packed, aligned (64)));
367
368union thread_xstate {
369 struct i387_fsave_struct fsave;
370 struct i387_fxsave_struct fxsave;
371 struct i387_soft_struct soft;
372 struct xsave_struct xsave;
373};
374
375#ifdef CONFIG_X86_64
376DECLARE_PER_CPU(struct orig_ist, orig_ist);
377#endif
378
379extern void print_cpu_info(struct cpuinfo_x86 *);
380extern unsigned int xstate_size;
381extern void free_thread_xstate(struct task_struct *);
382extern struct kmem_cache *task_xstate_cachep;
383extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
384extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
385extern unsigned short num_cache_leaves;
386
387struct thread_struct {
388 /* Cached TLS descriptors: */
389 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
390 unsigned long sp0;
391 unsigned long sp;
392#ifdef CONFIG_X86_32
393 unsigned long sysenter_cs;
394#else
395 unsigned long usersp; /* Copy from PDA */
396 unsigned short es;
397 unsigned short ds;
398 unsigned short fsindex;
399 unsigned short gsindex;
400#endif
401 unsigned long ip;
402 unsigned long fs;
403 unsigned long gs;
404 /* Hardware debugging registers: */
405 unsigned long debugreg0;
406 unsigned long debugreg1;
407 unsigned long debugreg2;
408 unsigned long debugreg3;
409 unsigned long debugreg6;
410 unsigned long debugreg7;
411 /* Fault info: */
412 unsigned long cr2;
413 unsigned long trap_no;
414 unsigned long error_code;
415 /* floating point and extended processor state */
416 union thread_xstate *xstate;
417#ifdef CONFIG_X86_32
418 /* Virtual 86 mode info */
419 struct vm86_struct __user *vm86_info;
420 unsigned long screen_bitmap;
421 unsigned long v86flags;
422 unsigned long v86mask;
423 unsigned long saved_sp0;
424 unsigned int saved_fs;
425 unsigned int saved_gs;
426#endif
427 /* IO permissions: */
428 unsigned long *io_bitmap_ptr;
429 unsigned long iopl;
430 /* Max allowed port in the bitmap, in bytes: */
431 unsigned io_bitmap_max;
432/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
433 unsigned long debugctlmsr;
434#ifdef CONFIG_X86_DS
435/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
436 struct ds_context *ds_ctx;
437#endif /* CONFIG_X86_DS */
438#ifdef CONFIG_X86_PTRACE_BTS
439/* the signal to send on a bts buffer overflow */
440 unsigned int bts_ovfl_signal;
441#endif /* CONFIG_X86_PTRACE_BTS */
442};
443
444static inline unsigned long native_get_debugreg(int regno)
445{
446 unsigned long val = 0; /* Damn you, gcc! */
447
448 switch (regno) {
449 case 0:
450 asm("mov %%db0, %0" :"=r" (val));
451 break;
452 case 1:
453 asm("mov %%db1, %0" :"=r" (val));
454 break;
455 case 2:
456 asm("mov %%db2, %0" :"=r" (val));
457 break;
458 case 3:
459 asm("mov %%db3, %0" :"=r" (val));
460 break;
461 case 6:
462 asm("mov %%db6, %0" :"=r" (val));
463 break;
464 case 7:
465 asm("mov %%db7, %0" :"=r" (val));
466 break;
467 default:
468 BUG();
469 }
470 return val;
471}
472
473static inline void native_set_debugreg(int regno, unsigned long value)
474{
475 switch (regno) {
476 case 0:
477 asm("mov %0, %%db0" ::"r" (value));
478 break;
479 case 1:
480 asm("mov %0, %%db1" ::"r" (value));
481 break;
482 case 2:
483 asm("mov %0, %%db2" ::"r" (value));
484 break;
485 case 3:
486 asm("mov %0, %%db3" ::"r" (value));
487 break;
488 case 6:
489 asm("mov %0, %%db6" ::"r" (value));
490 break;
491 case 7:
492 asm("mov %0, %%db7" ::"r" (value));
493 break;
494 default:
495 BUG();
496 }
497}
498
499/*
500 * Set IOPL bits in EFLAGS from given mask
501 */
502static inline void native_set_iopl_mask(unsigned mask)
503{
504#ifdef CONFIG_X86_32
505 unsigned int reg;
506
507 asm volatile ("pushfl;"
508 "popl %0;"
509 "andl %1, %0;"
510 "orl %2, %0;"
511 "pushl %0;"
512 "popfl"
513 : "=&r" (reg)
514 : "i" (~X86_EFLAGS_IOPL), "r" (mask));
515#endif
516}
517
518static inline void
519native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
520{
521 tss->x86_tss.sp0 = thread->sp0;
522#ifdef CONFIG_X86_32
523 /* Only happens when SEP is enabled, no need to test "SEP"arately: */
524 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
525 tss->x86_tss.ss1 = thread->sysenter_cs;
526 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
527 }
528#endif
529}
530
531static inline void native_swapgs(void)
532{
533#ifdef CONFIG_X86_64
534 asm volatile("swapgs" ::: "memory");
535#endif
536}
537
538#ifdef CONFIG_PARAVIRT
539#include <asm/paravirt.h>
540#else
541#define __cpuid native_cpuid
542#define paravirt_enabled() 0
543
544/*
545 * These special macros can be used to get or set a debugging register
546 */
547#define get_debugreg(var, register) \
548 (var) = native_get_debugreg(register)
549#define set_debugreg(value, register) \
550 native_set_debugreg(register, value)
551
552static inline void load_sp0(struct tss_struct *tss,
553 struct thread_struct *thread)
554{
555 native_load_sp0(tss, thread);
556}
557
558#define set_iopl_mask native_set_iopl_mask
559#endif /* CONFIG_PARAVIRT */
560
561/*
562 * Save the cr4 feature set we're using (ie
563 * Pentium 4MB enable and PPro Global page
564 * enable), so that any CPU's that boot up
565 * after us can get the correct flags.
566 */
567extern unsigned long mmu_cr4_features;
568
569static inline void set_in_cr4(unsigned long mask)
570{
571 unsigned cr4;
572
573 mmu_cr4_features |= mask;
574 cr4 = read_cr4();
575 cr4 |= mask;
576 write_cr4(cr4);
577}
578
579static inline void clear_in_cr4(unsigned long mask)
580{
581 unsigned cr4;
582
583 mmu_cr4_features &= ~mask;
584 cr4 = read_cr4();
585 cr4 &= ~mask;
586 write_cr4(cr4);
587}
588
589typedef struct {
590 unsigned long seg;
591} mm_segment_t;
592
593
594/*
595 * create a kernel thread without removing it from tasklists
596 */
597extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
598
599/* Free all resources held by a thread. */
600extern void release_thread(struct task_struct *);
601
602/* Prepare to copy thread state - unlazy all lazy state */
603extern void prepare_to_copy(struct task_struct *tsk);
604
605unsigned long get_wchan(struct task_struct *p);
606
607/*
608 * Generic CPUID function
609 * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
610 * resulting in stale register contents being returned.
611 */
612static inline void cpuid(unsigned int op,
613 unsigned int *eax, unsigned int *ebx,
614 unsigned int *ecx, unsigned int *edx)
615{
616 *eax = op;
617 *ecx = 0;
618 __cpuid(eax, ebx, ecx, edx);
619}
620
621/* Some CPUID calls want 'count' to be placed in ecx */
622static inline void cpuid_count(unsigned int op, int count,
623 unsigned int *eax, unsigned int *ebx,
624 unsigned int *ecx, unsigned int *edx)
625{
626 *eax = op;
627 *ecx = count;
628 __cpuid(eax, ebx, ecx, edx);
629}
630
631/*
632 * CPUID functions returning a single datum
633 */
634static inline unsigned int cpuid_eax(unsigned int op)
635{
636 unsigned int eax, ebx, ecx, edx;
637
638 cpuid(op, &eax, &ebx, &ecx, &edx);
639
640 return eax;
641}
642
643static inline unsigned int cpuid_ebx(unsigned int op)
644{
645 unsigned int eax, ebx, ecx, edx;
646
647 cpuid(op, &eax, &ebx, &ecx, &edx);
648
649 return ebx;
650}
651
652static inline unsigned int cpuid_ecx(unsigned int op)
653{
654 unsigned int eax, ebx, ecx, edx;
655
656 cpuid(op, &eax, &ebx, &ecx, &edx);
657
658 return ecx;
659}
660
661static inline unsigned int cpuid_edx(unsigned int op)
662{
663 unsigned int eax, ebx, ecx, edx;
664
665 cpuid(op, &eax, &ebx, &ecx, &edx);
666
667 return edx;
668}
669
670/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
671static inline void rep_nop(void)
672{
673 asm volatile("rep; nop" ::: "memory");
674}
675
676static inline void cpu_relax(void)
677{
678 rep_nop();
679}
680
681/* Stop speculative execution: */
682static inline void sync_core(void)
683{
684 int tmp;
685
686 asm volatile("cpuid" : "=a" (tmp) : "0" (1)
687 : "ebx", "ecx", "edx", "memory");
688}
689
690static inline void __monitor(const void *eax, unsigned long ecx,
691 unsigned long edx)
692{
693 /* "monitor %eax, %ecx, %edx;" */
694 asm volatile(".byte 0x0f, 0x01, 0xc8;"
695 :: "a" (eax), "c" (ecx), "d"(edx));
696}
697
698static inline void __mwait(unsigned long eax, unsigned long ecx)
699{
700 /* "mwait %eax, %ecx;" */
701 asm volatile(".byte 0x0f, 0x01, 0xc9;"
702 :: "a" (eax), "c" (ecx));
703}
704
705static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
706{
707 trace_hardirqs_on();
708 /* "mwait %eax, %ecx;" */
709 asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
710 :: "a" (eax), "c" (ecx));
711}
712
713extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
714
715extern void select_idle_routine(const struct cpuinfo_x86 *c);
716
717extern unsigned long boot_option_idle_override;
718extern unsigned long idle_halt;
719extern unsigned long idle_nomwait;
720
721/*
722 * on systems with caches, caches must be flashed as the absolute
723 * last instruction before going into a suspended halt. Otherwise,
724 * dirty data can linger in the cache and become stale on resume,
725 * leading to strange errors.
726 *
727 * perform a variety of operations to guarantee that the compiler
728 * will not reorder instructions. wbinvd itself is serializing
729 * so the processor will not reorder.
730 *
731 * Systems without cache can just go into halt.
732 */
733static inline void wbinvd_halt(void)
734{
735 mb();
736 /* check for clflush to determine if wbinvd is legal */
737 if (cpu_has_clflush)
738 asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
739 else
740 while (1)
741 halt();
742}
743
744extern void enable_sep_cpu(void);
745extern int sysenter_setup(void);
746
747/* Defined in head.S */
748extern struct desc_ptr early_gdt_descr;
749
750extern void cpu_set_gdt(int);
751extern void switch_to_new_gdt(void);
752extern void cpu_init(void);
753extern void init_gdt(int cpu);
754
755static inline void update_debugctlmsr(unsigned long debugctlmsr)
756{
757#ifndef CONFIG_X86_DEBUGCTLMSR
758 if (boot_cpu_data.x86 < 6)
759 return;
760#endif
761 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
762}
763
764/*
765 * from system description table in BIOS. Mostly for MCA use, but
766 * others may find it useful:
767 */
768extern unsigned int machine_id;
769extern unsigned int machine_submodel_id;
770extern unsigned int BIOS_revision;
771
772/* Boot loader type from the setup header: */
773extern int bootloader_type;
774
775extern char ignore_fpu_irq;
776
777#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
778#define ARCH_HAS_PREFETCHW
779#define ARCH_HAS_SPINLOCK_PREFETCH
780
781#ifdef CONFIG_X86_32
782# define BASE_PREFETCH ASM_NOP4
783# define ARCH_HAS_PREFETCH
784#else
785# define BASE_PREFETCH "prefetcht0 (%1)"
786#endif
787
788/*
789 * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
790 *
791 * It's not worth to care about 3dnow prefetches for the K6
792 * because they are microcoded there and very slow.
793 */
794static inline void prefetch(const void *x)
795{
796 alternative_input(BASE_PREFETCH,
797 "prefetchnta (%1)",
798 X86_FEATURE_XMM,
799 "r" (x));
800}
801
802/*
803 * 3dnow prefetch to get an exclusive cache line.
804 * Useful for spinlocks to avoid one state transition in the
805 * cache coherency protocol:
806 */
807static inline void prefetchw(const void *x)
808{
809 alternative_input(BASE_PREFETCH,
810 "prefetchw (%1)",
811 X86_FEATURE_3DNOW,
812 "r" (x));
813}
814
815static inline void spin_lock_prefetch(const void *x)
816{
817 prefetchw(x);
818}
819
820#ifdef CONFIG_X86_32
821/*
822 * User space process size: 3GB (default).
823 */
824#define TASK_SIZE PAGE_OFFSET
825#define STACK_TOP TASK_SIZE
826#define STACK_TOP_MAX STACK_TOP
827
828#define INIT_THREAD { \
829 .sp0 = sizeof(init_stack) + (long)&init_stack, \
830 .vm86_info = NULL, \
831 .sysenter_cs = __KERNEL_CS, \
832 .io_bitmap_ptr = NULL, \
833 .fs = __KERNEL_PERCPU, \
834}
835
836/*
837 * Note that the .io_bitmap member must be extra-big. This is because
838 * the CPU will access an additional byte beyond the end of the IO
839 * permission bitmap. The extra byte must be all 1 bits, and must
840 * be within the limit.
841 */
842#define INIT_TSS { \
843 .x86_tss = { \
844 .sp0 = sizeof(init_stack) + (long)&init_stack, \
845 .ss0 = __KERNEL_DS, \
846 .ss1 = __KERNEL_CS, \
847 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
848 }, \
849 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
850}
851
852extern unsigned long thread_saved_pc(struct task_struct *tsk);
853
854#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
855#define KSTK_TOP(info) \
856({ \
857 unsigned long *__ptr = (unsigned long *)(info); \
858 (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
859})
860
861/*
862 * The below -8 is to reserve 8 bytes on top of the ring0 stack.
863 * This is necessary to guarantee that the entire "struct pt_regs"
864 * is accessable even if the CPU haven't stored the SS/ESP registers
865 * on the stack (interrupt gate does not save these registers
866 * when switching to the same priv ring).
867 * Therefore beware: accessing the ss/esp fields of the
868 * "struct pt_regs" is possible, but they may contain the
869 * completely wrong values.
870 */
871#define task_pt_regs(task) \
872({ \
873 struct pt_regs *__regs__; \
874 __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
875 __regs__ - 1; \
876})
877
878#define KSTK_ESP(task) (task_pt_regs(task)->sp)
879
880#else
881/*
882 * User space process size. 47bits minus one guard page.
883 */
884#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE)
885
886/* This decides where the kernel will search for a free chunk of vm
887 * space during mmap's.
888 */
889#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
890 0xc0000000 : 0xFFFFe000)
891
892#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
893 IA32_PAGE_OFFSET : TASK_SIZE64)
894#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
895 IA32_PAGE_OFFSET : TASK_SIZE64)
896
897#define STACK_TOP TASK_SIZE
898#define STACK_TOP_MAX TASK_SIZE64
899
900#define INIT_THREAD { \
901 .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
902}
903
904#define INIT_TSS { \
905 .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
906}
907
908/*
909 * Return saved PC of a blocked thread.
910 * What is this good for? it will be always the scheduler or ret_from_fork.
911 */
912#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
913
914#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
915#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
916#endif /* CONFIG_X86_64 */
917
918extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
919 unsigned long new_sp);
920
921/*
922 * This decides where the kernel will search for a free chunk of vm
923 * space during mmap's.
924 */
925#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
926
927#define KSTK_EIP(task) (task_pt_regs(task)->ip)
928
929/* Get/set a process' ability to use the timestamp counter instruction */
930#define GET_TSC_CTL(adr) get_tsc_mode((adr))
931#define SET_TSC_CTL(val) set_tsc_mode((val))
932
933extern int get_tsc_mode(unsigned long adr);
934extern int set_tsc_mode(unsigned int val);
935
936#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
new file mode 100644
index 00000000000..d6a22f92ba7
--- /dev/null
+++ b/arch/x86/include/asm/proto.h
@@ -0,0 +1,32 @@
1#ifndef _ASM_X86_PROTO_H
2#define _ASM_X86_PROTO_H
3
4#include <asm/ldt.h>
5
6/* misc architecture specific prototypes */
7
8extern void early_idt_handler(void);
9
10extern void system_call(void);
11extern void syscall_init(void);
12
13extern void ia32_syscall(void);
14extern void ia32_cstar_target(void);
15extern void ia32_sysenter_target(void);
16
17extern void syscall32_cpu_init(void);
18
19extern void check_efer(void);
20
21#ifdef CONFIG_X86_BIOS_REBOOT
22extern int reboot_force;
23#else
24static const int reboot_force = 0;
25#endif
26
27long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
28
29#define round_up(x, y) (((x) + (y) - 1) & ~((y) - 1))
30#define round_down(x, y) ((x) & ~((y) - 1))
31
32#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
new file mode 100644
index 00000000000..25f1bb8fc62
--- /dev/null
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -0,0 +1,145 @@
1#ifndef _ASM_X86_PTRACE_ABI_H
2#define _ASM_X86_PTRACE_ABI_H
3
4#ifdef __i386__
5
6#define EBX 0
7#define ECX 1
8#define EDX 2
9#define ESI 3
10#define EDI 4
11#define EBP 5
12#define EAX 6
13#define DS 7
14#define ES 8
15#define FS 9
16#define GS 10
17#define ORIG_EAX 11
18#define EIP 12
19#define CS 13
20#define EFL 14
21#define UESP 15
22#define SS 16
23#define FRAME_SIZE 17
24
25#else /* __i386__ */
26
27#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
28#define R15 0
29#define R14 8
30#define R13 16
31#define R12 24
32#define RBP 32
33#define RBX 40
34/* arguments: interrupts/non tracing syscalls only save upto here*/
35#define R11 48
36#define R10 56
37#define R9 64
38#define R8 72
39#define RAX 80
40#define RCX 88
41#define RDX 96
42#define RSI 104
43#define RDI 112
44#define ORIG_RAX 120 /* = ERROR */
45/* end of arguments */
46/* cpu exception frame or undefined in case of fast syscall. */
47#define RIP 128
48#define CS 136
49#define EFLAGS 144
50#define RSP 152
51#define SS 160
52#define ARGOFFSET R11
53#endif /* __ASSEMBLY__ */
54
55/* top of stack page */
56#define FRAME_SIZE 168
57
58#endif /* !__i386__ */
59
60/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
61#define PTRACE_GETREGS 12
62#define PTRACE_SETREGS 13
63#define PTRACE_GETFPREGS 14
64#define PTRACE_SETFPREGS 15
65#define PTRACE_GETFPXREGS 18
66#define PTRACE_SETFPXREGS 19
67
68#define PTRACE_OLDSETOPTIONS 21
69
70/* only useful for access 32bit programs / kernels */
71#define PTRACE_GET_THREAD_AREA 25
72#define PTRACE_SET_THREAD_AREA 26
73
74#ifdef __x86_64__
75# define PTRACE_ARCH_PRCTL 30
76#endif
77
78#define PTRACE_SYSEMU 31
79#define PTRACE_SYSEMU_SINGLESTEP 32
80
81#define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */
82
83#ifdef CONFIG_X86_PTRACE_BTS
84
85#ifndef __ASSEMBLY__
86#include <asm/types.h>
87
88/* configuration/status structure used in PTRACE_BTS_CONFIG and
89 PTRACE_BTS_STATUS commands.
90*/
91struct ptrace_bts_config {
92 /* requested or actual size of BTS buffer in bytes */
93 __u32 size;
94 /* bitmask of below flags */
95 __u32 flags;
96 /* buffer overflow signal */
97 __u32 signal;
98 /* actual size of bts_struct in bytes */
99 __u32 bts_size;
100};
101#endif /* __ASSEMBLY__ */
102
103#define PTRACE_BTS_O_TRACE 0x1 /* branch trace */
104#define PTRACE_BTS_O_SCHED 0x2 /* scheduling events w/ jiffies */
105#define PTRACE_BTS_O_SIGNAL 0x4 /* send SIG<signal> on buffer overflow
106 instead of wrapping around */
107#define PTRACE_BTS_O_ALLOC 0x8 /* (re)allocate buffer */
108
109#define PTRACE_BTS_CONFIG 40
110/* Configure branch trace recording.
111 ADDR points to a struct ptrace_bts_config.
112 DATA gives the size of that buffer.
113 A new buffer is allocated, if requested in the flags.
114 An overflow signal may only be requested for new buffers.
115 Returns the number of bytes read.
116*/
117#define PTRACE_BTS_STATUS 41
118/* Return the current configuration in a struct ptrace_bts_config
119 pointed to by ADDR; DATA gives the size of that buffer.
120 Returns the number of bytes written.
121*/
122#define PTRACE_BTS_SIZE 42
123/* Return the number of available BTS records for draining.
124 DATA and ADDR are ignored.
125*/
126#define PTRACE_BTS_GET 43
127/* Get a single BTS record.
128 DATA defines the index into the BTS array, where 0 is the newest
129 entry, and higher indices refer to older entries.
130 ADDR is pointing to struct bts_struct (see asm/ds.h).
131*/
132#define PTRACE_BTS_CLEAR 44
133/* Clear the BTS buffer.
134 DATA and ADDR are ignored.
135*/
136#define PTRACE_BTS_DRAIN 45
137/* Read all available BTS records and clear the buffer.
138 ADDR points to an array of struct bts_struct.
139 DATA gives the size of that buffer.
140 BTS records are read from oldest to newest.
141 Returns number of BTS records drained.
142*/
143#endif /* CONFIG_X86_PTRACE_BTS */
144
145#endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
new file mode 100644
index 00000000000..d1531c8480b
--- /dev/null
+++ b/arch/x86/include/asm/ptrace.h
@@ -0,0 +1,280 @@
1#ifndef _ASM_X86_PTRACE_H
2#define _ASM_X86_PTRACE_H
3
4#include <linux/compiler.h> /* For __user */
5#include <asm/ptrace-abi.h>
6#include <asm/processor-flags.h>
7
8#ifdef __KERNEL__
9#include <asm/ds.h> /* the DS BTS struct is used for ptrace too */
10#include <asm/segment.h>
11#endif
12
13#ifndef __ASSEMBLY__
14
15#ifdef __i386__
16/* this struct defines the way the registers are stored on the
17 stack during a system call. */
18
19#ifndef __KERNEL__
20
21struct pt_regs {
22 long ebx;
23 long ecx;
24 long edx;
25 long esi;
26 long edi;
27 long ebp;
28 long eax;
29 int xds;
30 int xes;
31 int xfs;
32 /* int gs; */
33 long orig_eax;
34 long eip;
35 int xcs;
36 long eflags;
37 long esp;
38 int xss;
39};
40
41#else /* __KERNEL__ */
42
43struct pt_regs {
44 unsigned long bx;
45 unsigned long cx;
46 unsigned long dx;
47 unsigned long si;
48 unsigned long di;
49 unsigned long bp;
50 unsigned long ax;
51 unsigned long ds;
52 unsigned long es;
53 unsigned long fs;
54 /* int gs; */
55 unsigned long orig_ax;
56 unsigned long ip;
57 unsigned long cs;
58 unsigned long flags;
59 unsigned long sp;
60 unsigned long ss;
61};
62
63#endif /* __KERNEL__ */
64
65#else /* __i386__ */
66
67#ifndef __KERNEL__
68
69struct pt_regs {
70 unsigned long r15;
71 unsigned long r14;
72 unsigned long r13;
73 unsigned long r12;
74 unsigned long rbp;
75 unsigned long rbx;
76/* arguments: non interrupts/non tracing syscalls only save upto here*/
77 unsigned long r11;
78 unsigned long r10;
79 unsigned long r9;
80 unsigned long r8;
81 unsigned long rax;
82 unsigned long rcx;
83 unsigned long rdx;
84 unsigned long rsi;
85 unsigned long rdi;
86 unsigned long orig_rax;
87/* end of arguments */
88/* cpu exception frame or undefined */
89 unsigned long rip;
90 unsigned long cs;
91 unsigned long eflags;
92 unsigned long rsp;
93 unsigned long ss;
94/* top of stack page */
95};
96
97#else /* __KERNEL__ */
98
99struct pt_regs {
100 unsigned long r15;
101 unsigned long r14;
102 unsigned long r13;
103 unsigned long r12;
104 unsigned long bp;
105 unsigned long bx;
106/* arguments: non interrupts/non tracing syscalls only save upto here*/
107 unsigned long r11;
108 unsigned long r10;
109 unsigned long r9;
110 unsigned long r8;
111 unsigned long ax;
112 unsigned long cx;
113 unsigned long dx;
114 unsigned long si;
115 unsigned long di;
116 unsigned long orig_ax;
117/* end of arguments */
118/* cpu exception frame or undefined */
119 unsigned long ip;
120 unsigned long cs;
121 unsigned long flags;
122 unsigned long sp;
123 unsigned long ss;
124/* top of stack page */
125};
126
127#endif /* __KERNEL__ */
128#endif /* !__i386__ */
129
130
131#ifdef CONFIG_X86_PTRACE_BTS
132/* a branch trace record entry
133 *
134 * In order to unify the interface between various processor versions,
135 * we use the below data structure for all processors.
136 */
137enum bts_qualifier {
138 BTS_INVALID = 0,
139 BTS_BRANCH,
140 BTS_TASK_ARRIVES,
141 BTS_TASK_DEPARTS
142};
143
144struct bts_struct {
145 __u64 qualifier;
146 union {
147 /* BTS_BRANCH */
148 struct {
149 __u64 from_ip;
150 __u64 to_ip;
151 } lbr;
152 /* BTS_TASK_ARRIVES or
153 BTS_TASK_DEPARTS */
154 __u64 jiffies;
155 } variant;
156};
157#endif /* CONFIG_X86_PTRACE_BTS */
158
159#ifdef __KERNEL__
160
161#include <linux/init.h>
162
163struct cpuinfo_x86;
164struct task_struct;
165
166#ifdef CONFIG_X86_PTRACE_BTS
167extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *);
168extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier);
169#else
170#define ptrace_bts_init_intel(config) do {} while (0)
171#endif /* CONFIG_X86_PTRACE_BTS */
172
173extern unsigned long profile_pc(struct pt_regs *regs);
174
175extern unsigned long
176convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
177extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
178 int error_code, int si_code);
179void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
180
181extern long syscall_trace_enter(struct pt_regs *);
182extern void syscall_trace_leave(struct pt_regs *);
183
184static inline unsigned long regs_return_value(struct pt_regs *regs)
185{
186 return regs->ax;
187}
188
189/*
190 * user_mode_vm(regs) determines whether a register set came from user mode.
191 * This is true if V8086 mode was enabled OR if the register set was from
192 * protected mode with RPL-3 CS value. This tricky test checks that with
193 * one comparison. Many places in the kernel can bypass this full check
194 * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
195 */
196static inline int user_mode(struct pt_regs *regs)
197{
198#ifdef CONFIG_X86_32
199 return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
200#else
201 return !!(regs->cs & 3);
202#endif
203}
204
205static inline int user_mode_vm(struct pt_regs *regs)
206{
207#ifdef CONFIG_X86_32
208 return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >=
209 USER_RPL;
210#else
211 return user_mode(regs);
212#endif
213}
214
215static inline int v8086_mode(struct pt_regs *regs)
216{
217#ifdef CONFIG_X86_32
218 return (regs->flags & X86_VM_MASK);
219#else
220 return 0; /* No V86 mode support in long mode */
221#endif
222}
223
224/*
225 * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
226 * when it traps. So regs will be the current sp.
227 *
228 * This is valid only for kernel mode traps.
229 */
230static inline unsigned long kernel_trap_sp(struct pt_regs *regs)
231{
232#ifdef CONFIG_X86_32
233 return (unsigned long)regs;
234#else
235 return regs->sp;
236#endif
237}
238
239static inline unsigned long instruction_pointer(struct pt_regs *regs)
240{
241 return regs->ip;
242}
243
244static inline unsigned long frame_pointer(struct pt_regs *regs)
245{
246 return regs->bp;
247}
248
249static inline unsigned long user_stack_pointer(struct pt_regs *regs)
250{
251 return regs->sp;
252}
253
254/*
255 * These are defined as per linux/ptrace.h, which see.
256 */
257#define arch_has_single_step() (1)
258extern void user_enable_single_step(struct task_struct *);
259extern void user_disable_single_step(struct task_struct *);
260
261extern void user_enable_block_step(struct task_struct *);
262#ifdef CONFIG_X86_DEBUGCTLMSR
263#define arch_has_block_step() (1)
264#else
265#define arch_has_block_step() (boot_cpu_data.x86 >= 6)
266#endif
267
268struct user_desc;
269extern int do_get_thread_area(struct task_struct *p, int idx,
270 struct user_desc __user *info);
271extern int do_set_thread_area(struct task_struct *p, int idx,
272 struct user_desc __user *info, int can_allocate);
273
274#define __ARCH_WANT_COMPAT_SYS_PTRACE
275
276#endif /* __KERNEL__ */
277
278#endif /* !__ASSEMBLY__ */
279
280#endif /* _ASM_X86_PTRACE_H */
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
new file mode 100644
index 00000000000..6d93508f262
--- /dev/null
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -0,0 +1,42 @@
1#ifndef _ASM_X86_PVCLOCK_ABI_H
2#define _ASM_X86_PVCLOCK_ABI_H
3#ifndef __ASSEMBLY__
4
5/*
6 * These structs MUST NOT be changed.
7 * They are the ABI between hypervisor and guest OS.
8 * Both Xen and KVM are using this.
9 *
10 * pvclock_vcpu_time_info holds the system time and the tsc timestamp
11 * of the last update. So the guest can use the tsc delta to get a
12 * more precise system time. There is one per virtual cpu.
13 *
14 * pvclock_wall_clock references the point in time when the system
15 * time was zero (usually boot time), thus the guest calculates the
16 * current wall clock by adding the system time.
17 *
18 * Protocol for the "version" fields is: hypervisor raises it (making
19 * it uneven) before it starts updating the fields and raises it again
20 * (making it even) when it is done. Thus the guest can make sure the
21 * time values it got are consistent by checking the version before
22 * and after reading them.
23 */
24
25struct pvclock_vcpu_time_info {
26 u32 version;
27 u32 pad0;
28 u64 tsc_timestamp;
29 u64 system_time;
30 u32 tsc_to_system_mul;
31 s8 tsc_shift;
32 u8 pad[3];
33} __attribute__((__packed__)); /* 32 bytes */
34
35struct pvclock_wall_clock {
36 u32 version;
37 u32 sec;
38 u32 nsec;
39} __attribute__((__packed__));
40
41#endif /* __ASSEMBLY__ */
42#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
new file mode 100644
index 00000000000..53235fd5f8c
--- /dev/null
+++ b/arch/x86/include/asm/pvclock.h
@@ -0,0 +1,14 @@
1#ifndef _ASM_X86_PVCLOCK_H
2#define _ASM_X86_PVCLOCK_H
3
4#include <linux/clocksource.h>
5#include <asm/pvclock-abi.h>
6
7/* some helper functions for xen and kvm pv clock sources */
8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
9unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
10void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
11 struct pvclock_vcpu_time_info *vcpu,
12 struct timespec *ts);
13
14#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
new file mode 100644
index 00000000000..df7710354f8
--- /dev/null
+++ b/arch/x86/include/asm/reboot.h
@@ -0,0 +1,21 @@
1#ifndef _ASM_X86_REBOOT_H
2#define _ASM_X86_REBOOT_H
3
4struct pt_regs;
5
6struct machine_ops {
7 void (*restart)(char *cmd);
8 void (*halt)(void);
9 void (*power_off)(void);
10 void (*shutdown)(void);
11 void (*crash_shutdown)(struct pt_regs *);
12 void (*emergency_restart)(void);
13};
14
15extern struct machine_ops machine_ops;
16
17void native_machine_crash_shutdown(struct pt_regs *regs);
18void native_machine_shutdown(void);
19void machine_real_restart(const unsigned char *code, int length);
20
21#endif /* _ASM_X86_REBOOT_H */
diff --git a/arch/x86/include/asm/reboot_fixups.h b/arch/x86/include/asm/reboot_fixups.h
new file mode 100644
index 00000000000..765debe4c54
--- /dev/null
+++ b/arch/x86/include/asm/reboot_fixups.h
@@ -0,0 +1,6 @@
1#ifndef _ASM_X86_REBOOT_FIXUPS_H
2#define _ASM_X86_REBOOT_FIXUPS_H
3
4extern void mach_reboot_fixups(void);
5
6#endif /* _ASM_X86_REBOOT_FIXUPS_H */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
new file mode 100644
index 00000000000..d5cd6c58688
--- /dev/null
+++ b/arch/x86/include/asm/required-features.h
@@ -0,0 +1,82 @@
1#ifndef _ASM_X86_REQUIRED_FEATURES_H
2#define _ASM_X86_REQUIRED_FEATURES_H
3
4/* Define minimum CPUID feature set for kernel These bits are checked
5 really early to actually display a visible error message before the
6 kernel dies. Make sure to assign features to the proper mask!
7
8 Some requirements that are not in CPUID yet are also in the
9 CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too.
10
11 The real information is in arch/x86/Kconfig.cpu, this just converts
12 the CONFIGs into a bitmask */
13
14#ifndef CONFIG_MATH_EMULATION
15# define NEED_FPU (1<<(X86_FEATURE_FPU & 31))
16#else
17# define NEED_FPU 0
18#endif
19
20#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
21# define NEED_PAE (1<<(X86_FEATURE_PAE & 31))
22#else
23# define NEED_PAE 0
24#endif
25
26#ifdef CONFIG_X86_CMPXCHG64
27# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31))
28#else
29# define NEED_CX8 0
30#endif
31
32#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64)
33# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31))
34#else
35# define NEED_CMOV 0
36#endif
37
38#ifdef CONFIG_X86_USE_3DNOW
39# define NEED_3DNOW (1<<(X86_FEATURE_3DNOW & 31))
40#else
41# define NEED_3DNOW 0
42#endif
43
44#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64)
45# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31))
46#else
47# define NEED_NOPL 0
48#endif
49
50#ifdef CONFIG_X86_64
51#define NEED_PSE 0
52#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
53#define NEED_PGE (1<<(X86_FEATURE_PGE & 31))
54#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
55#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
56#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
57#define NEED_LM (1<<(X86_FEATURE_LM & 31))
58#else
59#define NEED_PSE 0
60#define NEED_MSR 0
61#define NEED_PGE 0
62#define NEED_FXSR 0
63#define NEED_XMM 0
64#define NEED_XMM2 0
65#define NEED_LM 0
66#endif
67
68#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\
69 NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\
70 NEED_XMM|NEED_XMM2)
71#define SSE_MASK (NEED_XMM|NEED_XMM2)
72
73#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW)
74
75#define REQUIRED_MASK2 0
76#define REQUIRED_MASK3 (NEED_NOPL)
77#define REQUIRED_MASK4 0
78#define REQUIRED_MASK5 0
79#define REQUIRED_MASK6 0
80#define REQUIRED_MASK7 0
81
82#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/resource.h b/arch/x86/include/asm/resource.h
new file mode 100644
index 00000000000..04bc4db8921
--- /dev/null
+++ b/arch/x86/include/asm/resource.h
@@ -0,0 +1 @@
#include <asm-generic/resource.h>
diff --git a/arch/x86/include/asm/resume-trace.h b/arch/x86/include/asm/resume-trace.h
new file mode 100644
index 00000000000..3ff1c2cb1da
--- /dev/null
+++ b/arch/x86/include/asm/resume-trace.h
@@ -0,0 +1,21 @@
1#ifndef _ASM_X86_RESUME_TRACE_H
2#define _ASM_X86_RESUME_TRACE_H
3
4#include <asm/asm.h>
5
6#define TRACE_RESUME(user) \
7do { \
8 if (pm_trace_enabled) { \
9 const void *tracedata; \
10 asm volatile(_ASM_MOV " $1f,%0\n" \
11 ".section .tracedata,\"a\"\n" \
12 "1:\t.word %c1\n\t" \
13 _ASM_PTR " %c2\n" \
14 ".previous" \
15 :"=r" (tracedata) \
16 : "i" (__LINE__), "i" (__FILE__)); \
17 generate_resume_trace(tracedata, user); \
18 } \
19} while (0)
20
21#endif /* _ASM_X86_RESUME_TRACE_H */
diff --git a/arch/x86/include/asm/rio.h b/arch/x86/include/asm/rio.h
new file mode 100644
index 00000000000..97bab6388a9
--- /dev/null
+++ b/arch/x86/include/asm/rio.h
@@ -0,0 +1,63 @@
1/*
2 * Derived from include/asm-x86/mach-summit/mach_mpparse.h
3 * and include/asm-x86/mach-default/bios_ebda.h
4 *
5 * Author: Laurent Vivier <Laurent.Vivier@bull.net>
6 */
7
8#ifndef _ASM_X86_RIO_H
9#define _ASM_X86_RIO_H
10
11#define RIO_TABLE_VERSION 3
12
13struct rio_table_hdr {
14 u8 version; /* Version number of this data structure */
15 u8 num_scal_dev; /* # of Scalability devices */
16 u8 num_rio_dev; /* # of RIO I/O devices */
17} __attribute__((packed));
18
19struct scal_detail {
20 u8 node_id; /* Scalability Node ID */
21 u32 CBAR; /* Address of 1MB register space */
22 u8 port0node; /* Node ID port connected to: 0xFF=None */
23 u8 port0port; /* Port num port connected to: 0,1,2, or */
24 /* 0xFF=None */
25 u8 port1node; /* Node ID port connected to: 0xFF = None */
26 u8 port1port; /* Port num port connected to: 0,1,2, or */
27 /* 0xFF=None */
28 u8 port2node; /* Node ID port connected to: 0xFF = None */
29 u8 port2port; /* Port num port connected to: 0,1,2, or */
30 /* 0xFF=None */
31 u8 chassis_num; /* 1 based Chassis number (1 = boot node) */
32} __attribute__((packed));
33
34struct rio_detail {
35 u8 node_id; /* RIO Node ID */
36 u32 BBAR; /* Address of 1MB register space */
37 u8 type; /* Type of device */
38 u8 owner_id; /* Node ID of Hurricane that owns this */
39 /* node */
40 u8 port0node; /* Node ID port connected to: 0xFF=None */
41 u8 port0port; /* Port num port connected to: 0,1,2, or */
42 /* 0xFF=None */
43 u8 port1node; /* Node ID port connected to: 0xFF=None */
44 u8 port1port; /* Port num port connected to: 0,1,2, or */
45 /* 0xFF=None */
46 u8 first_slot; /* Lowest slot number below this Calgary */
47 u8 status; /* Bit 0 = 1 : the XAPIC is used */
48 /* = 0 : the XAPIC is not used, ie: */
49 /* ints fwded to another XAPIC */
50 /* Bits1:7 Reserved */
51 u8 WP_index; /* instance index - lower ones have */
52 /* lower slot numbers/PCI bus numbers */
53 u8 chassis_num; /* 1 based Chassis number */
54} __attribute__((packed));
55
56enum {
57 HURR_SCALABILTY = 0, /* Hurricane Scalability info */
58 HURR_RIOIB = 2, /* Hurricane RIOIB info */
59 COMPAT_CALGARY = 4, /* Compatibility Calgary */
60 ALT_CALGARY = 5, /* Second Planar Calgary */
61};
62
63#endif /* _ASM_X86_RIO_H */
diff --git a/arch/x86/include/asm/rtc.h b/arch/x86/include/asm/rtc.h
new file mode 100644
index 00000000000..f71c3b0ed36
--- /dev/null
+++ b/arch/x86/include/asm/rtc.h
@@ -0,0 +1 @@
#include <asm-generic/rtc.h>
diff --git a/arch/x86/include/asm/rwlock.h b/arch/x86/include/asm/rwlock.h
new file mode 100644
index 00000000000..6a8c0d64510
--- /dev/null
+++ b/arch/x86/include/asm/rwlock.h
@@ -0,0 +1,8 @@
1#ifndef _ASM_X86_RWLOCK_H
2#define _ASM_X86_RWLOCK_H
3
4#define RW_LOCK_BIAS 0x01000000
5
6/* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */
7
8#endif /* _ASM_X86_RWLOCK_H */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
new file mode 100644
index 00000000000..ca7517d3377
--- /dev/null
+++ b/arch/x86/include/asm/rwsem.h
@@ -0,0 +1,265 @@
1/* rwsem.h: R/W semaphores implemented using XADD/CMPXCHG for i486+
2 *
3 * Written by David Howells (dhowells@redhat.com).
4 *
5 * Derived from asm-x86/semaphore.h
6 *
7 *
8 * The MSW of the count is the negated number of active writers and waiting
9 * lockers, and the LSW is the total number of active locks
10 *
11 * The lock count is initialized to 0 (no active and no waiting lockers).
12 *
13 * When a writer subtracts WRITE_BIAS, it'll get 0xffff0001 for the case of an
14 * uncontended lock. This can be determined because XADD returns the old value.
15 * Readers increment by 1 and see a positive value when uncontended, negative
16 * if there are writers (and maybe) readers waiting (in which case it goes to
17 * sleep).
18 *
19 * The value of WAITING_BIAS supports up to 32766 waiting processes. This can
20 * be extended to 65534 by manually checking the whole MSW rather than relying
21 * on the S flag.
22 *
23 * The value of ACTIVE_BIAS supports up to 65535 active processes.
24 *
25 * This should be totally fair - if anything is waiting, a process that wants a
26 * lock will go to the back of the queue. When the currently active lock is
27 * released, if there's a writer at the front of the queue, then that and only
28 * that will be woken up; if there's a bunch of consequtive readers at the
29 * front, then they'll all be woken up, but no other readers will be.
30 */
31
32#ifndef _ASM_X86_RWSEM_H
33#define _ASM_X86_RWSEM_H
34
35#ifndef _LINUX_RWSEM_H
36#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
37#endif
38
39#ifdef __KERNEL__
40
41#include <linux/list.h>
42#include <linux/spinlock.h>
43#include <linux/lockdep.h>
44
45struct rwsem_waiter;
46
47extern asmregparm struct rw_semaphore *
48 rwsem_down_read_failed(struct rw_semaphore *sem);
49extern asmregparm struct rw_semaphore *
50 rwsem_down_write_failed(struct rw_semaphore *sem);
51extern asmregparm struct rw_semaphore *
52 rwsem_wake(struct rw_semaphore *);
53extern asmregparm struct rw_semaphore *
54 rwsem_downgrade_wake(struct rw_semaphore *sem);
55
56/*
57 * the semaphore definition
58 */
59
60#define RWSEM_UNLOCKED_VALUE 0x00000000
61#define RWSEM_ACTIVE_BIAS 0x00000001
62#define RWSEM_ACTIVE_MASK 0x0000ffff
63#define RWSEM_WAITING_BIAS (-0x00010000)
64#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
65#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
66
67struct rw_semaphore {
68 signed long count;
69 spinlock_t wait_lock;
70 struct list_head wait_list;
71#ifdef CONFIG_DEBUG_LOCK_ALLOC
72 struct lockdep_map dep_map;
73#endif
74};
75
76#ifdef CONFIG_DEBUG_LOCK_ALLOC
77# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
78#else
79# define __RWSEM_DEP_MAP_INIT(lockname)
80#endif
81
82
83#define __RWSEM_INITIALIZER(name) \
84{ \
85 RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
86 LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \
87}
88
89#define DECLARE_RWSEM(name) \
90 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
91
92extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
93 struct lock_class_key *key);
94
95#define init_rwsem(sem) \
96do { \
97 static struct lock_class_key __key; \
98 \
99 __init_rwsem((sem), #sem, &__key); \
100} while (0)
101
102/*
103 * lock for reading
104 */
105static inline void __down_read(struct rw_semaphore *sem)
106{
107 asm volatile("# beginning down_read\n\t"
108 LOCK_PREFIX " incl (%%eax)\n\t"
109 /* adds 0x00000001, returns the old value */
110 " jns 1f\n"
111 " call call_rwsem_down_read_failed\n"
112 "1:\n\t"
113 "# ending down_read\n\t"
114 : "+m" (sem->count)
115 : "a" (sem)
116 : "memory", "cc");
117}
118
119/*
120 * trylock for reading -- returns 1 if successful, 0 if contention
121 */
122static inline int __down_read_trylock(struct rw_semaphore *sem)
123{
124 __s32 result, tmp;
125 asm volatile("# beginning __down_read_trylock\n\t"
126 " movl %0,%1\n\t"
127 "1:\n\t"
128 " movl %1,%2\n\t"
129 " addl %3,%2\n\t"
130 " jle 2f\n\t"
131 LOCK_PREFIX " cmpxchgl %2,%0\n\t"
132 " jnz 1b\n\t"
133 "2:\n\t"
134 "# ending __down_read_trylock\n\t"
135 : "+m" (sem->count), "=&a" (result), "=&r" (tmp)
136 : "i" (RWSEM_ACTIVE_READ_BIAS)
137 : "memory", "cc");
138 return result >= 0 ? 1 : 0;
139}
140
141/*
142 * lock for writing
143 */
144static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
145{
146 int tmp;
147
148 tmp = RWSEM_ACTIVE_WRITE_BIAS;
149 asm volatile("# beginning down_write\n\t"
150 LOCK_PREFIX " xadd %%edx,(%%eax)\n\t"
151 /* subtract 0x0000ffff, returns the old value */
152 " testl %%edx,%%edx\n\t"
153 /* was the count 0 before? */
154 " jz 1f\n"
155 " call call_rwsem_down_write_failed\n"
156 "1:\n"
157 "# ending down_write"
158 : "+m" (sem->count), "=d" (tmp)
159 : "a" (sem), "1" (tmp)
160 : "memory", "cc");
161}
162
163static inline void __down_write(struct rw_semaphore *sem)
164{
165 __down_write_nested(sem, 0);
166}
167
168/*
169 * trylock for writing -- returns 1 if successful, 0 if contention
170 */
171static inline int __down_write_trylock(struct rw_semaphore *sem)
172{
173 signed long ret = cmpxchg(&sem->count,
174 RWSEM_UNLOCKED_VALUE,
175 RWSEM_ACTIVE_WRITE_BIAS);
176 if (ret == RWSEM_UNLOCKED_VALUE)
177 return 1;
178 return 0;
179}
180
181/*
182 * unlock after reading
183 */
184static inline void __up_read(struct rw_semaphore *sem)
185{
186 __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
187 asm volatile("# beginning __up_read\n\t"
188 LOCK_PREFIX " xadd %%edx,(%%eax)\n\t"
189 /* subtracts 1, returns the old value */
190 " jns 1f\n\t"
191 " call call_rwsem_wake\n"
192 "1:\n"
193 "# ending __up_read\n"
194 : "+m" (sem->count), "=d" (tmp)
195 : "a" (sem), "1" (tmp)
196 : "memory", "cc");
197}
198
199/*
200 * unlock after writing
201 */
202static inline void __up_write(struct rw_semaphore *sem)
203{
204 asm volatile("# beginning __up_write\n\t"
205 " movl %2,%%edx\n\t"
206 LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t"
207 /* tries to transition
208 0xffff0001 -> 0x00000000 */
209 " jz 1f\n"
210 " call call_rwsem_wake\n"
211 "1:\n\t"
212 "# ending __up_write\n"
213 : "+m" (sem->count)
214 : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS)
215 : "memory", "cc", "edx");
216}
217
218/*
219 * downgrade write lock to read lock
220 */
221static inline void __downgrade_write(struct rw_semaphore *sem)
222{
223 asm volatile("# beginning __downgrade_write\n\t"
224 LOCK_PREFIX " addl %2,(%%eax)\n\t"
225 /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
226 " jns 1f\n\t"
227 " call call_rwsem_downgrade_wake\n"
228 "1:\n\t"
229 "# ending __downgrade_write\n"
230 : "+m" (sem->count)
231 : "a" (sem), "i" (-RWSEM_WAITING_BIAS)
232 : "memory", "cc");
233}
234
235/*
236 * implement atomic add functionality
237 */
238static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
239{
240 asm volatile(LOCK_PREFIX "addl %1,%0"
241 : "+m" (sem->count)
242 : "ir" (delta));
243}
244
245/*
246 * implement exchange and add functionality
247 */
248static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
249{
250 int tmp = delta;
251
252 asm volatile(LOCK_PREFIX "xadd %0,%1"
253 : "+r" (tmp), "+m" (sem->count)
254 : : "memory");
255
256 return tmp + delta;
257}
258
259static inline int rwsem_is_locked(struct rw_semaphore *sem)
260{
261 return (sem->count != 0);
262}
263
264#endif /* __KERNEL__ */
265#endif /* _ASM_X86_RWSEM_H */
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
new file mode 100644
index 00000000000..263d397d2ee
--- /dev/null
+++ b/arch/x86/include/asm/scatterlist.h
@@ -0,0 +1,33 @@
1#ifndef _ASM_X86_SCATTERLIST_H
2#define _ASM_X86_SCATTERLIST_H
3
4#include <asm/types.h>
5
6struct scatterlist {
7#ifdef CONFIG_DEBUG_SG
8 unsigned long sg_magic;
9#endif
10 unsigned long page_link;
11 unsigned int offset;
12 unsigned int length;
13 dma_addr_t dma_address;
14 unsigned int dma_length;
15};
16
17#define ARCH_HAS_SG_CHAIN
18#define ISA_DMA_THRESHOLD (0x00ffffff)
19
20/*
21 * These macros should be used after a pci_map_sg call has been done
22 * to get bus addresses of each of the SG entries and their lengths.
23 * You should only work with the number of sg entries pci_map_sg
24 * returns.
25 */
26#define sg_dma_address(sg) ((sg)->dma_address)
27#ifdef CONFIG_X86_32
28# define sg_dma_len(sg) ((sg)->length)
29#else
30# define sg_dma_len(sg) ((sg)->dma_length)
31#endif
32
33#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
new file mode 100644
index 00000000000..c62e58a5a90
--- /dev/null
+++ b/arch/x86/include/asm/seccomp.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "seccomp_32.h"
3#else
4# include "seccomp_64.h"
5#endif
diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h
new file mode 100644
index 00000000000..a6ad87b352c
--- /dev/null
+++ b/arch/x86/include/asm/seccomp_32.h
@@ -0,0 +1,17 @@
1#ifndef _ASM_X86_SECCOMP_32_H
2#define _ASM_X86_SECCOMP_32_H
3
4#include <linux/thread_info.h>
5
6#ifdef TIF_32BIT
7#error "unexpected TIF_32BIT on i386"
8#endif
9
10#include <linux/unistd.h>
11
12#define __NR_seccomp_read __NR_read
13#define __NR_seccomp_write __NR_write
14#define __NR_seccomp_exit __NR_exit
15#define __NR_seccomp_sigreturn __NR_sigreturn
16
17#endif /* _ASM_X86_SECCOMP_32_H */
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
new file mode 100644
index 00000000000..4171bb794e9
--- /dev/null
+++ b/arch/x86/include/asm/seccomp_64.h
@@ -0,0 +1,25 @@
1#ifndef _ASM_X86_SECCOMP_64_H
2#define _ASM_X86_SECCOMP_64_H
3
4#include <linux/thread_info.h>
5
6#ifdef TIF_32BIT
7#error "unexpected TIF_32BIT on x86_64"
8#else
9#define TIF_32BIT TIF_IA32
10#endif
11
12#include <linux/unistd.h>
13#include <asm/ia32_unistd.h>
14
15#define __NR_seccomp_read __NR_read
16#define __NR_seccomp_write __NR_write
17#define __NR_seccomp_exit __NR_exit
18#define __NR_seccomp_sigreturn __NR_rt_sigreturn
19
20#define __NR_seccomp_read_32 __NR_ia32_read
21#define __NR_seccomp_write_32 __NR_ia32_write
22#define __NR_seccomp_exit_32 __NR_ia32_exit
23#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
24
25#endif /* _ASM_X86_SECCOMP_64_H */
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
new file mode 100644
index 00000000000..2b8c5160388
--- /dev/null
+++ b/arch/x86/include/asm/sections.h
@@ -0,0 +1 @@
#include <asm-generic/sections.h>
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
new file mode 100644
index 00000000000..1dc1b51ac62
--- /dev/null
+++ b/arch/x86/include/asm/segment.h
@@ -0,0 +1,209 @@
1#ifndef _ASM_X86_SEGMENT_H
2#define _ASM_X86_SEGMENT_H
3
4/* Constructor for a conventional segment GDT (or LDT) entry */
5/* This is a macro so it can be used in initializers */
6#define GDT_ENTRY(flags, base, limit) \
7 ((((base) & 0xff000000ULL) << (56-24)) | \
8 (((flags) & 0x0000f0ffULL) << 40) | \
9 (((limit) & 0x000f0000ULL) << (48-16)) | \
10 (((base) & 0x00ffffffULL) << 16) | \
11 (((limit) & 0x0000ffffULL)))
12
13/* Simple and small GDT entries for booting only */
14
15#define GDT_ENTRY_BOOT_CS 2
16#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
17
18#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
19#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
20
21#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2)
22#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8)
23
24#ifdef CONFIG_X86_32
25/*
26 * The layout of the per-CPU GDT under Linux:
27 *
28 * 0 - null
29 * 1 - reserved
30 * 2 - reserved
31 * 3 - reserved
32 *
33 * 4 - unused <==== new cacheline
34 * 5 - unused
35 *
36 * ------- start of TLS (Thread-Local Storage) segments:
37 *
38 * 6 - TLS segment #1 [ glibc's TLS segment ]
39 * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
40 * 8 - TLS segment #3
41 * 9 - reserved
42 * 10 - reserved
43 * 11 - reserved
44 *
45 * ------- start of kernel segments:
46 *
47 * 12 - kernel code segment <==== new cacheline
48 * 13 - kernel data segment
49 * 14 - default user CS
50 * 15 - default user DS
51 * 16 - TSS
52 * 17 - LDT
53 * 18 - PNPBIOS support (16->32 gate)
54 * 19 - PNPBIOS support
55 * 20 - PNPBIOS support
56 * 21 - PNPBIOS support
57 * 22 - PNPBIOS support
58 * 23 - APM BIOS support
59 * 24 - APM BIOS support
60 * 25 - APM BIOS support
61 *
62 * 26 - ESPFIX small SS
63 * 27 - per-cpu [ offset to per-cpu data area ]
64 * 28 - unused
65 * 29 - unused
66 * 30 - unused
67 * 31 - TSS for double fault handler
68 */
69#define GDT_ENTRY_TLS_MIN 6
70#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
71
72#define GDT_ENTRY_DEFAULT_USER_CS 14
73
74#define GDT_ENTRY_DEFAULT_USER_DS 15
75
76#define GDT_ENTRY_KERNEL_BASE 12
77
78#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
79
80#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
81
82#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
83#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
84
85#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
86#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
87
88#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
89#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
90
91#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
92#ifdef CONFIG_SMP
93#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
94#else
95#define __KERNEL_PERCPU 0
96#endif
97
98#define GDT_ENTRY_DOUBLEFAULT_TSS 31
99
100/*
101 * The GDT has 32 entries
102 */
103#define GDT_ENTRIES 32
104
105/* The PnP BIOS entries in the GDT */
106#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
107#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
108#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
109#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
110#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
111
112/* The PnP BIOS selectors */
113#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
114#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
115#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
116#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
117#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
118
119/* Bottom two bits of selector give the ring privilege level */
120#define SEGMENT_RPL_MASK 0x3
121/* Bit 2 is table indicator (LDT/GDT) */
122#define SEGMENT_TI_MASK 0x4
123
124/* User mode is privilege level 3 */
125#define USER_RPL 0x3
126/* LDT segment has TI set, GDT has it cleared */
127#define SEGMENT_LDT 0x4
128#define SEGMENT_GDT 0x0
129
130/*
131 * Matching rules for certain types of segments.
132 */
133
134/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
135#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
136
137
138#else
139#include <asm/cache.h>
140
141#define GDT_ENTRY_KERNEL32_CS 1
142#define GDT_ENTRY_KERNEL_CS 2
143#define GDT_ENTRY_KERNEL_DS 3
144
145#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8)
146
147/*
148 * we cannot use the same code segment descriptor for user and kernel
149 * -- not even in the long flat mode, because of different DPL /kkeil
150 * The segment offset needs to contain a RPL. Grr. -AK
151 * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
152 */
153#define GDT_ENTRY_DEFAULT_USER32_CS 4
154#define GDT_ENTRY_DEFAULT_USER_DS 5
155#define GDT_ENTRY_DEFAULT_USER_CS 6
156#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
157#define __USER32_DS __USER_DS
158
159#define GDT_ENTRY_TSS 8 /* needs two entries */
160#define GDT_ENTRY_LDT 10 /* needs two entries */
161#define GDT_ENTRY_TLS_MIN 12
162#define GDT_ENTRY_TLS_MAX 14
163
164#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
165#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
166
167/* TLS indexes for 64bit - hardcoded in arch_prctl */
168#define FS_TLS 0
169#define GS_TLS 1
170
171#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
172#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
173
174#define GDT_ENTRIES 16
175
176#endif
177
178#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
179#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
180#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
181#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
182#ifndef CONFIG_PARAVIRT
183#define get_kernel_rpl() 0
184#endif
185
186/* User mode is privilege level 3 */
187#define USER_RPL 0x3
188/* LDT segment has TI set, GDT has it cleared */
189#define SEGMENT_LDT 0x4
190#define SEGMENT_GDT 0x0
191
192/* Bottom two bits of selector give the ring privilege level */
193#define SEGMENT_RPL_MASK 0x3
194/* Bit 2 is table indicator (LDT/GDT) */
195#define SEGMENT_TI_MASK 0x4
196
197#define IDT_ENTRIES 256
198#define NUM_EXCEPTION_VECTORS 32
199#define GDT_SIZE (GDT_ENTRIES * 8)
200#define GDT_ENTRY_TLS_ENTRIES 3
201#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
202
203#ifdef __KERNEL__
204#ifndef __ASSEMBLY__
205extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10];
206#endif
207#endif
208
209#endif /* _ASM_X86_SEGMENT_H */
diff --git a/arch/x86/include/asm/sembuf.h b/arch/x86/include/asm/sembuf.h
new file mode 100644
index 00000000000..ee50c801f7b
--- /dev/null
+++ b/arch/x86/include/asm/sembuf.h
@@ -0,0 +1,24 @@
1#ifndef _ASM_X86_SEMBUF_H
2#define _ASM_X86_SEMBUF_H
3
4/*
5 * The semid64_ds structure for x86 architecture.
6 * Note extra padding because this structure is passed back and forth
7 * between kernel and user space.
8 *
9 * Pad space is left for:
10 * - 64-bit time_t to solve y2038 problem
11 * - 2 miscellaneous 32-bit values
12 */
13struct semid64_ds {
14 struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
15 __kernel_time_t sem_otime; /* last semop time */
16 unsigned long __unused1;
17 __kernel_time_t sem_ctime; /* last change time */
18 unsigned long __unused2;
19 unsigned long sem_nsems; /* no. of semaphores in array */
20 unsigned long __unused3;
21 unsigned long __unused4;
22};
23
24#endif /* _ASM_X86_SEMBUF_H */
diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h
new file mode 100644
index 00000000000..628c801535e
--- /dev/null
+++ b/arch/x86/include/asm/serial.h
@@ -0,0 +1,29 @@
1#ifndef _ASM_X86_SERIAL_H
2#define _ASM_X86_SERIAL_H
3
4/*
5 * This assumes you have a 1.8432 MHz clock for your UART.
6 *
7 * It'd be nice if someone built a serial card with a 24.576 MHz
8 * clock, since the 16550A is capable of handling a top speed of 1.5
9 * megabits/second; but this requires the faster clock.
10 */
11#define BASE_BAUD ( 1843200 / 16 )
12
13/* Standard COM flags (except for COM4, because of the 8514 problem) */
14#ifdef CONFIG_SERIAL_DETECT_IRQ
15#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ)
16#define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_AUTO_IRQ)
17#else
18#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST)
19#define STD_COM4_FLAGS ASYNC_BOOT_AUTOCONF
20#endif
21
22#define SERIAL_PORT_DFNS \
23 /* UART CLK PORT IRQ FLAGS */ \
24 { 0, BASE_BAUD, 0x3F8, 4, STD_COM_FLAGS }, /* ttyS0 */ \
25 { 0, BASE_BAUD, 0x2F8, 3, STD_COM_FLAGS }, /* ttyS1 */ \
26 { 0, BASE_BAUD, 0x3E8, 4, STD_COM_FLAGS }, /* ttyS2 */ \
27 { 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS }, /* ttyS3 */
28
29#endif /* _ASM_X86_SERIAL_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
new file mode 100644
index 00000000000..f12d3723746
--- /dev/null
+++ b/arch/x86/include/asm/setup.h
@@ -0,0 +1,105 @@
1#ifndef _ASM_X86_SETUP_H
2#define _ASM_X86_SETUP_H
3
4#define COMMAND_LINE_SIZE 2048
5
6#ifndef __ASSEMBLY__
7
8/* Interrupt control for vSMPowered x86_64 systems */
9void vsmp_init(void);
10
11#ifdef CONFIG_X86_VISWS
12extern void visws_early_detect(void);
13extern int is_visws_box(void);
14#else
15static inline void visws_early_detect(void) { }
16static inline int is_visws_box(void) { return 0; }
17#endif
18
19/*
20 * Any setup quirks to be performed?
21 */
22struct mpc_config_processor;
23struct mpc_config_bus;
24struct mp_config_oemtable;
25struct x86_quirks {
26 int (*arch_pre_time_init)(void);
27 int (*arch_time_init)(void);
28 int (*arch_pre_intr_init)(void);
29 int (*arch_intr_init)(void);
30 int (*arch_trap_init)(void);
31 char * (*arch_memory_setup)(void);
32 int (*mach_get_smp_config)(unsigned int early);
33 int (*mach_find_smp_config)(unsigned int reserve);
34
35 int *mpc_record;
36 int (*mpc_apic_id)(struct mpc_config_processor *m);
37 void (*mpc_oem_bus_info)(struct mpc_config_bus *m, char *name);
38 void (*mpc_oem_pci_bus)(struct mpc_config_bus *m);
39 void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable,
40 unsigned short oemsize);
41 int (*setup_ioapic_ids)(void);
42};
43
44extern struct x86_quirks *x86_quirks;
45extern unsigned long saved_video_mode;
46
47#ifndef CONFIG_PARAVIRT
48#define paravirt_post_allocator_init() do {} while (0)
49#endif
50#endif /* __ASSEMBLY__ */
51
52#ifdef __KERNEL__
53
54#ifdef __i386__
55
56#include <linux/pfn.h>
57/*
58 * Reserved space for vmalloc and iomap - defined in asm/page.h
59 */
60#define MAXMEM_PFN PFN_DOWN(MAXMEM)
61#define MAX_NONPAE_PFN (1 << 20)
62
63#endif /* __i386__ */
64
65#define PARAM_SIZE 4096 /* sizeof(struct boot_params) */
66
67#define OLD_CL_MAGIC 0xA33F
68#define OLD_CL_ADDRESS 0x020 /* Relative to real mode data */
69#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
70
71#ifndef __ASSEMBLY__
72#include <asm/bootparam.h>
73
74#ifndef _SETUP
75
76/*
77 * This is set up by the setup-routine at boot-time
78 */
79extern struct boot_params boot_params;
80
81/*
82 * Do NOT EVER look at the BIOS memory size location.
83 * It does not work on many machines.
84 */
85#define LOWMEMSIZE() (0x9f000)
86
87#ifdef __i386__
88
89void __init i386_start_kernel(void);
90extern void probe_roms(void);
91
92extern unsigned long init_pg_tables_start;
93extern unsigned long init_pg_tables_end;
94
95#else
96void __init x86_64_init_pda(void);
97void __init x86_64_start_kernel(char *real_mode);
98void __init x86_64_start_reservations(char *real_mode_data);
99
100#endif /* __i386__ */
101#endif /* _SETUP */
102#endif /* __ASSEMBLY__ */
103#endif /* __KERNEL__ */
104
105#endif /* _ASM_X86_SETUP_H */
diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/asm/shmbuf.h
new file mode 100644
index 00000000000..b51413b7497
--- /dev/null
+++ b/arch/x86/include/asm/shmbuf.h
@@ -0,0 +1,51 @@
1#ifndef _ASM_X86_SHMBUF_H
2#define _ASM_X86_SHMBUF_H
3
4/*
5 * The shmid64_ds structure for x86 architecture.
6 * Note extra padding because this structure is passed back and forth
7 * between kernel and user space.
8 *
9 * Pad space on 32 bit is left for:
10 * - 64-bit time_t to solve y2038 problem
11 * - 2 miscellaneous 32-bit values
12 *
13 * Pad space on 64 bit is left for:
14 * - 2 miscellaneous 64-bit values
15 */
16
17struct shmid64_ds {
18 struct ipc64_perm shm_perm; /* operation perms */
19 size_t shm_segsz; /* size of segment (bytes) */
20 __kernel_time_t shm_atime; /* last attach time */
21#ifdef __i386__
22 unsigned long __unused1;
23#endif
24 __kernel_time_t shm_dtime; /* last detach time */
25#ifdef __i386__
26 unsigned long __unused2;
27#endif
28 __kernel_time_t shm_ctime; /* last change time */
29#ifdef __i386__
30 unsigned long __unused3;
31#endif
32 __kernel_pid_t shm_cpid; /* pid of creator */
33 __kernel_pid_t shm_lpid; /* pid of last operator */
34 unsigned long shm_nattch; /* no. of current attaches */
35 unsigned long __unused4;
36 unsigned long __unused5;
37};
38
39struct shminfo64 {
40 unsigned long shmmax;
41 unsigned long shmmin;
42 unsigned long shmmni;
43 unsigned long shmseg;
44 unsigned long shmall;
45 unsigned long __unused1;
46 unsigned long __unused2;
47 unsigned long __unused3;
48 unsigned long __unused4;
49};
50
51#endif /* _ASM_X86_SHMBUF_H */
diff --git a/arch/x86/include/asm/shmparam.h b/arch/x86/include/asm/shmparam.h
new file mode 100644
index 00000000000..0880cf0917b
--- /dev/null
+++ b/arch/x86/include/asm/shmparam.h
@@ -0,0 +1,6 @@
1#ifndef _ASM_X86_SHMPARAM_H
2#define _ASM_X86_SHMPARAM_H
3
4#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */
5
6#endif /* _ASM_X86_SHMPARAM_H */
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
new file mode 100644
index 00000000000..0afcb5e58ac
--- /dev/null
+++ b/arch/x86/include/asm/sigcontext.h
@@ -0,0 +1,284 @@
1#ifndef _ASM_X86_SIGCONTEXT_H
2#define _ASM_X86_SIGCONTEXT_H
3
4#include <linux/compiler.h>
5#include <asm/types.h>
6
7#define FP_XSTATE_MAGIC1 0x46505853U
8#define FP_XSTATE_MAGIC2 0x46505845U
9#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2)
10
11/*
12 * bytes 464..511 in the current 512byte layout of fxsave/fxrstor frame
13 * are reserved for SW usage. On cpu's supporting xsave/xrstor, these bytes
14 * are used to extended the fpstate pointer in the sigcontext, which now
15 * includes the extended state information along with fpstate information.
16 *
17 * Presence of FP_XSTATE_MAGIC1 at the beginning of this SW reserved
18 * area and FP_XSTATE_MAGIC2 at the end of memory layout
19 * (extended_size - FP_XSTATE_MAGIC2_SIZE) indicates the presence of the
20 * extended state information in the memory layout pointed by the fpstate
21 * pointer in sigcontext.
22 */
23struct _fpx_sw_bytes {
24 __u32 magic1; /* FP_XSTATE_MAGIC1 */
25 __u32 extended_size; /* total size of the layout referred by
26 * fpstate pointer in the sigcontext.
27 */
28 __u64 xstate_bv;
29 /* feature bit mask (including fp/sse/extended
30 * state) that is present in the memory
31 * layout.
32 */
33 __u32 xstate_size; /* actual xsave state size, based on the
34 * features saved in the layout.
35 * 'extended_size' will be greater than
36 * 'xstate_size'.
37 */
38 __u32 padding[7]; /* for future use. */
39};
40
41#ifdef __i386__
42/*
43 * As documented in the iBCS2 standard..
44 *
45 * The first part of "struct _fpstate" is just the normal i387
46 * hardware setup, the extra "status" word is used to save the
47 * coprocessor status word before entering the handler.
48 *
49 * Pentium III FXSR, SSE support
50 * Gareth Hughes <gareth@valinux.com>, May 2000
51 *
52 * The FPU state data structure has had to grow to accommodate the
53 * extended FPU state required by the Streaming SIMD Extensions.
54 * There is no documented standard to accomplish this at the moment.
55 */
56struct _fpreg {
57 unsigned short significand[4];
58 unsigned short exponent;
59};
60
61struct _fpxreg {
62 unsigned short significand[4];
63 unsigned short exponent;
64 unsigned short padding[3];
65};
66
67struct _xmmreg {
68 unsigned long element[4];
69};
70
71struct _fpstate {
72 /* Regular FPU environment */
73 unsigned long cw;
74 unsigned long sw;
75 unsigned long tag;
76 unsigned long ipoff;
77 unsigned long cssel;
78 unsigned long dataoff;
79 unsigned long datasel;
80 struct _fpreg _st[8];
81 unsigned short status;
82 unsigned short magic; /* 0xffff = regular FPU data only */
83
84 /* FXSR FPU environment */
85 unsigned long _fxsr_env[6]; /* FXSR FPU env is ignored */
86 unsigned long mxcsr;
87 unsigned long reserved;
88 struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */
89 struct _xmmreg _xmm[8];
90 unsigned long padding1[44];
91
92 union {
93 unsigned long padding2[12];
94 struct _fpx_sw_bytes sw_reserved; /* represents the extended
95 * state info */
96 };
97};
98
99#define X86_FXSR_MAGIC 0x0000
100
101#ifdef __KERNEL__
102struct sigcontext {
103 unsigned short gs, __gsh;
104 unsigned short fs, __fsh;
105 unsigned short es, __esh;
106 unsigned short ds, __dsh;
107 unsigned long di;
108 unsigned long si;
109 unsigned long bp;
110 unsigned long sp;
111 unsigned long bx;
112 unsigned long dx;
113 unsigned long cx;
114 unsigned long ax;
115 unsigned long trapno;
116 unsigned long err;
117 unsigned long ip;
118 unsigned short cs, __csh;
119 unsigned long flags;
120 unsigned long sp_at_signal;
121 unsigned short ss, __ssh;
122
123 /*
124 * fpstate is really (struct _fpstate *) or (struct _xstate *)
125 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
126 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
127 * of extended memory layout. See comments at the defintion of
128 * (struct _fpx_sw_bytes)
129 */
130 void __user *fpstate; /* zero when no FPU/extended context */
131 unsigned long oldmask;
132 unsigned long cr2;
133};
134#else /* __KERNEL__ */
135/*
136 * User-space might still rely on the old definition:
137 */
138struct sigcontext {
139 unsigned short gs, __gsh;
140 unsigned short fs, __fsh;
141 unsigned short es, __esh;
142 unsigned short ds, __dsh;
143 unsigned long edi;
144 unsigned long esi;
145 unsigned long ebp;
146 unsigned long esp;
147 unsigned long ebx;
148 unsigned long edx;
149 unsigned long ecx;
150 unsigned long eax;
151 unsigned long trapno;
152 unsigned long err;
153 unsigned long eip;
154 unsigned short cs, __csh;
155 unsigned long eflags;
156 unsigned long esp_at_signal;
157 unsigned short ss, __ssh;
158 struct _fpstate __user *fpstate;
159 unsigned long oldmask;
160 unsigned long cr2;
161};
162#endif /* !__KERNEL__ */
163
164#else /* __i386__ */
165
166/* FXSAVE frame */
167/* Note: reserved1/2 may someday contain valuable data. Always save/restore
168 them when you change signal frames. */
169struct _fpstate {
170 __u16 cwd;
171 __u16 swd;
172 __u16 twd; /* Note this is not the same as the
173 32bit/x87/FSAVE twd */
174 __u16 fop;
175 __u64 rip;
176 __u64 rdp;
177 __u32 mxcsr;
178 __u32 mxcsr_mask;
179 __u32 st_space[32]; /* 8*16 bytes for each FP-reg */
180 __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg */
181 __u32 reserved2[12];
182 union {
183 __u32 reserved3[12];
184 struct _fpx_sw_bytes sw_reserved; /* represents the extended
185 * state information */
186 };
187};
188
189#ifdef __KERNEL__
190struct sigcontext {
191 unsigned long r8;
192 unsigned long r9;
193 unsigned long r10;
194 unsigned long r11;
195 unsigned long r12;
196 unsigned long r13;
197 unsigned long r14;
198 unsigned long r15;
199 unsigned long di;
200 unsigned long si;
201 unsigned long bp;
202 unsigned long bx;
203 unsigned long dx;
204 unsigned long ax;
205 unsigned long cx;
206 unsigned long sp;
207 unsigned long ip;
208 unsigned long flags;
209 unsigned short cs;
210 unsigned short gs;
211 unsigned short fs;
212 unsigned short __pad0;
213 unsigned long err;
214 unsigned long trapno;
215 unsigned long oldmask;
216 unsigned long cr2;
217
218 /*
219 * fpstate is really (struct _fpstate *) or (struct _xstate *)
220 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
221 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
222 * of extended memory layout. See comments at the defintion of
223 * (struct _fpx_sw_bytes)
224 */
225 void __user *fpstate; /* zero when no FPU/extended context */
226 unsigned long reserved1[8];
227};
228#else /* __KERNEL__ */
229/*
230 * User-space might still rely on the old definition:
231 */
232struct sigcontext {
233 unsigned long r8;
234 unsigned long r9;
235 unsigned long r10;
236 unsigned long r11;
237 unsigned long r12;
238 unsigned long r13;
239 unsigned long r14;
240 unsigned long r15;
241 unsigned long rdi;
242 unsigned long rsi;
243 unsigned long rbp;
244 unsigned long rbx;
245 unsigned long rdx;
246 unsigned long rax;
247 unsigned long rcx;
248 unsigned long rsp;
249 unsigned long rip;
250 unsigned long eflags; /* RFLAGS */
251 unsigned short cs;
252 unsigned short gs;
253 unsigned short fs;
254 unsigned short __pad0;
255 unsigned long err;
256 unsigned long trapno;
257 unsigned long oldmask;
258 unsigned long cr2;
259 struct _fpstate __user *fpstate; /* zero when no FPU context */
260 unsigned long reserved1[8];
261};
262#endif /* !__KERNEL__ */
263
264#endif /* !__i386__ */
265
266struct _xsave_hdr {
267 __u64 xstate_bv;
268 __u64 reserved1[2];
269 __u64 reserved2[5];
270};
271
272/*
273 * Extended state pointed by the fpstate pointer in the sigcontext.
274 * In addition to the fpstate, information encoded in the xstate_hdr
275 * indicates the presence of other extended state information
276 * supported by the processor and OS.
277 */
278struct _xstate {
279 struct _fpstate fpstate;
280 struct _xsave_hdr xstate_hdr;
281 /* new processor state extensions go here */
282};
283
284#endif /* _ASM_X86_SIGCONTEXT_H */
diff --git a/arch/x86/include/asm/sigcontext32.h b/arch/x86/include/asm/sigcontext32.h
new file mode 100644
index 00000000000..6126188cf3a
--- /dev/null
+++ b/arch/x86/include/asm/sigcontext32.h
@@ -0,0 +1,75 @@
1#ifndef _ASM_X86_SIGCONTEXT32_H
2#define _ASM_X86_SIGCONTEXT32_H
3
4/* signal context for 32bit programs. */
5
6#define X86_FXSR_MAGIC 0x0000
7
8struct _fpreg {
9 unsigned short significand[4];
10 unsigned short exponent;
11};
12
13struct _fpxreg {
14 unsigned short significand[4];
15 unsigned short exponent;
16 unsigned short padding[3];
17};
18
19struct _xmmreg {
20 __u32 element[4];
21};
22
23/* FSAVE frame with extensions */
24struct _fpstate_ia32 {
25 /* Regular FPU environment */
26 __u32 cw;
27 __u32 sw;
28 __u32 tag; /* not compatible to 64bit twd */
29 __u32 ipoff;
30 __u32 cssel;
31 __u32 dataoff;
32 __u32 datasel;
33 struct _fpreg _st[8];
34 unsigned short status;
35 unsigned short magic; /* 0xffff = regular FPU data only */
36
37 /* FXSR FPU environment */
38 __u32 _fxsr_env[6];
39 __u32 mxcsr;
40 __u32 reserved;
41 struct _fpxreg _fxsr_st[8];
42 struct _xmmreg _xmm[8]; /* It's actually 16 */
43 __u32 padding[44];
44 union {
45 __u32 padding2[12];
46 struct _fpx_sw_bytes sw_reserved;
47 };
48};
49
50struct sigcontext_ia32 {
51 unsigned short gs, __gsh;
52 unsigned short fs, __fsh;
53 unsigned short es, __esh;
54 unsigned short ds, __dsh;
55 unsigned int di;
56 unsigned int si;
57 unsigned int bp;
58 unsigned int sp;
59 unsigned int bx;
60 unsigned int dx;
61 unsigned int cx;
62 unsigned int ax;
63 unsigned int trapno;
64 unsigned int err;
65 unsigned int ip;
66 unsigned short cs, __csh;
67 unsigned int flags;
68 unsigned int sp_at_signal;
69 unsigned short ss, __ssh;
70 unsigned int fpstate; /* really (struct _fpstate_ia32 *) */
71 unsigned int oldmask;
72 unsigned int cr2;
73};
74
75#endif /* _ASM_X86_SIGCONTEXT32_H */
diff --git a/arch/x86/include/asm/siginfo.h b/arch/x86/include/asm/siginfo.h
new file mode 100644
index 00000000000..fc1aa553564
--- /dev/null
+++ b/arch/x86/include/asm/siginfo.h
@@ -0,0 +1,10 @@
1#ifndef _ASM_X86_SIGINFO_H
2#define _ASM_X86_SIGINFO_H
3
4#ifdef __x86_64__
5# define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int))
6#endif
7
8#include <asm-generic/siginfo.h>
9
10#endif /* _ASM_X86_SIGINFO_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
new file mode 100644
index 00000000000..96ac44f275d
--- /dev/null
+++ b/arch/x86/include/asm/signal.h
@@ -0,0 +1,262 @@
1#ifndef _ASM_X86_SIGNAL_H
2#define _ASM_X86_SIGNAL_H
3
4#ifndef __ASSEMBLY__
5#include <linux/types.h>
6#include <linux/time.h>
7#include <linux/compiler.h>
8
9/* Avoid too many header ordering problems. */
10struct siginfo;
11
12#ifdef __KERNEL__
13#include <linux/linkage.h>
14
15/* Most things should be clean enough to redefine this at will, if care
16 is taken to make libc match. */
17
18#define _NSIG 64
19
20#ifdef __i386__
21# define _NSIG_BPW 32
22#else
23# define _NSIG_BPW 64
24#endif
25
26#define _NSIG_WORDS (_NSIG / _NSIG_BPW)
27
28typedef unsigned long old_sigset_t; /* at least 32 bits */
29
30typedef struct {
31 unsigned long sig[_NSIG_WORDS];
32} sigset_t;
33
34#else
35/* Here we must cater to libcs that poke about in kernel headers. */
36
37#define NSIG 32
38typedef unsigned long sigset_t;
39
40#endif /* __KERNEL__ */
41#endif /* __ASSEMBLY__ */
42
43#define SIGHUP 1
44#define SIGINT 2
45#define SIGQUIT 3
46#define SIGILL 4
47#define SIGTRAP 5
48#define SIGABRT 6
49#define SIGIOT 6
50#define SIGBUS 7
51#define SIGFPE 8
52#define SIGKILL 9
53#define SIGUSR1 10
54#define SIGSEGV 11
55#define SIGUSR2 12
56#define SIGPIPE 13
57#define SIGALRM 14
58#define SIGTERM 15
59#define SIGSTKFLT 16
60#define SIGCHLD 17
61#define SIGCONT 18
62#define SIGSTOP 19
63#define SIGTSTP 20
64#define SIGTTIN 21
65#define SIGTTOU 22
66#define SIGURG 23
67#define SIGXCPU 24
68#define SIGXFSZ 25
69#define SIGVTALRM 26
70#define SIGPROF 27
71#define SIGWINCH 28
72#define SIGIO 29
73#define SIGPOLL SIGIO
74/*
75#define SIGLOST 29
76*/
77#define SIGPWR 30
78#define SIGSYS 31
79#define SIGUNUSED 31
80
81/* These should not be considered constants from userland. */
82#define SIGRTMIN 32
83#define SIGRTMAX _NSIG
84
85/*
86 * SA_FLAGS values:
87 *
88 * SA_ONSTACK indicates that a registered stack_t will be used.
89 * SA_RESTART flag to get restarting signals (which were the default long ago)
90 * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop.
91 * SA_RESETHAND clears the handler when the signal is delivered.
92 * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
93 * SA_NODEFER prevents the current signal from being masked in the handler.
94 *
95 * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
96 * Unix names RESETHAND and NODEFER respectively.
97 */
98#define SA_NOCLDSTOP 0x00000001u
99#define SA_NOCLDWAIT 0x00000002u
100#define SA_SIGINFO 0x00000004u
101#define SA_ONSTACK 0x08000000u
102#define SA_RESTART 0x10000000u
103#define SA_NODEFER 0x40000000u
104#define SA_RESETHAND 0x80000000u
105
106#define SA_NOMASK SA_NODEFER
107#define SA_ONESHOT SA_RESETHAND
108
109#define SA_RESTORER 0x04000000
110
111/*
112 * sigaltstack controls
113 */
114#define SS_ONSTACK 1
115#define SS_DISABLE 2
116
117#define MINSIGSTKSZ 2048
118#define SIGSTKSZ 8192
119
120#include <asm-generic/signal.h>
121
122#ifndef __ASSEMBLY__
123
124#ifdef __i386__
125# ifdef __KERNEL__
126struct old_sigaction {
127 __sighandler_t sa_handler;
128 old_sigset_t sa_mask;
129 unsigned long sa_flags;
130 __sigrestore_t sa_restorer;
131};
132
133struct sigaction {
134 __sighandler_t sa_handler;
135 unsigned long sa_flags;
136 __sigrestore_t sa_restorer;
137 sigset_t sa_mask; /* mask last for extensibility */
138};
139
140struct k_sigaction {
141 struct sigaction sa;
142};
143
144extern void do_notify_resume(struct pt_regs *, void *, __u32);
145
146# else /* __KERNEL__ */
147/* Here we must cater to libcs that poke about in kernel headers. */
148
149struct sigaction {
150 union {
151 __sighandler_t _sa_handler;
152 void (*_sa_sigaction)(int, struct siginfo *, void *);
153 } _u;
154 sigset_t sa_mask;
155 unsigned long sa_flags;
156 void (*sa_restorer)(void);
157};
158
159#define sa_handler _u._sa_handler
160#define sa_sigaction _u._sa_sigaction
161
162# endif /* ! __KERNEL__ */
163#else /* __i386__ */
164
165struct sigaction {
166 __sighandler_t sa_handler;
167 unsigned long sa_flags;
168 __sigrestore_t sa_restorer;
169 sigset_t sa_mask; /* mask last for extensibility */
170};
171
172struct k_sigaction {
173 struct sigaction sa;
174};
175
176#endif /* !__i386__ */
177
178typedef struct sigaltstack {
179 void __user *ss_sp;
180 int ss_flags;
181 size_t ss_size;
182} stack_t;
183
184#ifdef __KERNEL__
185#include <asm/sigcontext.h>
186
187#ifdef __i386__
188
189#define __HAVE_ARCH_SIG_BITOPS
190
191#define sigaddset(set,sig) \
192 (__builtin_constant_p(sig) \
193 ? __const_sigaddset((set), (sig)) \
194 : __gen_sigaddset((set), (sig)))
195
196static inline void __gen_sigaddset(sigset_t *set, int _sig)
197{
198 asm("btsl %1,%0" : "+m"(*set) : "Ir"(_sig - 1) : "cc");
199}
200
201static inline void __const_sigaddset(sigset_t *set, int _sig)
202{
203 unsigned long sig = _sig - 1;
204 set->sig[sig / _NSIG_BPW] |= 1 << (sig % _NSIG_BPW);
205}
206
207#define sigdelset(set, sig) \
208 (__builtin_constant_p(sig) \
209 ? __const_sigdelset((set), (sig)) \
210 : __gen_sigdelset((set), (sig)))
211
212
213static inline void __gen_sigdelset(sigset_t *set, int _sig)
214{
215 asm("btrl %1,%0" : "+m"(*set) : "Ir"(_sig - 1) : "cc");
216}
217
218static inline void __const_sigdelset(sigset_t *set, int _sig)
219{
220 unsigned long sig = _sig - 1;
221 set->sig[sig / _NSIG_BPW] &= ~(1 << (sig % _NSIG_BPW));
222}
223
224static inline int __const_sigismember(sigset_t *set, int _sig)
225{
226 unsigned long sig = _sig - 1;
227 return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW));
228}
229
230static inline int __gen_sigismember(sigset_t *set, int _sig)
231{
232 int ret;
233 asm("btl %2,%1\n\tsbbl %0,%0"
234 : "=r"(ret) : "m"(*set), "Ir"(_sig-1) : "cc");
235 return ret;
236}
237
238#define sigismember(set, sig) \
239 (__builtin_constant_p(sig) \
240 ? __const_sigismember((set), (sig)) \
241 : __gen_sigismember((set), (sig)))
242
243static inline int sigfindinword(unsigned long word)
244{
245 asm("bsfl %1,%0" : "=r"(word) : "rm"(word) : "cc");
246 return word;
247}
248
249struct pt_regs;
250
251#else /* __i386__ */
252
253#undef __HAVE_ARCH_SIG_BITOPS
254
255#endif /* !__i386__ */
256
257#define ptrace_signal_deliver(regs, cookie) do { } while (0)
258
259#endif /* __KERNEL__ */
260#endif /* __ASSEMBLY__ */
261
262#endif /* _ASM_X86_SIGNAL_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
new file mode 100644
index 00000000000..2766021aef8
--- /dev/null
+++ b/arch/x86/include/asm/smp.h
@@ -0,0 +1,229 @@
1#ifndef _ASM_X86_SMP_H
2#define _ASM_X86_SMP_H
3#ifndef __ASSEMBLY__
4#include <linux/cpumask.h>
5#include <linux/init.h>
6#include <asm/percpu.h>
7
8/*
9 * We need the APIC definitions automatically as part of 'smp.h'
10 */
11#ifdef CONFIG_X86_LOCAL_APIC
12# include <asm/mpspec.h>
13# include <asm/apic.h>
14# ifdef CONFIG_X86_IO_APIC
15# include <asm/io_apic.h>
16# endif
17#endif
18#include <asm/pda.h>
19#include <asm/thread_info.h>
20
21extern cpumask_t cpu_callout_map;
22extern cpumask_t cpu_initialized;
23extern cpumask_t cpu_callin_map;
24
25extern void (*mtrr_hook)(void);
26extern void zap_low_mappings(void);
27
28extern int __cpuinit get_local_pda(int cpu);
29
30extern int smp_num_siblings;
31extern unsigned int num_processors;
32extern cpumask_t cpu_initialized;
33
34DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
35DECLARE_PER_CPU(cpumask_t, cpu_core_map);
36DECLARE_PER_CPU(u16, cpu_llc_id);
37#ifdef CONFIG_X86_32
38DECLARE_PER_CPU(int, cpu_number);
39#endif
40
41DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
42DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
43
44/* Static state in head.S used to set up a CPU */
45extern struct {
46 void *sp;
47 unsigned short ss;
48} stack_start;
49
50struct smp_ops {
51 void (*smp_prepare_boot_cpu)(void);
52 void (*smp_prepare_cpus)(unsigned max_cpus);
53 void (*smp_cpus_done)(unsigned max_cpus);
54
55 void (*smp_send_stop)(void);
56 void (*smp_send_reschedule)(int cpu);
57
58 int (*cpu_up)(unsigned cpu);
59 int (*cpu_disable)(void);
60 void (*cpu_die)(unsigned int cpu);
61 void (*play_dead)(void);
62
63 void (*send_call_func_ipi)(cpumask_t mask);
64 void (*send_call_func_single_ipi)(int cpu);
65};
66
67/* Globals due to paravirt */
68extern void set_cpu_sibling_map(int cpu);
69
70#ifdef CONFIG_SMP
71#ifndef CONFIG_PARAVIRT
72#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
73#endif
74extern struct smp_ops smp_ops;
75
76static inline void smp_send_stop(void)
77{
78 smp_ops.smp_send_stop();
79}
80
81static inline void smp_prepare_boot_cpu(void)
82{
83 smp_ops.smp_prepare_boot_cpu();
84}
85
86static inline void smp_prepare_cpus(unsigned int max_cpus)
87{
88 smp_ops.smp_prepare_cpus(max_cpus);
89}
90
91static inline void smp_cpus_done(unsigned int max_cpus)
92{
93 smp_ops.smp_cpus_done(max_cpus);
94}
95
96static inline int __cpu_up(unsigned int cpu)
97{
98 return smp_ops.cpu_up(cpu);
99}
100
101static inline int __cpu_disable(void)
102{
103 return smp_ops.cpu_disable();
104}
105
106static inline void __cpu_die(unsigned int cpu)
107{
108 smp_ops.cpu_die(cpu);
109}
110
111static inline void play_dead(void)
112{
113 smp_ops.play_dead();
114}
115
116static inline void smp_send_reschedule(int cpu)
117{
118 smp_ops.smp_send_reschedule(cpu);
119}
120
121static inline void arch_send_call_function_single_ipi(int cpu)
122{
123 smp_ops.send_call_func_single_ipi(cpu);
124}
125
126static inline void arch_send_call_function_ipi(cpumask_t mask)
127{
128 smp_ops.send_call_func_ipi(mask);
129}
130
131void cpu_disable_common(void);
132void native_smp_prepare_boot_cpu(void);
133void native_smp_prepare_cpus(unsigned int max_cpus);
134void native_smp_cpus_done(unsigned int max_cpus);
135int native_cpu_up(unsigned int cpunum);
136int native_cpu_disable(void);
137void native_cpu_die(unsigned int cpu);
138void native_play_dead(void);
139void play_dead_common(void);
140
141void native_send_call_func_ipi(cpumask_t mask);
142void native_send_call_func_single_ipi(int cpu);
143
144extern void prefill_possible_map(void);
145
146void smp_store_cpu_info(int id);
147#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
148
149/* We don't mark CPUs online until __cpu_up(), so we need another measure */
150static inline int num_booting_cpus(void)
151{
152 return cpus_weight(cpu_callout_map);
153}
154#else
155static inline void prefill_possible_map(void)
156{
157}
158#endif /* CONFIG_SMP */
159
160extern unsigned disabled_cpus __cpuinitdata;
161
162#ifdef CONFIG_X86_32_SMP
163/*
164 * This function is needed by all SMP systems. It must _always_ be valid
165 * from the initial startup. We map APIC_BASE very early in page_setup(),
166 * so this is correct in the x86 case.
167 */
168#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
169extern int safe_smp_processor_id(void);
170
171#elif defined(CONFIG_X86_64_SMP)
172#define raw_smp_processor_id() read_pda(cpunumber)
173
174#define stack_smp_processor_id() \
175({ \
176 struct thread_info *ti; \
177 __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
178 ti->cpu; \
179})
180#define safe_smp_processor_id() smp_processor_id()
181
182#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
183#define cpu_physical_id(cpu) boot_cpu_physical_apicid
184#define safe_smp_processor_id() 0
185#define stack_smp_processor_id() 0
186#endif
187
188#ifdef CONFIG_X86_LOCAL_APIC
189
190#ifndef CONFIG_X86_64
191static inline int logical_smp_processor_id(void)
192{
193 /* we don't want to mark this access volatile - bad code generation */
194 return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
195}
196
197#include <mach_apicdef.h>
198static inline unsigned int read_apic_id(void)
199{
200 unsigned int reg;
201
202 reg = *(u32 *)(APIC_BASE + APIC_ID);
203
204 return GET_APIC_ID(reg);
205}
206#endif
207
208
209# if defined(APIC_DEFINITION) || defined(CONFIG_X86_64)
210extern int hard_smp_processor_id(void);
211# else
212#include <mach_apicdef.h>
213static inline int hard_smp_processor_id(void)
214{
215 /* we don't want to mark this access volatile - bad code generation */
216 return read_apic_id();
217}
218# endif /* APIC_DEFINITION */
219
220#else /* CONFIG_X86_LOCAL_APIC */
221
222# ifndef CONFIG_SMP
223# define hard_smp_processor_id() 0
224# endif
225
226#endif /* CONFIG_X86_LOCAL_APIC */
227
228#endif /* __ASSEMBLY__ */
229#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h
new file mode 100644
index 00000000000..8ab9cc8b2ec
--- /dev/null
+++ b/arch/x86/include/asm/socket.h
@@ -0,0 +1,57 @@
1#ifndef _ASM_X86_SOCKET_H
2#define _ASM_X86_SOCKET_H
3
4#include <asm/sockios.h>
5
6/* For setsockopt(2) */
7#define SOL_SOCKET 1
8
9#define SO_DEBUG 1
10#define SO_REUSEADDR 2
11#define SO_TYPE 3
12#define SO_ERROR 4
13#define SO_DONTROUTE 5
14#define SO_BROADCAST 6
15#define SO_SNDBUF 7
16#define SO_RCVBUF 8
17#define SO_SNDBUFFORCE 32
18#define SO_RCVBUFFORCE 33
19#define SO_KEEPALIVE 9
20#define SO_OOBINLINE 10
21#define SO_NO_CHECK 11
22#define SO_PRIORITY 12
23#define SO_LINGER 13
24#define SO_BSDCOMPAT 14
25/* To add :#define SO_REUSEPORT 15 */
26#define SO_PASSCRED 16
27#define SO_PEERCRED 17
28#define SO_RCVLOWAT 18
29#define SO_SNDLOWAT 19
30#define SO_RCVTIMEO 20
31#define SO_SNDTIMEO 21
32
33/* Security levels - as per NRL IPv6 - don't actually do anything */
34#define SO_SECURITY_AUTHENTICATION 22
35#define SO_SECURITY_ENCRYPTION_TRANSPORT 23
36#define SO_SECURITY_ENCRYPTION_NETWORK 24
37
38#define SO_BINDTODEVICE 25
39
40/* Socket filtering */
41#define SO_ATTACH_FILTER 26
42#define SO_DETACH_FILTER 27
43
44#define SO_PEERNAME 28
45#define SO_TIMESTAMP 29
46#define SCM_TIMESTAMP SO_TIMESTAMP
47
48#define SO_ACCEPTCONN 30
49
50#define SO_PEERSEC 31
51#define SO_PASSSEC 34
52#define SO_TIMESTAMPNS 35
53#define SCM_TIMESTAMPNS SO_TIMESTAMPNS
54
55#define SO_MARK 36
56
57#endif /* _ASM_X86_SOCKET_H */
diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/asm/sockios.h
new file mode 100644
index 00000000000..49cc72b5d3c
--- /dev/null
+++ b/arch/x86/include/asm/sockios.h
@@ -0,0 +1,13 @@
1#ifndef _ASM_X86_SOCKIOS_H
2#define _ASM_X86_SOCKIOS_H
3
4/* Socket-level I/O control calls. */
5#define FIOSETOWN 0x8901
6#define SIOCSPGRP 0x8902
7#define FIOGETOWN 0x8903
8#define SIOCGPGRP 0x8904
9#define SIOCATMARK 0x8905
10#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */
11#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */
12
13#endif /* _ASM_X86_SOCKIOS_H */
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
new file mode 100644
index 00000000000..be44f7dab39
--- /dev/null
+++ b/arch/x86/include/asm/sparsemem.h
@@ -0,0 +1,34 @@
1#ifndef _ASM_X86_SPARSEMEM_H
2#define _ASM_X86_SPARSEMEM_H
3
4#ifdef CONFIG_SPARSEMEM
5/*
6 * generic non-linear memory support:
7 *
8 * 1) we will not split memory into more chunks than will fit into the flags
9 * field of the struct page
10 *
11 * SECTION_SIZE_BITS 2^n: size of each section
12 * MAX_PHYSADDR_BITS 2^n: max size of physical address space
13 * MAX_PHYSMEM_BITS 2^n: how much memory we can have in that space
14 *
15 */
16
17#ifdef CONFIG_X86_32
18# ifdef CONFIG_X86_PAE
19# define SECTION_SIZE_BITS 29
20# define MAX_PHYSADDR_BITS 36
21# define MAX_PHYSMEM_BITS 36
22# else
23# define SECTION_SIZE_BITS 26
24# define MAX_PHYSADDR_BITS 32
25# define MAX_PHYSMEM_BITS 32
26# endif
27#else /* CONFIG_X86_32 */
28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
29# define MAX_PHYSADDR_BITS 44
30# define MAX_PHYSMEM_BITS 44
31#endif
32
33#endif /* CONFIG_SPARSEMEM */
34#endif /* _ASM_X86_SPARSEMEM_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
new file mode 100644
index 00000000000..d17c91981da
--- /dev/null
+++ b/arch/x86/include/asm/spinlock.h
@@ -0,0 +1,364 @@
1#ifndef _ASM_X86_SPINLOCK_H
2#define _ASM_X86_SPINLOCK_H
3
4#include <asm/atomic.h>
5#include <asm/rwlock.h>
6#include <asm/page.h>
7#include <asm/processor.h>
8#include <linux/compiler.h>
9#include <asm/paravirt.h>
10/*
11 * Your basic SMP spinlocks, allowing only a single CPU anywhere
12 *
13 * Simple spin lock operations. There are two variants, one clears IRQ's
14 * on the local processor, one does not.
15 *
16 * These are fair FIFO ticket locks, which are currently limited to 256
17 * CPUs.
18 *
19 * (the type definitions are in asm/spinlock_types.h)
20 */
21
22#ifdef CONFIG_X86_32
23# define LOCK_PTR_REG "a"
24# define REG_PTR_MODE "k"
25#else
26# define LOCK_PTR_REG "D"
27# define REG_PTR_MODE "q"
28#endif
29
30#if defined(CONFIG_X86_32) && \
31 (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
32/*
33 * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
34 * (PPro errata 66, 92)
35 */
36# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
37#else
38# define UNLOCK_LOCK_PREFIX
39#endif
40
41/*
42 * Ticket locks are conceptually two parts, one indicating the current head of
43 * the queue, and the other indicating the current tail. The lock is acquired
44 * by atomically noting the tail and incrementing it by one (thus adding
45 * ourself to the queue and noting our position), then waiting until the head
46 * becomes equal to the the initial value of the tail.
47 *
48 * We use an xadd covering *both* parts of the lock, to increment the tail and
49 * also load the position of the head, which takes care of memory ordering
50 * issues and should be optimal for the uncontended case. Note the tail must be
51 * in the high part, because a wide xadd increment of the low part would carry
52 * up and contaminate the high part.
53 *
54 * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
55 * save some instructions and make the code more elegant. There really isn't
56 * much between them in performance though, especially as locks are out of line.
57 */
58#if (NR_CPUS < 256)
59#define TICKET_SHIFT 8
60
61static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
62{
63 short inc = 0x0100;
64
65 asm volatile (
66 LOCK_PREFIX "xaddw %w0, %1\n"
67 "1:\t"
68 "cmpb %h0, %b0\n\t"
69 "je 2f\n\t"
70 "rep ; nop\n\t"
71 "movb %1, %b0\n\t"
72 /* don't need lfence here, because loads are in-order */
73 "jmp 1b\n"
74 "2:"
75 : "+Q" (inc), "+m" (lock->slock)
76 :
77 : "memory", "cc");
78}
79
80static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
81{
82 int tmp, new;
83
84 asm volatile("movzwl %2, %0\n\t"
85 "cmpb %h0,%b0\n\t"
86 "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
87 "jne 1f\n\t"
88 LOCK_PREFIX "cmpxchgw %w1,%2\n\t"
89 "1:"
90 "sete %b1\n\t"
91 "movzbl %b1,%0\n\t"
92 : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
93 :
94 : "memory", "cc");
95
96 return tmp;
97}
98
99static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
100{
101 asm volatile(UNLOCK_LOCK_PREFIX "incb %0"
102 : "+m" (lock->slock)
103 :
104 : "memory", "cc");
105}
106#else
107#define TICKET_SHIFT 16
108
109static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
110{
111 int inc = 0x00010000;
112 int tmp;
113
114 asm volatile(LOCK_PREFIX "xaddl %0, %1\n"
115 "movzwl %w0, %2\n\t"
116 "shrl $16, %0\n\t"
117 "1:\t"
118 "cmpl %0, %2\n\t"
119 "je 2f\n\t"
120 "rep ; nop\n\t"
121 "movzwl %1, %2\n\t"
122 /* don't need lfence here, because loads are in-order */
123 "jmp 1b\n"
124 "2:"
125 : "+r" (inc), "+m" (lock->slock), "=&r" (tmp)
126 :
127 : "memory", "cc");
128}
129
130static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
131{
132 int tmp;
133 int new;
134
135 asm volatile("movl %2,%0\n\t"
136 "movl %0,%1\n\t"
137 "roll $16, %0\n\t"
138 "cmpl %0,%1\n\t"
139 "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
140 "jne 1f\n\t"
141 LOCK_PREFIX "cmpxchgl %1,%2\n\t"
142 "1:"
143 "sete %b1\n\t"
144 "movzbl %b1,%0\n\t"
145 : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
146 :
147 : "memory", "cc");
148
149 return tmp;
150}
151
152static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
153{
154 asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
155 : "+m" (lock->slock)
156 :
157 : "memory", "cc");
158}
159#endif
160
161static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
162{
163 int tmp = ACCESS_ONCE(lock->slock);
164
165 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
166}
167
168static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
169{
170 int tmp = ACCESS_ONCE(lock->slock);
171
172 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
173}
174
175#ifdef CONFIG_PARAVIRT
176/*
177 * Define virtualization-friendly old-style lock byte lock, for use in
178 * pv_lock_ops if desired.
179 *
180 * This differs from the pre-2.6.24 spinlock by always using xchgb
181 * rather than decb to take the lock; this allows it to use a
182 * zero-initialized lock structure. It also maintains a 1-byte
183 * contention counter, so that we can implement
184 * __byte_spin_is_contended.
185 */
186struct __byte_spinlock {
187 s8 lock;
188 s8 spinners;
189};
190
191static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
192{
193 struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
194 return bl->lock != 0;
195}
196
197static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
198{
199 struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
200 return bl->spinners != 0;
201}
202
203static inline void __byte_spin_lock(raw_spinlock_t *lock)
204{
205 struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
206 s8 val = 1;
207
208 asm("1: xchgb %1, %0\n"
209 " test %1,%1\n"
210 " jz 3f\n"
211 " " LOCK_PREFIX "incb %2\n"
212 "2: rep;nop\n"
213 " cmpb $1, %0\n"
214 " je 2b\n"
215 " " LOCK_PREFIX "decb %2\n"
216 " jmp 1b\n"
217 "3:"
218 : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
219}
220
221static inline int __byte_spin_trylock(raw_spinlock_t *lock)
222{
223 struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
224 u8 old = 1;
225
226 asm("xchgb %1,%0"
227 : "+m" (bl->lock), "+q" (old) : : "memory");
228
229 return old == 0;
230}
231
232static inline void __byte_spin_unlock(raw_spinlock_t *lock)
233{
234 struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
235 smp_wmb();
236 bl->lock = 0;
237}
238#else /* !CONFIG_PARAVIRT */
239static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
240{
241 return __ticket_spin_is_locked(lock);
242}
243
244static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
245{
246 return __ticket_spin_is_contended(lock);
247}
248
249static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
250{
251 __ticket_spin_lock(lock);
252}
253
254static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
255{
256 return __ticket_spin_trylock(lock);
257}
258
259static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
260{
261 __ticket_spin_unlock(lock);
262}
263
264static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
265 unsigned long flags)
266{
267 __raw_spin_lock(lock);
268}
269
270#endif /* CONFIG_PARAVIRT */
271
272static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
273{
274 while (__raw_spin_is_locked(lock))
275 cpu_relax();
276}
277
278/*
279 * Read-write spinlocks, allowing multiple readers
280 * but only one writer.
281 *
282 * NOTE! it is quite common to have readers in interrupts
283 * but no interrupt writers. For those circumstances we
284 * can "mix" irq-safe locks - any writer needs to get a
285 * irq-safe write-lock, but readers can get non-irqsafe
286 * read-locks.
287 *
288 * On x86, we implement read-write locks as a 32-bit counter
289 * with the high bit (sign) being the "contended" bit.
290 */
291
292/**
293 * read_can_lock - would read_trylock() succeed?
294 * @lock: the rwlock in question.
295 */
296static inline int __raw_read_can_lock(raw_rwlock_t *lock)
297{
298 return (int)(lock)->lock > 0;
299}
300
301/**
302 * write_can_lock - would write_trylock() succeed?
303 * @lock: the rwlock in question.
304 */
305static inline int __raw_write_can_lock(raw_rwlock_t *lock)
306{
307 return (lock)->lock == RW_LOCK_BIAS;
308}
309
310static inline void __raw_read_lock(raw_rwlock_t *rw)
311{
312 asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
313 "jns 1f\n"
314 "call __read_lock_failed\n\t"
315 "1:\n"
316 ::LOCK_PTR_REG (rw) : "memory");
317}
318
319static inline void __raw_write_lock(raw_rwlock_t *rw)
320{
321 asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
322 "jz 1f\n"
323 "call __write_lock_failed\n\t"
324 "1:\n"
325 ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
326}
327
328static inline int __raw_read_trylock(raw_rwlock_t *lock)
329{
330 atomic_t *count = (atomic_t *)lock;
331
332 atomic_dec(count);
333 if (atomic_read(count) >= 0)
334 return 1;
335 atomic_inc(count);
336 return 0;
337}
338
339static inline int __raw_write_trylock(raw_rwlock_t *lock)
340{
341 atomic_t *count = (atomic_t *)lock;
342
343 if (atomic_sub_and_test(RW_LOCK_BIAS, count))
344 return 1;
345 atomic_add(RW_LOCK_BIAS, count);
346 return 0;
347}
348
349static inline void __raw_read_unlock(raw_rwlock_t *rw)
350{
351 asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
352}
353
354static inline void __raw_write_unlock(raw_rwlock_t *rw)
355{
356 asm volatile(LOCK_PREFIX "addl %1, %0"
357 : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
358}
359
360#define _raw_spin_relax(lock) cpu_relax()
361#define _raw_read_relax(lock) cpu_relax()
362#define _raw_write_relax(lock) cpu_relax()
363
364#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
new file mode 100644
index 00000000000..845f81c8709
--- /dev/null
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -0,0 +1,20 @@
1#ifndef _ASM_X86_SPINLOCK_TYPES_H
2#define _ASM_X86_SPINLOCK_TYPES_H
3
4#ifndef __LINUX_SPINLOCK_TYPES_H
5# error "please don't include this file directly"
6#endif
7
8typedef struct raw_spinlock {
9 unsigned int slock;
10} raw_spinlock_t;
11
12#define __RAW_SPIN_LOCK_UNLOCKED { 0 }
13
14typedef struct {
15 unsigned int lock;
16} raw_rwlock_t;
17
18#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
19
20#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h
new file mode 100644
index 00000000000..b508d639d1a
--- /dev/null
+++ b/arch/x86/include/asm/srat.h
@@ -0,0 +1,39 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26
27#ifndef _ASM_X86_SRAT_H
28#define _ASM_X86_SRAT_H
29
30#ifdef CONFIG_ACPI_NUMA
31extern int get_memcfg_from_srat(void);
32#else
33static inline int get_memcfg_from_srat(void)
34{
35 return 0;
36}
37#endif
38
39#endif /* _ASM_X86_SRAT_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
new file mode 100644
index 00000000000..f517944b2b1
--- /dev/null
+++ b/arch/x86/include/asm/stacktrace.h
@@ -0,0 +1,21 @@
1#ifndef _ASM_X86_STACKTRACE_H
2#define _ASM_X86_STACKTRACE_H
3
4extern int kstack_depth_to_print;
5
6/* Generic stack tracer with callbacks */
7
8struct stacktrace_ops {
9 void (*warning)(void *data, char *msg);
10 /* msg must contain %s for the symbol */
11 void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
12 void (*address)(void *data, unsigned long address, int reliable);
13 /* On negative return stop dumping */
14 int (*stack)(void *data, char *name);
15};
16
17void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
18 unsigned long *stack, unsigned long bp,
19 const struct stacktrace_ops *ops, void *data);
20
21#endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/include/asm/stat.h b/arch/x86/include/asm/stat.h
new file mode 100644
index 00000000000..e0b1d9bbcbc
--- /dev/null
+++ b/arch/x86/include/asm/stat.h
@@ -0,0 +1,114 @@
1#ifndef _ASM_X86_STAT_H
2#define _ASM_X86_STAT_H
3
4#define STAT_HAVE_NSEC 1
5
6#ifdef __i386__
7struct stat {
8 unsigned long st_dev;
9 unsigned long st_ino;
10 unsigned short st_mode;
11 unsigned short st_nlink;
12 unsigned short st_uid;
13 unsigned short st_gid;
14 unsigned long st_rdev;
15 unsigned long st_size;
16 unsigned long st_blksize;
17 unsigned long st_blocks;
18 unsigned long st_atime;
19 unsigned long st_atime_nsec;
20 unsigned long st_mtime;
21 unsigned long st_mtime_nsec;
22 unsigned long st_ctime;
23 unsigned long st_ctime_nsec;
24 unsigned long __unused4;
25 unsigned long __unused5;
26};
27
28#define STAT64_HAS_BROKEN_ST_INO 1
29
30/* This matches struct stat64 in glibc2.1, hence the absolutely
31 * insane amounts of padding around dev_t's.
32 */
33struct stat64 {
34 unsigned long long st_dev;
35 unsigned char __pad0[4];
36
37 unsigned long __st_ino;
38
39 unsigned int st_mode;
40 unsigned int st_nlink;
41
42 unsigned long st_uid;
43 unsigned long st_gid;
44
45 unsigned long long st_rdev;
46 unsigned char __pad3[4];
47
48 long long st_size;
49 unsigned long st_blksize;
50
51 /* Number 512-byte blocks allocated. */
52 unsigned long long st_blocks;
53
54 unsigned long st_atime;
55 unsigned long st_atime_nsec;
56
57 unsigned long st_mtime;
58 unsigned int st_mtime_nsec;
59
60 unsigned long st_ctime;
61 unsigned long st_ctime_nsec;
62
63 unsigned long long st_ino;
64};
65
66#else /* __i386__ */
67
68struct stat {
69 unsigned long st_dev;
70 unsigned long st_ino;
71 unsigned long st_nlink;
72
73 unsigned int st_mode;
74 unsigned int st_uid;
75 unsigned int st_gid;
76 unsigned int __pad0;
77 unsigned long st_rdev;
78 long st_size;
79 long st_blksize;
80 long st_blocks; /* Number 512-byte blocks allocated. */
81
82 unsigned long st_atime;
83 unsigned long st_atime_nsec;
84 unsigned long st_mtime;
85 unsigned long st_mtime_nsec;
86 unsigned long st_ctime;
87 unsigned long st_ctime_nsec;
88 long __unused[3];
89};
90#endif
91
92/* for 32bit emulation and 32 bit kernels */
93struct __old_kernel_stat {
94 unsigned short st_dev;
95 unsigned short st_ino;
96 unsigned short st_mode;
97 unsigned short st_nlink;
98 unsigned short st_uid;
99 unsigned short st_gid;
100 unsigned short st_rdev;
101#ifdef __i386__
102 unsigned long st_size;
103 unsigned long st_atime;
104 unsigned long st_mtime;
105 unsigned long st_ctime;
106#else
107 unsigned int st_size;
108 unsigned int st_atime;
109 unsigned int st_mtime;
110 unsigned int st_ctime;
111#endif
112};
113
114#endif /* _ASM_X86_STAT_H */
diff --git a/arch/x86/include/asm/statfs.h b/arch/x86/include/asm/statfs.h
new file mode 100644
index 00000000000..2d0adbf99a8
--- /dev/null
+++ b/arch/x86/include/asm/statfs.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_STATFS_H
2#define _ASM_X86_STATFS_H
3
4/*
5 * We need compat_statfs64 to be packed, because the i386 ABI won't
6 * add padding at the end to bring it to a multiple of 8 bytes, but
7 * the x86_64 ABI will.
8 */
9#define ARCH_PACK_COMPAT_STATFS64 __attribute__((packed,aligned(4)))
10
11#include <asm-generic/statfs.h>
12#endif /* _ASM_X86_STATFS_H */
diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
new file mode 100644
index 00000000000..6dfd6d9373a
--- /dev/null
+++ b/arch/x86/include/asm/string.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "string_32.h"
3#else
4# include "string_64.h"
5#endif
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
new file mode 100644
index 00000000000..0e0e3ba827f
--- /dev/null
+++ b/arch/x86/include/asm/string_32.h
@@ -0,0 +1,326 @@
1#ifndef _ASM_X86_STRING_32_H
2#define _ASM_X86_STRING_32_H
3
4#ifdef __KERNEL__
5
6/* Let gcc decide whether to inline or use the out of line functions */
7
8#define __HAVE_ARCH_STRCPY
9extern char *strcpy(char *dest, const char *src);
10
11#define __HAVE_ARCH_STRNCPY
12extern char *strncpy(char *dest, const char *src, size_t count);
13
14#define __HAVE_ARCH_STRCAT
15extern char *strcat(char *dest, const char *src);
16
17#define __HAVE_ARCH_STRNCAT
18extern char *strncat(char *dest, const char *src, size_t count);
19
20#define __HAVE_ARCH_STRCMP
21extern int strcmp(const char *cs, const char *ct);
22
23#define __HAVE_ARCH_STRNCMP
24extern int strncmp(const char *cs, const char *ct, size_t count);
25
26#define __HAVE_ARCH_STRCHR
27extern char *strchr(const char *s, int c);
28
29#define __HAVE_ARCH_STRLEN
30extern size_t strlen(const char *s);
31
32static __always_inline void *__memcpy(void *to, const void *from, size_t n)
33{
34 int d0, d1, d2;
35 asm volatile("rep ; movsl\n\t"
36 "movl %4,%%ecx\n\t"
37 "andl $3,%%ecx\n\t"
38 "jz 1f\n\t"
39 "rep ; movsb\n\t"
40 "1:"
41 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
42 : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
43 : "memory");
44 return to;
45}
46
47/*
48 * This looks ugly, but the compiler can optimize it totally,
49 * as the count is constant.
50 */
51static __always_inline void *__constant_memcpy(void *to, const void *from,
52 size_t n)
53{
54 long esi, edi;
55 if (!n)
56 return to;
57
58 switch (n) {
59 case 1:
60 *(char *)to = *(char *)from;
61 return to;
62 case 2:
63 *(short *)to = *(short *)from;
64 return to;
65 case 4:
66 *(int *)to = *(int *)from;
67 return to;
68
69 case 3:
70 *(short *)to = *(short *)from;
71 *((char *)to + 2) = *((char *)from + 2);
72 return to;
73 case 5:
74 *(int *)to = *(int *)from;
75 *((char *)to + 4) = *((char *)from + 4);
76 return to;
77 case 6:
78 *(int *)to = *(int *)from;
79 *((short *)to + 2) = *((short *)from + 2);
80 return to;
81 case 8:
82 *(int *)to = *(int *)from;
83 *((int *)to + 1) = *((int *)from + 1);
84 return to;
85 }
86
87 esi = (long)from;
88 edi = (long)to;
89 if (n >= 5 * 4) {
90 /* large block: use rep prefix */
91 int ecx;
92 asm volatile("rep ; movsl"
93 : "=&c" (ecx), "=&D" (edi), "=&S" (esi)
94 : "0" (n / 4), "1" (edi), "2" (esi)
95 : "memory"
96 );
97 } else {
98 /* small block: don't clobber ecx + smaller code */
99 if (n >= 4 * 4)
100 asm volatile("movsl"
101 : "=&D"(edi), "=&S"(esi)
102 : "0"(edi), "1"(esi)
103 : "memory");
104 if (n >= 3 * 4)
105 asm volatile("movsl"
106 : "=&D"(edi), "=&S"(esi)
107 : "0"(edi), "1"(esi)
108 : "memory");
109 if (n >= 2 * 4)
110 asm volatile("movsl"
111 : "=&D"(edi), "=&S"(esi)
112 : "0"(edi), "1"(esi)
113 : "memory");
114 if (n >= 1 * 4)
115 asm volatile("movsl"
116 : "=&D"(edi), "=&S"(esi)
117 : "0"(edi), "1"(esi)
118 : "memory");
119 }
120 switch (n % 4) {
121 /* tail */
122 case 0:
123 return to;
124 case 1:
125 asm volatile("movsb"
126 : "=&D"(edi), "=&S"(esi)
127 : "0"(edi), "1"(esi)
128 : "memory");
129 return to;
130 case 2:
131 asm volatile("movsw"
132 : "=&D"(edi), "=&S"(esi)
133 : "0"(edi), "1"(esi)
134 : "memory");
135 return to;
136 default:
137 asm volatile("movsw\n\tmovsb"
138 : "=&D"(edi), "=&S"(esi)
139 : "0"(edi), "1"(esi)
140 : "memory");
141 return to;
142 }
143}
144
145#define __HAVE_ARCH_MEMCPY
146
147#ifdef CONFIG_X86_USE_3DNOW
148
149#include <asm/mmx.h>
150
151/*
152 * This CPU favours 3DNow strongly (eg AMD Athlon)
153 */
154
155static inline void *__constant_memcpy3d(void *to, const void *from, size_t len)
156{
157 if (len < 512)
158 return __constant_memcpy(to, from, len);
159 return _mmx_memcpy(to, from, len);
160}
161
162static inline void *__memcpy3d(void *to, const void *from, size_t len)
163{
164 if (len < 512)
165 return __memcpy(to, from, len);
166 return _mmx_memcpy(to, from, len);
167}
168
169#define memcpy(t, f, n) \
170 (__builtin_constant_p((n)) \
171 ? __constant_memcpy3d((t), (f), (n)) \
172 : __memcpy3d((t), (f), (n)))
173
174#else
175
176/*
177 * No 3D Now!
178 */
179
180#define memcpy(t, f, n) \
181 (__builtin_constant_p((n)) \
182 ? __constant_memcpy((t), (f), (n)) \
183 : __memcpy((t), (f), (n)))
184
185#endif
186
187#define __HAVE_ARCH_MEMMOVE
188void *memmove(void *dest, const void *src, size_t n);
189
190#define memcmp __builtin_memcmp
191
192#define __HAVE_ARCH_MEMCHR
193extern void *memchr(const void *cs, int c, size_t count);
194
195static inline void *__memset_generic(void *s, char c, size_t count)
196{
197 int d0, d1;
198 asm volatile("rep\n\t"
199 "stosb"
200 : "=&c" (d0), "=&D" (d1)
201 : "a" (c), "1" (s), "0" (count)
202 : "memory");
203 return s;
204}
205
206/* we might want to write optimized versions of these later */
207#define __constant_count_memset(s, c, count) __memset_generic((s), (c), (count))
208
209/*
210 * memset(x, 0, y) is a reasonably common thing to do, so we want to fill
211 * things 32 bits at a time even when we don't know the size of the
212 * area at compile-time..
213 */
214static __always_inline
215void *__constant_c_memset(void *s, unsigned long c, size_t count)
216{
217 int d0, d1;
218 asm volatile("rep ; stosl\n\t"
219 "testb $2,%b3\n\t"
220 "je 1f\n\t"
221 "stosw\n"
222 "1:\ttestb $1,%b3\n\t"
223 "je 2f\n\t"
224 "stosb\n"
225 "2:"
226 : "=&c" (d0), "=&D" (d1)
227 : "a" (c), "q" (count), "0" (count/4), "1" ((long)s)
228 : "memory");
229 return s;
230}
231
232/* Added by Gertjan van Wingerde to make minix and sysv module work */
233#define __HAVE_ARCH_STRNLEN
234extern size_t strnlen(const char *s, size_t count);
235/* end of additional stuff */
236
237#define __HAVE_ARCH_STRSTR
238extern char *strstr(const char *cs, const char *ct);
239
240/*
241 * This looks horribly ugly, but the compiler can optimize it totally,
242 * as we by now know that both pattern and count is constant..
243 */
244static __always_inline
245void *__constant_c_and_count_memset(void *s, unsigned long pattern,
246 size_t count)
247{
248 switch (count) {
249 case 0:
250 return s;
251 case 1:
252 *(unsigned char *)s = pattern & 0xff;
253 return s;
254 case 2:
255 *(unsigned short *)s = pattern & 0xffff;
256 return s;
257 case 3:
258 *(unsigned short *)s = pattern & 0xffff;
259 *((unsigned char *)s + 2) = pattern & 0xff;
260 return s;
261 case 4:
262 *(unsigned long *)s = pattern;
263 return s;
264 }
265
266#define COMMON(x) \
267 asm volatile("rep ; stosl" \
268 x \
269 : "=&c" (d0), "=&D" (d1) \
270 : "a" (eax), "0" (count/4), "1" ((long)s) \
271 : "memory")
272
273 {
274 int d0, d1;
275#if __GNUC__ == 4 && __GNUC_MINOR__ == 0
276 /* Workaround for broken gcc 4.0 */
277 register unsigned long eax asm("%eax") = pattern;
278#else
279 unsigned long eax = pattern;
280#endif
281
282 switch (count % 4) {
283 case 0:
284 COMMON("");
285 return s;
286 case 1:
287 COMMON("\n\tstosb");
288 return s;
289 case 2:
290 COMMON("\n\tstosw");
291 return s;
292 default:
293 COMMON("\n\tstosw\n\tstosb");
294 return s;
295 }
296 }
297
298#undef COMMON
299}
300
301#define __constant_c_x_memset(s, c, count) \
302 (__builtin_constant_p(count) \
303 ? __constant_c_and_count_memset((s), (c), (count)) \
304 : __constant_c_memset((s), (c), (count)))
305
306#define __memset(s, c, count) \
307 (__builtin_constant_p(count) \
308 ? __constant_count_memset((s), (c), (count)) \
309 : __memset_generic((s), (c), (count)))
310
311#define __HAVE_ARCH_MEMSET
312#define memset(s, c, count) \
313 (__builtin_constant_p(c) \
314 ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \
315 (count)) \
316 : __memset((s), (c), (count)))
317
318/*
319 * find the first occurrence of byte 'c', or 1 past the area if none
320 */
321#define __HAVE_ARCH_MEMSCAN
322extern void *memscan(void *addr, int c, size_t size);
323
324#endif /* __KERNEL__ */
325
326#endif /* _ASM_X86_STRING_32_H */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
new file mode 100644
index 00000000000..2afe164bf1e
--- /dev/null
+++ b/arch/x86/include/asm/string_64.h
@@ -0,0 +1,60 @@
1#ifndef _ASM_X86_STRING_64_H
2#define _ASM_X86_STRING_64_H
3
4#ifdef __KERNEL__
5
6/* Written 2002 by Andi Kleen */
7
8/* Only used for special circumstances. Stolen from i386/string.h */
9static __always_inline void *__inline_memcpy(void *to, const void *from, size_t n)
10{
11 unsigned long d0, d1, d2;
12 asm volatile("rep ; movsl\n\t"
13 "testb $2,%b4\n\t"
14 "je 1f\n\t"
15 "movsw\n"
16 "1:\ttestb $1,%b4\n\t"
17 "je 2f\n\t"
18 "movsb\n"
19 "2:"
20 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
21 : "0" (n / 4), "q" (n), "1" ((long)to), "2" ((long)from)
22 : "memory");
23 return to;
24}
25
26/* Even with __builtin_ the compiler may decide to use the out of line
27 function. */
28
29#define __HAVE_ARCH_MEMCPY 1
30#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
31extern void *memcpy(void *to, const void *from, size_t len);
32#else
33extern void *__memcpy(void *to, const void *from, size_t len);
34#define memcpy(dst, src, len) \
35({ \
36 size_t __len = (len); \
37 void *__ret; \
38 if (__builtin_constant_p(len) && __len >= 64) \
39 __ret = __memcpy((dst), (src), __len); \
40 else \
41 __ret = __builtin_memcpy((dst), (src), __len); \
42 __ret; \
43})
44#endif
45
46#define __HAVE_ARCH_MEMSET
47void *memset(void *s, int c, size_t n);
48
49#define __HAVE_ARCH_MEMMOVE
50void *memmove(void *dest, const void *src, size_t count);
51
52int memcmp(const void *cs, const void *ct, size_t count);
53size_t strlen(const char *s);
54char *strcpy(char *dest, const char *src);
55char *strcat(char *dest, const char *src);
56int strcmp(const char *cs, const char *ct);
57
58#endif /* __KERNEL__ */
59
60#endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/include/asm/summit/apic.h b/arch/x86/include/asm/summit/apic.h
new file mode 100644
index 00000000000..9b3070f1c2a
--- /dev/null
+++ b/arch/x86/include/asm/summit/apic.h
@@ -0,0 +1,184 @@
1#ifndef __ASM_SUMMIT_APIC_H
2#define __ASM_SUMMIT_APIC_H
3
4#include <asm/smp.h>
5
6#define esr_disable (1)
7#define NO_BALANCE_IRQ (0)
8
9/* In clustered mode, the high nibble of APIC ID is a cluster number.
10 * The low nibble is a 4-bit bitmap. */
11#define XAPIC_DEST_CPUS_SHIFT 4
12#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
13#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
14
15#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
16
17static inline cpumask_t target_cpus(void)
18{
19 /* CPU_MASK_ALL (0xff) has undefined behaviour with
20 * dest_LowestPrio mode logical clustered apic interrupt routing
21 * Just start on cpu 0. IRQ balancing will spread load
22 */
23 return cpumask_of_cpu(0);
24}
25
26#define INT_DELIVERY_MODE (dest_LowestPrio)
27#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */
28
29static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
30{
31 return 0;
32}
33
34/* we don't use the phys_cpu_present_map to indicate apicid presence */
35static inline unsigned long check_apicid_present(int bit)
36{
37 return 1;
38}
39
40#define apicid_cluster(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK)
41
42extern u8 cpu_2_logical_apicid[];
43
44static inline void init_apic_ldr(void)
45{
46 unsigned long val, id;
47 int count = 0;
48 u8 my_id = (u8)hard_smp_processor_id();
49 u8 my_cluster = (u8)apicid_cluster(my_id);
50#ifdef CONFIG_SMP
51 u8 lid;
52 int i;
53
54 /* Create logical APIC IDs by counting CPUs already in cluster. */
55 for (count = 0, i = NR_CPUS; --i >= 0; ) {
56 lid = cpu_2_logical_apicid[i];
57 if (lid != BAD_APICID && apicid_cluster(lid) == my_cluster)
58 ++count;
59 }
60#endif
61 /* We only have a 4 wide bitmap in cluster mode. If a deranged
62 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
63 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
64 id = my_cluster | (1UL << count);
65 apic_write(APIC_DFR, APIC_DFR_VALUE);
66 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
67 val |= SET_APIC_LOGICAL_ID(id);
68 apic_write(APIC_LDR, val);
69}
70
71static inline int multi_timer_check(int apic, int irq)
72{
73 return 0;
74}
75
76static inline int apic_id_registered(void)
77{
78 return 1;
79}
80
81static inline void setup_apic_routing(void)
82{
83 printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
84 nr_ioapics);
85}
86
87static inline int apicid_to_node(int logical_apicid)
88{
89#ifdef CONFIG_SMP
90 return apicid_2_node[hard_smp_processor_id()];
91#else
92 return 0;
93#endif
94}
95
96/* Mapping from cpu number to logical apicid */
97static inline int cpu_to_logical_apicid(int cpu)
98{
99#ifdef CONFIG_SMP
100 if (cpu >= NR_CPUS)
101 return BAD_APICID;
102 return (int)cpu_2_logical_apicid[cpu];
103#else
104 return logical_smp_processor_id();
105#endif
106}
107
108static inline int cpu_present_to_apicid(int mps_cpu)
109{
110 if (mps_cpu < NR_CPUS)
111 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
112 else
113 return BAD_APICID;
114}
115
116static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_id_map)
117{
118 /* For clustered we don't have a good way to do this yet - hack */
119 return physids_promote(0x0F);
120}
121
122static inline physid_mask_t apicid_to_cpu_present(int apicid)
123{
124 return physid_mask_of_physid(0);
125}
126
127static inline void setup_portio_remap(void)
128{
129}
130
131static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
132{
133 return 1;
134}
135
136static inline void enable_apic_mode(void)
137{
138}
139
140static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
141{
142 int num_bits_set;
143 int cpus_found = 0;
144 int cpu;
145 int apicid;
146
147 num_bits_set = cpus_weight(cpumask);
148 /* Return id to all */
149 if (num_bits_set == NR_CPUS)
150 return (int) 0xFF;
151 /*
152 * The cpus in the mask must all be on the apic cluster. If are not
153 * on the same apicid cluster return default value of TARGET_CPUS.
154 */
155 cpu = first_cpu(cpumask);
156 apicid = cpu_to_logical_apicid(cpu);
157 while (cpus_found < num_bits_set) {
158 if (cpu_isset(cpu, cpumask)) {
159 int new_apicid = cpu_to_logical_apicid(cpu);
160 if (apicid_cluster(apicid) !=
161 apicid_cluster(new_apicid)){
162 printk ("%s: Not a valid mask!\n", __func__);
163 return 0xFF;
164 }
165 apicid = apicid | new_apicid;
166 cpus_found++;
167 }
168 cpu++;
169 }
170 return apicid;
171}
172
173/* cpuid returns the value latched in the HW at reset, not the APIC ID
174 * register's value. For any box whose BIOS changes APIC IDs, like
175 * clustered APIC systems, we must use hard_smp_processor_id.
176 *
177 * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
178 */
179static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
180{
181 return hard_smp_processor_id() >> index_msb;
182}
183
184#endif /* __ASM_SUMMIT_APIC_H */
diff --git a/arch/x86/include/asm/summit/apicdef.h b/arch/x86/include/asm/summit/apicdef.h
new file mode 100644
index 00000000000..f3fbca1f61c
--- /dev/null
+++ b/arch/x86/include/asm/summit/apicdef.h
@@ -0,0 +1,13 @@
1#ifndef __ASM_SUMMIT_APICDEF_H
2#define __ASM_SUMMIT_APICDEF_H
3
4#define APIC_ID_MASK (0xFF<<24)
5
6static inline unsigned get_apic_id(unsigned long x)
7{
8 return (x>>24)&0xFF;
9}
10
11#define GET_APIC_ID(x) get_apic_id(x)
12
13#endif
diff --git a/arch/x86/include/asm/summit/ipi.h b/arch/x86/include/asm/summit/ipi.h
new file mode 100644
index 00000000000..53bd1e7bd7b
--- /dev/null
+++ b/arch/x86/include/asm/summit/ipi.h
@@ -0,0 +1,25 @@
1#ifndef __ASM_SUMMIT_IPI_H
2#define __ASM_SUMMIT_IPI_H
3
4void send_IPI_mask_sequence(cpumask_t mask, int vector);
5
6static inline void send_IPI_mask(cpumask_t mask, int vector)
7{
8 send_IPI_mask_sequence(mask, vector);
9}
10
11static inline void send_IPI_allbutself(int vector)
12{
13 cpumask_t mask = cpu_online_map;
14 cpu_clear(smp_processor_id(), mask);
15
16 if (!cpus_empty(mask))
17 send_IPI_mask(mask, vector);
18}
19
20static inline void send_IPI_all(int vector)
21{
22 send_IPI_mask(cpu_online_map, vector);
23}
24
25#endif /* __ASM_SUMMIT_IPI_H */
diff --git a/arch/x86/include/asm/summit/mpparse.h b/arch/x86/include/asm/summit/mpparse.h
new file mode 100644
index 00000000000..013ce6fab2d
--- /dev/null
+++ b/arch/x86/include/asm/summit/mpparse.h
@@ -0,0 +1,109 @@
1#ifndef __ASM_SUMMIT_MPPARSE_H
2#define __ASM_SUMMIT_MPPARSE_H
3
4#include <asm/tsc.h>
5
6extern int use_cyclone;
7
8#ifdef CONFIG_X86_SUMMIT_NUMA
9extern void setup_summit(void);
10#else
11#define setup_summit() {}
12#endif
13
14static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
15 char *productid)
16{
17 if (!strncmp(oem, "IBM ENSW", 8) &&
18 (!strncmp(productid, "VIGIL SMP", 9)
19 || !strncmp(productid, "EXA", 3)
20 || !strncmp(productid, "RUTHLESS SMP", 12))){
21 mark_tsc_unstable("Summit based system");
22 use_cyclone = 1; /*enable cyclone-timer*/
23 setup_summit();
24 return 1;
25 }
26 return 0;
27}
28
29/* Hook from generic ACPI tables.c */
30static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
31{
32 if (!strncmp(oem_id, "IBM", 3) &&
33 (!strncmp(oem_table_id, "SERVIGIL", 8)
34 || !strncmp(oem_table_id, "EXA", 3))){
35 mark_tsc_unstable("Summit based system");
36 use_cyclone = 1; /*enable cyclone-timer*/
37 setup_summit();
38 return 1;
39 }
40 return 0;
41}
42
43struct rio_table_hdr {
44 unsigned char version; /* Version number of this data structure */
45 /* Version 3 adds chassis_num & WP_index */
46 unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
47 unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
48} __attribute__((packed));
49
50struct scal_detail {
51 unsigned char node_id; /* Scalability Node ID */
52 unsigned long CBAR; /* Address of 1MB register space */
53 unsigned char port0node; /* Node ID port connected to: 0xFF=None */
54 unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
55 unsigned char port1node; /* Node ID port connected to: 0xFF = None */
56 unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
57 unsigned char port2node; /* Node ID port connected to: 0xFF = None */
58 unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
59 unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
60} __attribute__((packed));
61
62struct rio_detail {
63 unsigned char node_id; /* RIO Node ID */
64 unsigned long BBAR; /* Address of 1MB register space */
65 unsigned char type; /* Type of device */
66 unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
67 /* For CYC: Node ID of Twister that owns this CYC */
68 unsigned char port0node; /* Node ID port connected to: 0xFF=None */
69 unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
70 unsigned char port1node; /* Node ID port connected to: 0xFF=None */
71 unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
72 unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
73 /* For CYC: 0 */
74 unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
75 /* = 0 : the XAPIC is not used, ie:*/
76 /* ints fwded to another XAPIC */
77 /* Bits1:7 Reserved */
78 /* For CYC: Bits0:7 Reserved */
79 unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
80 /* lower slot numbers/PCI bus numbers */
81 /* For CYC: No meaning */
82 unsigned char chassis_num; /* 1 based Chassis number */
83 /* For LookOut WPEGs this field indicates the */
84 /* Expansion Chassis #, enumerated from Boot */
85 /* Node WPEG external port, then Boot Node CYC */
86 /* external port, then Next Vigil chassis WPEG */
87 /* external port, etc. */
88 /* Shared Lookouts have only 1 chassis number (the */
89 /* first one assigned) */
90} __attribute__((packed));
91
92
93typedef enum {
94 CompatTwister = 0, /* Compatibility Twister */
95 AltTwister = 1, /* Alternate Twister of internal 8-way */
96 CompatCyclone = 2, /* Compatibility Cyclone */
97 AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
98 CompatWPEG = 4, /* Compatibility WPEG */
99 AltWPEG = 5, /* Second Planar WPEG */
100 LookOutAWPEG = 6, /* LookOut WPEG */
101 LookOutBWPEG = 7, /* LookOut WPEG */
102} node_type;
103
104static inline int is_WPEG(struct rio_detail *rio){
105 return (rio->type == CompatWPEG || rio->type == AltWPEG ||
106 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
107}
108
109#endif /* __ASM_SUMMIT_MPPARSE_H */
diff --git a/arch/x86/include/asm/suspend.h b/arch/x86/include/asm/suspend.h
new file mode 100644
index 00000000000..9bd521fe457
--- /dev/null
+++ b/arch/x86/include/asm/suspend.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "suspend_32.h"
3#else
4# include "suspend_64.h"
5#endif
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
new file mode 100644
index 00000000000..a5074bd0f8b
--- /dev/null
+++ b/arch/x86/include/asm/suspend_32.h
@@ -0,0 +1,51 @@
1/*
2 * Copyright 2001-2002 Pavel Machek <pavel@suse.cz>
3 * Based on code
4 * Copyright 2001 Patrick Mochel <mochel@osdl.org>
5 */
6#ifndef _ASM_X86_SUSPEND_32_H
7#define _ASM_X86_SUSPEND_32_H
8
9#include <asm/desc.h>
10#include <asm/i387.h>
11
12static inline int arch_prepare_suspend(void) { return 0; }
13
14/* image of the saved processor state */
15struct saved_context {
16 u16 es, fs, gs, ss;
17 unsigned long cr0, cr2, cr3, cr4;
18 struct desc_ptr gdt;
19 struct desc_ptr idt;
20 u16 ldt;
21 u16 tss;
22 unsigned long tr;
23 unsigned long safety;
24 unsigned long return_address;
25} __attribute__((packed));
26
27#ifdef CONFIG_ACPI
28extern unsigned long saved_eip;
29extern unsigned long saved_esp;
30extern unsigned long saved_ebp;
31extern unsigned long saved_ebx;
32extern unsigned long saved_esi;
33extern unsigned long saved_edi;
34
35static inline void acpi_save_register_state(unsigned long return_point)
36{
37 saved_eip = return_point;
38 asm volatile("movl %%esp,%0" : "=m" (saved_esp));
39 asm volatile("movl %%ebp,%0" : "=m" (saved_ebp));
40 asm volatile("movl %%ebx,%0" : "=m" (saved_ebx));
41 asm volatile("movl %%edi,%0" : "=m" (saved_edi));
42 asm volatile("movl %%esi,%0" : "=m" (saved_esi));
43}
44
45#define acpi_restore_register_state() do {} while (0)
46
47/* routines for saving/restoring kernel state */
48extern int acpi_save_state_mem(void);
49#endif
50
51#endif /* _ASM_X86_SUSPEND_32_H */
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
new file mode 100644
index 00000000000..06284f42b75
--- /dev/null
+++ b/arch/x86/include/asm/suspend_64.h
@@ -0,0 +1,52 @@
1/*
2 * Copyright 2001-2003 Pavel Machek <pavel@suse.cz>
3 * Based on code
4 * Copyright 2001 Patrick Mochel <mochel@osdl.org>
5 */
6#ifndef _ASM_X86_SUSPEND_64_H
7#define _ASM_X86_SUSPEND_64_H
8
9#include <asm/desc.h>
10#include <asm/i387.h>
11
12static inline int arch_prepare_suspend(void)
13{
14 return 0;
15}
16
17/*
18 * Image of the saved processor state, used by the low level ACPI suspend to
19 * RAM code and by the low level hibernation code.
20 *
21 * If you modify it, fix arch/x86/kernel/acpi/wakeup_64.S and make sure that
22 * __save/__restore_processor_state(), defined in arch/x86/kernel/suspend_64.c,
23 * still work as required.
24 */
25struct saved_context {
26 struct pt_regs regs;
27 u16 ds, es, fs, gs, ss;
28 unsigned long gs_base, gs_kernel_base, fs_base;
29 unsigned long cr0, cr2, cr3, cr4, cr8;
30 unsigned long efer;
31 u16 gdt_pad;
32 u16 gdt_limit;
33 unsigned long gdt_base;
34 u16 idt_pad;
35 u16 idt_limit;
36 unsigned long idt_base;
37 u16 ldt;
38 u16 tss;
39 unsigned long tr;
40 unsigned long safety;
41 unsigned long return_address;
42} __attribute__((packed));
43
44#define loaddebug(thread,register) \
45 set_debugreg((thread)->debugreg##register, register)
46
47/* routines for saving/restoring kernel state */
48extern int acpi_save_state_mem(void);
49extern char core_restore_code;
50extern char restore_registers;
51
52#endif /* _ASM_X86_SUSPEND_64_H */
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
new file mode 100644
index 00000000000..51fb2c76ad7
--- /dev/null
+++ b/arch/x86/include/asm/swiotlb.h
@@ -0,0 +1,58 @@
1#ifndef _ASM_X86_SWIOTLB_H
2#define _ASM_X86_SWIOTLB_H
3
4#include <asm/dma-mapping.h>
5
6/* SWIOTLB interface */
7
8extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr,
9 size_t size, int dir);
10extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
11 dma_addr_t *dma_handle, gfp_t flags);
12extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
13 size_t size, int dir);
14extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
15 dma_addr_t dev_addr,
16 size_t size, int dir);
17extern void swiotlb_sync_single_for_device(struct device *hwdev,
18 dma_addr_t dev_addr,
19 size_t size, int dir);
20extern void swiotlb_sync_single_range_for_cpu(struct device *hwdev,
21 dma_addr_t dev_addr,
22 unsigned long offset,
23 size_t size, int dir);
24extern void swiotlb_sync_single_range_for_device(struct device *hwdev,
25 dma_addr_t dev_addr,
26 unsigned long offset,
27 size_t size, int dir);
28extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
29 struct scatterlist *sg, int nelems,
30 int dir);
31extern void swiotlb_sync_sg_for_device(struct device *hwdev,
32 struct scatterlist *sg, int nelems,
33 int dir);
34extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
35 int nents, int direction);
36extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
37 int nents, int direction);
38extern int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
39extern void swiotlb_free_coherent(struct device *hwdev, size_t size,
40 void *vaddr, dma_addr_t dma_handle);
41extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
42extern void swiotlb_init(void);
43
44extern int swiotlb_force;
45
46#ifdef CONFIG_SWIOTLB
47extern int swiotlb;
48extern void pci_swiotlb_init(void);
49#else
50#define swiotlb 0
51static inline void pci_swiotlb_init(void)
52{
53}
54#endif
55
56static inline void dma_mark_clean(void *addr, size_t size) {}
57
58#endif /* _ASM_X86_SWIOTLB_H */
diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h
new file mode 100644
index 00000000000..9d09b4073b6
--- /dev/null
+++ b/arch/x86/include/asm/sync_bitops.h
@@ -0,0 +1,130 @@
1#ifndef _ASM_X86_SYNC_BITOPS_H
2#define _ASM_X86_SYNC_BITOPS_H
3
4/*
5 * Copyright 1992, Linus Torvalds.
6 */
7
8/*
9 * These have to be done with inline assembly: that way the bit-setting
10 * is guaranteed to be atomic. All bit operations return 0 if the bit
11 * was cleared before the operation and != 0 if it was not.
12 *
13 * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
14 */
15
16#define ADDR (*(volatile long *)addr)
17
18/**
19 * sync_set_bit - Atomically set a bit in memory
20 * @nr: the bit to set
21 * @addr: the address to start counting from
22 *
23 * This function is atomic and may not be reordered. See __set_bit()
24 * if you do not require the atomic guarantees.
25 *
26 * Note that @nr may be almost arbitrarily large; this function is not
27 * restricted to acting on a single-word quantity.
28 */
29static inline void sync_set_bit(int nr, volatile unsigned long *addr)
30{
31 asm volatile("lock; btsl %1,%0"
32 : "+m" (ADDR)
33 : "Ir" (nr)
34 : "memory");
35}
36
37/**
38 * sync_clear_bit - Clears a bit in memory
39 * @nr: Bit to clear
40 * @addr: Address to start counting from
41 *
42 * sync_clear_bit() is atomic and may not be reordered. However, it does
43 * not contain a memory barrier, so if it is used for locking purposes,
44 * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
45 * in order to ensure changes are visible on other processors.
46 */
47static inline void sync_clear_bit(int nr, volatile unsigned long *addr)
48{
49 asm volatile("lock; btrl %1,%0"
50 : "+m" (ADDR)
51 : "Ir" (nr)
52 : "memory");
53}
54
55/**
56 * sync_change_bit - Toggle a bit in memory
57 * @nr: Bit to change
58 * @addr: Address to start counting from
59 *
60 * sync_change_bit() is atomic and may not be reordered.
61 * Note that @nr may be almost arbitrarily large; this function is not
62 * restricted to acting on a single-word quantity.
63 */
64static inline void sync_change_bit(int nr, volatile unsigned long *addr)
65{
66 asm volatile("lock; btcl %1,%0"
67 : "+m" (ADDR)
68 : "Ir" (nr)
69 : "memory");
70}
71
72/**
73 * sync_test_and_set_bit - Set a bit and return its old value
74 * @nr: Bit to set
75 * @addr: Address to count from
76 *
77 * This operation is atomic and cannot be reordered.
78 * It also implies a memory barrier.
79 */
80static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr)
81{
82 int oldbit;
83
84 asm volatile("lock; btsl %2,%1\n\tsbbl %0,%0"
85 : "=r" (oldbit), "+m" (ADDR)
86 : "Ir" (nr) : "memory");
87 return oldbit;
88}
89
90/**
91 * sync_test_and_clear_bit - Clear a bit and return its old value
92 * @nr: Bit to clear
93 * @addr: Address to count from
94 *
95 * This operation is atomic and cannot be reordered.
96 * It also implies a memory barrier.
97 */
98static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr)
99{
100 int oldbit;
101
102 asm volatile("lock; btrl %2,%1\n\tsbbl %0,%0"
103 : "=r" (oldbit), "+m" (ADDR)
104 : "Ir" (nr) : "memory");
105 return oldbit;
106}
107
108/**
109 * sync_test_and_change_bit - Change a bit and return its old value
110 * @nr: Bit to change
111 * @addr: Address to count from
112 *
113 * This operation is atomic and cannot be reordered.
114 * It also implies a memory barrier.
115 */
116static inline int sync_test_and_change_bit(int nr, volatile unsigned long *addr)
117{
118 int oldbit;
119
120 asm volatile("lock; btcl %2,%1\n\tsbbl %0,%0"
121 : "=r" (oldbit), "+m" (ADDR)
122 : "Ir" (nr) : "memory");
123 return oldbit;
124}
125
126#define sync_test_bit(nr, addr) test_bit(nr, addr)
127
128#undef ADDR
129
130#endif /* _ASM_X86_SYNC_BITOPS_H */
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
new file mode 100644
index 00000000000..d82f39bb790
--- /dev/null
+++ b/arch/x86/include/asm/syscall.h
@@ -0,0 +1,213 @@
1/*
2 * Access to user system call parameters and results
3 *
4 * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
5 *
6 * This copyrighted material is made available to anyone wishing to use,
7 * modify, copy, or redistribute it subject to the terms and conditions
8 * of the GNU General Public License v.2.
9 *
10 * See asm-generic/syscall.h for descriptions of what we must do here.
11 */
12
13#ifndef _ASM_X86_SYSCALL_H
14#define _ASM_X86_SYSCALL_H
15
16#include <linux/sched.h>
17#include <linux/err.h>
18
19static inline long syscall_get_nr(struct task_struct *task,
20 struct pt_regs *regs)
21{
22 /*
23 * We always sign-extend a -1 value being set here,
24 * so this is always either -1L or a syscall number.
25 */
26 return regs->orig_ax;
27}
28
29static inline void syscall_rollback(struct task_struct *task,
30 struct pt_regs *regs)
31{
32 regs->ax = regs->orig_ax;
33}
34
35static inline long syscall_get_error(struct task_struct *task,
36 struct pt_regs *regs)
37{
38 unsigned long error = regs->ax;
39#ifdef CONFIG_IA32_EMULATION
40 /*
41 * TS_COMPAT is set for 32-bit syscall entries and then
42 * remains set until we return to user mode.
43 */
44 if (task_thread_info(task)->status & TS_COMPAT)
45 /*
46 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
47 * and will match correctly in comparisons.
48 */
49 error = (long) (int) error;
50#endif
51 return IS_ERR_VALUE(error) ? error : 0;
52}
53
54static inline long syscall_get_return_value(struct task_struct *task,
55 struct pt_regs *regs)
56{
57 return regs->ax;
58}
59
60static inline void syscall_set_return_value(struct task_struct *task,
61 struct pt_regs *regs,
62 int error, long val)
63{
64 regs->ax = (long) error ?: val;
65}
66
67#ifdef CONFIG_X86_32
68
69static inline void syscall_get_arguments(struct task_struct *task,
70 struct pt_regs *regs,
71 unsigned int i, unsigned int n,
72 unsigned long *args)
73{
74 BUG_ON(i + n > 6);
75 memcpy(args, &regs->bx + i, n * sizeof(args[0]));
76}
77
78static inline void syscall_set_arguments(struct task_struct *task,
79 struct pt_regs *regs,
80 unsigned int i, unsigned int n,
81 const unsigned long *args)
82{
83 BUG_ON(i + n > 6);
84 memcpy(&regs->bx + i, args, n * sizeof(args[0]));
85}
86
87#else /* CONFIG_X86_64 */
88
89static inline void syscall_get_arguments(struct task_struct *task,
90 struct pt_regs *regs,
91 unsigned int i, unsigned int n,
92 unsigned long *args)
93{
94# ifdef CONFIG_IA32_EMULATION
95 if (task_thread_info(task)->status & TS_COMPAT)
96 switch (i) {
97 case 0:
98 if (!n--) break;
99 *args++ = regs->bx;
100 case 1:
101 if (!n--) break;
102 *args++ = regs->cx;
103 case 2:
104 if (!n--) break;
105 *args++ = regs->dx;
106 case 3:
107 if (!n--) break;
108 *args++ = regs->si;
109 case 4:
110 if (!n--) break;
111 *args++ = regs->di;
112 case 5:
113 if (!n--) break;
114 *args++ = regs->bp;
115 case 6:
116 if (!n--) break;
117 default:
118 BUG();
119 break;
120 }
121 else
122# endif
123 switch (i) {
124 case 0:
125 if (!n--) break;
126 *args++ = regs->di;
127 case 1:
128 if (!n--) break;
129 *args++ = regs->si;
130 case 2:
131 if (!n--) break;
132 *args++ = regs->dx;
133 case 3:
134 if (!n--) break;
135 *args++ = regs->r10;
136 case 4:
137 if (!n--) break;
138 *args++ = regs->r8;
139 case 5:
140 if (!n--) break;
141 *args++ = regs->r9;
142 case 6:
143 if (!n--) break;
144 default:
145 BUG();
146 break;
147 }
148}
149
150static inline void syscall_set_arguments(struct task_struct *task,
151 struct pt_regs *regs,
152 unsigned int i, unsigned int n,
153 const unsigned long *args)
154{
155# ifdef CONFIG_IA32_EMULATION
156 if (task_thread_info(task)->status & TS_COMPAT)
157 switch (i) {
158 case 0:
159 if (!n--) break;
160 regs->bx = *args++;
161 case 1:
162 if (!n--) break;
163 regs->cx = *args++;
164 case 2:
165 if (!n--) break;
166 regs->dx = *args++;
167 case 3:
168 if (!n--) break;
169 regs->si = *args++;
170 case 4:
171 if (!n--) break;
172 regs->di = *args++;
173 case 5:
174 if (!n--) break;
175 regs->bp = *args++;
176 case 6:
177 if (!n--) break;
178 default:
179 BUG();
180 break;
181 }
182 else
183# endif
184 switch (i) {
185 case 0:
186 if (!n--) break;
187 regs->di = *args++;
188 case 1:
189 if (!n--) break;
190 regs->si = *args++;
191 case 2:
192 if (!n--) break;
193 regs->dx = *args++;
194 case 3:
195 if (!n--) break;
196 regs->r10 = *args++;
197 case 4:
198 if (!n--) break;
199 regs->r8 = *args++;
200 case 5:
201 if (!n--) break;
202 regs->r9 = *args++;
203 case 6:
204 if (!n--) break;
205 default:
206 BUG();
207 break;
208 }
209}
210
211#endif /* CONFIG_X86_32 */
212
213#endif /* _ASM_X86_SYSCALL_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
new file mode 100644
index 00000000000..87803da4401
--- /dev/null
+++ b/arch/x86/include/asm/syscalls.h
@@ -0,0 +1,93 @@
1/*
2 * syscalls.h - Linux syscall interfaces (arch-specific)
3 *
4 * Copyright (c) 2008 Jaswinder Singh
5 *
6 * This file is released under the GPLv2.
7 * See the file COPYING for more details.
8 */
9
10#ifndef _ASM_X86_SYSCALLS_H
11#define _ASM_X86_SYSCALLS_H
12
13#include <linux/compiler.h>
14#include <linux/linkage.h>
15#include <linux/types.h>
16#include <linux/signal.h>
17
18/* Common in X86_32 and X86_64 */
19/* kernel/ioport.c */
20asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
21
22/* X86_32 only */
23#ifdef CONFIG_X86_32
24/* kernel/process_32.c */
25asmlinkage int sys_fork(struct pt_regs);
26asmlinkage int sys_clone(struct pt_regs);
27asmlinkage int sys_vfork(struct pt_regs);
28asmlinkage int sys_execve(struct pt_regs);
29
30/* kernel/signal_32.c */
31asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
32asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
33 struct old_sigaction __user *);
34asmlinkage int sys_sigaltstack(unsigned long);
35asmlinkage unsigned long sys_sigreturn(unsigned long);
36asmlinkage int sys_rt_sigreturn(unsigned long);
37
38/* kernel/ioport.c */
39asmlinkage long sys_iopl(unsigned long);
40
41/* kernel/ldt.c */
42asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
43
44/* kernel/sys_i386_32.c */
45asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
46 unsigned long, unsigned long, unsigned long);
47struct mmap_arg_struct;
48asmlinkage int old_mmap(struct mmap_arg_struct __user *);
49struct sel_arg_struct;
50asmlinkage int old_select(struct sel_arg_struct __user *);
51asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
52struct old_utsname;
53asmlinkage int sys_uname(struct old_utsname __user *);
54struct oldold_utsname;
55asmlinkage int sys_olduname(struct oldold_utsname __user *);
56
57/* kernel/tls.c */
58asmlinkage int sys_set_thread_area(struct user_desc __user *);
59asmlinkage int sys_get_thread_area(struct user_desc __user *);
60
61/* kernel/vm86_32.c */
62asmlinkage int sys_vm86old(struct pt_regs);
63asmlinkage int sys_vm86(struct pt_regs);
64
65#else /* CONFIG_X86_32 */
66
67/* X86_64 only */
68/* kernel/process_64.c */
69asmlinkage long sys_fork(struct pt_regs *);
70asmlinkage long sys_clone(unsigned long, unsigned long,
71 void __user *, void __user *,
72 struct pt_regs *);
73asmlinkage long sys_vfork(struct pt_regs *);
74asmlinkage long sys_execve(char __user *, char __user * __user *,
75 char __user * __user *,
76 struct pt_regs *);
77
78/* kernel/ioport.c */
79asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
80
81/* kernel/signal_64.c */
82asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
83 struct pt_regs *);
84asmlinkage long sys_rt_sigreturn(struct pt_regs *);
85
86/* kernel/sys_x86_64.c */
87asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
88 unsigned long, unsigned long, unsigned long);
89struct new_utsname;
90asmlinkage long sys_uname(struct new_utsname __user *);
91
92#endif /* CONFIG_X86_32 */
93#endif /* _ASM_X86_SYSCALLS_H */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
new file mode 100644
index 00000000000..2ed3f0f44ff
--- /dev/null
+++ b/arch/x86/include/asm/system.h
@@ -0,0 +1,425 @@
1#ifndef _ASM_X86_SYSTEM_H
2#define _ASM_X86_SYSTEM_H
3
4#include <asm/asm.h>
5#include <asm/segment.h>
6#include <asm/cpufeature.h>
7#include <asm/cmpxchg.h>
8#include <asm/nops.h>
9
10#include <linux/kernel.h>
11#include <linux/irqflags.h>
12
13/* entries in ARCH_DLINFO: */
14#ifdef CONFIG_IA32_EMULATION
15# define AT_VECTOR_SIZE_ARCH 2
16#else
17# define AT_VECTOR_SIZE_ARCH 1
18#endif
19
20#ifdef CONFIG_X86_32
21
22struct task_struct; /* one of the stranger aspects of C forward declarations */
23struct task_struct *__switch_to(struct task_struct *prev,
24 struct task_struct *next);
25
26/*
27 * Saving eflags is important. It switches not only IOPL between tasks,
28 * it also protects other tasks from NT leaking through sysenter etc.
29 */
30#define switch_to(prev, next, last) \
31do { \
32 /* \
33 * Context-switching clobbers all registers, so we clobber \
34 * them explicitly, via unused output variables. \
35 * (EAX and EBP is not listed because EBP is saved/restored \
36 * explicitly for wchan access and EAX is the return value of \
37 * __switch_to()) \
38 */ \
39 unsigned long ebx, ecx, edx, esi, edi; \
40 \
41 asm volatile("pushfl\n\t" /* save flags */ \
42 "pushl %%ebp\n\t" /* save EBP */ \
43 "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
44 "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
45 "movl $1f,%[prev_ip]\n\t" /* save EIP */ \
46 "pushl %[next_ip]\n\t" /* restore EIP */ \
47 "jmp __switch_to\n" /* regparm call */ \
48 "1:\t" \
49 "popl %%ebp\n\t" /* restore EBP */ \
50 "popfl\n" /* restore flags */ \
51 \
52 /* output parameters */ \
53 : [prev_sp] "=m" (prev->thread.sp), \
54 [prev_ip] "=m" (prev->thread.ip), \
55 "=a" (last), \
56 \
57 /* clobbered output registers: */ \
58 "=b" (ebx), "=c" (ecx), "=d" (edx), \
59 "=S" (esi), "=D" (edi) \
60 \
61 /* input parameters: */ \
62 : [next_sp] "m" (next->thread.sp), \
63 [next_ip] "m" (next->thread.ip), \
64 \
65 /* regparm parameters for __switch_to(): */ \
66 [prev] "a" (prev), \
67 [next] "d" (next) \
68 \
69 : /* reloaded segment registers */ \
70 "memory"); \
71} while (0)
72
73/*
74 * disable hlt during certain critical i/o operations
75 */
76#define HAVE_DISABLE_HLT
77#else
78#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
79#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
80
81/* frame pointer must be last for get_wchan */
82#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
83#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
84
85#define __EXTRA_CLOBBER \
86 , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
87 "r12", "r13", "r14", "r15"
88
89/* Save restore flags to clear handle leaking NT */
90#define switch_to(prev, next, last) \
91 asm volatile(SAVE_CONTEXT \
92 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
93 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
94 "call __switch_to\n\t" \
95 ".globl thread_return\n" \
96 "thread_return:\n\t" \
97 "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
98 "movq %P[thread_info](%%rsi),%%r8\n\t" \
99 LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
100 "movq %%rax,%%rdi\n\t" \
101 "jc ret_from_fork\n\t" \
102 RESTORE_CONTEXT \
103 : "=a" (last) \
104 : [next] "S" (next), [prev] "D" (prev), \
105 [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
106 [ti_flags] "i" (offsetof(struct thread_info, flags)), \
107 [tif_fork] "i" (TIF_FORK), \
108 [thread_info] "i" (offsetof(struct task_struct, stack)), \
109 [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
110 : "memory", "cc" __EXTRA_CLOBBER)
111#endif
112
113#ifdef __KERNEL__
114#define _set_base(addr, base) do { unsigned long __pr; \
115__asm__ __volatile__ ("movw %%dx,%1\n\t" \
116 "rorl $16,%%edx\n\t" \
117 "movb %%dl,%2\n\t" \
118 "movb %%dh,%3" \
119 :"=&d" (__pr) \
120 :"m" (*((addr)+2)), \
121 "m" (*((addr)+4)), \
122 "m" (*((addr)+7)), \
123 "0" (base) \
124 ); } while (0)
125
126#define _set_limit(addr, limit) do { unsigned long __lr; \
127__asm__ __volatile__ ("movw %%dx,%1\n\t" \
128 "rorl $16,%%edx\n\t" \
129 "movb %2,%%dh\n\t" \
130 "andb $0xf0,%%dh\n\t" \
131 "orb %%dh,%%dl\n\t" \
132 "movb %%dl,%2" \
133 :"=&d" (__lr) \
134 :"m" (*(addr)), \
135 "m" (*((addr)+6)), \
136 "0" (limit) \
137 ); } while (0)
138
139#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
140#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
141
142extern void native_load_gs_index(unsigned);
143
144/*
145 * Load a segment. Fall back on loading the zero
146 * segment if something goes wrong..
147 */
148#define loadsegment(seg, value) \
149 asm volatile("\n" \
150 "1:\t" \
151 "movl %k0,%%" #seg "\n" \
152 "2:\n" \
153 ".section .fixup,\"ax\"\n" \
154 "3:\t" \
155 "movl %k1, %%" #seg "\n\t" \
156 "jmp 2b\n" \
157 ".previous\n" \
158 _ASM_EXTABLE(1b,3b) \
159 : :"r" (value), "r" (0) : "memory")
160
161
162/*
163 * Save a segment register away
164 */
165#define savesegment(seg, value) \
166 asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
167
168static inline unsigned long get_limit(unsigned long segment)
169{
170 unsigned long __limit;
171 asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
172 return __limit + 1;
173}
174
175static inline void native_clts(void)
176{
177 asm volatile("clts");
178}
179
180/*
181 * Volatile isn't enough to prevent the compiler from reordering the
182 * read/write functions for the control registers and messing everything up.
183 * A memory clobber would solve the problem, but would prevent reordering of
184 * all loads stores around it, which can hurt performance. Solution is to
185 * use a variable and mimic reads and writes to it to enforce serialization
186 */
187static unsigned long __force_order;
188
189static inline unsigned long native_read_cr0(void)
190{
191 unsigned long val;
192 asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
193 return val;
194}
195
196static inline void native_write_cr0(unsigned long val)
197{
198 asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
199}
200
201static inline unsigned long native_read_cr2(void)
202{
203 unsigned long val;
204 asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order));
205 return val;
206}
207
208static inline void native_write_cr2(unsigned long val)
209{
210 asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
211}
212
213static inline unsigned long native_read_cr3(void)
214{
215 unsigned long val;
216 asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
217 return val;
218}
219
220static inline void native_write_cr3(unsigned long val)
221{
222 asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
223}
224
225static inline unsigned long native_read_cr4(void)
226{
227 unsigned long val;
228 asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
229 return val;
230}
231
232static inline unsigned long native_read_cr4_safe(void)
233{
234 unsigned long val;
235 /* This could fault if %cr4 does not exist. In x86_64, a cr4 always
236 * exists, so it will never fail. */
237#ifdef CONFIG_X86_32
238 asm volatile("1: mov %%cr4, %0\n"
239 "2:\n"
240 _ASM_EXTABLE(1b, 2b)
241 : "=r" (val), "=m" (__force_order) : "0" (0));
242#else
243 val = native_read_cr4();
244#endif
245 return val;
246}
247
248static inline void native_write_cr4(unsigned long val)
249{
250 asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
251}
252
253#ifdef CONFIG_X86_64
254static inline unsigned long native_read_cr8(void)
255{
256 unsigned long cr8;
257 asm volatile("movq %%cr8,%0" : "=r" (cr8));
258 return cr8;
259}
260
261static inline void native_write_cr8(unsigned long val)
262{
263 asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
264}
265#endif
266
267static inline void native_wbinvd(void)
268{
269 asm volatile("wbinvd": : :"memory");
270}
271
272#ifdef CONFIG_PARAVIRT
273#include <asm/paravirt.h>
274#else
275#define read_cr0() (native_read_cr0())
276#define write_cr0(x) (native_write_cr0(x))
277#define read_cr2() (native_read_cr2())
278#define write_cr2(x) (native_write_cr2(x))
279#define read_cr3() (native_read_cr3())
280#define write_cr3(x) (native_write_cr3(x))
281#define read_cr4() (native_read_cr4())
282#define read_cr4_safe() (native_read_cr4_safe())
283#define write_cr4(x) (native_write_cr4(x))
284#define wbinvd() (native_wbinvd())
285#ifdef CONFIG_X86_64
286#define read_cr8() (native_read_cr8())
287#define write_cr8(x) (native_write_cr8(x))
288#define load_gs_index native_load_gs_index
289#endif
290
291/* Clear the 'TS' bit */
292#define clts() (native_clts())
293
294#endif/* CONFIG_PARAVIRT */
295
296#define stts() write_cr0(read_cr0() | X86_CR0_TS)
297
298#endif /* __KERNEL__ */
299
300static inline void clflush(volatile void *__p)
301{
302 asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
303}
304
305#define nop() asm volatile ("nop")
306
307void disable_hlt(void);
308void enable_hlt(void);
309
310void cpu_idle_wait(void);
311
312extern unsigned long arch_align_stack(unsigned long sp);
313extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
314
315void default_idle(void);
316
317/*
318 * Force strict CPU ordering.
319 * And yes, this is required on UP too when we're talking
320 * to devices.
321 */
322#ifdef CONFIG_X86_32
323/*
324 * Some non-Intel clones support out of order store. wmb() ceases to be a
325 * nop for these.
326 */
327#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
328#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
329#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
330#else
331#define mb() asm volatile("mfence":::"memory")
332#define rmb() asm volatile("lfence":::"memory")
333#define wmb() asm volatile("sfence" ::: "memory")
334#endif
335
336/**
337 * read_barrier_depends - Flush all pending reads that subsequents reads
338 * depend on.
339 *
340 * No data-dependent reads from memory-like regions are ever reordered
341 * over this barrier. All reads preceding this primitive are guaranteed
342 * to access memory (but not necessarily other CPUs' caches) before any
343 * reads following this primitive that depend on the data return by
344 * any of the preceding reads. This primitive is much lighter weight than
345 * rmb() on most CPUs, and is never heavier weight than is
346 * rmb().
347 *
348 * These ordering constraints are respected by both the local CPU
349 * and the compiler.
350 *
351 * Ordering is not guaranteed by anything other than these primitives,
352 * not even by data dependencies. See the documentation for
353 * memory_barrier() for examples and URLs to more information.
354 *
355 * For example, the following code would force ordering (the initial
356 * value of "a" is zero, "b" is one, and "p" is "&a"):
357 *
358 * <programlisting>
359 * CPU 0 CPU 1
360 *
361 * b = 2;
362 * memory_barrier();
363 * p = &b; q = p;
364 * read_barrier_depends();
365 * d = *q;
366 * </programlisting>
367 *
368 * because the read of "*q" depends on the read of "p" and these
369 * two reads are separated by a read_barrier_depends(). However,
370 * the following code, with the same initial values for "a" and "b":
371 *
372 * <programlisting>
373 * CPU 0 CPU 1
374 *
375 * a = 2;
376 * memory_barrier();
377 * b = 3; y = b;
378 * read_barrier_depends();
379 * x = a;
380 * </programlisting>
381 *
382 * does not enforce ordering, since there is no data dependency between
383 * the read of "a" and the read of "b". Therefore, on some CPUs, such
384 * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
385 * in cases like this where there are no data dependencies.
386 **/
387
388#define read_barrier_depends() do { } while (0)
389
390#ifdef CONFIG_SMP
391#define smp_mb() mb()
392#ifdef CONFIG_X86_PPRO_FENCE
393# define smp_rmb() rmb()
394#else
395# define smp_rmb() barrier()
396#endif
397#ifdef CONFIG_X86_OOSTORE
398# define smp_wmb() wmb()
399#else
400# define smp_wmb() barrier()
401#endif
402#define smp_read_barrier_depends() read_barrier_depends()
403#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
404#else
405#define smp_mb() barrier()
406#define smp_rmb() barrier()
407#define smp_wmb() barrier()
408#define smp_read_barrier_depends() do { } while (0)
409#define set_mb(var, value) do { var = value; barrier(); } while (0)
410#endif
411
412/*
413 * Stop RDTSC speculation. This is needed when you need to use RDTSC
414 * (or get_cycles or vread that possibly accesses the TSC) in a defined
415 * code region.
416 *
417 * (Could use an alternative three way for this if there was one.)
418 */
419static inline void rdtsc_barrier(void)
420{
421 alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
422 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
423}
424
425#endif /* _ASM_X86_SYSTEM_H */
diff --git a/arch/x86/include/asm/system_64.h b/arch/x86/include/asm/system_64.h
new file mode 100644
index 00000000000..1159e091ad0
--- /dev/null
+++ b/arch/x86/include/asm/system_64.h
@@ -0,0 +1,22 @@
1#ifndef _ASM_X86_SYSTEM_64_H
2#define _ASM_X86_SYSTEM_64_H
3
4#include <asm/segment.h>
5#include <asm/cmpxchg.h>
6
7
8static inline unsigned long read_cr8(void)
9{
10 unsigned long cr8;
11 asm volatile("movq %%cr8,%0" : "=r" (cr8));
12 return cr8;
13}
14
15static inline void write_cr8(unsigned long val)
16{
17 asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
18}
19
20#include <linux/irqflags.h>
21
22#endif /* _ASM_X86_SYSTEM_64_H */
diff --git a/arch/x86/include/asm/tce.h b/arch/x86/include/asm/tce.h
new file mode 100644
index 00000000000..7a6677c1a71
--- /dev/null
+++ b/arch/x86/include/asm/tce.h
@@ -0,0 +1,48 @@
1/*
2 * This file is derived from asm-powerpc/tce.h.
3 *
4 * Copyright (C) IBM Corporation, 2006
5 *
6 * Author: Muli Ben-Yehuda <muli@il.ibm.com>
7 * Author: Jon Mason <jdmason@us.ibm.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#ifndef _ASM_X86_TCE_H
25#define _ASM_X86_TCE_H
26
27extern unsigned int specified_table_size;
28struct iommu_table;
29
30#define TCE_ENTRY_SIZE 8 /* in bytes */
31
32#define TCE_READ_SHIFT 0
33#define TCE_WRITE_SHIFT 1
34#define TCE_HUBID_SHIFT 2 /* unused */
35#define TCE_RSVD_SHIFT 8 /* unused */
36#define TCE_RPN_SHIFT 12
37#define TCE_UNUSED_SHIFT 48 /* unused */
38
39#define TCE_RPN_MASK 0x0000fffffffff000ULL
40
41extern void tce_build(struct iommu_table *tbl, unsigned long index,
42 unsigned int npages, unsigned long uaddr, int direction);
43extern void tce_free(struct iommu_table *tbl, long index, unsigned int npages);
44extern void * __init alloc_tce_table(void);
45extern void __init free_tce_table(void *tbl);
46extern int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar);
47
48#endif /* _ASM_X86_TCE_H */
diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/asm/termbits.h
new file mode 100644
index 00000000000..af1b70ea440
--- /dev/null
+++ b/arch/x86/include/asm/termbits.h
@@ -0,0 +1,198 @@
1#ifndef _ASM_X86_TERMBITS_H
2#define _ASM_X86_TERMBITS_H
3
4#include <linux/posix_types.h>
5
6typedef unsigned char cc_t;
7typedef unsigned int speed_t;
8typedef unsigned int tcflag_t;
9
10#define NCCS 19
11struct termios {
12 tcflag_t c_iflag; /* input mode flags */
13 tcflag_t c_oflag; /* output mode flags */
14 tcflag_t c_cflag; /* control mode flags */
15 tcflag_t c_lflag; /* local mode flags */
16 cc_t c_line; /* line discipline */
17 cc_t c_cc[NCCS]; /* control characters */
18};
19
20struct termios2 {
21 tcflag_t c_iflag; /* input mode flags */
22 tcflag_t c_oflag; /* output mode flags */
23 tcflag_t c_cflag; /* control mode flags */
24 tcflag_t c_lflag; /* local mode flags */
25 cc_t c_line; /* line discipline */
26 cc_t c_cc[NCCS]; /* control characters */
27 speed_t c_ispeed; /* input speed */
28 speed_t c_ospeed; /* output speed */
29};
30
31struct ktermios {
32 tcflag_t c_iflag; /* input mode flags */
33 tcflag_t c_oflag; /* output mode flags */
34 tcflag_t c_cflag; /* control mode flags */
35 tcflag_t c_lflag; /* local mode flags */
36 cc_t c_line; /* line discipline */
37 cc_t c_cc[NCCS]; /* control characters */
38 speed_t c_ispeed; /* input speed */
39 speed_t c_ospeed; /* output speed */
40};
41
42/* c_cc characters */
43#define VINTR 0
44#define VQUIT 1
45#define VERASE 2
46#define VKILL 3
47#define VEOF 4
48#define VTIME 5
49#define VMIN 6
50#define VSWTC 7
51#define VSTART 8
52#define VSTOP 9
53#define VSUSP 10
54#define VEOL 11
55#define VREPRINT 12
56#define VDISCARD 13
57#define VWERASE 14
58#define VLNEXT 15
59#define VEOL2 16
60
61/* c_iflag bits */
62#define IGNBRK 0000001
63#define BRKINT 0000002
64#define IGNPAR 0000004
65#define PARMRK 0000010
66#define INPCK 0000020
67#define ISTRIP 0000040
68#define INLCR 0000100
69#define IGNCR 0000200
70#define ICRNL 0000400
71#define IUCLC 0001000
72#define IXON 0002000
73#define IXANY 0004000
74#define IXOFF 0010000
75#define IMAXBEL 0020000
76#define IUTF8 0040000
77
78/* c_oflag bits */
79#define OPOST 0000001
80#define OLCUC 0000002
81#define ONLCR 0000004
82#define OCRNL 0000010
83#define ONOCR 0000020
84#define ONLRET 0000040
85#define OFILL 0000100
86#define OFDEL 0000200
87#define NLDLY 0000400
88#define NL0 0000000
89#define NL1 0000400
90#define CRDLY 0003000
91#define CR0 0000000
92#define CR1 0001000
93#define CR2 0002000
94#define CR3 0003000
95#define TABDLY 0014000
96#define TAB0 0000000
97#define TAB1 0004000
98#define TAB2 0010000
99#define TAB3 0014000
100#define XTABS 0014000
101#define BSDLY 0020000
102#define BS0 0000000
103#define BS1 0020000
104#define VTDLY 0040000
105#define VT0 0000000
106#define VT1 0040000
107#define FFDLY 0100000
108#define FF0 0000000
109#define FF1 0100000
110
111/* c_cflag bit meaning */
112#define CBAUD 0010017
113#define B0 0000000 /* hang up */
114#define B50 0000001
115#define B75 0000002
116#define B110 0000003
117#define B134 0000004
118#define B150 0000005
119#define B200 0000006
120#define B300 0000007
121#define B600 0000010
122#define B1200 0000011
123#define B1800 0000012
124#define B2400 0000013
125#define B4800 0000014
126#define B9600 0000015
127#define B19200 0000016
128#define B38400 0000017
129#define EXTA B19200
130#define EXTB B38400
131#define CSIZE 0000060
132#define CS5 0000000
133#define CS6 0000020
134#define CS7 0000040
135#define CS8 0000060
136#define CSTOPB 0000100
137#define CREAD 0000200
138#define PARENB 0000400
139#define PARODD 0001000
140#define HUPCL 0002000
141#define CLOCAL 0004000
142#define CBAUDEX 0010000
143#define BOTHER 0010000 /* non standard rate */
144#define B57600 0010001
145#define B115200 0010002
146#define B230400 0010003
147#define B460800 0010004
148#define B500000 0010005
149#define B576000 0010006
150#define B921600 0010007
151#define B1000000 0010010
152#define B1152000 0010011
153#define B1500000 0010012
154#define B2000000 0010013
155#define B2500000 0010014
156#define B3000000 0010015
157#define B3500000 0010016
158#define B4000000 0010017
159#define CIBAUD 002003600000 /* input baud rate */
160#define CMSPAR 010000000000 /* mark or space (stick) parity */
161#define CRTSCTS 020000000000 /* flow control */
162
163#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */
164
165/* c_lflag bits */
166#define ISIG 0000001
167#define ICANON 0000002
168#define XCASE 0000004
169#define ECHO 0000010
170#define ECHOE 0000020
171#define ECHOK 0000040
172#define ECHONL 0000100
173#define NOFLSH 0000200
174#define TOSTOP 0000400
175#define ECHOCTL 0001000
176#define ECHOPRT 0002000
177#define ECHOKE 0004000
178#define FLUSHO 0010000
179#define PENDIN 0040000
180#define IEXTEN 0100000
181
182/* tcflow() and TCXONC use these */
183#define TCOOFF 0
184#define TCOON 1
185#define TCIOFF 2
186#define TCION 3
187
188/* tcflush() and TCFLSH use these */
189#define TCIFLUSH 0
190#define TCOFLUSH 1
191#define TCIOFLUSH 2
192
193/* tcsetattr uses these */
194#define TCSANOW 0
195#define TCSADRAIN 1
196#define TCSAFLUSH 2
197
198#endif /* _ASM_X86_TERMBITS_H */
diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h
new file mode 100644
index 00000000000..f72956331c4
--- /dev/null
+++ b/arch/x86/include/asm/termios.h
@@ -0,0 +1,113 @@
1#ifndef _ASM_X86_TERMIOS_H
2#define _ASM_X86_TERMIOS_H
3
4#include <asm/termbits.h>
5#include <asm/ioctls.h>
6
7struct winsize {
8 unsigned short ws_row;
9 unsigned short ws_col;
10 unsigned short ws_xpixel;
11 unsigned short ws_ypixel;
12};
13
14#define NCC 8
15struct termio {
16 unsigned short c_iflag; /* input mode flags */
17 unsigned short c_oflag; /* output mode flags */
18 unsigned short c_cflag; /* control mode flags */
19 unsigned short c_lflag; /* local mode flags */
20 unsigned char c_line; /* line discipline */
21 unsigned char c_cc[NCC]; /* control characters */
22};
23
24/* modem lines */
25#define TIOCM_LE 0x001
26#define TIOCM_DTR 0x002
27#define TIOCM_RTS 0x004
28#define TIOCM_ST 0x008
29#define TIOCM_SR 0x010
30#define TIOCM_CTS 0x020
31#define TIOCM_CAR 0x040
32#define TIOCM_RNG 0x080
33#define TIOCM_DSR 0x100
34#define TIOCM_CD TIOCM_CAR
35#define TIOCM_RI TIOCM_RNG
36#define TIOCM_OUT1 0x2000
37#define TIOCM_OUT2 0x4000
38#define TIOCM_LOOP 0x8000
39
40/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */
41
42#ifdef __KERNEL__
43
44#include <asm/uaccess.h>
45
46/* intr=^C quit=^\ erase=del kill=^U
47 eof=^D vtime=\0 vmin=\1 sxtc=\0
48 start=^Q stop=^S susp=^Z eol=\0
49 reprint=^R discard=^U werase=^W lnext=^V
50 eol2=\0
51*/
52#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0"
53
54/*
55 * Translate a "termio" structure into a "termios". Ugh.
56 */
57#define SET_LOW_TERMIOS_BITS(termios, termio, x) { \
58 unsigned short __tmp; \
59 get_user(__tmp,&(termio)->x); \
60 *(unsigned short *) &(termios)->x = __tmp; \
61}
62
63static inline int user_termio_to_kernel_termios(struct ktermios *termios,
64 struct termio __user *termio)
65{
66 SET_LOW_TERMIOS_BITS(termios, termio, c_iflag);
67 SET_LOW_TERMIOS_BITS(termios, termio, c_oflag);
68 SET_LOW_TERMIOS_BITS(termios, termio, c_cflag);
69 SET_LOW_TERMIOS_BITS(termios, termio, c_lflag);
70 return copy_from_user(termios->c_cc, termio->c_cc, NCC);
71}
72
73/*
74 * Translate a "termios" structure into a "termio". Ugh.
75 */
76static inline int kernel_termios_to_user_termio(struct termio __user *termio,
77 struct ktermios *termios)
78{
79 put_user((termios)->c_iflag, &(termio)->c_iflag);
80 put_user((termios)->c_oflag, &(termio)->c_oflag);
81 put_user((termios)->c_cflag, &(termio)->c_cflag);
82 put_user((termios)->c_lflag, &(termio)->c_lflag);
83 put_user((termios)->c_line, &(termio)->c_line);
84 return copy_to_user((termio)->c_cc, (termios)->c_cc, NCC);
85}
86
87static inline int user_termios_to_kernel_termios(struct ktermios *k,
88 struct termios2 __user *u)
89{
90 return copy_from_user(k, u, sizeof(struct termios2));
91}
92
93static inline int kernel_termios_to_user_termios(struct termios2 __user *u,
94 struct ktermios *k)
95{
96 return copy_to_user(u, k, sizeof(struct termios2));
97}
98
99static inline int user_termios_to_kernel_termios_1(struct ktermios *k,
100 struct termios __user *u)
101{
102 return copy_from_user(k, u, sizeof(struct termios));
103}
104
105static inline int kernel_termios_to_user_termios_1(struct termios __user *u,
106 struct ktermios *k)
107{
108 return copy_to_user(u, k, sizeof(struct termios));
109}
110
111#endif /* __KERNEL__ */
112
113#endif /* _ASM_X86_TERMIOS_H */
diff --git a/arch/x86/include/asm/therm_throt.h b/arch/x86/include/asm/therm_throt.h
new file mode 100644
index 00000000000..c62349ee786
--- /dev/null
+++ b/arch/x86/include/asm/therm_throt.h
@@ -0,0 +1,9 @@
1#ifndef _ASM_X86_THERM_THROT_H
2#define _ASM_X86_THERM_THROT_H
3
4#include <asm/atomic.h>
5
6extern atomic_t therm_throt_en;
7int therm_throt_process(int curr);
8
9#endif /* _ASM_X86_THERM_THROT_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
new file mode 100644
index 00000000000..e44d379faad
--- /dev/null
+++ b/arch/x86/include/asm/thread_info.h
@@ -0,0 +1,264 @@
1/* thread_info.h: low-level thread information
2 *
3 * Copyright (C) 2002 David Howells (dhowells@redhat.com)
4 * - Incorporating suggestions made by Linus Torvalds and Dave Miller
5 */
6
7#ifndef _ASM_X86_THREAD_INFO_H
8#define _ASM_X86_THREAD_INFO_H
9
10#include <linux/compiler.h>
11#include <asm/page.h>
12#include <asm/types.h>
13
14/*
15 * low level task data that entry.S needs immediate access to
16 * - this struct should fit entirely inside of one cache line
17 * - this struct shares the supervisor stack pages
18 */
19#ifndef __ASSEMBLY__
20struct task_struct;
21struct exec_domain;
22#include <asm/processor.h>
23
24struct thread_info {
25 struct task_struct *task; /* main task structure */
26 struct exec_domain *exec_domain; /* execution domain */
27 unsigned long flags; /* low level flags */
28 __u32 status; /* thread synchronous flags */
29 __u32 cpu; /* current CPU */
30 int preempt_count; /* 0 => preemptable,
31 <0 => BUG */
32 mm_segment_t addr_limit;
33 struct restart_block restart_block;
34 void __user *sysenter_return;
35#ifdef CONFIG_X86_32
36 unsigned long previous_esp; /* ESP of the previous stack in
37 case of nested (IRQ) stacks
38 */
39 __u8 supervisor_stack[0];
40#endif
41};
42
43#define INIT_THREAD_INFO(tsk) \
44{ \
45 .task = &tsk, \
46 .exec_domain = &default_exec_domain, \
47 .flags = 0, \
48 .cpu = 0, \
49 .preempt_count = 1, \
50 .addr_limit = KERNEL_DS, \
51 .restart_block = { \
52 .fn = do_no_restart_syscall, \
53 }, \
54}
55
56#define init_thread_info (init_thread_union.thread_info)
57#define init_stack (init_thread_union.stack)
58
59#else /* !__ASSEMBLY__ */
60
61#include <asm/asm-offsets.h>
62
63#endif
64
65/*
66 * thread information flags
67 * - these are process state flags that various assembly files
68 * may need to access
69 * - pending work-to-be-done flags are in LSW
70 * - other flags in MSW
71 * Warning: layout of LSW is hardcoded in entry.S
72 */
73#define TIF_SYSCALL_TRACE 0 /* syscall trace active */
74#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */
75#define TIF_SIGPENDING 2 /* signal pending */
76#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
77#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
78#define TIF_IRET 5 /* force IRET */
79#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
80#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
81#define TIF_SECCOMP 8 /* secure computing */
82#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
83#define TIF_NOTSC 16 /* TSC is not accessible in userland */
84#define TIF_IA32 17 /* 32bit process */
85#define TIF_FORK 18 /* ret_from_fork */
86#define TIF_ABI_PENDING 19
87#define TIF_MEMDIE 20
88#define TIF_DEBUG 21 /* uses debug registers */
89#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
90#define TIF_FREEZE 23 /* is freezing for suspend */
91#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
92#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
93#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
94#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */
95
96#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
97#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
98#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
99#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
100#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
101#define _TIF_IRET (1 << TIF_IRET)
102#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
103#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
104#define _TIF_SECCOMP (1 << TIF_SECCOMP)
105#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
106#define _TIF_NOTSC (1 << TIF_NOTSC)
107#define _TIF_IA32 (1 << TIF_IA32)
108#define _TIF_FORK (1 << TIF_FORK)
109#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING)
110#define _TIF_DEBUG (1 << TIF_DEBUG)
111#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
112#define _TIF_FREEZE (1 << TIF_FREEZE)
113#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
114#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
115#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
116#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
117
118/* work to do in syscall_trace_enter() */
119#define _TIF_WORK_SYSCALL_ENTRY \
120 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \
121 _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP)
122
123/* work to do in syscall_trace_leave() */
124#define _TIF_WORK_SYSCALL_EXIT \
125 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP)
126
127/* work to do on interrupt/exception return */
128#define _TIF_WORK_MASK \
129 (0x0000FFFF & \
130 ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \
131 _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
132
133/* work to do on any return to user space */
134#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
135
136/* Only used for 64 bit */
137#define _TIF_DO_NOTIFY_MASK \
138 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
139
140/* flags to check in __switch_to() */
141#define _TIF_WORK_CTXSW \
142 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
143 _TIF_NOTSC)
144
145#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
146#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
147
148#define PREEMPT_ACTIVE 0x10000000
149
150/* thread information allocation */
151#ifdef CONFIG_DEBUG_STACK_USAGE
152#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO)
153#else
154#define THREAD_FLAGS GFP_KERNEL
155#endif
156
157#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
158
159#define alloc_thread_info(tsk) \
160 ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER))
161
162#ifdef CONFIG_X86_32
163
164#define STACK_WARN (THREAD_SIZE/8)
165/*
166 * macros/functions for gaining access to the thread information structure
167 *
168 * preempt_count needs to be 1 initially, until the scheduler is functional.
169 */
170#ifndef __ASSEMBLY__
171
172
173/* how to get the current stack pointer from C */
174register unsigned long current_stack_pointer asm("esp") __used;
175
176/* how to get the thread information struct from C */
177static inline struct thread_info *current_thread_info(void)
178{
179 return (struct thread_info *)
180 (current_stack_pointer & ~(THREAD_SIZE - 1));
181}
182
183#else /* !__ASSEMBLY__ */
184
185/* how to get the thread information struct from ASM */
186#define GET_THREAD_INFO(reg) \
187 movl $-THREAD_SIZE, reg; \
188 andl %esp, reg
189
190/* use this one if reg already contains %esp */
191#define GET_THREAD_INFO_WITH_ESP(reg) \
192 andl $-THREAD_SIZE, reg
193
194#endif
195
196#else /* X86_32 */
197
198#include <asm/pda.h>
199
200/*
201 * macros/functions for gaining access to the thread information structure
202 * preempt_count needs to be 1 initially, until the scheduler is functional.
203 */
204#ifndef __ASSEMBLY__
205static inline struct thread_info *current_thread_info(void)
206{
207 struct thread_info *ti;
208 ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE);
209 return ti;
210}
211
212/* do not use in interrupt context */
213static inline struct thread_info *stack_thread_info(void)
214{
215 struct thread_info *ti;
216 asm("andq %%rsp,%0; " : "=r" (ti) : "0" (~(THREAD_SIZE - 1)));
217 return ti;
218}
219
220#else /* !__ASSEMBLY__ */
221
222/* how to get the thread information struct from ASM */
223#define GET_THREAD_INFO(reg) \
224 movq %gs:pda_kernelstack,reg ; \
225 subq $(THREAD_SIZE-PDA_STACKOFFSET),reg
226
227#endif
228
229#endif /* !X86_32 */
230
231/*
232 * Thread-synchronous status.
233 *
234 * This is different from the flags in that nobody else
235 * ever touches our thread-synchronous status, so we don't
236 * have to worry about atomic accesses.
237 */
238#define TS_USEDFPU 0x0001 /* FPU was used by this task
239 this quantum (SMP) */
240#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
241#define TS_POLLING 0x0004 /* true if in idle loop
242 and not sleeping */
243#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
244#define TS_XSAVE 0x0010 /* Use xsave/xrstor */
245
246#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
247
248#ifndef __ASSEMBLY__
249#define HAVE_SET_RESTORE_SIGMASK 1
250static inline void set_restore_sigmask(void)
251{
252 struct thread_info *ti = current_thread_info();
253 ti->status |= TS_RESTORE_SIGMASK;
254 set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
255}
256#endif /* !__ASSEMBLY__ */
257
258#ifndef __ASSEMBLY__
259extern void arch_task_cache_init(void);
260extern void free_thread_info(struct thread_info *ti);
261extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
262#define arch_task_cache_init arch_task_cache_init
263#endif
264#endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
new file mode 100644
index 00000000000..50c733aac42
--- /dev/null
+++ b/arch/x86/include/asm/time.h
@@ -0,0 +1,63 @@
1#ifndef _ASM_X86_TIME_H
2#define _ASM_X86_TIME_H
3
4extern void hpet_time_init(void);
5
6#include <asm/mc146818rtc.h>
7#ifdef CONFIG_X86_32
8#include <linux/efi.h>
9
10static inline unsigned long native_get_wallclock(void)
11{
12 unsigned long retval;
13
14 if (efi_enabled)
15 retval = efi_get_time();
16 else
17 retval = mach_get_cmos_time();
18
19 return retval;
20}
21
22static inline int native_set_wallclock(unsigned long nowtime)
23{
24 int retval;
25
26 if (efi_enabled)
27 retval = efi_set_rtc_mmss(nowtime);
28 else
29 retval = mach_set_rtc_mmss(nowtime);
30
31 return retval;
32}
33
34#else
35extern void native_time_init_hook(void);
36
37static inline unsigned long native_get_wallclock(void)
38{
39 return mach_get_cmos_time();
40}
41
42static inline int native_set_wallclock(unsigned long nowtime)
43{
44 return mach_set_rtc_mmss(nowtime);
45}
46
47#endif
48
49extern void time_init(void);
50
51#ifdef CONFIG_PARAVIRT
52#include <asm/paravirt.h>
53#else /* !CONFIG_PARAVIRT */
54
55#define get_wallclock() native_get_wallclock()
56#define set_wallclock(x) native_set_wallclock(x)
57#define choose_time_init() hpet_time_init
58
59#endif /* CONFIG_PARAVIRT */
60
61extern unsigned long __init calibrate_cpu(void);
62
63#endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
new file mode 100644
index 00000000000..2bb6a835c45
--- /dev/null
+++ b/arch/x86/include/asm/timer.h
@@ -0,0 +1,66 @@
1#ifndef _ASM_X86_TIMER_H
2#define _ASM_X86_TIMER_H
3#include <linux/init.h>
4#include <linux/pm.h>
5#include <linux/percpu.h>
6
7#define TICK_SIZE (tick_nsec / 1000)
8
9unsigned long long native_sched_clock(void);
10unsigned long native_calibrate_tsc(void);
11
12#ifdef CONFIG_X86_32
13extern int timer_ack;
14extern int recalibrate_cpu_khz(void);
15#endif /* CONFIG_X86_32 */
16
17extern int no_timer_check;
18
19#ifndef CONFIG_PARAVIRT
20#define calibrate_tsc() native_calibrate_tsc()
21#endif
22
23/* Accelerators for sched_clock()
24 * convert from cycles(64bits) => nanoseconds (64bits)
25 * basic equation:
26 * ns = cycles / (freq / ns_per_sec)
27 * ns = cycles * (ns_per_sec / freq)
28 * ns = cycles * (10^9 / (cpu_khz * 10^3))
29 * ns = cycles * (10^6 / cpu_khz)
30 *
31 * Then we use scaling math (suggested by george@mvista.com) to get:
32 * ns = cycles * (10^6 * SC / cpu_khz) / SC
33 * ns = cycles * cyc2ns_scale / SC
34 *
35 * And since SC is a constant power of two, we can convert the div
36 * into a shift.
37 *
38 * We can use khz divisor instead of mhz to keep a better precision, since
39 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
40 * (mathieu.desnoyers@polymtl.ca)
41 *
42 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
43 */
44
45DECLARE_PER_CPU(unsigned long, cyc2ns);
46
47#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
48
49static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
50{
51 return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR;
52}
53
54static inline unsigned long long cycles_2_ns(unsigned long long cyc)
55{
56 unsigned long long ns;
57 unsigned long flags;
58
59 local_irq_save(flags);
60 ns = __cycles_2_ns(cyc);
61 local_irq_restore(flags);
62
63 return ns;
64}
65
66#endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
new file mode 100644
index 00000000000..1287dc1347d
--- /dev/null
+++ b/arch/x86/include/asm/timex.h
@@ -0,0 +1,19 @@
1/* x86 architecture timex specifications */
2#ifndef _ASM_X86_TIMEX_H
3#define _ASM_X86_TIMEX_H
4
5#include <asm/processor.h>
6#include <asm/tsc.h>
7
8#ifdef CONFIG_X86_ELAN
9# define PIT_TICK_RATE 1189200 /* AMD Elan has different frequency! */
10#elif defined(CONFIG_X86_RDC321X)
11# define PIT_TICK_RATE 1041667 /* Underlying HZ for R8610 */
12#else
13# define PIT_TICK_RATE 1193182 /* Underlying HZ */
14#endif
15#define CLOCK_TICK_RATE PIT_TICK_RATE
16
17#define ARCH_HAS_READ_CURRENT_TIMER
18
19#endif /* _ASM_X86_TIMEX_H */
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
new file mode 100644
index 00000000000..829215fef9e
--- /dev/null
+++ b/arch/x86/include/asm/tlb.h
@@ -0,0 +1,11 @@
1#ifndef _ASM_X86_TLB_H
2#define _ASM_X86_TLB_H
3
4#define tlb_start_vma(tlb, vma) do { } while (0)
5#define tlb_end_vma(tlb, vma) do { } while (0)
6#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
7#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
8
9#include <asm-generic/tlb.h>
10
11#endif /* _ASM_X86_TLB_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
new file mode 100644
index 00000000000..0e7bbb54911
--- /dev/null
+++ b/arch/x86/include/asm/tlbflush.h
@@ -0,0 +1,178 @@
1#ifndef _ASM_X86_TLBFLUSH_H
2#define _ASM_X86_TLBFLUSH_H
3
4#include <linux/mm.h>
5#include <linux/sched.h>
6
7#include <asm/processor.h>
8#include <asm/system.h>
9
10#ifdef CONFIG_PARAVIRT
11#include <asm/paravirt.h>
12#else
13#define __flush_tlb() __native_flush_tlb()
14#define __flush_tlb_global() __native_flush_tlb_global()
15#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
16#endif
17
18static inline void __native_flush_tlb(void)
19{
20 write_cr3(read_cr3());
21}
22
23static inline void __native_flush_tlb_global(void)
24{
25 unsigned long flags;
26 unsigned long cr4;
27
28 /*
29 * Read-modify-write to CR4 - protect it from preemption and
30 * from interrupts. (Use the raw variant because this code can
31 * be called from deep inside debugging code.)
32 */
33 raw_local_irq_save(flags);
34
35 cr4 = read_cr4();
36 /* clear PGE */
37 write_cr4(cr4 & ~X86_CR4_PGE);
38 /* write old PGE again and flush TLBs */
39 write_cr4(cr4);
40
41 raw_local_irq_restore(flags);
42}
43
44static inline void __native_flush_tlb_single(unsigned long addr)
45{
46 asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
47}
48
49static inline void __flush_tlb_all(void)
50{
51 if (cpu_has_pge)
52 __flush_tlb_global();
53 else
54 __flush_tlb();
55}
56
57static inline void __flush_tlb_one(unsigned long addr)
58{
59 if (cpu_has_invlpg)
60 __flush_tlb_single(addr);
61 else
62 __flush_tlb();
63}
64
65#ifdef CONFIG_X86_32
66# define TLB_FLUSH_ALL 0xffffffff
67#else
68# define TLB_FLUSH_ALL -1ULL
69#endif
70
71/*
72 * TLB flushing:
73 *
74 * - flush_tlb() flushes the current mm struct TLBs
75 * - flush_tlb_all() flushes all processes TLBs
76 * - flush_tlb_mm(mm) flushes the specified mm context TLB's
77 * - flush_tlb_page(vma, vmaddr) flushes one page
78 * - flush_tlb_range(vma, start, end) flushes a range of pages
79 * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
80 * - flush_tlb_others(cpumask, mm, va) flushes TLBs on other cpus
81 *
82 * ..but the i386 has somewhat limited tlb flushing capabilities,
83 * and page-granular flushes are available only on i486 and up.
84 *
85 * x86-64 can only flush individual pages or full VMs. For a range flush
86 * we always do the full VM. Might be worth trying if for a small
87 * range a few INVLPGs in a row are a win.
88 */
89
90#ifndef CONFIG_SMP
91
92#define flush_tlb() __flush_tlb()
93#define flush_tlb_all() __flush_tlb_all()
94#define local_flush_tlb() __flush_tlb()
95
96static inline void flush_tlb_mm(struct mm_struct *mm)
97{
98 if (mm == current->active_mm)
99 __flush_tlb();
100}
101
102static inline void flush_tlb_page(struct vm_area_struct *vma,
103 unsigned long addr)
104{
105 if (vma->vm_mm == current->active_mm)
106 __flush_tlb_one(addr);
107}
108
109static inline void flush_tlb_range(struct vm_area_struct *vma,
110 unsigned long start, unsigned long end)
111{
112 if (vma->vm_mm == current->active_mm)
113 __flush_tlb();
114}
115
116static inline void native_flush_tlb_others(const cpumask_t *cpumask,
117 struct mm_struct *mm,
118 unsigned long va)
119{
120}
121
122static inline void reset_lazy_tlbstate(void)
123{
124}
125
126#else /* SMP */
127
128#include <asm/smp.h>
129
130#define local_flush_tlb() __flush_tlb()
131
132extern void flush_tlb_all(void);
133extern void flush_tlb_current_task(void);
134extern void flush_tlb_mm(struct mm_struct *);
135extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
136
137#define flush_tlb() flush_tlb_current_task()
138
139static inline void flush_tlb_range(struct vm_area_struct *vma,
140 unsigned long start, unsigned long end)
141{
142 flush_tlb_mm(vma->vm_mm);
143}
144
145void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm,
146 unsigned long va);
147
148#define TLBSTATE_OK 1
149#define TLBSTATE_LAZY 2
150
151#ifdef CONFIG_X86_32
152struct tlb_state {
153 struct mm_struct *active_mm;
154 int state;
155 char __cacheline_padding[L1_CACHE_BYTES-8];
156};
157DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
158
159void reset_lazy_tlbstate(void);
160#else
161static inline void reset_lazy_tlbstate(void)
162{
163}
164#endif
165
166#endif /* SMP */
167
168#ifndef CONFIG_PARAVIRT
169#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(&mask, mm, va)
170#endif
171
172static inline void flush_tlb_kernel_range(unsigned long start,
173 unsigned long end)
174{
175 flush_tlb_all();
176}
177
178#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
new file mode 100644
index 00000000000..90ac7718469
--- /dev/null
+++ b/arch/x86/include/asm/topology.h
@@ -0,0 +1,258 @@
1/*
2 * Written by: Matthew Dobson, IBM Corporation
3 *
4 * Copyright (C) 2002, IBM Corp.
5 *
6 * All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16 * NON INFRINGEMENT. See the GNU General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * Send feedback to <colpatch@us.ibm.com>
24 */
25#ifndef _ASM_X86_TOPOLOGY_H
26#define _ASM_X86_TOPOLOGY_H
27
28#ifdef CONFIG_X86_32
29# ifdef CONFIG_X86_HT
30# define ENABLE_TOPO_DEFINES
31# endif
32#else
33# ifdef CONFIG_SMP
34# define ENABLE_TOPO_DEFINES
35# endif
36#endif
37
38/* Node not present */
39#define NUMA_NO_NODE (-1)
40
41#ifdef CONFIG_NUMA
42#include <linux/cpumask.h>
43#include <asm/mpspec.h>
44
45#ifdef CONFIG_X86_32
46
47/* Mappings between node number and cpus on that node. */
48extern cpumask_t node_to_cpumask_map[];
49
50/* Mappings between logical cpu number and node number */
51extern int cpu_to_node_map[];
52
53/* Returns the number of the node containing CPU 'cpu' */
54static inline int cpu_to_node(int cpu)
55{
56 return cpu_to_node_map[cpu];
57}
58#define early_cpu_to_node(cpu) cpu_to_node(cpu)
59
60/* Returns a bitmask of CPUs on Node 'node'.
61 *
62 * Side note: this function creates the returned cpumask on the stack
63 * so with a high NR_CPUS count, excessive stack space is used. The
64 * node_to_cpumask_ptr function should be used whenever possible.
65 */
66static inline cpumask_t node_to_cpumask(int node)
67{
68 return node_to_cpumask_map[node];
69}
70
71#else /* CONFIG_X86_64 */
72
73/* Mappings between node number and cpus on that node. */
74extern cpumask_t *node_to_cpumask_map;
75
76/* Mappings between logical cpu number and node number */
77DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
78
79/* Returns the number of the current Node. */
80#define numa_node_id() read_pda(nodenumber)
81
82#ifdef CONFIG_DEBUG_PER_CPU_MAPS
83extern int cpu_to_node(int cpu);
84extern int early_cpu_to_node(int cpu);
85extern const cpumask_t *_node_to_cpumask_ptr(int node);
86extern cpumask_t node_to_cpumask(int node);
87
88#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
89
90/* Returns the number of the node containing CPU 'cpu' */
91static inline int cpu_to_node(int cpu)
92{
93 return per_cpu(x86_cpu_to_node_map, cpu);
94}
95
96/* Same function but used if called before per_cpu areas are setup */
97static inline int early_cpu_to_node(int cpu)
98{
99 if (early_per_cpu_ptr(x86_cpu_to_node_map))
100 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
101
102 return per_cpu(x86_cpu_to_node_map, cpu);
103}
104
105/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
106static inline const cpumask_t *_node_to_cpumask_ptr(int node)
107{
108 return &node_to_cpumask_map[node];
109}
110
111/* Returns a bitmask of CPUs on Node 'node'. */
112static inline cpumask_t node_to_cpumask(int node)
113{
114 return node_to_cpumask_map[node];
115}
116
117#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
118
119/* Replace default node_to_cpumask_ptr with optimized version */
120#define node_to_cpumask_ptr(v, node) \
121 const cpumask_t *v = _node_to_cpumask_ptr(node)
122
123#define node_to_cpumask_ptr_next(v, node) \
124 v = _node_to_cpumask_ptr(node)
125
126#endif /* CONFIG_X86_64 */
127
128/*
129 * Returns the number of the node containing Node 'node'. This
130 * architecture is flat, so it is a pretty simple function!
131 */
132#define parent_node(node) (node)
133
134#define pcibus_to_node(bus) __pcibus_to_node(bus)
135#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)
136
137#ifdef CONFIG_X86_32
138extern unsigned long node_start_pfn[];
139extern unsigned long node_end_pfn[];
140extern unsigned long node_remap_size[];
141#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
142
143# define SD_CACHE_NICE_TRIES 1
144# define SD_IDLE_IDX 1
145# define SD_NEWIDLE_IDX 2
146# define SD_FORKEXEC_IDX 0
147
148#else
149
150# define SD_CACHE_NICE_TRIES 2
151# define SD_IDLE_IDX 2
152# define SD_NEWIDLE_IDX 2
153# define SD_FORKEXEC_IDX 1
154
155#endif
156
157/* sched_domains SD_NODE_INIT for NUMAQ machines */
158#define SD_NODE_INIT (struct sched_domain) { \
159 .min_interval = 8, \
160 .max_interval = 32, \
161 .busy_factor = 32, \
162 .imbalance_pct = 125, \
163 .cache_nice_tries = SD_CACHE_NICE_TRIES, \
164 .busy_idx = 3, \
165 .idle_idx = SD_IDLE_IDX, \
166 .newidle_idx = SD_NEWIDLE_IDX, \
167 .wake_idx = 1, \
168 .forkexec_idx = SD_FORKEXEC_IDX, \
169 .flags = SD_LOAD_BALANCE \
170 | SD_BALANCE_EXEC \
171 | SD_BALANCE_FORK \
172 | SD_SERIALIZE \
173 | SD_WAKE_BALANCE, \
174 .last_balance = jiffies, \
175 .balance_interval = 1, \
176}
177
178#ifdef CONFIG_X86_64_ACPI_NUMA
179extern int __node_distance(int, int);
180#define node_distance(a, b) __node_distance(a, b)
181#endif
182
183#else /* !CONFIG_NUMA */
184
185#define numa_node_id() 0
186#define cpu_to_node(cpu) 0
187#define early_cpu_to_node(cpu) 0
188
189static inline const cpumask_t *_node_to_cpumask_ptr(int node)
190{
191 return &cpu_online_map;
192}
193static inline cpumask_t node_to_cpumask(int node)
194{
195 return cpu_online_map;
196}
197static inline int node_to_first_cpu(int node)
198{
199 return first_cpu(cpu_online_map);
200}
201
202/* Replace default node_to_cpumask_ptr with optimized version */
203#define node_to_cpumask_ptr(v, node) \
204 const cpumask_t *v = _node_to_cpumask_ptr(node)
205
206#define node_to_cpumask_ptr_next(v, node) \
207 v = _node_to_cpumask_ptr(node)
208#endif
209
210#include <asm-generic/topology.h>
211
212#ifdef CONFIG_NUMA
213/* Returns the number of the first CPU on Node 'node'. */
214static inline int node_to_first_cpu(int node)
215{
216 node_to_cpumask_ptr(mask, node);
217 return first_cpu(*mask);
218}
219#endif
220
221extern cpumask_t cpu_coregroup_map(int cpu);
222
223#ifdef ENABLE_TOPO_DEFINES
224#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
225#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
226#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu))
227#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu))
228
229/* indicates that pointers to the topology cpumask_t maps are valid */
230#define arch_provides_topology_pointers yes
231#endif
232
233static inline void arch_fix_phys_package_id(int num, u32 slot)
234{
235}
236
237struct pci_bus;
238void set_pci_bus_resources_arch_default(struct pci_bus *b);
239
240#ifdef CONFIG_SMP
241#define mc_capable() (boot_cpu_data.x86_max_cores > 1)
242#define smt_capable() (smp_num_siblings > 1)
243#endif
244
245#ifdef CONFIG_NUMA
246extern int get_mp_bus_to_node(int busnum);
247extern void set_mp_bus_to_node(int busnum, int node);
248#else
249static inline int get_mp_bus_to_node(int busnum)
250{
251 return 0;
252}
253static inline void set_mp_bus_to_node(int busnum, int node)
254{
255}
256#endif
257
258#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
new file mode 100644
index 00000000000..fa0d79facdb
--- /dev/null
+++ b/arch/x86/include/asm/trampoline.h
@@ -0,0 +1,21 @@
1#ifndef _ASM_X86_TRAMPOLINE_H
2#define _ASM_X86_TRAMPOLINE_H
3
4#ifndef __ASSEMBLY__
5
6/*
7 * Trampoline 80x86 program as an array.
8 */
9extern const unsigned char trampoline_data [];
10extern const unsigned char trampoline_end [];
11extern unsigned char *trampoline_base;
12
13extern unsigned long init_rsp;
14extern unsigned long initial_code;
15
16#define TRAMPOLINE_BASE 0x6000
17extern unsigned long setup_trampoline(void);
18
19#endif /* __ASSEMBLY__ */
20
21#endif /* _ASM_X86_TRAMPOLINE_H */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
new file mode 100644
index 00000000000..45dee286e45
--- /dev/null
+++ b/arch/x86/include/asm/traps.h
@@ -0,0 +1,81 @@
1#ifndef _ASM_X86_TRAPS_H
2#define _ASM_X86_TRAPS_H
3
4#include <asm/debugreg.h>
5
6#ifdef CONFIG_X86_32
7#define dotraplinkage
8#else
9#define dotraplinkage asmlinkage
10#endif
11
12asmlinkage void divide_error(void);
13asmlinkage void debug(void);
14asmlinkage void nmi(void);
15asmlinkage void int3(void);
16asmlinkage void overflow(void);
17asmlinkage void bounds(void);
18asmlinkage void invalid_op(void);
19asmlinkage void device_not_available(void);
20#ifdef CONFIG_X86_64
21asmlinkage void double_fault(void);
22#endif
23asmlinkage void coprocessor_segment_overrun(void);
24asmlinkage void invalid_TSS(void);
25asmlinkage void segment_not_present(void);
26asmlinkage void stack_segment(void);
27asmlinkage void general_protection(void);
28asmlinkage void page_fault(void);
29asmlinkage void spurious_interrupt_bug(void);
30asmlinkage void coprocessor_error(void);
31asmlinkage void alignment_check(void);
32#ifdef CONFIG_X86_MCE
33asmlinkage void machine_check(void);
34#endif /* CONFIG_X86_MCE */
35asmlinkage void simd_coprocessor_error(void);
36
37dotraplinkage void do_divide_error(struct pt_regs *, long);
38dotraplinkage void do_debug(struct pt_regs *, long);
39dotraplinkage void do_nmi(struct pt_regs *, long);
40dotraplinkage void do_int3(struct pt_regs *, long);
41dotraplinkage void do_overflow(struct pt_regs *, long);
42dotraplinkage void do_bounds(struct pt_regs *, long);
43dotraplinkage void do_invalid_op(struct pt_regs *, long);
44dotraplinkage void do_device_not_available(struct pt_regs *, long);
45dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long);
46dotraplinkage void do_invalid_TSS(struct pt_regs *, long);
47dotraplinkage void do_segment_not_present(struct pt_regs *, long);
48dotraplinkage void do_stack_segment(struct pt_regs *, long);
49dotraplinkage void do_general_protection(struct pt_regs *, long);
50dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
51dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
52dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
53dotraplinkage void do_alignment_check(struct pt_regs *, long);
54#ifdef CONFIG_X86_MCE
55dotraplinkage void do_machine_check(struct pt_regs *, long);
56#endif
57dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
58#ifdef CONFIG_X86_32
59dotraplinkage void do_iret_error(struct pt_regs *, long);
60#endif
61
62static inline int get_si_code(unsigned long condition)
63{
64 if (condition & DR_STEP)
65 return TRAP_TRACE;
66 else if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3))
67 return TRAP_HWBKPT;
68 else
69 return TRAP_BRKPT;
70}
71
72extern int panic_on_unrecovered_nmi;
73extern int kstack_depth_to_print;
74
75#ifdef CONFIG_X86_32
76void math_error(void __user *);
77unsigned long patch_espfix_desc(unsigned long, unsigned long);
78asmlinkage void math_emulate(long);
79#endif
80
81#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
new file mode 100644
index 00000000000..38ae163cc91
--- /dev/null
+++ b/arch/x86/include/asm/tsc.h
@@ -0,0 +1,62 @@
1/*
2 * x86 TSC related functions
3 */
4#ifndef _ASM_X86_TSC_H
5#define _ASM_X86_TSC_H
6
7#include <asm/processor.h>
8
9#define NS_SCALE 10 /* 2^10, carefully chosen */
10#define US_SCALE 32 /* 2^32, arbitralrily chosen */
11
12/*
13 * Standard way to access the cycle counter.
14 */
15typedef unsigned long long cycles_t;
16
17extern unsigned int cpu_khz;
18extern unsigned int tsc_khz;
19
20extern void disable_TSC(void);
21
22static inline cycles_t get_cycles(void)
23{
24 unsigned long long ret = 0;
25
26#ifndef CONFIG_X86_TSC
27 if (!cpu_has_tsc)
28 return 0;
29#endif
30 rdtscll(ret);
31
32 return ret;
33}
34
35static __always_inline cycles_t vget_cycles(void)
36{
37 /*
38 * We only do VDSOs on TSC capable CPUs, so this shouldnt
39 * access boot_cpu_data (which is not VDSO-safe):
40 */
41#ifndef CONFIG_X86_TSC
42 if (!cpu_has_tsc)
43 return 0;
44#endif
45 return (cycles_t)__native_read_tsc();
46}
47
48extern void tsc_init(void);
49extern void mark_tsc_unstable(char *reason);
50extern int unsynchronized_tsc(void);
51int check_tsc_unstable(void);
52
53/*
54 * Boot-time check whether the TSCs are synchronized across
55 * all CPUs/cores:
56 */
57extern void check_tsc_sync_source(int cpu);
58extern void check_tsc_sync_target(void);
59
60extern int notsc_setup(char *);
61
62#endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
new file mode 100644
index 00000000000..e6f73632007
--- /dev/null
+++ b/arch/x86/include/asm/types.h
@@ -0,0 +1,36 @@
1#ifndef _ASM_X86_TYPES_H
2#define _ASM_X86_TYPES_H
3
4#include <asm-generic/int-ll64.h>
5
6#ifndef __ASSEMBLY__
7
8typedef unsigned short umode_t;
9
10#endif /* __ASSEMBLY__ */
11
12/*
13 * These aren't exported outside the kernel to avoid name space clashes
14 */
15#ifdef __KERNEL__
16
17#ifdef CONFIG_X86_32
18# define BITS_PER_LONG 32
19#else
20# define BITS_PER_LONG 64
21#endif
22
23#ifndef __ASSEMBLY__
24
25typedef u64 dma64_addr_t;
26#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G)
27/* DMA addresses come in 32-bit and 64-bit flavours. */
28typedef u64 dma_addr_t;
29#else
30typedef u32 dma_addr_t;
31#endif
32
33#endif /* __ASSEMBLY__ */
34#endif /* __KERNEL__ */
35
36#endif /* _ASM_X86_TYPES_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
new file mode 100644
index 00000000000..99192bb55a5
--- /dev/null
+++ b/arch/x86/include/asm/uaccess.h
@@ -0,0 +1,456 @@
1#ifndef _ASM_X86_UACCESS_H
2#define _ASM_X86_UACCESS_H
3/*
4 * User space memory access functions
5 */
6#include <linux/errno.h>
7#include <linux/compiler.h>
8#include <linux/thread_info.h>
9#include <linux/prefetch.h>
10#include <linux/string.h>
11#include <asm/asm.h>
12#include <asm/page.h>
13
14#define VERIFY_READ 0
15#define VERIFY_WRITE 1
16
17/*
18 * The fs value determines whether argument validity checking should be
19 * performed or not. If get_fs() == USER_DS, checking is performed, with
20 * get_fs() == KERNEL_DS, checking is bypassed.
21 *
22 * For historical reasons, these macros are grossly misnamed.
23 */
24
25#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
26
27#define KERNEL_DS MAKE_MM_SEG(-1UL)
28#define USER_DS MAKE_MM_SEG(PAGE_OFFSET)
29
30#define get_ds() (KERNEL_DS)
31#define get_fs() (current_thread_info()->addr_limit)
32#define set_fs(x) (current_thread_info()->addr_limit = (x))
33
34#define segment_eq(a, b) ((a).seg == (b).seg)
35
36#define __addr_ok(addr) \
37 ((unsigned long __force)(addr) < \
38 (current_thread_info()->addr_limit.seg))
39
40/*
41 * Test whether a block of memory is a valid user space address.
42 * Returns 0 if the range is valid, nonzero otherwise.
43 *
44 * This is equivalent to the following test:
45 * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64)
46 *
47 * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
48 */
49
50#define __range_not_ok(addr, size) \
51({ \
52 unsigned long flag, roksum; \
53 __chk_user_ptr(addr); \
54 asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \
55 : "=&r" (flag), "=r" (roksum) \
56 : "1" (addr), "g" ((long)(size)), \
57 "rm" (current_thread_info()->addr_limit.seg)); \
58 flag; \
59})
60
61/**
62 * access_ok: - Checks if a user space pointer is valid
63 * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that
64 * %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe
65 * to write to a block, it is always safe to read from it.
66 * @addr: User space pointer to start of block to check
67 * @size: Size of block to check
68 *
69 * Context: User context only. This function may sleep.
70 *
71 * Checks if a pointer to a block of memory in user space is valid.
72 *
73 * Returns true (nonzero) if the memory block may be valid, false (zero)
74 * if it is definitely invalid.
75 *
76 * Note that, depending on architecture, this function probably just
77 * checks that the pointer is in the user space range - after calling
78 * this function, memory access functions may still return -EFAULT.
79 */
80#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0))
81
82/*
83 * The exception table consists of pairs of addresses: the first is the
84 * address of an instruction that is allowed to fault, and the second is
85 * the address at which the program should continue. No registers are
86 * modified, so it is entirely up to the continuation code to figure out
87 * what to do.
88 *
89 * All the routines below use bits of fixup code that are out of line
90 * with the main instruction path. This means when everything is well,
91 * we don't even have to jump over them. Further, they do not intrude
92 * on our cache or tlb entries.
93 */
94
95struct exception_table_entry {
96 unsigned long insn, fixup;
97};
98
99extern int fixup_exception(struct pt_regs *regs);
100
101/*
102 * These are the main single-value transfer routines. They automatically
103 * use the right size if we just have the right pointer type.
104 *
105 * This gets kind of ugly. We want to return _two_ values in "get_user()"
106 * and yet we don't want to do any pointers, because that is too much
107 * of a performance impact. Thus we have a few rather ugly macros here,
108 * and hide all the ugliness from the user.
109 *
110 * The "__xxx" versions of the user access functions are versions that
111 * do not verify the address space, that must have been done previously
112 * with a separate "access_ok()" call (this is used when we do multiple
113 * accesses to the same area of user memory).
114 */
115
116extern int __get_user_1(void);
117extern int __get_user_2(void);
118extern int __get_user_4(void);
119extern int __get_user_8(void);
120extern int __get_user_bad(void);
121
122#define __get_user_x(size, ret, x, ptr) \
123 asm volatile("call __get_user_" #size \
124 : "=a" (ret),"=d" (x) \
125 : "0" (ptr)) \
126
127/* Careful: we have to cast the result to the type of the pointer
128 * for sign reasons */
129
130/**
131 * get_user: - Get a simple variable from user space.
132 * @x: Variable to store result.
133 * @ptr: Source address, in user space.
134 *
135 * Context: User context only. This function may sleep.
136 *
137 * This macro copies a single simple variable from user space to kernel
138 * space. It supports simple types like char and int, but not larger
139 * data types like structures or arrays.
140 *
141 * @ptr must have pointer-to-simple-variable type, and the result of
142 * dereferencing @ptr must be assignable to @x without a cast.
143 *
144 * Returns zero on success, or -EFAULT on error.
145 * On error, the variable @x is set to zero.
146 */
147#ifdef CONFIG_X86_32
148#define __get_user_8(__ret_gu, __val_gu, ptr) \
149 __get_user_x(X, __ret_gu, __val_gu, ptr)
150#else
151#define __get_user_8(__ret_gu, __val_gu, ptr) \
152 __get_user_x(8, __ret_gu, __val_gu, ptr)
153#endif
154
155#define get_user(x, ptr) \
156({ \
157 int __ret_gu; \
158 unsigned long __val_gu; \
159 __chk_user_ptr(ptr); \
160 might_fault(); \
161 switch (sizeof(*(ptr))) { \
162 case 1: \
163 __get_user_x(1, __ret_gu, __val_gu, ptr); \
164 break; \
165 case 2: \
166 __get_user_x(2, __ret_gu, __val_gu, ptr); \
167 break; \
168 case 4: \
169 __get_user_x(4, __ret_gu, __val_gu, ptr); \
170 break; \
171 case 8: \
172 __get_user_8(__ret_gu, __val_gu, ptr); \
173 break; \
174 default: \
175 __get_user_x(X, __ret_gu, __val_gu, ptr); \
176 break; \
177 } \
178 (x) = (__typeof__(*(ptr)))__val_gu; \
179 __ret_gu; \
180})
181
182#define __put_user_x(size, x, ptr, __ret_pu) \
183 asm volatile("call __put_user_" #size : "=a" (__ret_pu) \
184 :"0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
185
186
187
188#ifdef CONFIG_X86_32
189#define __put_user_u64(x, addr, err) \
190 asm volatile("1: movl %%eax,0(%2)\n" \
191 "2: movl %%edx,4(%2)\n" \
192 "3:\n" \
193 ".section .fixup,\"ax\"\n" \
194 "4: movl %3,%0\n" \
195 " jmp 3b\n" \
196 ".previous\n" \
197 _ASM_EXTABLE(1b, 4b) \
198 _ASM_EXTABLE(2b, 4b) \
199 : "=r" (err) \
200 : "A" (x), "r" (addr), "i" (-EFAULT), "0" (err))
201
202#define __put_user_x8(x, ptr, __ret_pu) \
203 asm volatile("call __put_user_8" : "=a" (__ret_pu) \
204 : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
205#else
206#define __put_user_u64(x, ptr, retval) \
207 __put_user_asm(x, ptr, retval, "q", "", "Zr", -EFAULT)
208#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
209#endif
210
211extern void __put_user_bad(void);
212
213/*
214 * Strange magic calling convention: pointer in %ecx,
215 * value in %eax(:%edx), return value in %eax. clobbers %rbx
216 */
217extern void __put_user_1(void);
218extern void __put_user_2(void);
219extern void __put_user_4(void);
220extern void __put_user_8(void);
221
222#ifdef CONFIG_X86_WP_WORKS_OK
223
224/**
225 * put_user: - Write a simple value into user space.
226 * @x: Value to copy to user space.
227 * @ptr: Destination address, in user space.
228 *
229 * Context: User context only. This function may sleep.
230 *
231 * This macro copies a single simple value from kernel space to user
232 * space. It supports simple types like char and int, but not larger
233 * data types like structures or arrays.
234 *
235 * @ptr must have pointer-to-simple-variable type, and @x must be assignable
236 * to the result of dereferencing @ptr.
237 *
238 * Returns zero on success, or -EFAULT on error.
239 */
240#define put_user(x, ptr) \
241({ \
242 int __ret_pu; \
243 __typeof__(*(ptr)) __pu_val; \
244 __chk_user_ptr(ptr); \
245 might_fault(); \
246 __pu_val = x; \
247 switch (sizeof(*(ptr))) { \
248 case 1: \
249 __put_user_x(1, __pu_val, ptr, __ret_pu); \
250 break; \
251 case 2: \
252 __put_user_x(2, __pu_val, ptr, __ret_pu); \
253 break; \
254 case 4: \
255 __put_user_x(4, __pu_val, ptr, __ret_pu); \
256 break; \
257 case 8: \
258 __put_user_x8(__pu_val, ptr, __ret_pu); \
259 break; \
260 default: \
261 __put_user_x(X, __pu_val, ptr, __ret_pu); \
262 break; \
263 } \
264 __ret_pu; \
265})
266
267#define __put_user_size(x, ptr, size, retval, errret) \
268do { \
269 retval = 0; \
270 __chk_user_ptr(ptr); \
271 switch (size) { \
272 case 1: \
273 __put_user_asm(x, ptr, retval, "b", "b", "iq", errret); \
274 break; \
275 case 2: \
276 __put_user_asm(x, ptr, retval, "w", "w", "ir", errret); \
277 break; \
278 case 4: \
279 __put_user_asm(x, ptr, retval, "l", "k", "ir", errret);\
280 break; \
281 case 8: \
282 __put_user_u64((__typeof__(*ptr))(x), ptr, retval); \
283 break; \
284 default: \
285 __put_user_bad(); \
286 } \
287} while (0)
288
289#else
290
291#define __put_user_size(x, ptr, size, retval, errret) \
292do { \
293 __typeof__(*(ptr))__pus_tmp = x; \
294 retval = 0; \
295 \
296 if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, size) != 0)) \
297 retval = errret; \
298} while (0)
299
300#define put_user(x, ptr) \
301({ \
302 int __ret_pu; \
303 __typeof__(*(ptr))__pus_tmp = x; \
304 __ret_pu = 0; \
305 if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, \
306 sizeof(*(ptr))) != 0)) \
307 __ret_pu = -EFAULT; \
308 __ret_pu; \
309})
310#endif
311
312#ifdef CONFIG_X86_32
313#define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad()
314#else
315#define __get_user_asm_u64(x, ptr, retval, errret) \
316 __get_user_asm(x, ptr, retval, "q", "", "=r", errret)
317#endif
318
319#define __get_user_size(x, ptr, size, retval, errret) \
320do { \
321 retval = 0; \
322 __chk_user_ptr(ptr); \
323 switch (size) { \
324 case 1: \
325 __get_user_asm(x, ptr, retval, "b", "b", "=q", errret); \
326 break; \
327 case 2: \
328 __get_user_asm(x, ptr, retval, "w", "w", "=r", errret); \
329 break; \
330 case 4: \
331 __get_user_asm(x, ptr, retval, "l", "k", "=r", errret); \
332 break; \
333 case 8: \
334 __get_user_asm_u64(x, ptr, retval, errret); \
335 break; \
336 default: \
337 (x) = __get_user_bad(); \
338 } \
339} while (0)
340
341#define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \
342 asm volatile("1: mov"itype" %2,%"rtype"1\n" \
343 "2:\n" \
344 ".section .fixup,\"ax\"\n" \
345 "3: mov %3,%0\n" \
346 " xor"itype" %"rtype"1,%"rtype"1\n" \
347 " jmp 2b\n" \
348 ".previous\n" \
349 _ASM_EXTABLE(1b, 3b) \
350 : "=r" (err), ltype(x) \
351 : "m" (__m(addr)), "i" (errret), "0" (err))
352
353#define __put_user_nocheck(x, ptr, size) \
354({ \
355 long __pu_err; \
356 __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
357 __pu_err; \
358})
359
360#define __get_user_nocheck(x, ptr, size) \
361({ \
362 long __gu_err; \
363 unsigned long __gu_val; \
364 __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
365 (x) = (__force __typeof__(*(ptr)))__gu_val; \
366 __gu_err; \
367})
368
369/* FIXME: this hack is definitely wrong -AK */
370struct __large_struct { unsigned long buf[100]; };
371#define __m(x) (*(struct __large_struct __user *)(x))
372
373/*
374 * Tell gcc we read from memory instead of writing: this is because
375 * we do not write to any memory gcc knows about, so there are no
376 * aliasing issues.
377 */
378#define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \
379 asm volatile("1: mov"itype" %"rtype"1,%2\n" \
380 "2:\n" \
381 ".section .fixup,\"ax\"\n" \
382 "3: mov %3,%0\n" \
383 " jmp 2b\n" \
384 ".previous\n" \
385 _ASM_EXTABLE(1b, 3b) \
386 : "=r"(err) \
387 : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err))
388/**
389 * __get_user: - Get a simple variable from user space, with less checking.
390 * @x: Variable to store result.
391 * @ptr: Source address, in user space.
392 *
393 * Context: User context only. This function may sleep.
394 *
395 * This macro copies a single simple variable from user space to kernel
396 * space. It supports simple types like char and int, but not larger
397 * data types like structures or arrays.
398 *
399 * @ptr must have pointer-to-simple-variable type, and the result of
400 * dereferencing @ptr must be assignable to @x without a cast.
401 *
402 * Caller must check the pointer with access_ok() before calling this
403 * function.
404 *
405 * Returns zero on success, or -EFAULT on error.
406 * On error, the variable @x is set to zero.
407 */
408
409#define __get_user(x, ptr) \
410 __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
411/**
412 * __put_user: - Write a simple value into user space, with less checking.
413 * @x: Value to copy to user space.
414 * @ptr: Destination address, in user space.
415 *
416 * Context: User context only. This function may sleep.
417 *
418 * This macro copies a single simple value from kernel space to user
419 * space. It supports simple types like char and int, but not larger
420 * data types like structures or arrays.
421 *
422 * @ptr must have pointer-to-simple-variable type, and @x must be assignable
423 * to the result of dereferencing @ptr.
424 *
425 * Caller must check the pointer with access_ok() before calling this
426 * function.
427 *
428 * Returns zero on success, or -EFAULT on error.
429 */
430
431#define __put_user(x, ptr) \
432 __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
433
434#define __get_user_unaligned __get_user
435#define __put_user_unaligned __put_user
436
437/*
438 * movsl can be slow when source and dest are not both 8-byte aligned
439 */
440#ifdef CONFIG_X86_INTEL_USERCOPY
441extern struct movsl_mask {
442 int mask;
443} ____cacheline_aligned_in_smp movsl_mask;
444#endif
445
446#define ARCH_HAS_NOCACHE_UACCESS 1
447
448#ifdef CONFIG_X86_32
449# include "uaccess_32.h"
450#else
451# define ARCH_HAS_SEARCH_EXTABLE
452# include "uaccess_64.h"
453#endif
454
455#endif /* _ASM_X86_UACCESS_H */
456
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
new file mode 100644
index 00000000000..5e06259e90e
--- /dev/null
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -0,0 +1,218 @@
1#ifndef _ASM_X86_UACCESS_32_H
2#define _ASM_X86_UACCESS_32_H
3
4/*
5 * User space memory access functions
6 */
7#include <linux/errno.h>
8#include <linux/thread_info.h>
9#include <linux/prefetch.h>
10#include <linux/string.h>
11#include <asm/asm.h>
12#include <asm/page.h>
13
14unsigned long __must_check __copy_to_user_ll
15 (void __user *to, const void *from, unsigned long n);
16unsigned long __must_check __copy_from_user_ll
17 (void *to, const void __user *from, unsigned long n);
18unsigned long __must_check __copy_from_user_ll_nozero
19 (void *to, const void __user *from, unsigned long n);
20unsigned long __must_check __copy_from_user_ll_nocache
21 (void *to, const void __user *from, unsigned long n);
22unsigned long __must_check __copy_from_user_ll_nocache_nozero
23 (void *to, const void __user *from, unsigned long n);
24
25/**
26 * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
27 * @to: Destination address, in user space.
28 * @from: Source address, in kernel space.
29 * @n: Number of bytes to copy.
30 *
31 * Context: User context only.
32 *
33 * Copy data from kernel space to user space. Caller must check
34 * the specified block with access_ok() before calling this function.
35 * The caller should also make sure he pins the user space address
36 * so that the we don't result in page fault and sleep.
37 *
38 * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault
39 * we return the initial request size (1, 2 or 4), as copy_*_user should do.
40 * If a store crosses a page boundary and gets a fault, the x86 will not write
41 * anything, so this is accurate.
42 */
43
44static __always_inline unsigned long __must_check
45__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
46{
47 if (__builtin_constant_p(n)) {
48 unsigned long ret;
49
50 switch (n) {
51 case 1:
52 __put_user_size(*(u8 *)from, (u8 __user *)to,
53 1, ret, 1);
54 return ret;
55 case 2:
56 __put_user_size(*(u16 *)from, (u16 __user *)to,
57 2, ret, 2);
58 return ret;
59 case 4:
60 __put_user_size(*(u32 *)from, (u32 __user *)to,
61 4, ret, 4);
62 return ret;
63 }
64 }
65 return __copy_to_user_ll(to, from, n);
66}
67
68/**
69 * __copy_to_user: - Copy a block of data into user space, with less checking.
70 * @to: Destination address, in user space.
71 * @from: Source address, in kernel space.
72 * @n: Number of bytes to copy.
73 *
74 * Context: User context only. This function may sleep.
75 *
76 * Copy data from kernel space to user space. Caller must check
77 * the specified block with access_ok() before calling this function.
78 *
79 * Returns number of bytes that could not be copied.
80 * On success, this will be zero.
81 */
82static __always_inline unsigned long __must_check
83__copy_to_user(void __user *to, const void *from, unsigned long n)
84{
85 might_fault();
86 return __copy_to_user_inatomic(to, from, n);
87}
88
89static __always_inline unsigned long
90__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
91{
92 /* Avoid zeroing the tail if the copy fails..
93 * If 'n' is constant and 1, 2, or 4, we do still zero on a failure,
94 * but as the zeroing behaviour is only significant when n is not
95 * constant, that shouldn't be a problem.
96 */
97 if (__builtin_constant_p(n)) {
98 unsigned long ret;
99
100 switch (n) {
101 case 1:
102 __get_user_size(*(u8 *)to, from, 1, ret, 1);
103 return ret;
104 case 2:
105 __get_user_size(*(u16 *)to, from, 2, ret, 2);
106 return ret;
107 case 4:
108 __get_user_size(*(u32 *)to, from, 4, ret, 4);
109 return ret;
110 }
111 }
112 return __copy_from_user_ll_nozero(to, from, n);
113}
114
115/**
116 * __copy_from_user: - Copy a block of data from user space, with less checking.
117 * @to: Destination address, in kernel space.
118 * @from: Source address, in user space.
119 * @n: Number of bytes to copy.
120 *
121 * Context: User context only. This function may sleep.
122 *
123 * Copy data from user space to kernel space. Caller must check
124 * the specified block with access_ok() before calling this function.
125 *
126 * Returns number of bytes that could not be copied.
127 * On success, this will be zero.
128 *
129 * If some data could not be copied, this function will pad the copied
130 * data to the requested size using zero bytes.
131 *
132 * An alternate version - __copy_from_user_inatomic() - may be called from
133 * atomic context and will fail rather than sleep. In this case the
134 * uncopied bytes will *NOT* be padded with zeros. See fs/filemap.h
135 * for explanation of why this is needed.
136 */
137static __always_inline unsigned long
138__copy_from_user(void *to, const void __user *from, unsigned long n)
139{
140 might_fault();
141 if (__builtin_constant_p(n)) {
142 unsigned long ret;
143
144 switch (n) {
145 case 1:
146 __get_user_size(*(u8 *)to, from, 1, ret, 1);
147 return ret;
148 case 2:
149 __get_user_size(*(u16 *)to, from, 2, ret, 2);
150 return ret;
151 case 4:
152 __get_user_size(*(u32 *)to, from, 4, ret, 4);
153 return ret;
154 }
155 }
156 return __copy_from_user_ll(to, from, n);
157}
158
159static __always_inline unsigned long __copy_from_user_nocache(void *to,
160 const void __user *from, unsigned long n)
161{
162 might_fault();
163 if (__builtin_constant_p(n)) {
164 unsigned long ret;
165
166 switch (n) {
167 case 1:
168 __get_user_size(*(u8 *)to, from, 1, ret, 1);
169 return ret;
170 case 2:
171 __get_user_size(*(u16 *)to, from, 2, ret, 2);
172 return ret;
173 case 4:
174 __get_user_size(*(u32 *)to, from, 4, ret, 4);
175 return ret;
176 }
177 }
178 return __copy_from_user_ll_nocache(to, from, n);
179}
180
181static __always_inline unsigned long
182__copy_from_user_inatomic_nocache(void *to, const void __user *from,
183 unsigned long n)
184{
185 return __copy_from_user_ll_nocache_nozero(to, from, n);
186}
187
188unsigned long __must_check copy_to_user(void __user *to,
189 const void *from, unsigned long n);
190unsigned long __must_check copy_from_user(void *to,
191 const void __user *from,
192 unsigned long n);
193long __must_check strncpy_from_user(char *dst, const char __user *src,
194 long count);
195long __must_check __strncpy_from_user(char *dst,
196 const char __user *src, long count);
197
198/**
199 * strlen_user: - Get the size of a string in user space.
200 * @str: The string to measure.
201 *
202 * Context: User context only. This function may sleep.
203 *
204 * Get the size of a NUL-terminated string in user space.
205 *
206 * Returns the size of the string INCLUDING the terminating NUL.
207 * On exception, returns 0.
208 *
209 * If there is a limit on the length of a valid string, you may wish to
210 * consider using strnlen_user() instead.
211 */
212#define strlen_user(str) strnlen_user(str, LONG_MAX)
213
214long strnlen_user(const char __user *str, long n);
215unsigned long __must_check clear_user(void __user *mem, unsigned long len);
216unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
217
218#endif /* _ASM_X86_UACCESS_32_H */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
new file mode 100644
index 00000000000..543ba883cc6
--- /dev/null
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -0,0 +1,208 @@
1#ifndef _ASM_X86_UACCESS_64_H
2#define _ASM_X86_UACCESS_64_H
3
4/*
5 * User space memory access functions
6 */
7#include <linux/compiler.h>
8#include <linux/errno.h>
9#include <linux/prefetch.h>
10#include <linux/lockdep.h>
11#include <asm/page.h>
12
13/*
14 * Copy To/From Userspace
15 */
16
17/* Handles exceptions in both to and from, but doesn't do access_ok */
18__must_check unsigned long
19copy_user_generic(void *to, const void *from, unsigned len);
20
21__must_check unsigned long
22copy_to_user(void __user *to, const void *from, unsigned len);
23__must_check unsigned long
24copy_from_user(void *to, const void __user *from, unsigned len);
25__must_check unsigned long
26copy_in_user(void __user *to, const void __user *from, unsigned len);
27
28static __always_inline __must_check
29int __copy_from_user(void *dst, const void __user *src, unsigned size)
30{
31 int ret = 0;
32
33 might_fault();
34 if (!__builtin_constant_p(size))
35 return copy_user_generic(dst, (__force void *)src, size);
36 switch (size) {
37 case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src,
38 ret, "b", "b", "=q", 1);
39 return ret;
40 case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src,
41 ret, "w", "w", "=r", 2);
42 return ret;
43 case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src,
44 ret, "l", "k", "=r", 4);
45 return ret;
46 case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src,
47 ret, "q", "", "=r", 8);
48 return ret;
49 case 10:
50 __get_user_asm(*(u64 *)dst, (u64 __user *)src,
51 ret, "q", "", "=r", 16);
52 if (unlikely(ret))
53 return ret;
54 __get_user_asm(*(u16 *)(8 + (char *)dst),
55 (u16 __user *)(8 + (char __user *)src),
56 ret, "w", "w", "=r", 2);
57 return ret;
58 case 16:
59 __get_user_asm(*(u64 *)dst, (u64 __user *)src,
60 ret, "q", "", "=r", 16);
61 if (unlikely(ret))
62 return ret;
63 __get_user_asm(*(u64 *)(8 + (char *)dst),
64 (u64 __user *)(8 + (char __user *)src),
65 ret, "q", "", "=r", 8);
66 return ret;
67 default:
68 return copy_user_generic(dst, (__force void *)src, size);
69 }
70}
71
72static __always_inline __must_check
73int __copy_to_user(void __user *dst, const void *src, unsigned size)
74{
75 int ret = 0;
76
77 might_fault();
78 if (!__builtin_constant_p(size))
79 return copy_user_generic((__force void *)dst, src, size);
80 switch (size) {
81 case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst,
82 ret, "b", "b", "iq", 1);
83 return ret;
84 case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst,
85 ret, "w", "w", "ir", 2);
86 return ret;
87 case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst,
88 ret, "l", "k", "ir", 4);
89 return ret;
90 case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
91 ret, "q", "", "ir", 8);
92 return ret;
93 case 10:
94 __put_user_asm(*(u64 *)src, (u64 __user *)dst,
95 ret, "q", "", "ir", 10);
96 if (unlikely(ret))
97 return ret;
98 asm("":::"memory");
99 __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
100 ret, "w", "w", "ir", 2);
101 return ret;
102 case 16:
103 __put_user_asm(*(u64 *)src, (u64 __user *)dst,
104 ret, "q", "", "ir", 16);
105 if (unlikely(ret))
106 return ret;
107 asm("":::"memory");
108 __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
109 ret, "q", "", "ir", 8);
110 return ret;
111 default:
112 return copy_user_generic((__force void *)dst, src, size);
113 }
114}
115
116static __always_inline __must_check
117int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
118{
119 int ret = 0;
120
121 might_fault();
122 if (!__builtin_constant_p(size))
123 return copy_user_generic((__force void *)dst,
124 (__force void *)src, size);
125 switch (size) {
126 case 1: {
127 u8 tmp;
128 __get_user_asm(tmp, (u8 __user *)src,
129 ret, "b", "b", "=q", 1);
130 if (likely(!ret))
131 __put_user_asm(tmp, (u8 __user *)dst,
132 ret, "b", "b", "iq", 1);
133 return ret;
134 }
135 case 2: {
136 u16 tmp;
137 __get_user_asm(tmp, (u16 __user *)src,
138 ret, "w", "w", "=r", 2);
139 if (likely(!ret))
140 __put_user_asm(tmp, (u16 __user *)dst,
141 ret, "w", "w", "ir", 2);
142 return ret;
143 }
144
145 case 4: {
146 u32 tmp;
147 __get_user_asm(tmp, (u32 __user *)src,
148 ret, "l", "k", "=r", 4);
149 if (likely(!ret))
150 __put_user_asm(tmp, (u32 __user *)dst,
151 ret, "l", "k", "ir", 4);
152 return ret;
153 }
154 case 8: {
155 u64 tmp;
156 __get_user_asm(tmp, (u64 __user *)src,
157 ret, "q", "", "=r", 8);
158 if (likely(!ret))
159 __put_user_asm(tmp, (u64 __user *)dst,
160 ret, "q", "", "ir", 8);
161 return ret;
162 }
163 default:
164 return copy_user_generic((__force void *)dst,
165 (__force void *)src, size);
166 }
167}
168
169__must_check long
170strncpy_from_user(char *dst, const char __user *src, long count);
171__must_check long
172__strncpy_from_user(char *dst, const char __user *src, long count);
173__must_check long strnlen_user(const char __user *str, long n);
174__must_check long __strnlen_user(const char __user *str, long n);
175__must_check long strlen_user(const char __user *str);
176__must_check unsigned long clear_user(void __user *mem, unsigned long len);
177__must_check unsigned long __clear_user(void __user *mem, unsigned long len);
178
179__must_check long __copy_from_user_inatomic(void *dst, const void __user *src,
180 unsigned size);
181
182static __must_check __always_inline int
183__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
184{
185 return copy_user_generic((__force void *)dst, src, size);
186}
187
188extern long __copy_user_nocache(void *dst, const void __user *src,
189 unsigned size, int zerorest);
190
191static inline int __copy_from_user_nocache(void *dst, const void __user *src,
192 unsigned size)
193{
194 might_sleep();
195 return __copy_user_nocache(dst, src, size, 1);
196}
197
198static inline int __copy_from_user_inatomic_nocache(void *dst,
199 const void __user *src,
200 unsigned size)
201{
202 return __copy_user_nocache(dst, src, size, 0);
203}
204
205unsigned long
206copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest);
207
208#endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/asm/ucontext.h
new file mode 100644
index 00000000000..87324cf439d
--- /dev/null
+++ b/arch/x86/include/asm/ucontext.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_X86_UCONTEXT_H
2#define _ASM_X86_UCONTEXT_H
3
4#define UC_FP_XSTATE 0x1 /* indicates the presence of extended state
5 * information in the memory layout pointed
6 * by the fpstate pointer in the ucontext's
7 * sigcontext struct (uc_mcontext).
8 */
9
10struct ucontext {
11 unsigned long uc_flags;
12 struct ucontext *uc_link;
13 stack_t uc_stack;
14 struct sigcontext uc_mcontext;
15 sigset_t uc_sigmask; /* mask last for extensibility */
16};
17
18#endif /* _ASM_X86_UCONTEXT_H */
diff --git a/arch/x86/include/asm/unaligned.h b/arch/x86/include/asm/unaligned.h
new file mode 100644
index 00000000000..a7bd416b476
--- /dev/null
+++ b/arch/x86/include/asm/unaligned.h
@@ -0,0 +1,14 @@
1#ifndef _ASM_X86_UNALIGNED_H
2#define _ASM_X86_UNALIGNED_H
3
4/*
5 * The x86 can do unaligned accesses itself.
6 */
7
8#include <linux/unaligned/access_ok.h>
9#include <linux/unaligned/generic.h>
10
11#define get_unaligned __get_unaligned_le
12#define put_unaligned __put_unaligned_le
13
14#endif /* _ASM_X86_UNALIGNED_H */
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
new file mode 100644
index 00000000000..2a58ed3e51d
--- /dev/null
+++ b/arch/x86/include/asm/unistd.h
@@ -0,0 +1,13 @@
1#ifdef __KERNEL__
2# ifdef CONFIG_X86_32
3# include "unistd_32.h"
4# else
5# include "unistd_64.h"
6# endif
7#else
8# ifdef __i386__
9# include "unistd_32.h"
10# else
11# include "unistd_64.h"
12# endif
13#endif
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
new file mode 100644
index 00000000000..f2bba78430a
--- /dev/null
+++ b/arch/x86/include/asm/unistd_32.h
@@ -0,0 +1,379 @@
1#ifndef _ASM_X86_UNISTD_32_H
2#define _ASM_X86_UNISTD_32_H
3
4/*
5 * This file contains the system call numbers.
6 */
7
8#define __NR_restart_syscall 0
9#define __NR_exit 1
10#define __NR_fork 2
11#define __NR_read 3
12#define __NR_write 4
13#define __NR_open 5
14#define __NR_close 6
15#define __NR_waitpid 7
16#define __NR_creat 8
17#define __NR_link 9
18#define __NR_unlink 10
19#define __NR_execve 11
20#define __NR_chdir 12
21#define __NR_time 13
22#define __NR_mknod 14
23#define __NR_chmod 15
24#define __NR_lchown 16
25#define __NR_break 17
26#define __NR_oldstat 18
27#define __NR_lseek 19
28#define __NR_getpid 20
29#define __NR_mount 21
30#define __NR_umount 22
31#define __NR_setuid 23
32#define __NR_getuid 24
33#define __NR_stime 25
34#define __NR_ptrace 26
35#define __NR_alarm 27
36#define __NR_oldfstat 28
37#define __NR_pause 29
38#define __NR_utime 30
39#define __NR_stty 31
40#define __NR_gtty 32
41#define __NR_access 33
42#define __NR_nice 34
43#define __NR_ftime 35
44#define __NR_sync 36
45#define __NR_kill 37
46#define __NR_rename 38
47#define __NR_mkdir 39
48#define __NR_rmdir 40
49#define __NR_dup 41
50#define __NR_pipe 42
51#define __NR_times 43
52#define __NR_prof 44
53#define __NR_brk 45
54#define __NR_setgid 46
55#define __NR_getgid 47
56#define __NR_signal 48
57#define __NR_geteuid 49
58#define __NR_getegid 50
59#define __NR_acct 51
60#define __NR_umount2 52
61#define __NR_lock 53
62#define __NR_ioctl 54
63#define __NR_fcntl 55
64#define __NR_mpx 56
65#define __NR_setpgid 57
66#define __NR_ulimit 58
67#define __NR_oldolduname 59
68#define __NR_umask 60
69#define __NR_chroot 61
70#define __NR_ustat 62
71#define __NR_dup2 63
72#define __NR_getppid 64
73#define __NR_getpgrp 65
74#define __NR_setsid 66
75#define __NR_sigaction 67
76#define __NR_sgetmask 68
77#define __NR_ssetmask 69
78#define __NR_setreuid 70
79#define __NR_setregid 71
80#define __NR_sigsuspend 72
81#define __NR_sigpending 73
82#define __NR_sethostname 74
83#define __NR_setrlimit 75
84#define __NR_getrlimit 76 /* Back compatible 2Gig limited rlimit */
85#define __NR_getrusage 77
86#define __NR_gettimeofday 78
87#define __NR_settimeofday 79
88#define __NR_getgroups 80
89#define __NR_setgroups 81
90#define __NR_select 82
91#define __NR_symlink 83
92#define __NR_oldlstat 84
93#define __NR_readlink 85
94#define __NR_uselib 86
95#define __NR_swapon 87
96#define __NR_reboot 88
97#define __NR_readdir 89
98#define __NR_mmap 90
99#define __NR_munmap 91
100#define __NR_truncate 92
101#define __NR_ftruncate 93
102#define __NR_fchmod 94
103#define __NR_fchown 95
104#define __NR_getpriority 96
105#define __NR_setpriority 97
106#define __NR_profil 98
107#define __NR_statfs 99
108#define __NR_fstatfs 100
109#define __NR_ioperm 101
110#define __NR_socketcall 102
111#define __NR_syslog 103
112#define __NR_setitimer 104
113#define __NR_getitimer 105
114#define __NR_stat 106
115#define __NR_lstat 107
116#define __NR_fstat 108
117#define __NR_olduname 109
118#define __NR_iopl 110
119#define __NR_vhangup 111
120#define __NR_idle 112
121#define __NR_vm86old 113
122#define __NR_wait4 114
123#define __NR_swapoff 115
124#define __NR_sysinfo 116
125#define __NR_ipc 117
126#define __NR_fsync 118
127#define __NR_sigreturn 119
128#define __NR_clone 120
129#define __NR_setdomainname 121
130#define __NR_uname 122
131#define __NR_modify_ldt 123
132#define __NR_adjtimex 124
133#define __NR_mprotect 125
134#define __NR_sigprocmask 126
135#define __NR_create_module 127
136#define __NR_init_module 128
137#define __NR_delete_module 129
138#define __NR_get_kernel_syms 130
139#define __NR_quotactl 131
140#define __NR_getpgid 132
141#define __NR_fchdir 133
142#define __NR_bdflush 134
143#define __NR_sysfs 135
144#define __NR_personality 136
145#define __NR_afs_syscall 137 /* Syscall for Andrew File System */
146#define __NR_setfsuid 138
147#define __NR_setfsgid 139
148#define __NR__llseek 140
149#define __NR_getdents 141
150#define __NR__newselect 142
151#define __NR_flock 143
152#define __NR_msync 144
153#define __NR_readv 145
154#define __NR_writev 146
155#define __NR_getsid 147
156#define __NR_fdatasync 148
157#define __NR__sysctl 149
158#define __NR_mlock 150
159#define __NR_munlock 151
160#define __NR_mlockall 152
161#define __NR_munlockall 153
162#define __NR_sched_setparam 154
163#define __NR_sched_getparam 155
164#define __NR_sched_setscheduler 156
165#define __NR_sched_getscheduler 157
166#define __NR_sched_yield 158
167#define __NR_sched_get_priority_max 159
168#define __NR_sched_get_priority_min 160
169#define __NR_sched_rr_get_interval 161
170#define __NR_nanosleep 162
171#define __NR_mremap 163
172#define __NR_setresuid 164
173#define __NR_getresuid 165
174#define __NR_vm86 166
175#define __NR_query_module 167
176#define __NR_poll 168
177#define __NR_nfsservctl 169
178#define __NR_setresgid 170
179#define __NR_getresgid 171
180#define __NR_prctl 172
181#define __NR_rt_sigreturn 173
182#define __NR_rt_sigaction 174
183#define __NR_rt_sigprocmask 175
184#define __NR_rt_sigpending 176
185#define __NR_rt_sigtimedwait 177
186#define __NR_rt_sigqueueinfo 178
187#define __NR_rt_sigsuspend 179
188#define __NR_pread64 180
189#define __NR_pwrite64 181
190#define __NR_chown 182
191#define __NR_getcwd 183
192#define __NR_capget 184
193#define __NR_capset 185
194#define __NR_sigaltstack 186
195#define __NR_sendfile 187
196#define __NR_getpmsg 188 /* some people actually want streams */
197#define __NR_putpmsg 189 /* some people actually want streams */
198#define __NR_vfork 190
199#define __NR_ugetrlimit 191 /* SuS compliant getrlimit */
200#define __NR_mmap2 192
201#define __NR_truncate64 193
202#define __NR_ftruncate64 194
203#define __NR_stat64 195
204#define __NR_lstat64 196
205#define __NR_fstat64 197
206#define __NR_lchown32 198
207#define __NR_getuid32 199
208#define __NR_getgid32 200
209#define __NR_geteuid32 201
210#define __NR_getegid32 202
211#define __NR_setreuid32 203
212#define __NR_setregid32 204
213#define __NR_getgroups32 205
214#define __NR_setgroups32 206
215#define __NR_fchown32 207
216#define __NR_setresuid32 208
217#define __NR_getresuid32 209
218#define __NR_setresgid32 210
219#define __NR_getresgid32 211
220#define __NR_chown32 212
221#define __NR_setuid32 213
222#define __NR_setgid32 214
223#define __NR_setfsuid32 215
224#define __NR_setfsgid32 216
225#define __NR_pivot_root 217
226#define __NR_mincore 218
227#define __NR_madvise 219
228#define __NR_madvise1 219 /* delete when C lib stub is removed */
229#define __NR_getdents64 220
230#define __NR_fcntl64 221
231/* 223 is unused */
232#define __NR_gettid 224
233#define __NR_readahead 225
234#define __NR_setxattr 226
235#define __NR_lsetxattr 227
236#define __NR_fsetxattr 228
237#define __NR_getxattr 229
238#define __NR_lgetxattr 230
239#define __NR_fgetxattr 231
240#define __NR_listxattr 232
241#define __NR_llistxattr 233
242#define __NR_flistxattr 234
243#define __NR_removexattr 235
244#define __NR_lremovexattr 236
245#define __NR_fremovexattr 237
246#define __NR_tkill 238
247#define __NR_sendfile64 239
248#define __NR_futex 240
249#define __NR_sched_setaffinity 241
250#define __NR_sched_getaffinity 242
251#define __NR_set_thread_area 243
252#define __NR_get_thread_area 244
253#define __NR_io_setup 245
254#define __NR_io_destroy 246
255#define __NR_io_getevents 247
256#define __NR_io_submit 248
257#define __NR_io_cancel 249
258#define __NR_fadvise64 250
259/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */
260#define __NR_exit_group 252
261#define __NR_lookup_dcookie 253
262#define __NR_epoll_create 254
263#define __NR_epoll_ctl 255
264#define __NR_epoll_wait 256
265#define __NR_remap_file_pages 257
266#define __NR_set_tid_address 258
267#define __NR_timer_create 259
268#define __NR_timer_settime (__NR_timer_create+1)
269#define __NR_timer_gettime (__NR_timer_create+2)
270#define __NR_timer_getoverrun (__NR_timer_create+3)
271#define __NR_timer_delete (__NR_timer_create+4)
272#define __NR_clock_settime (__NR_timer_create+5)
273#define __NR_clock_gettime (__NR_timer_create+6)
274#define __NR_clock_getres (__NR_timer_create+7)
275#define __NR_clock_nanosleep (__NR_timer_create+8)
276#define __NR_statfs64 268
277#define __NR_fstatfs64 269
278#define __NR_tgkill 270
279#define __NR_utimes 271
280#define __NR_fadvise64_64 272
281#define __NR_vserver 273
282#define __NR_mbind 274
283#define __NR_get_mempolicy 275
284#define __NR_set_mempolicy 276
285#define __NR_mq_open 277
286#define __NR_mq_unlink (__NR_mq_open+1)
287#define __NR_mq_timedsend (__NR_mq_open+2)
288#define __NR_mq_timedreceive (__NR_mq_open+3)
289#define __NR_mq_notify (__NR_mq_open+4)
290#define __NR_mq_getsetattr (__NR_mq_open+5)
291#define __NR_kexec_load 283
292#define __NR_waitid 284
293/* #define __NR_sys_setaltroot 285 */
294#define __NR_add_key 286
295#define __NR_request_key 287
296#define __NR_keyctl 288
297#define __NR_ioprio_set 289
298#define __NR_ioprio_get 290
299#define __NR_inotify_init 291
300#define __NR_inotify_add_watch 292
301#define __NR_inotify_rm_watch 293
302#define __NR_migrate_pages 294
303#define __NR_openat 295
304#define __NR_mkdirat 296
305#define __NR_mknodat 297
306#define __NR_fchownat 298
307#define __NR_futimesat 299
308#define __NR_fstatat64 300
309#define __NR_unlinkat 301
310#define __NR_renameat 302
311#define __NR_linkat 303
312#define __NR_symlinkat 304
313#define __NR_readlinkat 305
314#define __NR_fchmodat 306
315#define __NR_faccessat 307
316#define __NR_pselect6 308
317#define __NR_ppoll 309
318#define __NR_unshare 310
319#define __NR_set_robust_list 311
320#define __NR_get_robust_list 312
321#define __NR_splice 313
322#define __NR_sync_file_range 314
323#define __NR_tee 315
324#define __NR_vmsplice 316
325#define __NR_move_pages 317
326#define __NR_getcpu 318
327#define __NR_epoll_pwait 319
328#define __NR_utimensat 320
329#define __NR_signalfd 321
330#define __NR_timerfd_create 322
331#define __NR_eventfd 323
332#define __NR_fallocate 324
333#define __NR_timerfd_settime 325
334#define __NR_timerfd_gettime 326
335#define __NR_signalfd4 327
336#define __NR_eventfd2 328
337#define __NR_epoll_create1 329
338#define __NR_dup3 330
339#define __NR_pipe2 331
340#define __NR_inotify_init1 332
341
342#ifdef __KERNEL__
343
344#define __ARCH_WANT_IPC_PARSE_VERSION
345#define __ARCH_WANT_OLD_READDIR
346#define __ARCH_WANT_OLD_STAT
347#define __ARCH_WANT_STAT64
348#define __ARCH_WANT_SYS_ALARM
349#define __ARCH_WANT_SYS_GETHOSTNAME
350#define __ARCH_WANT_SYS_PAUSE
351#define __ARCH_WANT_SYS_SGETMASK
352#define __ARCH_WANT_SYS_SIGNAL
353#define __ARCH_WANT_SYS_TIME
354#define __ARCH_WANT_SYS_UTIME
355#define __ARCH_WANT_SYS_WAITPID
356#define __ARCH_WANT_SYS_SOCKETCALL
357#define __ARCH_WANT_SYS_FADVISE64
358#define __ARCH_WANT_SYS_GETPGRP
359#define __ARCH_WANT_SYS_LLSEEK
360#define __ARCH_WANT_SYS_NICE
361#define __ARCH_WANT_SYS_OLD_GETRLIMIT
362#define __ARCH_WANT_SYS_OLDUMOUNT
363#define __ARCH_WANT_SYS_SIGPENDING
364#define __ARCH_WANT_SYS_SIGPROCMASK
365#define __ARCH_WANT_SYS_RT_SIGACTION
366#define __ARCH_WANT_SYS_RT_SIGSUSPEND
367
368/*
369 * "Conditional" syscalls
370 *
371 * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
372 * but it doesn't work on all toolchains, so we just do it by hand
373 */
374#ifndef cond_syscall
375#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
376#endif
377
378#endif /* __KERNEL__ */
379#endif /* _ASM_X86_UNISTD_32_H */
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
new file mode 100644
index 00000000000..834b2c1d89f
--- /dev/null
+++ b/arch/x86/include/asm/unistd_64.h
@@ -0,0 +1,693 @@
1#ifndef _ASM_X86_UNISTD_64_H
2#define _ASM_X86_UNISTD_64_H
3
4#ifndef __SYSCALL
5#define __SYSCALL(a, b)
6#endif
7
8/*
9 * This file contains the system call numbers.
10 *
11 * Note: holes are not allowed.
12 */
13
14/* at least 8 syscall per cacheline */
15#define __NR_read 0
16__SYSCALL(__NR_read, sys_read)
17#define __NR_write 1
18__SYSCALL(__NR_write, sys_write)
19#define __NR_open 2
20__SYSCALL(__NR_open, sys_open)
21#define __NR_close 3
22__SYSCALL(__NR_close, sys_close)
23#define __NR_stat 4
24__SYSCALL(__NR_stat, sys_newstat)
25#define __NR_fstat 5
26__SYSCALL(__NR_fstat, sys_newfstat)
27#define __NR_lstat 6
28__SYSCALL(__NR_lstat, sys_newlstat)
29#define __NR_poll 7
30__SYSCALL(__NR_poll, sys_poll)
31
32#define __NR_lseek 8
33__SYSCALL(__NR_lseek, sys_lseek)
34#define __NR_mmap 9
35__SYSCALL(__NR_mmap, sys_mmap)
36#define __NR_mprotect 10
37__SYSCALL(__NR_mprotect, sys_mprotect)
38#define __NR_munmap 11
39__SYSCALL(__NR_munmap, sys_munmap)
40#define __NR_brk 12
41__SYSCALL(__NR_brk, sys_brk)
42#define __NR_rt_sigaction 13
43__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction)
44#define __NR_rt_sigprocmask 14
45__SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask)
46#define __NR_rt_sigreturn 15
47__SYSCALL(__NR_rt_sigreturn, stub_rt_sigreturn)
48
49#define __NR_ioctl 16
50__SYSCALL(__NR_ioctl, sys_ioctl)
51#define __NR_pread64 17
52__SYSCALL(__NR_pread64, sys_pread64)
53#define __NR_pwrite64 18
54__SYSCALL(__NR_pwrite64, sys_pwrite64)
55#define __NR_readv 19
56__SYSCALL(__NR_readv, sys_readv)
57#define __NR_writev 20
58__SYSCALL(__NR_writev, sys_writev)
59#define __NR_access 21
60__SYSCALL(__NR_access, sys_access)
61#define __NR_pipe 22
62__SYSCALL(__NR_pipe, sys_pipe)
63#define __NR_select 23
64__SYSCALL(__NR_select, sys_select)
65
66#define __NR_sched_yield 24
67__SYSCALL(__NR_sched_yield, sys_sched_yield)
68#define __NR_mremap 25
69__SYSCALL(__NR_mremap, sys_mremap)
70#define __NR_msync 26
71__SYSCALL(__NR_msync, sys_msync)
72#define __NR_mincore 27
73__SYSCALL(__NR_mincore, sys_mincore)
74#define __NR_madvise 28
75__SYSCALL(__NR_madvise, sys_madvise)
76#define __NR_shmget 29
77__SYSCALL(__NR_shmget, sys_shmget)
78#define __NR_shmat 30
79__SYSCALL(__NR_shmat, sys_shmat)
80#define __NR_shmctl 31
81__SYSCALL(__NR_shmctl, sys_shmctl)
82
83#define __NR_dup 32
84__SYSCALL(__NR_dup, sys_dup)
85#define __NR_dup2 33
86__SYSCALL(__NR_dup2, sys_dup2)
87#define __NR_pause 34
88__SYSCALL(__NR_pause, sys_pause)
89#define __NR_nanosleep 35
90__SYSCALL(__NR_nanosleep, sys_nanosleep)
91#define __NR_getitimer 36
92__SYSCALL(__NR_getitimer, sys_getitimer)
93#define __NR_alarm 37
94__SYSCALL(__NR_alarm, sys_alarm)
95#define __NR_setitimer 38
96__SYSCALL(__NR_setitimer, sys_setitimer)
97#define __NR_getpid 39
98__SYSCALL(__NR_getpid, sys_getpid)
99
100#define __NR_sendfile 40
101__SYSCALL(__NR_sendfile, sys_sendfile64)
102#define __NR_socket 41
103__SYSCALL(__NR_socket, sys_socket)
104#define __NR_connect 42
105__SYSCALL(__NR_connect, sys_connect)
106#define __NR_accept 43
107__SYSCALL(__NR_accept, sys_accept)
108#define __NR_sendto 44
109__SYSCALL(__NR_sendto, sys_sendto)
110#define __NR_recvfrom 45
111__SYSCALL(__NR_recvfrom, sys_recvfrom)
112#define __NR_sendmsg 46
113__SYSCALL(__NR_sendmsg, sys_sendmsg)
114#define __NR_recvmsg 47
115__SYSCALL(__NR_recvmsg, sys_recvmsg)
116
117#define __NR_shutdown 48
118__SYSCALL(__NR_shutdown, sys_shutdown)
119#define __NR_bind 49
120__SYSCALL(__NR_bind, sys_bind)
121#define __NR_listen 50
122__SYSCALL(__NR_listen, sys_listen)
123#define __NR_getsockname 51
124__SYSCALL(__NR_getsockname, sys_getsockname)
125#define __NR_getpeername 52
126__SYSCALL(__NR_getpeername, sys_getpeername)
127#define __NR_socketpair 53
128__SYSCALL(__NR_socketpair, sys_socketpair)
129#define __NR_setsockopt 54
130__SYSCALL(__NR_setsockopt, sys_setsockopt)
131#define __NR_getsockopt 55
132__SYSCALL(__NR_getsockopt, sys_getsockopt)
133
134#define __NR_clone 56
135__SYSCALL(__NR_clone, stub_clone)
136#define __NR_fork 57
137__SYSCALL(__NR_fork, stub_fork)
138#define __NR_vfork 58
139__SYSCALL(__NR_vfork, stub_vfork)
140#define __NR_execve 59
141__SYSCALL(__NR_execve, stub_execve)
142#define __NR_exit 60
143__SYSCALL(__NR_exit, sys_exit)
144#define __NR_wait4 61
145__SYSCALL(__NR_wait4, sys_wait4)
146#define __NR_kill 62
147__SYSCALL(__NR_kill, sys_kill)
148#define __NR_uname 63
149__SYSCALL(__NR_uname, sys_uname)
150
151#define __NR_semget 64
152__SYSCALL(__NR_semget, sys_semget)
153#define __NR_semop 65
154__SYSCALL(__NR_semop, sys_semop)
155#define __NR_semctl 66
156__SYSCALL(__NR_semctl, sys_semctl)
157#define __NR_shmdt 67
158__SYSCALL(__NR_shmdt, sys_shmdt)
159#define __NR_msgget 68
160__SYSCALL(__NR_msgget, sys_msgget)
161#define __NR_msgsnd 69
162__SYSCALL(__NR_msgsnd, sys_msgsnd)
163#define __NR_msgrcv 70
164__SYSCALL(__NR_msgrcv, sys_msgrcv)
165#define __NR_msgctl 71
166__SYSCALL(__NR_msgctl, sys_msgctl)
167
168#define __NR_fcntl 72
169__SYSCALL(__NR_fcntl, sys_fcntl)
170#define __NR_flock 73
171__SYSCALL(__NR_flock, sys_flock)
172#define __NR_fsync 74
173__SYSCALL(__NR_fsync, sys_fsync)
174#define __NR_fdatasync 75
175__SYSCALL(__NR_fdatasync, sys_fdatasync)
176#define __NR_truncate 76
177__SYSCALL(__NR_truncate, sys_truncate)
178#define __NR_ftruncate 77
179__SYSCALL(__NR_ftruncate, sys_ftruncate)
180#define __NR_getdents 78
181__SYSCALL(__NR_getdents, sys_getdents)
182#define __NR_getcwd 79
183__SYSCALL(__NR_getcwd, sys_getcwd)
184
185#define __NR_chdir 80
186__SYSCALL(__NR_chdir, sys_chdir)
187#define __NR_fchdir 81
188__SYSCALL(__NR_fchdir, sys_fchdir)
189#define __NR_rename 82
190__SYSCALL(__NR_rename, sys_rename)
191#define __NR_mkdir 83
192__SYSCALL(__NR_mkdir, sys_mkdir)
193#define __NR_rmdir 84
194__SYSCALL(__NR_rmdir, sys_rmdir)
195#define __NR_creat 85
196__SYSCALL(__NR_creat, sys_creat)
197#define __NR_link 86
198__SYSCALL(__NR_link, sys_link)
199#define __NR_unlink 87
200__SYSCALL(__NR_unlink, sys_unlink)
201
202#define __NR_symlink 88
203__SYSCALL(__NR_symlink, sys_symlink)
204#define __NR_readlink 89
205__SYSCALL(__NR_readlink, sys_readlink)
206#define __NR_chmod 90
207__SYSCALL(__NR_chmod, sys_chmod)
208#define __NR_fchmod 91
209__SYSCALL(__NR_fchmod, sys_fchmod)
210#define __NR_chown 92
211__SYSCALL(__NR_chown, sys_chown)
212#define __NR_fchown 93
213__SYSCALL(__NR_fchown, sys_fchown)
214#define __NR_lchown 94
215__SYSCALL(__NR_lchown, sys_lchown)
216#define __NR_umask 95
217__SYSCALL(__NR_umask, sys_umask)
218
219#define __NR_gettimeofday 96
220__SYSCALL(__NR_gettimeofday, sys_gettimeofday)
221#define __NR_getrlimit 97
222__SYSCALL(__NR_getrlimit, sys_getrlimit)
223#define __NR_getrusage 98
224__SYSCALL(__NR_getrusage, sys_getrusage)
225#define __NR_sysinfo 99
226__SYSCALL(__NR_sysinfo, sys_sysinfo)
227#define __NR_times 100
228__SYSCALL(__NR_times, sys_times)
229#define __NR_ptrace 101
230__SYSCALL(__NR_ptrace, sys_ptrace)
231#define __NR_getuid 102
232__SYSCALL(__NR_getuid, sys_getuid)
233#define __NR_syslog 103
234__SYSCALL(__NR_syslog, sys_syslog)
235
236/* at the very end the stuff that never runs during the benchmarks */
237#define __NR_getgid 104
238__SYSCALL(__NR_getgid, sys_getgid)
239#define __NR_setuid 105
240__SYSCALL(__NR_setuid, sys_setuid)
241#define __NR_setgid 106
242__SYSCALL(__NR_setgid, sys_setgid)
243#define __NR_geteuid 107
244__SYSCALL(__NR_geteuid, sys_geteuid)
245#define __NR_getegid 108
246__SYSCALL(__NR_getegid, sys_getegid)
247#define __NR_setpgid 109
248__SYSCALL(__NR_setpgid, sys_setpgid)
249#define __NR_getppid 110
250__SYSCALL(__NR_getppid, sys_getppid)
251#define __NR_getpgrp 111
252__SYSCALL(__NR_getpgrp, sys_getpgrp)
253
254#define __NR_setsid 112
255__SYSCALL(__NR_setsid, sys_setsid)
256#define __NR_setreuid 113
257__SYSCALL(__NR_setreuid, sys_setreuid)
258#define __NR_setregid 114
259__SYSCALL(__NR_setregid, sys_setregid)
260#define __NR_getgroups 115
261__SYSCALL(__NR_getgroups, sys_getgroups)
262#define __NR_setgroups 116
263__SYSCALL(__NR_setgroups, sys_setgroups)
264#define __NR_setresuid 117
265__SYSCALL(__NR_setresuid, sys_setresuid)
266#define __NR_getresuid 118
267__SYSCALL(__NR_getresuid, sys_getresuid)
268#define __NR_setresgid 119
269__SYSCALL(__NR_setresgid, sys_setresgid)
270
271#define __NR_getresgid 120
272__SYSCALL(__NR_getresgid, sys_getresgid)
273#define __NR_getpgid 121
274__SYSCALL(__NR_getpgid, sys_getpgid)
275#define __NR_setfsuid 122
276__SYSCALL(__NR_setfsuid, sys_setfsuid)
277#define __NR_setfsgid 123
278__SYSCALL(__NR_setfsgid, sys_setfsgid)
279#define __NR_getsid 124
280__SYSCALL(__NR_getsid, sys_getsid)
281#define __NR_capget 125
282__SYSCALL(__NR_capget, sys_capget)
283#define __NR_capset 126
284__SYSCALL(__NR_capset, sys_capset)
285
286#define __NR_rt_sigpending 127
287__SYSCALL(__NR_rt_sigpending, sys_rt_sigpending)
288#define __NR_rt_sigtimedwait 128
289__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait)
290#define __NR_rt_sigqueueinfo 129
291__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo)
292#define __NR_rt_sigsuspend 130
293__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend)
294#define __NR_sigaltstack 131
295__SYSCALL(__NR_sigaltstack, stub_sigaltstack)
296#define __NR_utime 132
297__SYSCALL(__NR_utime, sys_utime)
298#define __NR_mknod 133
299__SYSCALL(__NR_mknod, sys_mknod)
300
301/* Only needed for a.out */
302#define __NR_uselib 134
303__SYSCALL(__NR_uselib, sys_ni_syscall)
304#define __NR_personality 135
305__SYSCALL(__NR_personality, sys_personality)
306
307#define __NR_ustat 136
308__SYSCALL(__NR_ustat, sys_ustat)
309#define __NR_statfs 137
310__SYSCALL(__NR_statfs, sys_statfs)
311#define __NR_fstatfs 138
312__SYSCALL(__NR_fstatfs, sys_fstatfs)
313#define __NR_sysfs 139
314__SYSCALL(__NR_sysfs, sys_sysfs)
315
316#define __NR_getpriority 140
317__SYSCALL(__NR_getpriority, sys_getpriority)
318#define __NR_setpriority 141
319__SYSCALL(__NR_setpriority, sys_setpriority)
320#define __NR_sched_setparam 142
321__SYSCALL(__NR_sched_setparam, sys_sched_setparam)
322#define __NR_sched_getparam 143
323__SYSCALL(__NR_sched_getparam, sys_sched_getparam)
324#define __NR_sched_setscheduler 144
325__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler)
326#define __NR_sched_getscheduler 145
327__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler)
328#define __NR_sched_get_priority_max 146
329__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
330#define __NR_sched_get_priority_min 147
331__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
332#define __NR_sched_rr_get_interval 148
333__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval)
334
335#define __NR_mlock 149
336__SYSCALL(__NR_mlock, sys_mlock)
337#define __NR_munlock 150
338__SYSCALL(__NR_munlock, sys_munlock)
339#define __NR_mlockall 151
340__SYSCALL(__NR_mlockall, sys_mlockall)
341#define __NR_munlockall 152
342__SYSCALL(__NR_munlockall, sys_munlockall)
343
344#define __NR_vhangup 153
345__SYSCALL(__NR_vhangup, sys_vhangup)
346
347#define __NR_modify_ldt 154
348__SYSCALL(__NR_modify_ldt, sys_modify_ldt)
349
350#define __NR_pivot_root 155
351__SYSCALL(__NR_pivot_root, sys_pivot_root)
352
353#define __NR__sysctl 156
354__SYSCALL(__NR__sysctl, sys_sysctl)
355
356#define __NR_prctl 157
357__SYSCALL(__NR_prctl, sys_prctl)
358#define __NR_arch_prctl 158
359__SYSCALL(__NR_arch_prctl, sys_arch_prctl)
360
361#define __NR_adjtimex 159
362__SYSCALL(__NR_adjtimex, sys_adjtimex)
363
364#define __NR_setrlimit 160
365__SYSCALL(__NR_setrlimit, sys_setrlimit)
366
367#define __NR_chroot 161
368__SYSCALL(__NR_chroot, sys_chroot)
369
370#define __NR_sync 162
371__SYSCALL(__NR_sync, sys_sync)
372
373#define __NR_acct 163
374__SYSCALL(__NR_acct, sys_acct)
375
376#define __NR_settimeofday 164
377__SYSCALL(__NR_settimeofday, sys_settimeofday)
378
379#define __NR_mount 165
380__SYSCALL(__NR_mount, sys_mount)
381#define __NR_umount2 166
382__SYSCALL(__NR_umount2, sys_umount)
383
384#define __NR_swapon 167
385__SYSCALL(__NR_swapon, sys_swapon)
386#define __NR_swapoff 168
387__SYSCALL(__NR_swapoff, sys_swapoff)
388
389#define __NR_reboot 169
390__SYSCALL(__NR_reboot, sys_reboot)
391
392#define __NR_sethostname 170
393__SYSCALL(__NR_sethostname, sys_sethostname)
394#define __NR_setdomainname 171
395__SYSCALL(__NR_setdomainname, sys_setdomainname)
396
397#define __NR_iopl 172
398__SYSCALL(__NR_iopl, stub_iopl)
399#define __NR_ioperm 173
400__SYSCALL(__NR_ioperm, sys_ioperm)
401
402#define __NR_create_module 174
403__SYSCALL(__NR_create_module, sys_ni_syscall)
404#define __NR_init_module 175
405__SYSCALL(__NR_init_module, sys_init_module)
406#define __NR_delete_module 176
407__SYSCALL(__NR_delete_module, sys_delete_module)
408#define __NR_get_kernel_syms 177
409__SYSCALL(__NR_get_kernel_syms, sys_ni_syscall)
410#define __NR_query_module 178
411__SYSCALL(__NR_query_module, sys_ni_syscall)
412
413#define __NR_quotactl 179
414__SYSCALL(__NR_quotactl, sys_quotactl)
415
416#define __NR_nfsservctl 180
417__SYSCALL(__NR_nfsservctl, sys_nfsservctl)
418
419/* reserved for LiS/STREAMS */
420#define __NR_getpmsg 181
421__SYSCALL(__NR_getpmsg, sys_ni_syscall)
422#define __NR_putpmsg 182
423__SYSCALL(__NR_putpmsg, sys_ni_syscall)
424
425/* reserved for AFS */
426#define __NR_afs_syscall 183
427__SYSCALL(__NR_afs_syscall, sys_ni_syscall)
428
429/* reserved for tux */
430#define __NR_tuxcall 184
431__SYSCALL(__NR_tuxcall, sys_ni_syscall)
432
433#define __NR_security 185
434__SYSCALL(__NR_security, sys_ni_syscall)
435
436#define __NR_gettid 186
437__SYSCALL(__NR_gettid, sys_gettid)
438
439#define __NR_readahead 187
440__SYSCALL(__NR_readahead, sys_readahead)
441#define __NR_setxattr 188
442__SYSCALL(__NR_setxattr, sys_setxattr)
443#define __NR_lsetxattr 189
444__SYSCALL(__NR_lsetxattr, sys_lsetxattr)
445#define __NR_fsetxattr 190
446__SYSCALL(__NR_fsetxattr, sys_fsetxattr)
447#define __NR_getxattr 191
448__SYSCALL(__NR_getxattr, sys_getxattr)
449#define __NR_lgetxattr 192
450__SYSCALL(__NR_lgetxattr, sys_lgetxattr)
451#define __NR_fgetxattr 193
452__SYSCALL(__NR_fgetxattr, sys_fgetxattr)
453#define __NR_listxattr 194
454__SYSCALL(__NR_listxattr, sys_listxattr)
455#define __NR_llistxattr 195
456__SYSCALL(__NR_llistxattr, sys_llistxattr)
457#define __NR_flistxattr 196
458__SYSCALL(__NR_flistxattr, sys_flistxattr)
459#define __NR_removexattr 197
460__SYSCALL(__NR_removexattr, sys_removexattr)
461#define __NR_lremovexattr 198
462__SYSCALL(__NR_lremovexattr, sys_lremovexattr)
463#define __NR_fremovexattr 199
464__SYSCALL(__NR_fremovexattr, sys_fremovexattr)
465#define __NR_tkill 200
466__SYSCALL(__NR_tkill, sys_tkill)
467#define __NR_time 201
468__SYSCALL(__NR_time, sys_time)
469#define __NR_futex 202
470__SYSCALL(__NR_futex, sys_futex)
471#define __NR_sched_setaffinity 203
472__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity)
473#define __NR_sched_getaffinity 204
474__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity)
475#define __NR_set_thread_area 205
476__SYSCALL(__NR_set_thread_area, sys_ni_syscall) /* use arch_prctl */
477#define __NR_io_setup 206
478__SYSCALL(__NR_io_setup, sys_io_setup)
479#define __NR_io_destroy 207
480__SYSCALL(__NR_io_destroy, sys_io_destroy)
481#define __NR_io_getevents 208
482__SYSCALL(__NR_io_getevents, sys_io_getevents)
483#define __NR_io_submit 209
484__SYSCALL(__NR_io_submit, sys_io_submit)
485#define __NR_io_cancel 210
486__SYSCALL(__NR_io_cancel, sys_io_cancel)
487#define __NR_get_thread_area 211
488__SYSCALL(__NR_get_thread_area, sys_ni_syscall) /* use arch_prctl */
489#define __NR_lookup_dcookie 212
490__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie)
491#define __NR_epoll_create 213
492__SYSCALL(__NR_epoll_create, sys_epoll_create)
493#define __NR_epoll_ctl_old 214
494__SYSCALL(__NR_epoll_ctl_old, sys_ni_syscall)
495#define __NR_epoll_wait_old 215
496__SYSCALL(__NR_epoll_wait_old, sys_ni_syscall)
497#define __NR_remap_file_pages 216
498__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages)
499#define __NR_getdents64 217
500__SYSCALL(__NR_getdents64, sys_getdents64)
501#define __NR_set_tid_address 218
502__SYSCALL(__NR_set_tid_address, sys_set_tid_address)
503#define __NR_restart_syscall 219
504__SYSCALL(__NR_restart_syscall, sys_restart_syscall)
505#define __NR_semtimedop 220
506__SYSCALL(__NR_semtimedop, sys_semtimedop)
507#define __NR_fadvise64 221
508__SYSCALL(__NR_fadvise64, sys_fadvise64)
509#define __NR_timer_create 222
510__SYSCALL(__NR_timer_create, sys_timer_create)
511#define __NR_timer_settime 223
512__SYSCALL(__NR_timer_settime, sys_timer_settime)
513#define __NR_timer_gettime 224
514__SYSCALL(__NR_timer_gettime, sys_timer_gettime)
515#define __NR_timer_getoverrun 225
516__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
517#define __NR_timer_delete 226
518__SYSCALL(__NR_timer_delete, sys_timer_delete)
519#define __NR_clock_settime 227
520__SYSCALL(__NR_clock_settime, sys_clock_settime)
521#define __NR_clock_gettime 228
522__SYSCALL(__NR_clock_gettime, sys_clock_gettime)
523#define __NR_clock_getres 229
524__SYSCALL(__NR_clock_getres, sys_clock_getres)
525#define __NR_clock_nanosleep 230
526__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep)
527#define __NR_exit_group 231
528__SYSCALL(__NR_exit_group, sys_exit_group)
529#define __NR_epoll_wait 232
530__SYSCALL(__NR_epoll_wait, sys_epoll_wait)
531#define __NR_epoll_ctl 233
532__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl)
533#define __NR_tgkill 234
534__SYSCALL(__NR_tgkill, sys_tgkill)
535#define __NR_utimes 235
536__SYSCALL(__NR_utimes, sys_utimes)
537#define __NR_vserver 236
538__SYSCALL(__NR_vserver, sys_ni_syscall)
539#define __NR_mbind 237
540__SYSCALL(__NR_mbind, sys_mbind)
541#define __NR_set_mempolicy 238
542__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy)
543#define __NR_get_mempolicy 239
544__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy)
545#define __NR_mq_open 240
546__SYSCALL(__NR_mq_open, sys_mq_open)
547#define __NR_mq_unlink 241
548__SYSCALL(__NR_mq_unlink, sys_mq_unlink)
549#define __NR_mq_timedsend 242
550__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend)
551#define __NR_mq_timedreceive 243
552__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive)
553#define __NR_mq_notify 244
554__SYSCALL(__NR_mq_notify, sys_mq_notify)
555#define __NR_mq_getsetattr 245
556__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr)
557#define __NR_kexec_load 246
558__SYSCALL(__NR_kexec_load, sys_kexec_load)
559#define __NR_waitid 247
560__SYSCALL(__NR_waitid, sys_waitid)
561#define __NR_add_key 248
562__SYSCALL(__NR_add_key, sys_add_key)
563#define __NR_request_key 249
564__SYSCALL(__NR_request_key, sys_request_key)
565#define __NR_keyctl 250
566__SYSCALL(__NR_keyctl, sys_keyctl)
567#define __NR_ioprio_set 251
568__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
569#define __NR_ioprio_get 252
570__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
571#define __NR_inotify_init 253
572__SYSCALL(__NR_inotify_init, sys_inotify_init)
573#define __NR_inotify_add_watch 254
574__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
575#define __NR_inotify_rm_watch 255
576__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
577#define __NR_migrate_pages 256
578__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
579#define __NR_openat 257
580__SYSCALL(__NR_openat, sys_openat)
581#define __NR_mkdirat 258
582__SYSCALL(__NR_mkdirat, sys_mkdirat)
583#define __NR_mknodat 259
584__SYSCALL(__NR_mknodat, sys_mknodat)
585#define __NR_fchownat 260
586__SYSCALL(__NR_fchownat, sys_fchownat)
587#define __NR_futimesat 261
588__SYSCALL(__NR_futimesat, sys_futimesat)
589#define __NR_newfstatat 262
590__SYSCALL(__NR_newfstatat, sys_newfstatat)
591#define __NR_unlinkat 263
592__SYSCALL(__NR_unlinkat, sys_unlinkat)
593#define __NR_renameat 264
594__SYSCALL(__NR_renameat, sys_renameat)
595#define __NR_linkat 265
596__SYSCALL(__NR_linkat, sys_linkat)
597#define __NR_symlinkat 266
598__SYSCALL(__NR_symlinkat, sys_symlinkat)
599#define __NR_readlinkat 267
600__SYSCALL(__NR_readlinkat, sys_readlinkat)
601#define __NR_fchmodat 268
602__SYSCALL(__NR_fchmodat, sys_fchmodat)
603#define __NR_faccessat 269
604__SYSCALL(__NR_faccessat, sys_faccessat)
605#define __NR_pselect6 270
606__SYSCALL(__NR_pselect6, sys_pselect6)
607#define __NR_ppoll 271
608__SYSCALL(__NR_ppoll, sys_ppoll)
609#define __NR_unshare 272
610__SYSCALL(__NR_unshare, sys_unshare)
611#define __NR_set_robust_list 273
612__SYSCALL(__NR_set_robust_list, sys_set_robust_list)
613#define __NR_get_robust_list 274
614__SYSCALL(__NR_get_robust_list, sys_get_robust_list)
615#define __NR_splice 275
616__SYSCALL(__NR_splice, sys_splice)
617#define __NR_tee 276
618__SYSCALL(__NR_tee, sys_tee)
619#define __NR_sync_file_range 277
620__SYSCALL(__NR_sync_file_range, sys_sync_file_range)
621#define __NR_vmsplice 278
622__SYSCALL(__NR_vmsplice, sys_vmsplice)
623#define __NR_move_pages 279
624__SYSCALL(__NR_move_pages, sys_move_pages)
625#define __NR_utimensat 280
626__SYSCALL(__NR_utimensat, sys_utimensat)
627#define __IGNORE_getcpu /* implemented as a vsyscall */
628#define __NR_epoll_pwait 281
629__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait)
630#define __NR_signalfd 282
631__SYSCALL(__NR_signalfd, sys_signalfd)
632#define __NR_timerfd_create 283
633__SYSCALL(__NR_timerfd_create, sys_timerfd_create)
634#define __NR_eventfd 284
635__SYSCALL(__NR_eventfd, sys_eventfd)
636#define __NR_fallocate 285
637__SYSCALL(__NR_fallocate, sys_fallocate)
638#define __NR_timerfd_settime 286
639__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
640#define __NR_timerfd_gettime 287
641__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
642#define __NR_paccept 288
643__SYSCALL(__NR_paccept, sys_paccept)
644#define __NR_signalfd4 289
645__SYSCALL(__NR_signalfd4, sys_signalfd4)
646#define __NR_eventfd2 290
647__SYSCALL(__NR_eventfd2, sys_eventfd2)
648#define __NR_epoll_create1 291
649__SYSCALL(__NR_epoll_create1, sys_epoll_create1)
650#define __NR_dup3 292
651__SYSCALL(__NR_dup3, sys_dup3)
652#define __NR_pipe2 293
653__SYSCALL(__NR_pipe2, sys_pipe2)
654#define __NR_inotify_init1 294
655__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
656
657
658#ifndef __NO_STUBS
659#define __ARCH_WANT_OLD_READDIR
660#define __ARCH_WANT_OLD_STAT
661#define __ARCH_WANT_SYS_ALARM
662#define __ARCH_WANT_SYS_GETHOSTNAME
663#define __ARCH_WANT_SYS_PAUSE
664#define __ARCH_WANT_SYS_SGETMASK
665#define __ARCH_WANT_SYS_SIGNAL
666#define __ARCH_WANT_SYS_UTIME
667#define __ARCH_WANT_SYS_WAITPID
668#define __ARCH_WANT_SYS_SOCKETCALL
669#define __ARCH_WANT_SYS_FADVISE64
670#define __ARCH_WANT_SYS_GETPGRP
671#define __ARCH_WANT_SYS_LLSEEK
672#define __ARCH_WANT_SYS_NICE
673#define __ARCH_WANT_SYS_OLD_GETRLIMIT
674#define __ARCH_WANT_SYS_OLDUMOUNT
675#define __ARCH_WANT_SYS_SIGPENDING
676#define __ARCH_WANT_SYS_SIGPROCMASK
677#define __ARCH_WANT_SYS_RT_SIGACTION
678#define __ARCH_WANT_SYS_RT_SIGSUSPEND
679#define __ARCH_WANT_SYS_TIME
680#define __ARCH_WANT_COMPAT_SYS_TIME
681#endif /* __NO_STUBS */
682
683#ifdef __KERNEL__
684/*
685 * "Conditional" syscalls
686 *
687 * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
688 * but it doesn't work on all toolchains, so we just do it by hand
689 */
690#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
691#endif /* __KERNEL__ */
692
693#endif /* _ASM_X86_UNISTD_64_H */
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
new file mode 100644
index 00000000000..8b064bd9c55
--- /dev/null
+++ b/arch/x86/include/asm/unwind.h
@@ -0,0 +1,13 @@
1#ifndef _ASM_X86_UNWIND_H
2#define _ASM_X86_UNWIND_H
3
4#define UNW_PC(frame) ((void)(frame), 0UL)
5#define UNW_SP(frame) ((void)(frame), 0UL)
6#define UNW_FP(frame) ((void)(frame), 0UL)
7
8static inline int arch_unw_user_mode(const void *info)
9{
10 return 0;
11}
12
13#endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h
new file mode 100644
index 00000000000..999873b22e7
--- /dev/null
+++ b/arch/x86/include/asm/user.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "user_32.h"
3#else
4# include "user_64.h"
5#endif
diff --git a/arch/x86/include/asm/user32.h b/arch/x86/include/asm/user32.h
new file mode 100644
index 00000000000..14cbb73ebcb
--- /dev/null
+++ b/arch/x86/include/asm/user32.h
@@ -0,0 +1,70 @@
1#ifndef _ASM_X86_USER32_H
2#define _ASM_X86_USER32_H
3
4/* IA32 compatible user structures for ptrace.
5 * These should be used for 32bit coredumps too. */
6
7struct user_i387_ia32_struct {
8 u32 cwd;
9 u32 swd;
10 u32 twd;
11 u32 fip;
12 u32 fcs;
13 u32 foo;
14 u32 fos;
15 u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
16};
17
18/* FSAVE frame with extensions */
19struct user32_fxsr_struct {
20 unsigned short cwd;
21 unsigned short swd;
22 unsigned short twd; /* not compatible to 64bit twd */
23 unsigned short fop;
24 int fip;
25 int fcs;
26 int foo;
27 int fos;
28 int mxcsr;
29 int reserved;
30 int st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
31 int xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
32 int padding[56];
33};
34
35struct user_regs_struct32 {
36 __u32 ebx, ecx, edx, esi, edi, ebp, eax;
37 unsigned short ds, __ds, es, __es;
38 unsigned short fs, __fs, gs, __gs;
39 __u32 orig_eax, eip;
40 unsigned short cs, __cs;
41 __u32 eflags, esp;
42 unsigned short ss, __ss;
43};
44
45struct user32 {
46 struct user_regs_struct32 regs; /* Where the registers are actually stored */
47 int u_fpvalid; /* True if math co-processor being used. */
48 /* for this mess. Not yet used. */
49 struct user_i387_ia32_struct i387; /* Math Co-processor registers. */
50/* The rest of this junk is to help gdb figure out what goes where */
51 __u32 u_tsize; /* Text segment size (pages). */
52 __u32 u_dsize; /* Data segment size (pages). */
53 __u32 u_ssize; /* Stack segment size (pages). */
54 __u32 start_code; /* Starting virtual address of text. */
55 __u32 start_stack; /* Starting virtual address of stack area.
56 This is actually the bottom of the stack,
57 the top of the stack is always found in the
58 esp register. */
59 __u32 signal; /* Signal that caused the core dump. */
60 int reserved; /* No __u32er used */
61 __u32 u_ar0; /* Used by gdb to help find the values for */
62 /* the registers. */
63 __u32 u_fpstate; /* Math Co-processor pointer. */
64 __u32 magic; /* To uniquely identify a core file */
65 char u_comm[32]; /* User command that was responsible */
66 int u_debugreg[8];
67};
68
69
70#endif /* _ASM_X86_USER32_H */
diff --git a/arch/x86/include/asm/user_32.h b/arch/x86/include/asm/user_32.h
new file mode 100644
index 00000000000..bebfd864401
--- /dev/null
+++ b/arch/x86/include/asm/user_32.h
@@ -0,0 +1,131 @@
1#ifndef _ASM_X86_USER_32_H
2#define _ASM_X86_USER_32_H
3
4#include <asm/page.h>
5/* Core file format: The core file is written in such a way that gdb
6 can understand it and provide useful information to the user (under
7 linux we use the 'trad-core' bfd). There are quite a number of
8 obstacles to being able to view the contents of the floating point
9 registers, and until these are solved you will not be able to view the
10 contents of them. Actually, you can read in the core file and look at
11 the contents of the user struct to find out what the floating point
12 registers contain.
13 The actual file contents are as follows:
14 UPAGE: 1 page consisting of a user struct that tells gdb what is present
15 in the file. Directly after this is a copy of the task_struct, which
16 is currently not used by gdb, but it may come in useful at some point.
17 All of the registers are stored as part of the upage. The upage should
18 always be only one page.
19 DATA: The data area is stored. We use current->end_text to
20 current->brk to pick up all of the user variables, plus any memory
21 that may have been malloced. No attempt is made to determine if a page
22 is demand-zero or if a page is totally unused, we just cover the entire
23 range. All of the addresses are rounded in such a way that an integral
24 number of pages is written.
25 STACK: We need the stack information in order to get a meaningful
26 backtrace. We need to write the data from (esp) to
27 current->start_stack, so we round each of these off in order to be able
28 to write an integer number of pages.
29 The minimum core file size is 3 pages, or 12288 bytes.
30*/
31
32/*
33 * Pentium III FXSR, SSE support
34 * Gareth Hughes <gareth@valinux.com>, May 2000
35 *
36 * Provide support for the GDB 5.0+ PTRACE_{GET|SET}FPXREGS requests for
37 * interacting with the FXSR-format floating point environment. Floating
38 * point data can be accessed in the regular format in the usual manner,
39 * and both the standard and SIMD floating point data can be accessed via
40 * the new ptrace requests. In either case, changes to the FPU environment
41 * will be reflected in the task's state as expected.
42 */
43
44struct user_i387_struct {
45 long cwd;
46 long swd;
47 long twd;
48 long fip;
49 long fcs;
50 long foo;
51 long fos;
52 long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
53};
54
55struct user_fxsr_struct {
56 unsigned short cwd;
57 unsigned short swd;
58 unsigned short twd;
59 unsigned short fop;
60 long fip;
61 long fcs;
62 long foo;
63 long fos;
64 long mxcsr;
65 long reserved;
66 long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
67 long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
68 long padding[56];
69};
70
71/*
72 * This is the old layout of "struct pt_regs", and
73 * is still the layout used by user mode (the new
74 * pt_regs doesn't have all registers as the kernel
75 * doesn't use the extra segment registers)
76 */
77struct user_regs_struct {
78 unsigned long bx;
79 unsigned long cx;
80 unsigned long dx;
81 unsigned long si;
82 unsigned long di;
83 unsigned long bp;
84 unsigned long ax;
85 unsigned long ds;
86 unsigned long es;
87 unsigned long fs;
88 unsigned long gs;
89 unsigned long orig_ax;
90 unsigned long ip;
91 unsigned long cs;
92 unsigned long flags;
93 unsigned long sp;
94 unsigned long ss;
95};
96
97/* When the kernel dumps core, it starts by dumping the user struct -
98 this will be used by gdb to figure out where the data and stack segments
99 are within the file, and what virtual addresses to use. */
100struct user{
101/* We start with the registers, to mimic the way that "memory" is returned
102 from the ptrace(3,...) function. */
103 struct user_regs_struct regs; /* Where the registers are actually stored */
104/* ptrace does not yet supply these. Someday.... */
105 int u_fpvalid; /* True if math co-processor being used. */
106 /* for this mess. Not yet used. */
107 struct user_i387_struct i387; /* Math Co-processor registers. */
108/* The rest of this junk is to help gdb figure out what goes where */
109 unsigned long int u_tsize; /* Text segment size (pages). */
110 unsigned long int u_dsize; /* Data segment size (pages). */
111 unsigned long int u_ssize; /* Stack segment size (pages). */
112 unsigned long start_code; /* Starting virtual address of text. */
113 unsigned long start_stack; /* Starting virtual address of stack area.
114 This is actually the bottom of the stack,
115 the top of the stack is always found in the
116 esp register. */
117 long int signal; /* Signal that caused the core dump. */
118 int reserved; /* No longer used */
119 unsigned long u_ar0; /* Used by gdb to help find the values for */
120 /* the registers. */
121 struct user_i387_struct *u_fpstate; /* Math Co-processor pointer. */
122 unsigned long magic; /* To uniquely identify a core file */
123 char u_comm[32]; /* User command that was responsible */
124 int u_debugreg[8];
125};
126#define NBPG PAGE_SIZE
127#define UPAGES 1
128#define HOST_TEXT_START_ADDR (u.start_code)
129#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
130
131#endif /* _ASM_X86_USER_32_H */
diff --git a/arch/x86/include/asm/user_64.h b/arch/x86/include/asm/user_64.h
new file mode 100644
index 00000000000..faf2cd3e0d7
--- /dev/null
+++ b/arch/x86/include/asm/user_64.h
@@ -0,0 +1,137 @@
1#ifndef _ASM_X86_USER_64_H
2#define _ASM_X86_USER_64_H
3
4#include <asm/types.h>
5#include <asm/page.h>
6/* Core file format: The core file is written in such a way that gdb
7 can understand it and provide useful information to the user.
8 There are quite a number of obstacles to being able to view the
9 contents of the floating point registers, and until these are
10 solved you will not be able to view the contents of them.
11 Actually, you can read in the core file and look at the contents of
12 the user struct to find out what the floating point registers
13 contain.
14
15 The actual file contents are as follows:
16 UPAGE: 1 page consisting of a user struct that tells gdb what is present
17 in the file. Directly after this is a copy of the task_struct, which
18 is currently not used by gdb, but it may come in useful at some point.
19 All of the registers are stored as part of the upage. The upage should
20 always be only one page.
21 DATA: The data area is stored. We use current->end_text to
22 current->brk to pick up all of the user variables, plus any memory
23 that may have been malloced. No attempt is made to determine if a page
24 is demand-zero or if a page is totally unused, we just cover the entire
25 range. All of the addresses are rounded in such a way that an integral
26 number of pages is written.
27 STACK: We need the stack information in order to get a meaningful
28 backtrace. We need to write the data from (esp) to
29 current->start_stack, so we round each of these off in order to be able
30 to write an integer number of pages.
31 The minimum core file size is 3 pages, or 12288 bytes. */
32
33/*
34 * Pentium III FXSR, SSE support
35 * Gareth Hughes <gareth@valinux.com>, May 2000
36 *
37 * Provide support for the GDB 5.0+ PTRACE_{GET|SET}FPXREGS requests for
38 * interacting with the FXSR-format floating point environment. Floating
39 * point data can be accessed in the regular format in the usual manner,
40 * and both the standard and SIMD floating point data can be accessed via
41 * the new ptrace requests. In either case, changes to the FPU environment
42 * will be reflected in the task's state as expected.
43 *
44 * x86-64 support by Andi Kleen.
45 */
46
47/* This matches the 64bit FXSAVE format as defined by AMD. It is the same
48 as the 32bit format defined by Intel, except that the selector:offset pairs
49 for data and eip are replaced with flat 64bit pointers. */
50struct user_i387_struct {
51 unsigned short cwd;
52 unsigned short swd;
53 unsigned short twd; /* Note this is not the same as
54 the 32bit/x87/FSAVE twd */
55 unsigned short fop;
56 __u64 rip;
57 __u64 rdp;
58 __u32 mxcsr;
59 __u32 mxcsr_mask;
60 __u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
61 __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
62 __u32 padding[24];
63};
64
65/*
66 * Segment register layout in coredumps.
67 */
68struct user_regs_struct {
69 unsigned long r15;
70 unsigned long r14;
71 unsigned long r13;
72 unsigned long r12;
73 unsigned long bp;
74 unsigned long bx;
75 unsigned long r11;
76 unsigned long r10;
77 unsigned long r9;
78 unsigned long r8;
79 unsigned long ax;
80 unsigned long cx;
81 unsigned long dx;
82 unsigned long si;
83 unsigned long di;
84 unsigned long orig_ax;
85 unsigned long ip;
86 unsigned long cs;
87 unsigned long flags;
88 unsigned long sp;
89 unsigned long ss;
90 unsigned long fs_base;
91 unsigned long gs_base;
92 unsigned long ds;
93 unsigned long es;
94 unsigned long fs;
95 unsigned long gs;
96};
97
98/* When the kernel dumps core, it starts by dumping the user struct -
99 this will be used by gdb to figure out where the data and stack segments
100 are within the file, and what virtual addresses to use. */
101
102struct user {
103/* We start with the registers, to mimic the way that "memory" is returned
104 from the ptrace(3,...) function. */
105 struct user_regs_struct regs; /* Where the registers are actually stored */
106/* ptrace does not yet supply these. Someday.... */
107 int u_fpvalid; /* True if math co-processor being used. */
108 /* for this mess. Not yet used. */
109 int pad0;
110 struct user_i387_struct i387; /* Math Co-processor registers. */
111/* The rest of this junk is to help gdb figure out what goes where */
112 unsigned long int u_tsize; /* Text segment size (pages). */
113 unsigned long int u_dsize; /* Data segment size (pages). */
114 unsigned long int u_ssize; /* Stack segment size (pages). */
115 unsigned long start_code; /* Starting virtual address of text. */
116 unsigned long start_stack; /* Starting virtual address of stack area.
117 This is actually the bottom of the stack,
118 the top of the stack is always found in the
119 esp register. */
120 long int signal; /* Signal that caused the core dump. */
121 int reserved; /* No longer used */
122 int pad1;
123 unsigned long u_ar0; /* Used by gdb to help find the values for */
124 /* the registers. */
125 struct user_i387_struct *u_fpstate; /* Math Co-processor pointer. */
126 unsigned long magic; /* To uniquely identify a core file */
127 char u_comm[32]; /* User command that was responsible */
128 unsigned long u_debugreg[8];
129 unsigned long error_code; /* CPU error code or 0 */
130 unsigned long fault_address; /* CR3 or 0 */
131};
132#define NBPG PAGE_SIZE
133#define UPAGES 1
134#define HOST_TEXT_START_ADDR (u.start_code)
135#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
136
137#endif /* _ASM_X86_USER_64_H */
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
new file mode 100644
index 00000000000..d931d3b7e6f
--- /dev/null
+++ b/arch/x86/include/asm/uv/bios.h
@@ -0,0 +1,94 @@
1#ifndef _ASM_X86_UV_BIOS_H
2#define _ASM_X86_UV_BIOS_H
3
4/*
5 * UV BIOS layer definitions.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
22 * Copyright (c) Russ Anderson
23 */
24
25#include <linux/rtc.h>
26
27/*
28 * Values for the BIOS calls. It is passed as the first * argument in the
29 * BIOS call. Passing any other value in the first argument will result
30 * in a BIOS_STATUS_UNIMPLEMENTED return status.
31 */
32enum uv_bios_cmd {
33 UV_BIOS_COMMON,
34 UV_BIOS_GET_SN_INFO,
35 UV_BIOS_FREQ_BASE
36};
37
38/*
39 * Status values returned from a BIOS call.
40 */
41enum {
42 BIOS_STATUS_SUCCESS = 0,
43 BIOS_STATUS_UNIMPLEMENTED = -ENOSYS,
44 BIOS_STATUS_EINVAL = -EINVAL,
45 BIOS_STATUS_UNAVAIL = -EBUSY
46};
47
48/*
49 * The UV system table describes specific firmware
50 * capabilities available to the Linux kernel at runtime.
51 */
52struct uv_systab {
53 char signature[4]; /* must be "UVST" */
54 u32 revision; /* distinguish different firmware revs */
55 u64 function; /* BIOS runtime callback function ptr */
56};
57
58enum {
59 BIOS_FREQ_BASE_PLATFORM = 0,
60 BIOS_FREQ_BASE_INTERVAL_TIMER = 1,
61 BIOS_FREQ_BASE_REALTIME_CLOCK = 2
62};
63
64union partition_info_u {
65 u64 val;
66 struct {
67 u64 hub_version : 8,
68 partition_id : 16,
69 coherence_id : 16,
70 region_size : 24;
71 };
72};
73
74/*
75 * bios calls have 6 parameters
76 */
77extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64);
78extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64);
79extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
80
81extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *);
82extern s64 uv_bios_freq_base(u64, u64 *);
83
84extern void uv_bios_init(void);
85
86extern int uv_type;
87extern long sn_partition_id;
88extern long uv_coherency_id;
89extern long uv_region_size;
90#define partition_coherence_id() (uv_coherency_id)
91
92extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
93
94#endif /* _ASM_X86_UV_BIOS_H */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
new file mode 100644
index 00000000000..e2363253bbb
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -0,0 +1,332 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * SGI UV Broadcast Assist Unit definitions
7 *
8 * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
9 */
10
11#ifndef _ASM_X86_UV_UV_BAU_H
12#define _ASM_X86_UV_UV_BAU_H
13
14#include <linux/bitmap.h>
15#define BITSPERBYTE 8
16
17/*
18 * Broadcast Assist Unit messaging structures
19 *
20 * Selective Broadcast activations are induced by software action
21 * specifying a particular 8-descriptor "set" via a 6-bit index written
22 * to an MMR.
23 * Thus there are 64 unique 512-byte sets of SB descriptors - one set for
24 * each 6-bit index value. These descriptor sets are mapped in sequence
25 * starting with set 0 located at the address specified in the
26 * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
27 * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
28 *
29 * We will use 31 sets, one for sending BAU messages from each of the 32
30 * cpu's on the node.
31 *
32 * TLB shootdown will use the first of the 8 descriptors of each set.
33 * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
34 */
35
36#define UV_ITEMS_PER_DESCRIPTOR 8
37#define UV_CPUS_PER_ACT_STATUS 32
38#define UV_ACT_STATUS_MASK 0x3
39#define UV_ACT_STATUS_SIZE 2
40#define UV_ACTIVATION_DESCRIPTOR_SIZE 32
41#define UV_DISTRIBUTION_SIZE 256
42#define UV_SW_ACK_NPENDING 8
43#define UV_NET_ENDPOINT_INTD 0x38
44#define UV_DESC_BASE_PNODE_SHIFT 49
45#define UV_PAYLOADQ_PNODE_SHIFT 49
46#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
47#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
48
49/*
50 * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
51 */
52#define DESC_STATUS_IDLE 0
53#define DESC_STATUS_ACTIVE 1
54#define DESC_STATUS_DESTINATION_TIMEOUT 2
55#define DESC_STATUS_SOURCE_TIMEOUT 3
56
57/*
58 * source side threshholds at which message retries print a warning
59 */
60#define SOURCE_TIMEOUT_LIMIT 20
61#define DESTINATION_TIMEOUT_LIMIT 20
62
63/*
64 * number of entries in the destination side payload queue
65 */
66#define DEST_Q_SIZE 17
67/*
68 * number of destination side software ack resources
69 */
70#define DEST_NUM_RESOURCES 8
71#define MAX_CPUS_PER_NODE 32
72/*
73 * completion statuses for sending a TLB flush message
74 */
75#define FLUSH_RETRY 1
76#define FLUSH_GIVEUP 2
77#define FLUSH_COMPLETE 3
78
79/*
80 * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
81 * If the 'multilevel' flag in the header portion of the descriptor
82 * has been set to 0, then endpoint multi-unicast mode is selected.
83 * The distribution specification (32 bytes) is interpreted as a 256-bit
84 * distribution vector. Adjacent bits correspond to consecutive even numbered
85 * nodeIDs. The result of adding the index of a given bit to the 15-bit
86 * 'base_dest_nodeid' field of the header corresponds to the
87 * destination nodeID associated with that specified bit.
88 */
89struct bau_target_nodemask {
90 unsigned long bits[BITS_TO_LONGS(256)];
91};
92
93/*
94 * mask of cpu's on a node
95 * (during initialization we need to check that unsigned long has
96 * enough bits for max. cpu's per node)
97 */
98struct bau_local_cpumask {
99 unsigned long bits;
100};
101
102/*
103 * Payload: 16 bytes (128 bits) (bytes 0x20-0x2f of descriptor)
104 * only 12 bytes (96 bits) of the payload area are usable.
105 * An additional 3 bytes (bits 27:4) of the header address are carried
106 * to the next bytes of the destination payload queue.
107 * And an additional 2 bytes of the header Suppl_A field are also
108 * carried to the destination payload queue.
109 * But the first byte of the Suppl_A becomes bits 127:120 (the 16th byte)
110 * of the destination payload queue, which is written by the hardware
111 * with the s/w ack resource bit vector.
112 * [ effective message contents (16 bytes (128 bits) maximum), not counting
113 * the s/w ack bit vector ]
114 */
115
116/*
117 * The payload is software-defined for INTD transactions
118 */
119struct bau_msg_payload {
120 unsigned long address; /* signifies a page or all TLB's
121 of the cpu */
122 /* 64 bits */
123 unsigned short sending_cpu; /* filled in by sender */
124 /* 16 bits */
125 unsigned short acknowledge_count;/* filled in by destination */
126 /* 16 bits */
127 unsigned int reserved1:32; /* not usable */
128};
129
130
131/*
132 * Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
133 * see table 4.2.3.0.1 in broacast_assist spec.
134 */
135struct bau_msg_header {
136 int dest_subnodeid:6; /* must be zero */
137 /* bits 5:0 */
138 int base_dest_nodeid:15; /* nasid>>1 (pnode) of first bit in node_map */
139 /* bits 20:6 */
140 int command:8; /* message type */
141 /* bits 28:21 */
142 /* 0x38: SN3net EndPoint Message */
143 int rsvd_1:3; /* must be zero */
144 /* bits 31:29 */
145 /* int will align on 32 bits */
146 int rsvd_2:9; /* must be zero */
147 /* bits 40:32 */
148 /* Suppl_A is 56-41 */
149 int payload_2a:8; /* becomes byte 16 of msg */
150 /* bits 48:41 */ /* not currently using */
151 int payload_2b:8; /* becomes byte 17 of msg */
152 /* bits 56:49 */ /* not currently using */
153 /* Address field (96:57) is never used as an
154 address (these are address bits 42:3) */
155 int rsvd_3:1; /* must be zero */
156 /* bit 57 */
157 /* address bits 27:4 are payload */
158 /* these 24 bits become bytes 12-14 of msg */
159 int replied_to:1; /* sent as 0 by the source to byte 12 */
160 /* bit 58 */
161
162 int payload_1a:5; /* not currently used */
163 /* bits 63:59 */
164 int payload_1b:8; /* not currently used */
165 /* bits 71:64 */
166 int payload_1c:8; /* not currently used */
167 /* bits 79:72 */
168 int payload_1d:2; /* not currently used */
169 /* bits 81:80 */
170
171 int rsvd_4:7; /* must be zero */
172 /* bits 88:82 */
173 int sw_ack_flag:1; /* software acknowledge flag */
174 /* bit 89 */
175 /* INTD trasactions at destination are to
176 wait for software acknowledge */
177 int rsvd_5:6; /* must be zero */
178 /* bits 95:90 */
179 int rsvd_6:5; /* must be zero */
180 /* bits 100:96 */
181 int int_both:1; /* if 1, interrupt both sockets on the blade */
182 /* bit 101*/
183 int fairness:3; /* usually zero */
184 /* bits 104:102 */
185 int multilevel:1; /* multi-level multicast format */
186 /* bit 105 */
187 /* 0 for TLB: endpoint multi-unicast messages */
188 int chaining:1; /* next descriptor is part of this activation*/
189 /* bit 106 */
190 int rsvd_7:21; /* must be zero */
191 /* bits 127:107 */
192};
193
194/*
195 * The activation descriptor:
196 * The format of the message to send, plus all accompanying control
197 * Should be 64 bytes
198 */
199struct bau_desc {
200 struct bau_target_nodemask distribution;
201 /*
202 * message template, consisting of header and payload:
203 */
204 struct bau_msg_header header;
205 struct bau_msg_payload payload;
206};
207/*
208 * -payload-- ---------header------
209 * bytes 0-11 bits 41-56 bits 58-81
210 * A B (2) C (3)
211 *
212 * A/B/C are moved to:
213 * A C B
214 * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector)
215 * ------------payload queue-----------
216 */
217
218/*
219 * The payload queue on the destination side is an array of these.
220 * With BAU_MISC_CONTROL set for software acknowledge mode, the messages
221 * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17
222 * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120)
223 * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from
224 * sw_ack_vector and payload_2)
225 * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software
226 * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload
227 * operation."
228 */
229struct bau_payload_queue_entry {
230 unsigned long address; /* signifies a page or all TLB's
231 of the cpu */
232 /* 64 bits, bytes 0-7 */
233
234 unsigned short sending_cpu; /* cpu that sent the message */
235 /* 16 bits, bytes 8-9 */
236
237 unsigned short acknowledge_count; /* filled in by destination */
238 /* 16 bits, bytes 10-11 */
239
240 unsigned short replied_to:1; /* sent as 0 by the source */
241 /* 1 bit */
242 unsigned short unused1:7; /* not currently using */
243 /* 7 bits: byte 12) */
244
245 unsigned char unused2[2]; /* not currently using */
246 /* bytes 13-14 */
247
248 unsigned char sw_ack_vector; /* filled in by the hardware */
249 /* byte 15 (bits 127:120) */
250
251 unsigned char unused4[3]; /* not currently using bytes 17-19 */
252 /* bytes 17-19 */
253
254 int number_of_cpus; /* filled in at destination */
255 /* 32 bits, bytes 20-23 (aligned) */
256
257 unsigned char unused5[8]; /* not using */
258 /* bytes 24-31 */
259};
260
261/*
262 * one for every slot in the destination payload queue
263 */
264struct bau_msg_status {
265 struct bau_local_cpumask seen_by; /* map of cpu's */
266};
267
268/*
269 * one for every slot in the destination software ack resources
270 */
271struct bau_sw_ack_status {
272 struct bau_payload_queue_entry *msg; /* associated message */
273 int watcher; /* cpu monitoring, or -1 */
274};
275
276/*
277 * one on every node and per-cpu; to locate the software tables
278 */
279struct bau_control {
280 struct bau_desc *descriptor_base;
281 struct bau_payload_queue_entry *bau_msg_head;
282 struct bau_payload_queue_entry *va_queue_first;
283 struct bau_payload_queue_entry *va_queue_last;
284 struct bau_msg_status *msg_statuses;
285 int *watching; /* pointer to array */
286};
287
288/*
289 * This structure is allocated per_cpu for UV TLB shootdown statistics.
290 */
291struct ptc_stats {
292 unsigned long ptc_i; /* number of IPI-style flushes */
293 unsigned long requestor; /* number of nodes this cpu sent to */
294 unsigned long requestee; /* times cpu was remotely requested */
295 unsigned long alltlb; /* times all tlb's on this cpu were flushed */
296 unsigned long onetlb; /* times just one tlb on this cpu was flushed */
297 unsigned long s_retry; /* retries on source side timeouts */
298 unsigned long d_retry; /* retries on destination side timeouts */
299 unsigned long sflush; /* cycles spent in uv_flush_tlb_others */
300 unsigned long dflush; /* cycles spent on destination side */
301 unsigned long retriesok; /* successes on retries */
302 unsigned long nomsg; /* interrupts with no message */
303 unsigned long multmsg; /* interrupts with multiple messages */
304 unsigned long ntargeted;/* nodes targeted */
305};
306
307static inline int bau_node_isset(int node, struct bau_target_nodemask *dstp)
308{
309 return constant_test_bit(node, &dstp->bits[0]);
310}
311static inline void bau_node_set(int node, struct bau_target_nodemask *dstp)
312{
313 __set_bit(node, &dstp->bits[0]);
314}
315static inline void bau_nodes_clear(struct bau_target_nodemask *dstp, int nbits)
316{
317 bitmap_zero(&dstp->bits[0], nbits);
318}
319
320static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
321{
322 bitmap_zero(&dstp->bits, nbits);
323}
324
325#define cpubit_isset(cpu, bau_local_cpumask) \
326 test_bit((cpu), (bau_local_cpumask).bits)
327
328extern int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long);
329extern void uv_bau_message_intr1(void);
330extern void uv_bau_timeout_intr1(void);
331
332#endif /* _ASM_X86_UV_UV_BAU_H */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
new file mode 100644
index 00000000000..c6ad93e315c
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -0,0 +1,354 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * SGI UV architectural definitions
7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
9 */
10
11#ifndef _ASM_X86_UV_UV_HUB_H
12#define _ASM_X86_UV_UV_HUB_H
13
14#include <linux/numa.h>
15#include <linux/percpu.h>
16#include <asm/types.h>
17#include <asm/percpu.h>
18
19
20/*
21 * Addressing Terminology
22 *
23 * M - The low M bits of a physical address represent the offset
24 * into the blade local memory. RAM memory on a blade is physically
25 * contiguous (although various IO spaces may punch holes in
26 * it)..
27 *
28 * N - Number of bits in the node portion of a socket physical
29 * address.
30 *
31 * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of
32 * routers always have low bit of 1, C/MBricks have low bit
33 * equal to 0. Most addressing macros that target UV hub chips
34 * right shift the NASID by 1 to exclude the always-zero bit.
35 * NASIDs contain up to 15 bits.
36 *
37 * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead
38 * of nasids.
39 *
40 * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant
41 * of the nasid for socket usage.
42 *
43 *
44 * NumaLink Global Physical Address Format:
45 * +--------------------------------+---------------------+
46 * |00..000| GNODE | NodeOffset |
47 * +--------------------------------+---------------------+
48 * |<-------53 - M bits --->|<--------M bits ----->
49 *
50 * M - number of node offset bits (35 .. 40)
51 *
52 *
53 * Memory/UV-HUB Processor Socket Address Format:
54 * +----------------+---------------+---------------------+
55 * |00..000000000000| PNODE | NodeOffset |
56 * +----------------+---------------+---------------------+
57 * <--- N bits --->|<--------M bits ----->
58 *
59 * M - number of node offset bits (35 .. 40)
60 * N - number of PNODE bits (0 .. 10)
61 *
62 * Note: M + N cannot currently exceed 44 (x86_64) or 46 (IA64).
63 * The actual values are configuration dependent and are set at
64 * boot time. M & N values are set by the hardware/BIOS at boot.
65 *
66 *
67 * APICID format
68 * NOTE!!!!!! This is the current format of the APICID. However, code
69 * should assume that this will change in the future. Use functions
70 * in this file for all APICID bit manipulations and conversion.
71 *
72 * 1111110000000000
73 * 5432109876543210
74 * pppppppppplc0cch
75 * sssssssssss
76 *
77 * p = pnode bits
78 * l = socket number on board
79 * c = core
80 * h = hyperthread
81 * s = bits that are in the SOCKET_ID CSR
82 *
83 * Note: Processor only supports 12 bits in the APICID register. The ACPI
84 * tables hold all 16 bits. Software needs to be aware of this.
85 *
86 * Unless otherwise specified, all references to APICID refer to
87 * the FULL value contained in ACPI tables, not the subset in the
88 * processor APICID register.
89 */
90
91
92/*
93 * Maximum number of bricks in all partitions and in all coherency domains.
94 * This is the total number of bricks accessible in the numalink fabric. It
95 * includes all C & M bricks. Routers are NOT included.
96 *
97 * This value is also the value of the maximum number of non-router NASIDs
98 * in the numalink fabric.
99 *
100 * NOTE: a brick may contain 1 or 2 OS nodes. Don't get these confused.
101 */
102#define UV_MAX_NUMALINK_BLADES 16384
103
104/*
105 * Maximum number of C/Mbricks within a software SSI (hardware may support
106 * more).
107 */
108#define UV_MAX_SSI_BLADES 256
109
110/*
111 * The largest possible NASID of a C or M brick (+ 2)
112 */
113#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2)
114
115/*
116 * The following defines attributes of the HUB chip. These attributes are
117 * frequently referenced and are kept in the per-cpu data areas of each cpu.
118 * They are kept together in a struct to minimize cache misses.
119 */
120struct uv_hub_info_s {
121 unsigned long global_mmr_base;
122 unsigned long gpa_mask;
123 unsigned long gnode_upper;
124 unsigned long lowmem_remap_top;
125 unsigned long lowmem_remap_base;
126 unsigned short pnode;
127 unsigned short pnode_mask;
128 unsigned short coherency_domain_number;
129 unsigned short numa_blade_id;
130 unsigned char blade_processor_id;
131 unsigned char m_val;
132 unsigned char n_val;
133};
134DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
135#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
136#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
137
138/*
139 * Local & Global MMR space macros.
140 * Note: macros are intended to be used ONLY by inline functions
141 * in this file - not by other kernel code.
142 * n - NASID (full 15-bit global nasid)
143 * g - GNODE (full 15-bit global nasid, right shifted 1)
144 * p - PNODE (local part of nsids, right shifted 1)
145 */
146#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
147#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper)
148
149#define UV_LOCAL_MMR_BASE 0xf4000000UL
150#define UV_GLOBAL_MMR32_BASE 0xf8000000UL
151#define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base)
152#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
153#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
154
155#define UV_GLOBAL_MMR32_PNODE_SHIFT 15
156#define UV_GLOBAL_MMR64_PNODE_SHIFT 26
157
158#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
159
160#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
161 ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT)
162
163#define UV_APIC_PNODE_SHIFT 6
164
165/*
166 * Macros for converting between kernel virtual addresses, socket local physical
167 * addresses, and UV global physical addresses.
168 * Note: use the standard __pa() & __va() macros for converting
169 * between socket virtual and socket physical addresses.
170 */
171
172/* socket phys RAM --> UV global physical address */
173static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
174{
175 if (paddr < uv_hub_info->lowmem_remap_top)
176 paddr += uv_hub_info->lowmem_remap_base;
177 return paddr | uv_hub_info->gnode_upper;
178}
179
180
181/* socket virtual --> UV global physical address */
182static inline unsigned long uv_gpa(void *v)
183{
184 return __pa(v) | uv_hub_info->gnode_upper;
185}
186
187/* socket virtual --> UV global physical address */
188static inline void *uv_vgpa(void *v)
189{
190 return (void *)uv_gpa(v);
191}
192
193/* UV global physical address --> socket virtual */
194static inline void *uv_va(unsigned long gpa)
195{
196 return __va(gpa & uv_hub_info->gpa_mask);
197}
198
199/* pnode, offset --> socket virtual */
200static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
201{
202 return __va(((unsigned long)pnode << uv_hub_info->m_val) | offset);
203}
204
205
206/*
207 * Extract a PNODE from an APICID (full apicid, not processor subset)
208 */
209static inline int uv_apicid_to_pnode(int apicid)
210{
211 return (apicid >> UV_APIC_PNODE_SHIFT);
212}
213
214/*
215 * Access global MMRs using the low memory MMR32 space. This region supports
216 * faster MMR access but not all MMRs are accessible in this space.
217 */
218static inline unsigned long *uv_global_mmr32_address(int pnode,
219 unsigned long offset)
220{
221 return __va(UV_GLOBAL_MMR32_BASE |
222 UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset);
223}
224
225static inline void uv_write_global_mmr32(int pnode, unsigned long offset,
226 unsigned long val)
227{
228 *uv_global_mmr32_address(pnode, offset) = val;
229}
230
231static inline unsigned long uv_read_global_mmr32(int pnode,
232 unsigned long offset)
233{
234 return *uv_global_mmr32_address(pnode, offset);
235}
236
237/*
238 * Access Global MMR space using the MMR space located at the top of physical
239 * memory.
240 */
241static inline unsigned long *uv_global_mmr64_address(int pnode,
242 unsigned long offset)
243{
244 return __va(UV_GLOBAL_MMR64_BASE |
245 UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
246}
247
248static inline void uv_write_global_mmr64(int pnode, unsigned long offset,
249 unsigned long val)
250{
251 *uv_global_mmr64_address(pnode, offset) = val;
252}
253
254static inline unsigned long uv_read_global_mmr64(int pnode,
255 unsigned long offset)
256{
257 return *uv_global_mmr64_address(pnode, offset);
258}
259
260/*
261 * Access hub local MMRs. Faster than using global space but only local MMRs
262 * are accessible.
263 */
264static inline unsigned long *uv_local_mmr_address(unsigned long offset)
265{
266 return __va(UV_LOCAL_MMR_BASE | offset);
267}
268
269static inline unsigned long uv_read_local_mmr(unsigned long offset)
270{
271 return *uv_local_mmr_address(offset);
272}
273
274static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
275{
276 *uv_local_mmr_address(offset) = val;
277}
278
279/*
280 * Structures and definitions for converting between cpu, node, pnode, and blade
281 * numbers.
282 */
283struct uv_blade_info {
284 unsigned short nr_possible_cpus;
285 unsigned short nr_online_cpus;
286 unsigned short pnode;
287};
288extern struct uv_blade_info *uv_blade_info;
289extern short *uv_node_to_blade;
290extern short *uv_cpu_to_blade;
291extern short uv_possible_blades;
292
293/* Blade-local cpu number of current cpu. Numbered 0 .. <# cpus on the blade> */
294static inline int uv_blade_processor_id(void)
295{
296 return uv_hub_info->blade_processor_id;
297}
298
299/* Blade number of current cpu. Numnbered 0 .. <#blades -1> */
300static inline int uv_numa_blade_id(void)
301{
302 return uv_hub_info->numa_blade_id;
303}
304
305/* Convert a cpu number to the the UV blade number */
306static inline int uv_cpu_to_blade_id(int cpu)
307{
308 return uv_cpu_to_blade[cpu];
309}
310
311/* Convert linux node number to the UV blade number */
312static inline int uv_node_to_blade_id(int nid)
313{
314 return uv_node_to_blade[nid];
315}
316
317/* Convert a blade id to the PNODE of the blade */
318static inline int uv_blade_to_pnode(int bid)
319{
320 return uv_blade_info[bid].pnode;
321}
322
323/* Determine the number of possible cpus on a blade */
324static inline int uv_blade_nr_possible_cpus(int bid)
325{
326 return uv_blade_info[bid].nr_possible_cpus;
327}
328
329/* Determine the number of online cpus on a blade */
330static inline int uv_blade_nr_online_cpus(int bid)
331{
332 return uv_blade_info[bid].nr_online_cpus;
333}
334
335/* Convert a cpu id to the PNODE of the blade containing the cpu */
336static inline int uv_cpu_to_pnode(int cpu)
337{
338 return uv_blade_info[uv_cpu_to_blade_id(cpu)].pnode;
339}
340
341/* Convert a linux node number to the PNODE of the blade */
342static inline int uv_node_to_pnode(int nid)
343{
344 return uv_blade_info[uv_node_to_blade_id(nid)].pnode;
345}
346
347/* Maximum possible number of blades */
348static inline int uv_num_possible_blades(void)
349{
350 return uv_possible_blades;
351}
352
353#endif /* _ASM_X86_UV_UV_HUB_H */
354
diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h
new file mode 100644
index 00000000000..9613c8c0b64
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_irq.h
@@ -0,0 +1,36 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * SGI UV IRQ definitions
7 *
8 * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
9 */
10
11#ifndef _ASM_X86_UV_UV_IRQ_H
12#define _ASM_X86_UV_UV_IRQ_H
13
14/* If a generic version of this structure gets defined, eliminate this one. */
15struct uv_IO_APIC_route_entry {
16 __u64 vector : 8,
17 delivery_mode : 3,
18 dest_mode : 1,
19 delivery_status : 1,
20 polarity : 1,
21 __reserved_1 : 1,
22 trigger : 1,
23 mask : 1,
24 __reserved_2 : 15,
25 dest : 32;
26};
27
28extern struct irq_chip uv_irq_chip;
29
30extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long);
31extern void arch_disable_uv_irq(int, unsigned long);
32
33extern int uv_setup_irq(char *, int, int, unsigned long);
34extern void uv_teardown_irq(unsigned int, int, unsigned long);
35
36#endif /* _ASM_X86_UV_UV_IRQ_H */
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
new file mode 100644
index 00000000000..dd627793a23
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -0,0 +1,1295 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * SGI UV MMR definitions
7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
9 */
10
11#ifndef _ASM_X86_UV_UV_MMRS_H
12#define _ASM_X86_UV_UV_MMRS_H
13
14#define UV_MMR_ENABLE (1UL << 63)
15
16/* ========================================================================= */
17/* UVH_BAU_DATA_CONFIG */
18/* ========================================================================= */
19#define UVH_BAU_DATA_CONFIG 0x61680UL
20#define UVH_BAU_DATA_CONFIG_32 0x0438
21
22#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
23#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
24#define UVH_BAU_DATA_CONFIG_DM_SHFT 8
25#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL
26#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11
27#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL
28#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12
29#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL
30#define UVH_BAU_DATA_CONFIG_P_SHFT 13
31#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL
32#define UVH_BAU_DATA_CONFIG_T_SHFT 15
33#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL
34#define UVH_BAU_DATA_CONFIG_M_SHFT 16
35#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL
36#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32
37#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
38
39union uvh_bau_data_config_u {
40 unsigned long v;
41 struct uvh_bau_data_config_s {
42 unsigned long vector_ : 8; /* RW */
43 unsigned long dm : 3; /* RW */
44 unsigned long destmode : 1; /* RW */
45 unsigned long status : 1; /* RO */
46 unsigned long p : 1; /* RO */
47 unsigned long rsvd_14 : 1; /* */
48 unsigned long t : 1; /* RO */
49 unsigned long m : 1; /* RW */
50 unsigned long rsvd_17_31: 15; /* */
51 unsigned long apic_id : 32; /* RW */
52 } s;
53};
54
55/* ========================================================================= */
56/* UVH_EVENT_OCCURRED0 */
57/* ========================================================================= */
58#define UVH_EVENT_OCCURRED0 0x70000UL
59#define UVH_EVENT_OCCURRED0_32 0x005e8
60
61#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0
62#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
63#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
64#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
65#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
66#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
67#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3
68#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
69#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4
70#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
71#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5
72#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
73#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6
74#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
75#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
76#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
77#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
78#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
79#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
80#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
81#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
82#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
83#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
84#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
85#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
86#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
87#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
88#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
89#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
90#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
91#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
92#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
93#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
94#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
95#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
96#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
97#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
98#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
99#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
100#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
101#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
102#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
103#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
104#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
105#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
106#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
107#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
108#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
109#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
110#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
111#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
112#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
113#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
114#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
115#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
116#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
117#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
118#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
119#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
120#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
121#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
122#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
123#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
124#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
125#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
126#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
127#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
128#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
129#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
130#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
131#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
132#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
133#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
134#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
135#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
136#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
137#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
138#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
139#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
140#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
141#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
142#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
143#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
144#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
145#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
146#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
147#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43
148#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
149#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
150#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
151#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45
152#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
153#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
154#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
155#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
156#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
157#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
158#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
159#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
160#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
161#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
162#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
163#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51
164#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
165#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52
166#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
167#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53
168#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
169#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54
170#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
171#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55
172#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
173#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
174#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
175union uvh_event_occurred0_u {
176 unsigned long v;
177 struct uvh_event_occurred0_s {
178 unsigned long lb_hcerr : 1; /* RW, W1C */
179 unsigned long gr0_hcerr : 1; /* RW, W1C */
180 unsigned long gr1_hcerr : 1; /* RW, W1C */
181 unsigned long lh_hcerr : 1; /* RW, W1C */
182 unsigned long rh_hcerr : 1; /* RW, W1C */
183 unsigned long xn_hcerr : 1; /* RW, W1C */
184 unsigned long si_hcerr : 1; /* RW, W1C */
185 unsigned long lb_aoerr0 : 1; /* RW, W1C */
186 unsigned long gr0_aoerr0 : 1; /* RW, W1C */
187 unsigned long gr1_aoerr0 : 1; /* RW, W1C */
188 unsigned long lh_aoerr0 : 1; /* RW, W1C */
189 unsigned long rh_aoerr0 : 1; /* RW, W1C */
190 unsigned long xn_aoerr0 : 1; /* RW, W1C */
191 unsigned long si_aoerr0 : 1; /* RW, W1C */
192 unsigned long lb_aoerr1 : 1; /* RW, W1C */
193 unsigned long gr0_aoerr1 : 1; /* RW, W1C */
194 unsigned long gr1_aoerr1 : 1; /* RW, W1C */
195 unsigned long lh_aoerr1 : 1; /* RW, W1C */
196 unsigned long rh_aoerr1 : 1; /* RW, W1C */
197 unsigned long xn_aoerr1 : 1; /* RW, W1C */
198 unsigned long si_aoerr1 : 1; /* RW, W1C */
199 unsigned long rh_vpi_int : 1; /* RW, W1C */
200 unsigned long system_shutdown_int : 1; /* RW, W1C */
201 unsigned long lb_irq_int_0 : 1; /* RW, W1C */
202 unsigned long lb_irq_int_1 : 1; /* RW, W1C */
203 unsigned long lb_irq_int_2 : 1; /* RW, W1C */
204 unsigned long lb_irq_int_3 : 1; /* RW, W1C */
205 unsigned long lb_irq_int_4 : 1; /* RW, W1C */
206 unsigned long lb_irq_int_5 : 1; /* RW, W1C */
207 unsigned long lb_irq_int_6 : 1; /* RW, W1C */
208 unsigned long lb_irq_int_7 : 1; /* RW, W1C */
209 unsigned long lb_irq_int_8 : 1; /* RW, W1C */
210 unsigned long lb_irq_int_9 : 1; /* RW, W1C */
211 unsigned long lb_irq_int_10 : 1; /* RW, W1C */
212 unsigned long lb_irq_int_11 : 1; /* RW, W1C */
213 unsigned long lb_irq_int_12 : 1; /* RW, W1C */
214 unsigned long lb_irq_int_13 : 1; /* RW, W1C */
215 unsigned long lb_irq_int_14 : 1; /* RW, W1C */
216 unsigned long lb_irq_int_15 : 1; /* RW, W1C */
217 unsigned long l1_nmi_int : 1; /* RW, W1C */
218 unsigned long stop_clock : 1; /* RW, W1C */
219 unsigned long asic_to_l1 : 1; /* RW, W1C */
220 unsigned long l1_to_asic : 1; /* RW, W1C */
221 unsigned long ltc_int : 1; /* RW, W1C */
222 unsigned long la_seq_trigger : 1; /* RW, W1C */
223 unsigned long ipi_int : 1; /* RW, W1C */
224 unsigned long extio_int0 : 1; /* RW, W1C */
225 unsigned long extio_int1 : 1; /* RW, W1C */
226 unsigned long extio_int2 : 1; /* RW, W1C */
227 unsigned long extio_int3 : 1; /* RW, W1C */
228 unsigned long profile_int : 1; /* RW, W1C */
229 unsigned long rtc0 : 1; /* RW, W1C */
230 unsigned long rtc1 : 1; /* RW, W1C */
231 unsigned long rtc2 : 1; /* RW, W1C */
232 unsigned long rtc3 : 1; /* RW, W1C */
233 unsigned long bau_data : 1; /* RW, W1C */
234 unsigned long power_management_req : 1; /* RW, W1C */
235 unsigned long rsvd_57_63 : 7; /* */
236 } s;
237};
238
239/* ========================================================================= */
240/* UVH_EVENT_OCCURRED0_ALIAS */
241/* ========================================================================= */
242#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
243#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0
244
245/* ========================================================================= */
246/* UVH_INT_CMPB */
247/* ========================================================================= */
248#define UVH_INT_CMPB 0x22080UL
249
250#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0
251#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL
252
253union uvh_int_cmpb_u {
254 unsigned long v;
255 struct uvh_int_cmpb_s {
256 unsigned long real_time_cmpb : 56; /* RW */
257 unsigned long rsvd_56_63 : 8; /* */
258 } s;
259};
260
261/* ========================================================================= */
262/* UVH_INT_CMPC */
263/* ========================================================================= */
264#define UVH_INT_CMPC 0x22100UL
265
266#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0
267#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL
268
269union uvh_int_cmpc_u {
270 unsigned long v;
271 struct uvh_int_cmpc_s {
272 unsigned long real_time_cmpc : 56; /* RW */
273 unsigned long rsvd_56_63 : 8; /* */
274 } s;
275};
276
277/* ========================================================================= */
278/* UVH_INT_CMPD */
279/* ========================================================================= */
280#define UVH_INT_CMPD 0x22180UL
281
282#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0
283#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL
284
285union uvh_int_cmpd_u {
286 unsigned long v;
287 struct uvh_int_cmpd_s {
288 unsigned long real_time_cmpd : 56; /* RW */
289 unsigned long rsvd_56_63 : 8; /* */
290 } s;
291};
292
293/* ========================================================================= */
294/* UVH_IPI_INT */
295/* ========================================================================= */
296#define UVH_IPI_INT 0x60500UL
297#define UVH_IPI_INT_32 0x0348
298
299#define UVH_IPI_INT_VECTOR_SHFT 0
300#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
301#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8
302#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL
303#define UVH_IPI_INT_DESTMODE_SHFT 11
304#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL
305#define UVH_IPI_INT_APIC_ID_SHFT 16
306#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL
307#define UVH_IPI_INT_SEND_SHFT 63
308#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL
309
310union uvh_ipi_int_u {
311 unsigned long v;
312 struct uvh_ipi_int_s {
313 unsigned long vector_ : 8; /* RW */
314 unsigned long delivery_mode : 3; /* RW */
315 unsigned long destmode : 1; /* RW */
316 unsigned long rsvd_12_15 : 4; /* */
317 unsigned long apic_id : 32; /* RW */
318 unsigned long rsvd_48_62 : 15; /* */
319 unsigned long send : 1; /* WP */
320 } s;
321};
322
323/* ========================================================================= */
324/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */
325/* ========================================================================= */
326#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
327#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x009c0
328
329#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
330#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
331#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
332#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
333
334union uvh_lb_bau_intd_payload_queue_first_u {
335 unsigned long v;
336 struct uvh_lb_bau_intd_payload_queue_first_s {
337 unsigned long rsvd_0_3: 4; /* */
338 unsigned long address : 39; /* RW */
339 unsigned long rsvd_43_48: 6; /* */
340 unsigned long node_id : 14; /* RW */
341 unsigned long rsvd_63 : 1; /* */
342 } s;
343};
344
345/* ========================================================================= */
346/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */
347/* ========================================================================= */
348#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
349#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x009c8
350
351#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
352#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
353
354union uvh_lb_bau_intd_payload_queue_last_u {
355 unsigned long v;
356 struct uvh_lb_bau_intd_payload_queue_last_s {
357 unsigned long rsvd_0_3: 4; /* */
358 unsigned long address : 39; /* RW */
359 unsigned long rsvd_43_63: 21; /* */
360 } s;
361};
362
363/* ========================================================================= */
364/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */
365/* ========================================================================= */
366#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
367#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x009d0
368
369#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
370#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
371
372union uvh_lb_bau_intd_payload_queue_tail_u {
373 unsigned long v;
374 struct uvh_lb_bau_intd_payload_queue_tail_s {
375 unsigned long rsvd_0_3: 4; /* */
376 unsigned long address : 39; /* RW */
377 unsigned long rsvd_43_63: 21; /* */
378 } s;
379};
380
381/* ========================================================================= */
382/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */
383/* ========================================================================= */
384#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
385#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0a68
386
387#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
388#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
389#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
390#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
391#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
392#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
393#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
394#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
395#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
396#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
397#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
398#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
399#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
400#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
401#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
402#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
403#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
404#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
405#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
406#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
407#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
408#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
409#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
410#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
411#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
412#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
413#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
414#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
415#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
416#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
417#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
418#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
419union uvh_lb_bau_intd_software_acknowledge_u {
420 unsigned long v;
421 struct uvh_lb_bau_intd_software_acknowledge_s {
422 unsigned long pending_0 : 1; /* RW, W1C */
423 unsigned long pending_1 : 1; /* RW, W1C */
424 unsigned long pending_2 : 1; /* RW, W1C */
425 unsigned long pending_3 : 1; /* RW, W1C */
426 unsigned long pending_4 : 1; /* RW, W1C */
427 unsigned long pending_5 : 1; /* RW, W1C */
428 unsigned long pending_6 : 1; /* RW, W1C */
429 unsigned long pending_7 : 1; /* RW, W1C */
430 unsigned long timeout_0 : 1; /* RW, W1C */
431 unsigned long timeout_1 : 1; /* RW, W1C */
432 unsigned long timeout_2 : 1; /* RW, W1C */
433 unsigned long timeout_3 : 1; /* RW, W1C */
434 unsigned long timeout_4 : 1; /* RW, W1C */
435 unsigned long timeout_5 : 1; /* RW, W1C */
436 unsigned long timeout_6 : 1; /* RW, W1C */
437 unsigned long timeout_7 : 1; /* RW, W1C */
438 unsigned long rsvd_16_63: 48; /* */
439 } s;
440};
441
442/* ========================================================================= */
443/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */
444/* ========================================================================= */
445#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
446#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70
447
448/* ========================================================================= */
449/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */
450/* ========================================================================= */
451#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
452#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x009a8
453
454#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0
455#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL
456#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62
457#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL
458#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63
459#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL
460
461union uvh_lb_bau_sb_activation_control_u {
462 unsigned long v;
463 struct uvh_lb_bau_sb_activation_control_s {
464 unsigned long index : 6; /* RW */
465 unsigned long rsvd_6_61: 56; /* */
466 unsigned long push : 1; /* WP */
467 unsigned long init : 1; /* WP */
468 } s;
469};
470
471/* ========================================================================= */
472/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */
473/* ========================================================================= */
474#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
475#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x009b0
476
477#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0
478#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL
479
480union uvh_lb_bau_sb_activation_status_0_u {
481 unsigned long v;
482 struct uvh_lb_bau_sb_activation_status_0_s {
483 unsigned long status : 64; /* RW */
484 } s;
485};
486
487/* ========================================================================= */
488/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */
489/* ========================================================================= */
490#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
491#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x009b8
492
493#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0
494#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL
495
496union uvh_lb_bau_sb_activation_status_1_u {
497 unsigned long v;
498 struct uvh_lb_bau_sb_activation_status_1_s {
499 unsigned long status : 64; /* RW */
500 } s;
501};
502
503/* ========================================================================= */
504/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */
505/* ========================================================================= */
506#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
507#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x009a0
508
509#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12
510#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
511#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49
512#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL
513
514union uvh_lb_bau_sb_descriptor_base_u {
515 unsigned long v;
516 struct uvh_lb_bau_sb_descriptor_base_s {
517 unsigned long rsvd_0_11 : 12; /* */
518 unsigned long page_address : 31; /* RW */
519 unsigned long rsvd_43_48 : 6; /* */
520 unsigned long node_id : 14; /* RW */
521 unsigned long rsvd_63 : 1; /* */
522 } s;
523};
524
525/* ========================================================================= */
526/* UVH_LB_MCAST_AOERR0_RPT_ENABLE */
527/* ========================================================================= */
528#define UVH_LB_MCAST_AOERR0_RPT_ENABLE 0x50b20UL
529
530#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_SHFT 0
531#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_MASK 0x0000000000000001UL
532#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_SHFT 1
533#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_MASK 0x0000000000000002UL
534#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_SHFT 2
535#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_MASK 0x0000000000000004UL
536#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_SHFT 3
537#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_MASK 0x0000000000000008UL
538#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_SHFT 4
539#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_MASK 0x0000000000000010UL
540#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_SHFT 5
541#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_MASK 0x0000000000000020UL
542#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_SHFT 6
543#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_MASK 0x0000000000000040UL
544#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_SHFT 7
545#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_MASK 0x0000000000000080UL
546#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_SHFT 8
547#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_MASK 0x0000000000000100UL
548#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_SHFT 9
549#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_MASK 0x0000000000000200UL
550#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_SHFT 10
551#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_MASK 0x0000000000000400UL
552#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_SHFT 11
553#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_MASK 0x0000000000000800UL
554#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_SHFT 12
555#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_MASK 0x0000000000001000UL
556#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_SHFT 13
557#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_MASK 0x0000000000002000UL
558#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_SHFT 14
559#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_MASK 0x0000000000004000UL
560#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_SHFT 15
561#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_MASK 0x0000000000008000UL
562#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_SHFT 16
563#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_MASK 0x0000000000010000UL
564#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_SHFT 17
565#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_MASK 0x0000000000020000UL
566#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_SHFT 18
567#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_MASK 0x0000000000040000UL
568#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_SHFT 19
569#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_MASK 0x0000000000080000UL
570#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_SHFT 20
571#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_MASK 0x0000000000100000UL
572#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_SHFT 21
573#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_MASK 0x0000000000200000UL
574#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_SHFT 22
575#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_MASK 0x0000000000400000UL
576#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_SHFT 23
577#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_MASK 0x0000000000800000UL
578#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 24
579#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000001000000UL
580#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 25
581#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000002000000UL
582#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 26
583#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000004000000UL
584#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 27
585#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000008000000UL
586#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 28
587#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000010000000UL
588#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 29
589#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000020000000UL
590#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 30
591#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000040000000UL
592#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 31
593#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000080000000UL
594#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 32
595#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000100000000UL
596#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 33
597#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000200000000UL
598#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 34
599#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000400000000UL
600#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 35
601#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000800000000UL
602#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 36
603#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000001000000000UL
604#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 37
605#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000002000000000UL
606#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 38
607#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000004000000000UL
608#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 39
609#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000008000000000UL
610#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 40
611#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000010000000000UL
612#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 41
613#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000020000000000UL
614#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 42
615#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000040000000000UL
616
617union uvh_lb_mcast_aoerr0_rpt_enable_u {
618 unsigned long v;
619 struct uvh_lb_mcast_aoerr0_rpt_enable_s {
620 unsigned long mcast_obese_msg : 1; /* RW */
621 unsigned long mcast_data_sb_err : 1; /* RW */
622 unsigned long mcast_nack_buff_parity : 1; /* RW */
623 unsigned long mcast_timeout : 1; /* RW */
624 unsigned long mcast_inactive_reply : 1; /* RW */
625 unsigned long mcast_upgrade_error : 1; /* RW */
626 unsigned long mcast_reg_count_underflow : 1; /* RW */
627 unsigned long mcast_rep_obese_msg : 1; /* RW */
628 unsigned long ucache_req_runt_msg : 1; /* RW */
629 unsigned long ucache_req_obese_msg : 1; /* RW */
630 unsigned long ucache_req_data_sb_err : 1; /* RW */
631 unsigned long ucache_rep_runt_msg : 1; /* RW */
632 unsigned long ucache_rep_obese_msg : 1; /* RW */
633 unsigned long ucache_rep_data_sb_err : 1; /* RW */
634 unsigned long ucache_rep_command_err : 1; /* RW */
635 unsigned long ucache_pend_timeout : 1; /* RW */
636 unsigned long macc_req_runt_msg : 1; /* RW */
637 unsigned long macc_req_obese_msg : 1; /* RW */
638 unsigned long macc_req_data_sb_err : 1; /* RW */
639 unsigned long macc_rep_runt_msg : 1; /* RW */
640 unsigned long macc_rep_obese_msg : 1; /* RW */
641 unsigned long macc_rep_data_sb_err : 1; /* RW */
642 unsigned long macc_amo_timeout : 1; /* RW */
643 unsigned long macc_put_timeout : 1; /* RW */
644 unsigned long macc_spurious_event : 1; /* RW */
645 unsigned long ioh_destination_table_parity : 1; /* RW */
646 unsigned long get_had_error_reply : 1; /* RW */
647 unsigned long get_timeout : 1; /* RW */
648 unsigned long lock_manager_had_error_reply : 1; /* RW */
649 unsigned long put_had_error_reply : 1; /* RW */
650 unsigned long put_timeout : 1; /* RW */
651 unsigned long sb_activation_overrun : 1; /* RW */
652 unsigned long completed_gb_activation_had_error_reply : 1; /* RW */
653 unsigned long completed_gb_activation_timeout : 1; /* RW */
654 unsigned long descriptor_buffer_0_parity : 1; /* RW */
655 unsigned long descriptor_buffer_1_parity : 1; /* RW */
656 unsigned long socket_destination_table_parity : 1; /* RW */
657 unsigned long bau_reply_payload_corruption : 1; /* RW */
658 unsigned long io_port_destination_table_parity : 1; /* RW */
659 unsigned long intd_soft_ack_timeout : 1; /* RW */
660 unsigned long int_rep_obese_msg : 1; /* RW */
661 unsigned long int_rep_command_err : 1; /* RW */
662 unsigned long int_timeout : 1; /* RW */
663 unsigned long rsvd_43_63 : 21; /* */
664 } s;
665};
666
667/* ========================================================================= */
668/* UVH_LOCAL_INT0_CONFIG */
669/* ========================================================================= */
670#define UVH_LOCAL_INT0_CONFIG 0x61000UL
671
672#define UVH_LOCAL_INT0_CONFIG_VECTOR_SHFT 0
673#define UVH_LOCAL_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
674#define UVH_LOCAL_INT0_CONFIG_DM_SHFT 8
675#define UVH_LOCAL_INT0_CONFIG_DM_MASK 0x0000000000000700UL
676#define UVH_LOCAL_INT0_CONFIG_DESTMODE_SHFT 11
677#define UVH_LOCAL_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
678#define UVH_LOCAL_INT0_CONFIG_STATUS_SHFT 12
679#define UVH_LOCAL_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
680#define UVH_LOCAL_INT0_CONFIG_P_SHFT 13
681#define UVH_LOCAL_INT0_CONFIG_P_MASK 0x0000000000002000UL
682#define UVH_LOCAL_INT0_CONFIG_T_SHFT 15
683#define UVH_LOCAL_INT0_CONFIG_T_MASK 0x0000000000008000UL
684#define UVH_LOCAL_INT0_CONFIG_M_SHFT 16
685#define UVH_LOCAL_INT0_CONFIG_M_MASK 0x0000000000010000UL
686#define UVH_LOCAL_INT0_CONFIG_APIC_ID_SHFT 32
687#define UVH_LOCAL_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
688
689union uvh_local_int0_config_u {
690 unsigned long v;
691 struct uvh_local_int0_config_s {
692 unsigned long vector_ : 8; /* RW */
693 unsigned long dm : 3; /* RW */
694 unsigned long destmode : 1; /* RW */
695 unsigned long status : 1; /* RO */
696 unsigned long p : 1; /* RO */
697 unsigned long rsvd_14 : 1; /* */
698 unsigned long t : 1; /* RO */
699 unsigned long m : 1; /* RW */
700 unsigned long rsvd_17_31: 15; /* */
701 unsigned long apic_id : 32; /* RW */
702 } s;
703};
704
705/* ========================================================================= */
706/* UVH_LOCAL_INT0_ENABLE */
707/* ========================================================================= */
708#define UVH_LOCAL_INT0_ENABLE 0x65000UL
709
710#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_SHFT 0
711#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_MASK 0x0000000000000001UL
712#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_SHFT 1
713#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_MASK 0x0000000000000002UL
714#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_SHFT 2
715#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_MASK 0x0000000000000004UL
716#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_SHFT 3
717#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_MASK 0x0000000000000008UL
718#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_SHFT 4
719#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_MASK 0x0000000000000010UL
720#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_SHFT 5
721#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_MASK 0x0000000000000020UL
722#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_SHFT 6
723#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_MASK 0x0000000000000040UL
724#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_SHFT 7
725#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_MASK 0x0000000000000080UL
726#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_SHFT 8
727#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_MASK 0x0000000000000100UL
728#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_SHFT 9
729#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_MASK 0x0000000000000200UL
730#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_SHFT 10
731#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_MASK 0x0000000000000400UL
732#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_SHFT 11
733#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_MASK 0x0000000000000800UL
734#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_SHFT 12
735#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_MASK 0x0000000000001000UL
736#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_SHFT 13
737#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_MASK 0x0000000000002000UL
738#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_SHFT 14
739#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_MASK 0x0000000000004000UL
740#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_SHFT 15
741#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_MASK 0x0000000000008000UL
742#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_SHFT 16
743#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_MASK 0x0000000000010000UL
744#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_SHFT 17
745#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_MASK 0x0000000000020000UL
746#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_SHFT 18
747#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_MASK 0x0000000000040000UL
748#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_SHFT 19
749#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_MASK 0x0000000000080000UL
750#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_SHFT 20
751#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_MASK 0x0000000000100000UL
752#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_SHFT 21
753#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_MASK 0x0000000000200000UL
754#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_SHFT 22
755#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
756#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_SHFT 23
757#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_MASK 0x0000000000800000UL
758#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_SHFT 24
759#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_MASK 0x0000000001000000UL
760#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_SHFT 25
761#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_MASK 0x0000000002000000UL
762#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_SHFT 26
763#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_MASK 0x0000000004000000UL
764#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_SHFT 27
765#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_MASK 0x0000000008000000UL
766#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_SHFT 28
767#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_MASK 0x0000000010000000UL
768#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_SHFT 29
769#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_MASK 0x0000000020000000UL
770#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_SHFT 30
771#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_MASK 0x0000000040000000UL
772#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_SHFT 31
773#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_MASK 0x0000000080000000UL
774#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_SHFT 32
775#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_MASK 0x0000000100000000UL
776#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_SHFT 33
777#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_MASK 0x0000000200000000UL
778#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_SHFT 34
779#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_MASK 0x0000000400000000UL
780#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_SHFT 35
781#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_MASK 0x0000000800000000UL
782#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_SHFT 36
783#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_MASK 0x0000001000000000UL
784#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_SHFT 37
785#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_MASK 0x0000002000000000UL
786#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_SHFT 38
787#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_MASK 0x0000004000000000UL
788#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_SHFT 39
789#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_MASK 0x0000008000000000UL
790#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_SHFT 40
791#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_MASK 0x0000010000000000UL
792#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_SHFT 41
793#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_MASK 0x0000020000000000UL
794#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_SHFT 42
795#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_MASK 0x0000040000000000UL
796#define UVH_LOCAL_INT0_ENABLE_LTC_INT_SHFT 43
797#define UVH_LOCAL_INT0_ENABLE_LTC_INT_MASK 0x0000080000000000UL
798#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_SHFT 44
799#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
800
801union uvh_local_int0_enable_u {
802 unsigned long v;
803 struct uvh_local_int0_enable_s {
804 unsigned long lb_hcerr : 1; /* RW */
805 unsigned long gr0_hcerr : 1; /* RW */
806 unsigned long gr1_hcerr : 1; /* RW */
807 unsigned long lh_hcerr : 1; /* RW */
808 unsigned long rh_hcerr : 1; /* RW */
809 unsigned long xn_hcerr : 1; /* RW */
810 unsigned long si_hcerr : 1; /* RW */
811 unsigned long lb_aoerr0 : 1; /* RW */
812 unsigned long gr0_aoerr0 : 1; /* RW */
813 unsigned long gr1_aoerr0 : 1; /* RW */
814 unsigned long lh_aoerr0 : 1; /* RW */
815 unsigned long rh_aoerr0 : 1; /* RW */
816 unsigned long xn_aoerr0 : 1; /* RW */
817 unsigned long si_aoerr0 : 1; /* RW */
818 unsigned long lb_aoerr1 : 1; /* RW */
819 unsigned long gr0_aoerr1 : 1; /* RW */
820 unsigned long gr1_aoerr1 : 1; /* RW */
821 unsigned long lh_aoerr1 : 1; /* RW */
822 unsigned long rh_aoerr1 : 1; /* RW */
823 unsigned long xn_aoerr1 : 1; /* RW */
824 unsigned long si_aoerr1 : 1; /* RW */
825 unsigned long rh_vpi_int : 1; /* RW */
826 unsigned long system_shutdown_int : 1; /* RW */
827 unsigned long lb_irq_int_0 : 1; /* RW */
828 unsigned long lb_irq_int_1 : 1; /* RW */
829 unsigned long lb_irq_int_2 : 1; /* RW */
830 unsigned long lb_irq_int_3 : 1; /* RW */
831 unsigned long lb_irq_int_4 : 1; /* RW */
832 unsigned long lb_irq_int_5 : 1; /* RW */
833 unsigned long lb_irq_int_6 : 1; /* RW */
834 unsigned long lb_irq_int_7 : 1; /* RW */
835 unsigned long lb_irq_int_8 : 1; /* RW */
836 unsigned long lb_irq_int_9 : 1; /* RW */
837 unsigned long lb_irq_int_10 : 1; /* RW */
838 unsigned long lb_irq_int_11 : 1; /* RW */
839 unsigned long lb_irq_int_12 : 1; /* RW */
840 unsigned long lb_irq_int_13 : 1; /* RW */
841 unsigned long lb_irq_int_14 : 1; /* RW */
842 unsigned long lb_irq_int_15 : 1; /* RW */
843 unsigned long l1_nmi_int : 1; /* RW */
844 unsigned long stop_clock : 1; /* RW */
845 unsigned long asic_to_l1 : 1; /* RW */
846 unsigned long l1_to_asic : 1; /* RW */
847 unsigned long ltc_int : 1; /* RW */
848 unsigned long la_seq_trigger : 1; /* RW */
849 unsigned long rsvd_45_63 : 19; /* */
850 } s;
851};
852
853/* ========================================================================= */
854/* UVH_NODE_ID */
855/* ========================================================================= */
856#define UVH_NODE_ID 0x0UL
857
858#define UVH_NODE_ID_FORCE1_SHFT 0
859#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
860#define UVH_NODE_ID_MANUFACTURER_SHFT 1
861#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
862#define UVH_NODE_ID_PART_NUMBER_SHFT 12
863#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
864#define UVH_NODE_ID_REVISION_SHFT 28
865#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
866#define UVH_NODE_ID_NODE_ID_SHFT 32
867#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
868#define UVH_NODE_ID_NODES_PER_BIT_SHFT 48
869#define UVH_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
870#define UVH_NODE_ID_NI_PORT_SHFT 56
871#define UVH_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
872
873union uvh_node_id_u {
874 unsigned long v;
875 struct uvh_node_id_s {
876 unsigned long force1 : 1; /* RO */
877 unsigned long manufacturer : 11; /* RO */
878 unsigned long part_number : 16; /* RO */
879 unsigned long revision : 4; /* RO */
880 unsigned long node_id : 15; /* RW */
881 unsigned long rsvd_47 : 1; /* */
882 unsigned long nodes_per_bit : 7; /* RW */
883 unsigned long rsvd_55 : 1; /* */
884 unsigned long ni_port : 4; /* RO */
885 unsigned long rsvd_60_63 : 4; /* */
886 } s;
887};
888
889/* ========================================================================= */
890/* UVH_NODE_PRESENT_TABLE */
891/* ========================================================================= */
892#define UVH_NODE_PRESENT_TABLE 0x1400UL
893#define UVH_NODE_PRESENT_TABLE_DEPTH 16
894
895#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0
896#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL
897
898union uvh_node_present_table_u {
899 unsigned long v;
900 struct uvh_node_present_table_s {
901 unsigned long nodes : 64; /* RW */
902 } s;
903};
904
905/* ========================================================================= */
906/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */
907/* ========================================================================= */
908#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
909
910#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
911#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
912
913union uvh_rh_gam_alias210_redirect_config_0_mmr_u {
914 unsigned long v;
915 struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
916 unsigned long rsvd_0_23 : 24; /* */
917 unsigned long dest_base : 22; /* RW */
918 unsigned long rsvd_46_63: 18; /* */
919 } s;
920};
921
922/* ========================================================================= */
923/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */
924/* ========================================================================= */
925#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
926
927#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
928#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
929
930union uvh_rh_gam_alias210_redirect_config_1_mmr_u {
931 unsigned long v;
932 struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
933 unsigned long rsvd_0_23 : 24; /* */
934 unsigned long dest_base : 22; /* RW */
935 unsigned long rsvd_46_63: 18; /* */
936 } s;
937};
938
939/* ========================================================================= */
940/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */
941/* ========================================================================= */
942#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
943
944#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
945#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
946
947union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
948 unsigned long v;
949 struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
950 unsigned long rsvd_0_23 : 24; /* */
951 unsigned long dest_base : 22; /* RW */
952 unsigned long rsvd_46_63: 18; /* */
953 } s;
954};
955
956/* ========================================================================= */
957/* UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR */
958/* ========================================================================= */
959#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR 0x1600020UL
960
961#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT 26
962#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
963#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
964#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
965
966union uvh_rh_gam_cfg_overlay_config_mmr_u {
967 unsigned long v;
968 struct uvh_rh_gam_cfg_overlay_config_mmr_s {
969 unsigned long rsvd_0_25: 26; /* */
970 unsigned long base : 20; /* RW */
971 unsigned long rsvd_46_62: 17; /* */
972 unsigned long enable : 1; /* RW */
973 } s;
974};
975
976/* ========================================================================= */
977/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */
978/* ========================================================================= */
979#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
980
981#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
982#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
983#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
984#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
985#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
986#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
987#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
988#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
989
990union uvh_rh_gam_gru_overlay_config_mmr_u {
991 unsigned long v;
992 struct uvh_rh_gam_gru_overlay_config_mmr_s {
993 unsigned long rsvd_0_27: 28; /* */
994 unsigned long base : 18; /* RW */
995 unsigned long rsvd_46_47: 2; /* */
996 unsigned long gr4 : 1; /* RW */
997 unsigned long rsvd_49_51: 3; /* */
998 unsigned long n_gru : 4; /* RW */
999 unsigned long rsvd_56_62: 7; /* */
1000 unsigned long enable : 1; /* RW */
1001 } s;
1002};
1003
1004/* ========================================================================= */
1005/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */
1006/* ========================================================================= */
1007#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
1008
1009#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
1010#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
1011#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
1012#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
1013#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
1014#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
1015#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1016#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1017
1018union uvh_rh_gam_mmioh_overlay_config_mmr_u {
1019 unsigned long v;
1020 struct uvh_rh_gam_mmioh_overlay_config_mmr_s {
1021 unsigned long rsvd_0_29: 30; /* */
1022 unsigned long base : 16; /* RW */
1023 unsigned long m_io : 6; /* RW */
1024 unsigned long n_io : 4; /* RW */
1025 unsigned long rsvd_56_62: 7; /* */
1026 unsigned long enable : 1; /* RW */
1027 } s;
1028};
1029
1030/* ========================================================================= */
1031/* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */
1032/* ========================================================================= */
1033#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
1034
1035#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
1036#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
1037#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
1038#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
1039#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1040#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1041
1042union uvh_rh_gam_mmr_overlay_config_mmr_u {
1043 unsigned long v;
1044 struct uvh_rh_gam_mmr_overlay_config_mmr_s {
1045 unsigned long rsvd_0_25: 26; /* */
1046 unsigned long base : 20; /* RW */
1047 unsigned long dual_hub : 1; /* RW */
1048 unsigned long rsvd_47_62: 16; /* */
1049 unsigned long enable : 1; /* RW */
1050 } s;
1051};
1052
1053/* ========================================================================= */
1054/* UVH_RTC */
1055/* ========================================================================= */
1056#define UVH_RTC 0x340000UL
1057
1058#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0
1059#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL
1060
1061union uvh_rtc_u {
1062 unsigned long v;
1063 struct uvh_rtc_s {
1064 unsigned long real_time_clock : 56; /* RW */
1065 unsigned long rsvd_56_63 : 8; /* */
1066 } s;
1067};
1068
1069/* ========================================================================= */
1070/* UVH_RTC1_INT_CONFIG */
1071/* ========================================================================= */
1072#define UVH_RTC1_INT_CONFIG 0x615c0UL
1073
1074#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0
1075#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
1076#define UVH_RTC1_INT_CONFIG_DM_SHFT 8
1077#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL
1078#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11
1079#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
1080#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12
1081#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
1082#define UVH_RTC1_INT_CONFIG_P_SHFT 13
1083#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL
1084#define UVH_RTC1_INT_CONFIG_T_SHFT 15
1085#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL
1086#define UVH_RTC1_INT_CONFIG_M_SHFT 16
1087#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL
1088#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32
1089#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
1090
1091union uvh_rtc1_int_config_u {
1092 unsigned long v;
1093 struct uvh_rtc1_int_config_s {
1094 unsigned long vector_ : 8; /* RW */
1095 unsigned long dm : 3; /* RW */
1096 unsigned long destmode : 1; /* RW */
1097 unsigned long status : 1; /* RO */
1098 unsigned long p : 1; /* RO */
1099 unsigned long rsvd_14 : 1; /* */
1100 unsigned long t : 1; /* RO */
1101 unsigned long m : 1; /* RW */
1102 unsigned long rsvd_17_31: 15; /* */
1103 unsigned long apic_id : 32; /* RW */
1104 } s;
1105};
1106
1107/* ========================================================================= */
1108/* UVH_RTC2_INT_CONFIG */
1109/* ========================================================================= */
1110#define UVH_RTC2_INT_CONFIG 0x61600UL
1111
1112#define UVH_RTC2_INT_CONFIG_VECTOR_SHFT 0
1113#define UVH_RTC2_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
1114#define UVH_RTC2_INT_CONFIG_DM_SHFT 8
1115#define UVH_RTC2_INT_CONFIG_DM_MASK 0x0000000000000700UL
1116#define UVH_RTC2_INT_CONFIG_DESTMODE_SHFT 11
1117#define UVH_RTC2_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
1118#define UVH_RTC2_INT_CONFIG_STATUS_SHFT 12
1119#define UVH_RTC2_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
1120#define UVH_RTC2_INT_CONFIG_P_SHFT 13
1121#define UVH_RTC2_INT_CONFIG_P_MASK 0x0000000000002000UL
1122#define UVH_RTC2_INT_CONFIG_T_SHFT 15
1123#define UVH_RTC2_INT_CONFIG_T_MASK 0x0000000000008000UL
1124#define UVH_RTC2_INT_CONFIG_M_SHFT 16
1125#define UVH_RTC2_INT_CONFIG_M_MASK 0x0000000000010000UL
1126#define UVH_RTC2_INT_CONFIG_APIC_ID_SHFT 32
1127#define UVH_RTC2_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
1128
1129union uvh_rtc2_int_config_u {
1130 unsigned long v;
1131 struct uvh_rtc2_int_config_s {
1132 unsigned long vector_ : 8; /* RW */
1133 unsigned long dm : 3; /* RW */
1134 unsigned long destmode : 1; /* RW */
1135 unsigned long status : 1; /* RO */
1136 unsigned long p : 1; /* RO */
1137 unsigned long rsvd_14 : 1; /* */
1138 unsigned long t : 1; /* RO */
1139 unsigned long m : 1; /* RW */
1140 unsigned long rsvd_17_31: 15; /* */
1141 unsigned long apic_id : 32; /* RW */
1142 } s;
1143};
1144
1145/* ========================================================================= */
1146/* UVH_RTC3_INT_CONFIG */
1147/* ========================================================================= */
1148#define UVH_RTC3_INT_CONFIG 0x61640UL
1149
1150#define UVH_RTC3_INT_CONFIG_VECTOR_SHFT 0
1151#define UVH_RTC3_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
1152#define UVH_RTC3_INT_CONFIG_DM_SHFT 8
1153#define UVH_RTC3_INT_CONFIG_DM_MASK 0x0000000000000700UL
1154#define UVH_RTC3_INT_CONFIG_DESTMODE_SHFT 11
1155#define UVH_RTC3_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
1156#define UVH_RTC3_INT_CONFIG_STATUS_SHFT 12
1157#define UVH_RTC3_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
1158#define UVH_RTC3_INT_CONFIG_P_SHFT 13
1159#define UVH_RTC3_INT_CONFIG_P_MASK 0x0000000000002000UL
1160#define UVH_RTC3_INT_CONFIG_T_SHFT 15
1161#define UVH_RTC3_INT_CONFIG_T_MASK 0x0000000000008000UL
1162#define UVH_RTC3_INT_CONFIG_M_SHFT 16
1163#define UVH_RTC3_INT_CONFIG_M_MASK 0x0000000000010000UL
1164#define UVH_RTC3_INT_CONFIG_APIC_ID_SHFT 32
1165#define UVH_RTC3_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
1166
1167union uvh_rtc3_int_config_u {
1168 unsigned long v;
1169 struct uvh_rtc3_int_config_s {
1170 unsigned long vector_ : 8; /* RW */
1171 unsigned long dm : 3; /* RW */
1172 unsigned long destmode : 1; /* RW */
1173 unsigned long status : 1; /* RO */
1174 unsigned long p : 1; /* RO */
1175 unsigned long rsvd_14 : 1; /* */
1176 unsigned long t : 1; /* RO */
1177 unsigned long m : 1; /* RW */
1178 unsigned long rsvd_17_31: 15; /* */
1179 unsigned long apic_id : 32; /* RW */
1180 } s;
1181};
1182
1183/* ========================================================================= */
1184/* UVH_RTC_INC_RATIO */
1185/* ========================================================================= */
1186#define UVH_RTC_INC_RATIO 0x350000UL
1187
1188#define UVH_RTC_INC_RATIO_FRACTION_SHFT 0
1189#define UVH_RTC_INC_RATIO_FRACTION_MASK 0x00000000000fffffUL
1190#define UVH_RTC_INC_RATIO_RATIO_SHFT 20
1191#define UVH_RTC_INC_RATIO_RATIO_MASK 0x0000000000700000UL
1192
1193union uvh_rtc_inc_ratio_u {
1194 unsigned long v;
1195 struct uvh_rtc_inc_ratio_s {
1196 unsigned long fraction : 20; /* RW */
1197 unsigned long ratio : 3; /* RW */
1198 unsigned long rsvd_23_63: 41; /* */
1199 } s;
1200};
1201
1202/* ========================================================================= */
1203/* UVH_SI_ADDR_MAP_CONFIG */
1204/* ========================================================================= */
1205#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL
1206
1207#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0
1208#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL
1209#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8
1210#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL
1211
1212union uvh_si_addr_map_config_u {
1213 unsigned long v;
1214 struct uvh_si_addr_map_config_s {
1215 unsigned long m_skt : 6; /* RW */
1216 unsigned long rsvd_6_7: 2; /* */
1217 unsigned long n_skt : 4; /* RW */
1218 unsigned long rsvd_12_63: 52; /* */
1219 } s;
1220};
1221
1222/* ========================================================================= */
1223/* UVH_SI_ALIAS0_OVERLAY_CONFIG */
1224/* ========================================================================= */
1225#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL
1226
1227#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24
1228#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
1229#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48
1230#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
1231#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63
1232#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
1233
1234union uvh_si_alias0_overlay_config_u {
1235 unsigned long v;
1236 struct uvh_si_alias0_overlay_config_s {
1237 unsigned long rsvd_0_23: 24; /* */
1238 unsigned long base : 8; /* RW */
1239 unsigned long rsvd_32_47: 16; /* */
1240 unsigned long m_alias : 5; /* RW */
1241 unsigned long rsvd_53_62: 10; /* */
1242 unsigned long enable : 1; /* RW */
1243 } s;
1244};
1245
1246/* ========================================================================= */
1247/* UVH_SI_ALIAS1_OVERLAY_CONFIG */
1248/* ========================================================================= */
1249#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL
1250
1251#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24
1252#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
1253#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48
1254#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
1255#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63
1256#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
1257
1258union uvh_si_alias1_overlay_config_u {
1259 unsigned long v;
1260 struct uvh_si_alias1_overlay_config_s {
1261 unsigned long rsvd_0_23: 24; /* */
1262 unsigned long base : 8; /* RW */
1263 unsigned long rsvd_32_47: 16; /* */
1264 unsigned long m_alias : 5; /* RW */
1265 unsigned long rsvd_53_62: 10; /* */
1266 unsigned long enable : 1; /* RW */
1267 } s;
1268};
1269
1270/* ========================================================================= */
1271/* UVH_SI_ALIAS2_OVERLAY_CONFIG */
1272/* ========================================================================= */
1273#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL
1274
1275#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24
1276#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
1277#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48
1278#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
1279#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63
1280#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
1281
1282union uvh_si_alias2_overlay_config_u {
1283 unsigned long v;
1284 struct uvh_si_alias2_overlay_config_s {
1285 unsigned long rsvd_0_23: 24; /* */
1286 unsigned long base : 8; /* RW */
1287 unsigned long rsvd_32_47: 16; /* */
1288 unsigned long m_alias : 5; /* RW */
1289 unsigned long rsvd_53_62: 10; /* */
1290 unsigned long enable : 1; /* RW */
1291 } s;
1292};
1293
1294
1295#endif /* _ASM_X86_UV_UV_MMRS_H */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
new file mode 100644
index 00000000000..9064052b73d
--- /dev/null
+++ b/arch/x86/include/asm/vdso.h
@@ -0,0 +1,47 @@
1#ifndef _ASM_X86_VDSO_H
2#define _ASM_X86_VDSO_H
3
4#ifdef CONFIG_X86_64
5extern const char VDSO64_PRELINK[];
6
7/*
8 * Given a pointer to the vDSO image, find the pointer to VDSO64_name
9 * as that symbol is defined in the vDSO sources or linker script.
10 */
11#define VDSO64_SYMBOL(base, name) \
12({ \
13 extern const char VDSO64_##name[]; \
14 (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
15})
16#endif
17
18#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
19extern const char VDSO32_PRELINK[];
20
21/*
22 * Given a pointer to the vDSO image, find the pointer to VDSO32_name
23 * as that symbol is defined in the vDSO sources or linker script.
24 */
25#define VDSO32_SYMBOL(base, name) \
26({ \
27 extern const char VDSO32_##name[]; \
28 (void *)(VDSO32_##name - VDSO32_PRELINK + (unsigned long)(base)); \
29})
30#endif
31
32/*
33 * These symbols are defined with the addresses in the vsyscall page.
34 * See vsyscall-sigreturn.S.
35 */
36extern void __user __kernel_sigreturn;
37extern void __user __kernel_rt_sigreturn;
38
39/*
40 * These symbols are defined by vdso32.S to mark the bounds
41 * of the ELF DSO images included therein.
42 */
43extern const char vdso32_int80_start, vdso32_int80_end;
44extern const char vdso32_syscall_start, vdso32_syscall_end;
45extern const char vdso32_sysenter_start, vdso32_sysenter_end;
46
47#endif /* _ASM_X86_VDSO_H */
diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h
new file mode 100644
index 00000000000..c4b9dc2f67c
--- /dev/null
+++ b/arch/x86/include/asm/vga.h
@@ -0,0 +1,20 @@
1/*
2 * Access to VGA videoram
3 *
4 * (c) 1998 Martin Mares <mj@ucw.cz>
5 */
6
7#ifndef _ASM_X86_VGA_H
8#define _ASM_X86_VGA_H
9
10/*
11 * On the PC, we can just recalculate addresses and then
12 * access the videoram directly without any black magic.
13 */
14
15#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x)
16
17#define vga_readb(x) (*(x))
18#define vga_writeb(x, y) (*(y) = (x))
19
20#endif /* _ASM_X86_VGA_H */
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
new file mode 100644
index 00000000000..dc27a69e5d2
--- /dev/null
+++ b/arch/x86/include/asm/vgtod.h
@@ -0,0 +1,29 @@
1#ifndef _ASM_X86_VGTOD_H
2#define _ASM_X86_VGTOD_H
3
4#include <asm/vsyscall.h>
5#include <linux/clocksource.h>
6
7struct vsyscall_gtod_data {
8 seqlock_t lock;
9
10 /* open coded 'struct timespec' */
11 time_t wall_time_sec;
12 u32 wall_time_nsec;
13
14 int sysctl_enabled;
15 struct timezone sys_tz;
16 struct { /* extract of a clocksource struct */
17 cycle_t (*vread)(void);
18 cycle_t cycle_last;
19 cycle_t mask;
20 u32 mult;
21 u32 shift;
22 } clock;
23 struct timespec wall_to_monotonic;
24};
25extern struct vsyscall_gtod_data __vsyscall_gtod_data
26__section_vsyscall_gtod_data;
27extern struct vsyscall_gtod_data vsyscall_gtod_data;
28
29#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vic.h b/arch/x86/include/asm/vic.h
new file mode 100644
index 00000000000..53100f35361
--- /dev/null
+++ b/arch/x86/include/asm/vic.h
@@ -0,0 +1,61 @@
1/* Copyright (C) 1999,2001
2 *
3 * Author: J.E.J.Bottomley@HansenPartnership.com
4 *
5 * Standard include definitions for the NCR Voyager Interrupt Controller */
6
7/* The eight CPI vectors. To activate a CPI, you write a bit mask
8 * corresponding to the processor set to be interrupted into the
9 * relevant register. That set of CPUs will then be interrupted with
10 * the CPI */
11static const int VIC_CPI_Registers[] =
12 {0xFC00, 0xFC01, 0xFC08, 0xFC09,
13 0xFC10, 0xFC11, 0xFC18, 0xFC19 };
14
15#define VIC_PROC_WHO_AM_I 0xfc29
16# define QUAD_IDENTIFIER 0xC0
17# define EIGHT_SLOT_IDENTIFIER 0xE0
18#define QIC_EXTENDED_PROCESSOR_SELECT 0xFC72
19#define VIC_CPI_BASE_REGISTER 0xFC41
20#define VIC_PROCESSOR_ID 0xFC21
21# define VIC_CPU_MASQUERADE_ENABLE 0x8
22
23#define VIC_CLAIM_REGISTER_0 0xFC38
24#define VIC_CLAIM_REGISTER_1 0xFC39
25#define VIC_REDIRECT_REGISTER_0 0xFC60
26#define VIC_REDIRECT_REGISTER_1 0xFC61
27#define VIC_PRIORITY_REGISTER 0xFC20
28
29#define VIC_PRIMARY_MC_BASE 0xFC48
30#define VIC_SECONDARY_MC_BASE 0xFC49
31
32#define QIC_PROCESSOR_ID 0xFC71
33# define QIC_CPUID_ENABLE 0x08
34
35#define QIC_VIC_CPI_BASE_REGISTER 0xFC79
36#define QIC_CPI_BASE_REGISTER 0xFC7A
37
38#define QIC_MASK_REGISTER0 0xFC80
39/* NOTE: these are masked high, enabled low */
40# define QIC_PERF_TIMER 0x01
41# define QIC_LPE 0x02
42# define QIC_SYS_INT 0x04
43# define QIC_CMN_INT 0x08
44/* at the moment, just enable CMN_INT, disable SYS_INT */
45# define QIC_DEFAULT_MASK0 (~(QIC_CMN_INT /* | VIC_SYS_INT */))
46#define QIC_MASK_REGISTER1 0xFC81
47# define QIC_BOOT_CPI_MASK 0xFE
48/* Enable CPI's 1-6 inclusive */
49# define QIC_CPI_ENABLE 0x81
50
51#define QIC_INTERRUPT_CLEAR0 0xFC8A
52#define QIC_INTERRUPT_CLEAR1 0xFC8B
53
54/* this is where we place the CPI vectors */
55#define VIC_DEFAULT_CPI_BASE 0xC0
56/* this is where we place the QIC CPI vectors */
57#define QIC_DEFAULT_CPI_BASE 0xD0
58
59#define VIC_BOOT_INTERRUPT_MASK 0xfe
60
61extern void smp_vic_timer_interrupt(void);
diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h
new file mode 100644
index 00000000000..166adf61e77
--- /dev/null
+++ b/arch/x86/include/asm/visws/cobalt.h
@@ -0,0 +1,125 @@
1#ifndef _ASM_X86_VISWS_COBALT_H
2#define _ASM_X86_VISWS_COBALT_H
3
4#include <asm/fixmap.h>
5
6/*
7 * Cobalt SGI Visual Workstation system ASIC
8 */
9
10#define CO_CPU_NUM_PHYS 0x1e00
11#define CO_CPU_TAB_PHYS (CO_CPU_NUM_PHYS + 2)
12
13#define CO_CPU_MAX 4
14
15#define CO_CPU_PHYS 0xc2000000
16#define CO_APIC_PHYS 0xc4000000
17
18/* see set_fixmap() and asm/fixmap.h */
19#define CO_CPU_VADDR (fix_to_virt(FIX_CO_CPU))
20#define CO_APIC_VADDR (fix_to_virt(FIX_CO_APIC))
21
22/* Cobalt CPU registers -- relative to CO_CPU_VADDR, use co_cpu_*() */
23#define CO_CPU_REV 0x08
24#define CO_CPU_CTRL 0x10
25#define CO_CPU_STAT 0x20
26#define CO_CPU_TIMEVAL 0x30
27
28/* CO_CPU_CTRL bits */
29#define CO_CTRL_TIMERUN 0x04 /* 0 == disabled */
30#define CO_CTRL_TIMEMASK 0x08 /* 0 == unmasked */
31
32/* CO_CPU_STATUS bits */
33#define CO_STAT_TIMEINTR 0x02 /* (r) 1 == int pend, (w) 0 == clear */
34
35/* CO_CPU_TIMEVAL value */
36#define CO_TIME_HZ 100000000 /* Cobalt core rate */
37
38/* Cobalt APIC registers -- relative to CO_APIC_VADDR, use co_apic_*() */
39#define CO_APIC_HI(n) (((n) * 0x10) + 4)
40#define CO_APIC_LO(n) ((n) * 0x10)
41#define CO_APIC_ID 0x0ffc
42
43/* CO_APIC_ID bits */
44#define CO_APIC_ENABLE 0x00000100
45
46/* CO_APIC_LO bits */
47#define CO_APIC_MASK 0x00010000 /* 0 = enabled */
48#define CO_APIC_LEVEL 0x00008000 /* 0 = edge */
49
50/*
51 * Where things are physically wired to Cobalt
52 * #defines with no board _<type>_<rev>_ are common to all (thus far)
53 */
54#define CO_APIC_IDE0 4
55#define CO_APIC_IDE1 2 /* Only on 320 */
56
57#define CO_APIC_8259 12 /* serial, floppy, par-l-l */
58
59/* Lithium PCI Bridge A -- "the one with 82557 Ethernet" */
60#define CO_APIC_PCIA_BASE0 0 /* and 1 */ /* slot 0, line 0 */
61#define CO_APIC_PCIA_BASE123 5 /* and 6 */ /* slot 0, line 1 */
62
63#define CO_APIC_PIIX4_USB 7 /* this one is weird */
64
65/* Lithium PCI Bridge B -- "the one with PIIX4" */
66#define CO_APIC_PCIB_BASE0 8 /* and 9-12 *//* slot 0, line 0 */
67#define CO_APIC_PCIB_BASE123 13 /* 14.15 */ /* slot 0, line 1 */
68
69#define CO_APIC_VIDOUT0 16
70#define CO_APIC_VIDOUT1 17
71#define CO_APIC_VIDIN0 18
72#define CO_APIC_VIDIN1 19
73
74#define CO_APIC_LI_AUDIO 22
75
76#define CO_APIC_AS 24
77#define CO_APIC_RE 25
78
79#define CO_APIC_CPU 28 /* Timer and Cache interrupt */
80#define CO_APIC_NMI 29
81#define CO_APIC_LAST CO_APIC_NMI
82
83/*
84 * This is how irqs are assigned on the Visual Workstation.
85 * Legacy devices get irq's 1-15 (system clock is 0 and is CO_APIC_CPU).
86 * All other devices (including PCI) go to Cobalt and are irq's 16 on up.
87 */
88#define CO_IRQ_APIC0 16 /* irq of apic entry 0 */
89#define IS_CO_APIC(irq) ((irq) >= CO_IRQ_APIC0)
90#define CO_IRQ(apic) (CO_IRQ_APIC0 + (apic)) /* apic ent to irq */
91#define CO_APIC(irq) ((irq) - CO_IRQ_APIC0) /* irq to apic ent */
92#define CO_IRQ_IDE0 14 /* knowledge of... */
93#define CO_IRQ_IDE1 15 /* ... ide driver defaults! */
94#define CO_IRQ_8259 CO_IRQ(CO_APIC_8259)
95
96#ifdef CONFIG_X86_VISWS_APIC
97static inline void co_cpu_write(unsigned long reg, unsigned long v)
98{
99 *((volatile unsigned long *)(CO_CPU_VADDR+reg))=v;
100}
101
102static inline unsigned long co_cpu_read(unsigned long reg)
103{
104 return *((volatile unsigned long *)(CO_CPU_VADDR+reg));
105}
106
107static inline void co_apic_write(unsigned long reg, unsigned long v)
108{
109 *((volatile unsigned long *)(CO_APIC_VADDR+reg))=v;
110}
111
112static inline unsigned long co_apic_read(unsigned long reg)
113{
114 return *((volatile unsigned long *)(CO_APIC_VADDR+reg));
115}
116#endif
117
118extern char visws_board_type;
119
120#define VISWS_320 0
121#define VISWS_540 1
122
123extern char visws_board_rev;
124
125#endif /* _ASM_X86_VISWS_COBALT_H */
diff --git a/arch/x86/include/asm/visws/lithium.h b/arch/x86/include/asm/visws/lithium.h
new file mode 100644
index 00000000000..a10d89bc127
--- /dev/null
+++ b/arch/x86/include/asm/visws/lithium.h
@@ -0,0 +1,53 @@
1#ifndef _ASM_X86_VISWS_LITHIUM_H
2#define _ASM_X86_VISWS_LITHIUM_H
3
4#include <asm/fixmap.h>
5
6/*
7 * Lithium is the SGI Visual Workstation I/O ASIC
8 */
9
10#define LI_PCI_A_PHYS 0xfc000000 /* Enet is dev 3 */
11#define LI_PCI_B_PHYS 0xfd000000 /* PIIX4 is here */
12
13/* see set_fixmap() and asm/fixmap.h */
14#define LI_PCIA_VADDR (fix_to_virt(FIX_LI_PCIA))
15#define LI_PCIB_VADDR (fix_to_virt(FIX_LI_PCIB))
16
17/* Not a standard PCI? (not in linux/pci.h) */
18#define LI_PCI_BUSNUM 0x44 /* lo8: primary, hi8: sub */
19#define LI_PCI_INTEN 0x46
20
21/* LI_PCI_INTENT bits */
22#define LI_INTA_0 0x0001
23#define LI_INTA_1 0x0002
24#define LI_INTA_2 0x0004
25#define LI_INTA_3 0x0008
26#define LI_INTA_4 0x0010
27#define LI_INTB 0x0020
28#define LI_INTC 0x0040
29#define LI_INTD 0x0080
30
31/* More special purpose macros... */
32static inline void li_pcia_write16(unsigned long reg, unsigned short v)
33{
34 *((volatile unsigned short *)(LI_PCIA_VADDR+reg))=v;
35}
36
37static inline unsigned short li_pcia_read16(unsigned long reg)
38{
39 return *((volatile unsigned short *)(LI_PCIA_VADDR+reg));
40}
41
42static inline void li_pcib_write16(unsigned long reg, unsigned short v)
43{
44 *((volatile unsigned short *)(LI_PCIB_VADDR+reg))=v;
45}
46
47static inline unsigned short li_pcib_read16(unsigned long reg)
48{
49 return *((volatile unsigned short *)(LI_PCIB_VADDR+reg));
50}
51
52#endif /* _ASM_X86_VISWS_LITHIUM_H */
53
diff --git a/arch/x86/include/asm/visws/piix4.h b/arch/x86/include/asm/visws/piix4.h
new file mode 100644
index 00000000000..d0af4d338e7
--- /dev/null
+++ b/arch/x86/include/asm/visws/piix4.h
@@ -0,0 +1,107 @@
1#ifndef _ASM_X86_VISWS_PIIX4_H
2#define _ASM_X86_VISWS_PIIX4_H
3
4/*
5 * PIIX4 as used on SGI Visual Workstations
6 */
7
8#define PIIX_PM_START 0x0F80
9
10#define SIO_GPIO_START 0x0FC0
11
12#define SIO_PM_START 0x0FC8
13
14#define PMBASE PIIX_PM_START
15#define GPIREG0 (PMBASE+0x30)
16#define GPIREG(x) (GPIREG0+((x)/8))
17#define GPIBIT(x) (1 << ((x)%8))
18
19#define PIIX_GPI_BD_ID1 18
20#define PIIX_GPI_BD_ID2 19
21#define PIIX_GPI_BD_ID3 20
22#define PIIX_GPI_BD_ID4 21
23#define PIIX_GPI_BD_REG GPIREG(PIIX_GPI_BD_ID1)
24#define PIIX_GPI_BD_MASK (GPIBIT(PIIX_GPI_BD_ID1) | \
25 GPIBIT(PIIX_GPI_BD_ID2) | \
26 GPIBIT(PIIX_GPI_BD_ID3) | \
27 GPIBIT(PIIX_GPI_BD_ID4) )
28
29#define PIIX_GPI_BD_SHIFT (PIIX_GPI_BD_ID1 % 8)
30
31#define SIO_INDEX 0x2e
32#define SIO_DATA 0x2f
33
34#define SIO_DEV_SEL 0x7
35#define SIO_DEV_ENB 0x30
36#define SIO_DEV_MSB 0x60
37#define SIO_DEV_LSB 0x61
38
39#define SIO_GP_DEV 0x7
40
41#define SIO_GP_BASE SIO_GPIO_START
42#define SIO_GP_MSB (SIO_GP_BASE>>8)
43#define SIO_GP_LSB (SIO_GP_BASE&0xff)
44
45#define SIO_GP_DATA1 (SIO_GP_BASE+0)
46
47#define SIO_PM_DEV 0x8
48
49#define SIO_PM_BASE SIO_PM_START
50#define SIO_PM_MSB (SIO_PM_BASE>>8)
51#define SIO_PM_LSB (SIO_PM_BASE&0xff)
52#define SIO_PM_INDEX (SIO_PM_BASE+0)
53#define SIO_PM_DATA (SIO_PM_BASE+1)
54
55#define SIO_PM_FER2 0x1
56
57#define SIO_PM_GP_EN 0x80
58
59
60
61/*
62 * This is the dev/reg where generating a config cycle will
63 * result in a PCI special cycle.
64 */
65#define SPECIAL_DEV 0xff
66#define SPECIAL_REG 0x00
67
68/*
69 * PIIX4 needs to see a special cycle with the following data
70 * to be convinced the processor has gone into the stop grant
71 * state. PIIX4 insists on seeing this before it will power
72 * down a system.
73 */
74#define PIIX_SPECIAL_STOP 0x00120002
75
76#define PIIX4_RESET_PORT 0xcf9
77#define PIIX4_RESET_VAL 0x6
78
79#define PMSTS_PORT 0xf80 // 2 bytes PM Status
80#define PMEN_PORT 0xf82 // 2 bytes PM Enable
81#define PMCNTRL_PORT 0xf84 // 2 bytes PM Control
82
83#define PM_SUSPEND_ENABLE 0x2000 // start sequence to suspend state
84
85/*
86 * PMSTS and PMEN I/O bit definitions.
87 * (Bits are the same in both registers)
88 */
89#define PM_STS_RSM (1<<15) // Resume Status
90#define PM_STS_PWRBTNOR (1<<11) // Power Button Override
91#define PM_STS_RTC (1<<10) // RTC status
92#define PM_STS_PWRBTN (1<<8) // Power Button Pressed?
93#define PM_STS_GBL (1<<5) // Global Status
94#define PM_STS_BM (1<<4) // Bus Master Status
95#define PM_STS_TMROF (1<<0) // Timer Overflow Status.
96
97/*
98 * Stop clock GPI register
99 */
100#define PIIX_GPIREG0 (0xf80 + 0x30)
101
102/*
103 * Stop clock GPI bit in GPIREG0
104 */
105#define PIIX_GPI_STPCLK 0x4 // STPCLK signal routed back in
106
107#endif /* _ASM_X86_VISWS_PIIX4_H */
diff --git a/arch/x86/include/asm/visws/sgivw.h b/arch/x86/include/asm/visws/sgivw.h
new file mode 100644
index 00000000000..5fbf63e1003
--- /dev/null
+++ b/arch/x86/include/asm/visws/sgivw.h
@@ -0,0 +1,5 @@
1/*
2 * Frame buffer position and size:
3 */
4extern unsigned long sgivwfb_mem_phys;
5extern unsigned long sgivwfb_mem_size;
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
new file mode 100644
index 00000000000..f9303602fbc
--- /dev/null
+++ b/arch/x86/include/asm/vm86.h
@@ -0,0 +1,208 @@
1#ifndef _ASM_X86_VM86_H
2#define _ASM_X86_VM86_H
3
4/*
5 * I'm guessing at the VIF/VIP flag usage, but hope that this is how
6 * the Pentium uses them. Linux will return from vm86 mode when both
7 * VIF and VIP is set.
8 *
9 * On a Pentium, we could probably optimize the virtual flags directly
10 * in the eflags register instead of doing it "by hand" in vflags...
11 *
12 * Linus
13 */
14
15#include <asm/processor-flags.h>
16
17#define BIOSSEG 0x0f000
18
19#define CPU_086 0
20#define CPU_186 1
21#define CPU_286 2
22#define CPU_386 3
23#define CPU_486 4
24#define CPU_586 5
25
26/*
27 * Return values for the 'vm86()' system call
28 */
29#define VM86_TYPE(retval) ((retval) & 0xff)
30#define VM86_ARG(retval) ((retval) >> 8)
31
32#define VM86_SIGNAL 0 /* return due to signal */
33#define VM86_UNKNOWN 1 /* unhandled GP fault
34 - IO-instruction or similar */
35#define VM86_INTx 2 /* int3/int x instruction (ARG = x) */
36#define VM86_STI 3 /* sti/popf/iret instruction enabled
37 virtual interrupts */
38
39/*
40 * Additional return values when invoking new vm86()
41 */
42#define VM86_PICRETURN 4 /* return due to pending PIC request */
43#define VM86_TRAP 6 /* return due to DOS-debugger request */
44
45/*
46 * function codes when invoking new vm86()
47 */
48#define VM86_PLUS_INSTALL_CHECK 0
49#define VM86_ENTER 1
50#define VM86_ENTER_NO_BYPASS 2
51#define VM86_REQUEST_IRQ 3
52#define VM86_FREE_IRQ 4
53#define VM86_GET_IRQ_BITS 5
54#define VM86_GET_AND_RESET_IRQ 6
55
56/*
57 * This is the stack-layout seen by the user space program when we have
58 * done a translation of "SAVE_ALL" from vm86 mode. The real kernel layout
59 * is 'kernel_vm86_regs' (see below).
60 */
61
62struct vm86_regs {
63/*
64 * normal regs, with special meaning for the segment descriptors..
65 */
66 long ebx;
67 long ecx;
68 long edx;
69 long esi;
70 long edi;
71 long ebp;
72 long eax;
73 long __null_ds;
74 long __null_es;
75 long __null_fs;
76 long __null_gs;
77 long orig_eax;
78 long eip;
79 unsigned short cs, __csh;
80 long eflags;
81 long esp;
82 unsigned short ss, __ssh;
83/*
84 * these are specific to v86 mode:
85 */
86 unsigned short es, __esh;
87 unsigned short ds, __dsh;
88 unsigned short fs, __fsh;
89 unsigned short gs, __gsh;
90};
91
92struct revectored_struct {
93 unsigned long __map[8]; /* 256 bits */
94};
95
96struct vm86_struct {
97 struct vm86_regs regs;
98 unsigned long flags;
99 unsigned long screen_bitmap;
100 unsigned long cpu_type;
101 struct revectored_struct int_revectored;
102 struct revectored_struct int21_revectored;
103};
104
105/*
106 * flags masks
107 */
108#define VM86_SCREEN_BITMAP 0x0001
109
110struct vm86plus_info_struct {
111 unsigned long force_return_for_pic:1;
112 unsigned long vm86dbg_active:1; /* for debugger */
113 unsigned long vm86dbg_TFpendig:1; /* for debugger */
114 unsigned long unused:28;
115 unsigned long is_vm86pus:1; /* for vm86 internal use */
116 unsigned char vm86dbg_intxxtab[32]; /* for debugger */
117};
118struct vm86plus_struct {
119 struct vm86_regs regs;
120 unsigned long flags;
121 unsigned long screen_bitmap;
122 unsigned long cpu_type;
123 struct revectored_struct int_revectored;
124 struct revectored_struct int21_revectored;
125 struct vm86plus_info_struct vm86plus;
126};
127
128#ifdef __KERNEL__
129
130#include <asm/ptrace.h>
131
132/*
133 * This is the (kernel) stack-layout when we have done a "SAVE_ALL" from vm86
134 * mode - the main change is that the old segment descriptors aren't
135 * useful any more and are forced to be zero by the kernel (and the
136 * hardware when a trap occurs), and the real segment descriptors are
137 * at the end of the structure. Look at ptrace.h to see the "normal"
138 * setup. For user space layout see 'struct vm86_regs' above.
139 */
140
141struct kernel_vm86_regs {
142/*
143 * normal regs, with special meaning for the segment descriptors..
144 */
145 struct pt_regs pt;
146/*
147 * these are specific to v86 mode:
148 */
149 unsigned short es, __esh;
150 unsigned short ds, __dsh;
151 unsigned short fs, __fsh;
152 unsigned short gs, __gsh;
153};
154
155struct kernel_vm86_struct {
156 struct kernel_vm86_regs regs;
157/*
158 * the below part remains on the kernel stack while we are in VM86 mode.
159 * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we
160 * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above
161 * 'struct kernel_vm86_regs' with the then actual values.
162 * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct'
163 * in kernelspace, hence we need not reget the data from userspace.
164 */
165#define VM86_TSS_ESP0 flags
166 unsigned long flags;
167 unsigned long screen_bitmap;
168 unsigned long cpu_type;
169 struct revectored_struct int_revectored;
170 struct revectored_struct int21_revectored;
171 struct vm86plus_info_struct vm86plus;
172 struct pt_regs *regs32; /* here we save the pointer to the old regs */
173/*
174 * The below is not part of the structure, but the stack layout continues
175 * this way. In front of 'return-eip' may be some data, depending on
176 * compilation, so we don't rely on this and save the pointer to 'oldregs'
177 * in 'regs32' above.
178 * However, with GCC-2.7.2 and the current CFLAGS you see exactly this:
179
180 long return-eip; from call to vm86()
181 struct pt_regs oldregs; user space registers as saved by syscall
182 */
183};
184
185#ifdef CONFIG_VM86
186
187void handle_vm86_fault(struct kernel_vm86_regs *, long);
188int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
189struct pt_regs *save_v86_state(struct kernel_vm86_regs *);
190
191struct task_struct;
192void release_vm86_irqs(struct task_struct *);
193
194#else
195
196#define handle_vm86_fault(a, b)
197#define release_vm86_irqs(a)
198
199static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
200{
201 return 0;
202}
203
204#endif /* CONFIG_VM86 */
205
206#endif /* __KERNEL__ */
207
208#endif /* _ASM_X86_VM86_H */
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h
new file mode 100644
index 00000000000..b7c0dea119f
--- /dev/null
+++ b/arch/x86/include/asm/vmi.h
@@ -0,0 +1,263 @@
1/*
2 * VMI interface definition
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Maintained by: Zachary Amsden zach@vmware.com
22 *
23 */
24#include <linux/types.h>
25
26/*
27 *---------------------------------------------------------------------
28 *
29 * VMI Option ROM API
30 *
31 *---------------------------------------------------------------------
32 */
33#define VMI_SIGNATURE 0x696d5663 /* "cVmi" */
34
35#define PCI_VENDOR_ID_VMWARE 0x15AD
36#define PCI_DEVICE_ID_VMWARE_VMI 0x0801
37
38/*
39 * We use two version numbers for compatibility, with the major
40 * number signifying interface breakages, and the minor number
41 * interface extensions.
42 */
43#define VMI_API_REV_MAJOR 3
44#define VMI_API_REV_MINOR 0
45
46#define VMI_CALL_CPUID 0
47#define VMI_CALL_WRMSR 1
48#define VMI_CALL_RDMSR 2
49#define VMI_CALL_SetGDT 3
50#define VMI_CALL_SetLDT 4
51#define VMI_CALL_SetIDT 5
52#define VMI_CALL_SetTR 6
53#define VMI_CALL_GetGDT 7
54#define VMI_CALL_GetLDT 8
55#define VMI_CALL_GetIDT 9
56#define VMI_CALL_GetTR 10
57#define VMI_CALL_WriteGDTEntry 11
58#define VMI_CALL_WriteLDTEntry 12
59#define VMI_CALL_WriteIDTEntry 13
60#define VMI_CALL_UpdateKernelStack 14
61#define VMI_CALL_SetCR0 15
62#define VMI_CALL_SetCR2 16
63#define VMI_CALL_SetCR3 17
64#define VMI_CALL_SetCR4 18
65#define VMI_CALL_GetCR0 19
66#define VMI_CALL_GetCR2 20
67#define VMI_CALL_GetCR3 21
68#define VMI_CALL_GetCR4 22
69#define VMI_CALL_WBINVD 23
70#define VMI_CALL_SetDR 24
71#define VMI_CALL_GetDR 25
72#define VMI_CALL_RDPMC 26
73#define VMI_CALL_RDTSC 27
74#define VMI_CALL_CLTS 28
75#define VMI_CALL_EnableInterrupts 29
76#define VMI_CALL_DisableInterrupts 30
77#define VMI_CALL_GetInterruptMask 31
78#define VMI_CALL_SetInterruptMask 32
79#define VMI_CALL_IRET 33
80#define VMI_CALL_SYSEXIT 34
81#define VMI_CALL_Halt 35
82#define VMI_CALL_Reboot 36
83#define VMI_CALL_Shutdown 37
84#define VMI_CALL_SetPxE 38
85#define VMI_CALL_SetPxELong 39
86#define VMI_CALL_UpdatePxE 40
87#define VMI_CALL_UpdatePxELong 41
88#define VMI_CALL_MachineToPhysical 42
89#define VMI_CALL_PhysicalToMachine 43
90#define VMI_CALL_AllocatePage 44
91#define VMI_CALL_ReleasePage 45
92#define VMI_CALL_InvalPage 46
93#define VMI_CALL_FlushTLB 47
94#define VMI_CALL_SetLinearMapping 48
95
96#define VMI_CALL_SetIOPLMask 61
97#define VMI_CALL_SetInitialAPState 62
98#define VMI_CALL_APICWrite 63
99#define VMI_CALL_APICRead 64
100#define VMI_CALL_IODelay 65
101#define VMI_CALL_SetLazyMode 73
102
103/*
104 *---------------------------------------------------------------------
105 *
106 * MMU operation flags
107 *
108 *---------------------------------------------------------------------
109 */
110
111/* Flags used by VMI_{Allocate|Release}Page call */
112#define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */
113#define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */
114#define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */
115
116
117/* Flags shared by Allocate|Release Page and PTE updates */
118#define VMI_PAGE_PT 0x01
119#define VMI_PAGE_PD 0x02
120#define VMI_PAGE_PDP 0x04
121#define VMI_PAGE_PML4 0x08
122
123#define VMI_PAGE_NORMAL 0x00 /* for debugging */
124
125/* Flags used by PTE updates */
126#define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */
127#define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */
128#define VMI_PAGE_VA_MASK 0xfffff000
129
130#ifdef CONFIG_X86_PAE
131#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
132#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
133#else
134#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED)
135#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED)
136#endif
137
138/* Flags used by VMI_FlushTLB call */
139#define VMI_FLUSH_TLB 0x01
140#define VMI_FLUSH_GLOBAL 0x02
141
142/*
143 *---------------------------------------------------------------------
144 *
145 * VMI relocation definitions for ROM call get_reloc
146 *
147 *---------------------------------------------------------------------
148 */
149
150/* VMI Relocation types */
151#define VMI_RELOCATION_NONE 0
152#define VMI_RELOCATION_CALL_REL 1
153#define VMI_RELOCATION_JUMP_REL 2
154#define VMI_RELOCATION_NOP 3
155
156#ifndef __ASSEMBLY__
157struct vmi_relocation_info {
158 unsigned char *eip;
159 unsigned char type;
160 unsigned char reserved[3];
161};
162#endif
163
164
165/*
166 *---------------------------------------------------------------------
167 *
168 * Generic ROM structures and definitions
169 *
170 *---------------------------------------------------------------------
171 */
172
173#ifndef __ASSEMBLY__
174
175struct vrom_header {
176 u16 rom_signature; /* option ROM signature */
177 u8 rom_length; /* ROM length in 512 byte chunks */
178 u8 rom_entry[4]; /* 16-bit code entry point */
179 u8 rom_pad0; /* 4-byte align pad */
180 u32 vrom_signature; /* VROM identification signature */
181 u8 api_version_min;/* Minor version of API */
182 u8 api_version_maj;/* Major version of API */
183 u8 jump_slots; /* Number of jump slots */
184 u8 reserved1; /* Reserved for expansion */
185 u32 virtual_top; /* Hypervisor virtual address start */
186 u16 reserved2; /* Reserved for expansion */
187 u16 license_offs; /* Offset to License string */
188 u16 pci_header_offs;/* Offset to PCI OPROM header */
189 u16 pnp_header_offs;/* Offset to PnP OPROM header */
190 u32 rom_pad3; /* PnP reserverd / VMI reserved */
191 u8 reserved[96]; /* Reserved for headers */
192 char vmi_init[8]; /* VMI_Init jump point */
193 char get_reloc[8]; /* VMI_GetRelocationInfo jump point */
194} __attribute__((packed));
195
196struct pnp_header {
197 char sig[4];
198 char rev;
199 char size;
200 short next;
201 short res;
202 long devID;
203 unsigned short manufacturer_offset;
204 unsigned short product_offset;
205} __attribute__((packed));
206
207struct pci_header {
208 char sig[4];
209 short vendorID;
210 short deviceID;
211 short vpdData;
212 short size;
213 char rev;
214 char class;
215 char subclass;
216 char interface;
217 short chunks;
218 char rom_version_min;
219 char rom_version_maj;
220 char codetype;
221 char lastRom;
222 short reserved;
223} __attribute__((packed));
224
225/* Function prototypes for bootstrapping */
226extern void vmi_init(void);
227extern void vmi_bringup(void);
228extern void vmi_apply_boot_page_allocations(void);
229
230/* State needed to start an application processor in an SMP system. */
231struct vmi_ap_state {
232 u32 cr0;
233 u32 cr2;
234 u32 cr3;
235 u32 cr4;
236
237 u64 efer;
238
239 u32 eip;
240 u32 eflags;
241 u32 eax;
242 u32 ebx;
243 u32 ecx;
244 u32 edx;
245 u32 esp;
246 u32 ebp;
247 u32 esi;
248 u32 edi;
249 u16 cs;
250 u16 ss;
251 u16 ds;
252 u16 es;
253 u16 fs;
254 u16 gs;
255 u16 ldtr;
256
257 u16 gdtr_limit;
258 u32 gdtr_base;
259 u32 idtr_base;
260 u16 idtr_limit;
261};
262
263#endif
diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h
new file mode 100644
index 00000000000..c6e0bee93e3
--- /dev/null
+++ b/arch/x86/include/asm/vmi_time.h
@@ -0,0 +1,98 @@
1/*
2 * VMI Time wrappers
3 *
4 * Copyright (C) 2006, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to dhecht@vmware.com
22 *
23 */
24
25#ifndef _ASM_X86_VMI_TIME_H
26#define _ASM_X86_VMI_TIME_H
27
28/*
29 * Raw VMI call indices for timer functions
30 */
31#define VMI_CALL_GetCycleFrequency 66
32#define VMI_CALL_GetCycleCounter 67
33#define VMI_CALL_SetAlarm 68
34#define VMI_CALL_CancelAlarm 69
35#define VMI_CALL_GetWallclockTime 70
36#define VMI_CALL_WallclockUpdated 71
37
38/* Cached VMI timer operations */
39extern struct vmi_timer_ops {
40 u64 (*get_cycle_frequency)(void);
41 u64 (*get_cycle_counter)(int);
42 u64 (*get_wallclock)(void);
43 int (*wallclock_updated)(void);
44 void (*set_alarm)(u32 flags, u64 expiry, u64 period);
45 void (*cancel_alarm)(u32 flags);
46} vmi_timer_ops;
47
48/* Prototypes */
49extern void __init vmi_time_init(void);
50extern unsigned long vmi_get_wallclock(void);
51extern int vmi_set_wallclock(unsigned long now);
52extern unsigned long long vmi_sched_clock(void);
53extern unsigned long vmi_tsc_khz(void);
54
55#ifdef CONFIG_X86_LOCAL_APIC
56extern void __devinit vmi_time_bsp_init(void);
57extern void __devinit vmi_time_ap_init(void);
58#endif
59
60/*
61 * When run under a hypervisor, a vcpu is always in one of three states:
62 * running, halted, or ready. The vcpu is in the 'running' state if it
63 * is executing. When the vcpu executes the halt interface, the vcpu
64 * enters the 'halted' state and remains halted until there is some work
65 * pending for the vcpu (e.g. an alarm expires, host I/O completes on
66 * behalf of virtual I/O). At this point, the vcpu enters the 'ready'
67 * state (waiting for the hypervisor to reschedule it). Finally, at any
68 * time when the vcpu is not in the 'running' state nor the 'halted'
69 * state, it is in the 'ready' state.
70 *
71 * Real time is advances while the vcpu is 'running', 'ready', or
72 * 'halted'. Stolen time is the time in which the vcpu is in the
73 * 'ready' state. Available time is the remaining time -- the vcpu is
74 * either 'running' or 'halted'.
75 *
76 * All three views of time are accessible through the VMI cycle
77 * counters.
78 */
79
80/* The cycle counters. */
81#define VMI_CYCLES_REAL 0
82#define VMI_CYCLES_AVAILABLE 1
83#define VMI_CYCLES_STOLEN 2
84
85/* The alarm interface 'flags' bits */
86#define VMI_ALARM_COUNTERS 2
87
88#define VMI_ALARM_COUNTER_MASK 0x000000ff
89
90#define VMI_ALARM_WIRED_IRQ0 0x00000000
91#define VMI_ALARM_WIRED_LVTT 0x00010000
92
93#define VMI_ALARM_IS_ONESHOT 0x00000000
94#define VMI_ALARM_IS_PERIODIC 0x00000100
95
96#define CONFIG_VMI_ALARM_HZ 100
97
98#endif /* _ASM_X86_VMI_TIME_H */
diff --git a/arch/x86/include/asm/voyager.h b/arch/x86/include/asm/voyager.h
new file mode 100644
index 00000000000..9c811d2e6f9
--- /dev/null
+++ b/arch/x86/include/asm/voyager.h
@@ -0,0 +1,528 @@
1/* Copyright (C) 1999,2001
2 *
3 * Author: J.E.J.Bottomley@HansenPartnership.com
4 *
5 * Standard include definitions for the NCR Voyager system */
6
7#undef VOYAGER_DEBUG
8#undef VOYAGER_CAT_DEBUG
9
10#ifdef VOYAGER_DEBUG
11#define VDEBUG(x) printk x
12#else
13#define VDEBUG(x)
14#endif
15
16/* There are three levels of voyager machine: 3,4 and 5. The rule is
17 * if it's less than 3435 it's a Level 3 except for a 3360 which is
18 * a level 4. A 3435 or above is a Level 5 */
19#define VOYAGER_LEVEL5_AND_ABOVE 0x3435
20#define VOYAGER_LEVEL4 0x3360
21
22/* The L4 DINO ASIC */
23#define VOYAGER_DINO 0x43
24
25/* voyager ports in standard I/O space */
26#define VOYAGER_MC_SETUP 0x96
27
28
29#define VOYAGER_CAT_CONFIG_PORT 0x97
30# define VOYAGER_CAT_DESELECT 0xff
31#define VOYAGER_SSPB_RELOCATION_PORT 0x98
32
33/* Valid CAT controller commands */
34/* start instruction register cycle */
35#define VOYAGER_CAT_IRCYC 0x01
36/* start data register cycle */
37#define VOYAGER_CAT_DRCYC 0x02
38/* move to execute state */
39#define VOYAGER_CAT_RUN 0x0F
40/* end operation */
41#define VOYAGER_CAT_END 0x80
42/* hold in idle state */
43#define VOYAGER_CAT_HOLD 0x90
44/* single step an "intest" vector */
45#define VOYAGER_CAT_STEP 0xE0
46/* return cat controller to CLEMSON mode */
47#define VOYAGER_CAT_CLEMSON 0xFF
48
49/* the default cat command header */
50#define VOYAGER_CAT_HEADER 0x7F
51
52/* the range of possible CAT module ids in the system */
53#define VOYAGER_MIN_MODULE 0x10
54#define VOYAGER_MAX_MODULE 0x1f
55
56/* The voyager registers per asic */
57#define VOYAGER_ASIC_ID_REG 0x00
58#define VOYAGER_ASIC_TYPE_REG 0x01
59/* the sub address registers can be made auto incrementing on reads */
60#define VOYAGER_AUTO_INC_REG 0x02
61# define VOYAGER_AUTO_INC 0x04
62# define VOYAGER_NO_AUTO_INC 0xfb
63#define VOYAGER_SUBADDRDATA 0x03
64#define VOYAGER_SCANPATH 0x05
65# define VOYAGER_CONNECT_ASIC 0x01
66# define VOYAGER_DISCONNECT_ASIC 0xfe
67#define VOYAGER_SUBADDRLO 0x06
68#define VOYAGER_SUBADDRHI 0x07
69#define VOYAGER_SUBMODSELECT 0x08
70#define VOYAGER_SUBMODPRESENT 0x09
71
72#define VOYAGER_SUBADDR_LO 0xff
73#define VOYAGER_SUBADDR_HI 0xffff
74
75/* the maximum size of a scan path -- used to form instructions */
76#define VOYAGER_MAX_SCAN_PATH 0x100
77/* the biggest possible register size (in bytes) */
78#define VOYAGER_MAX_REG_SIZE 4
79
80/* Total number of possible modules (including submodules) */
81#define VOYAGER_MAX_MODULES 16
82/* Largest number of asics per module */
83#define VOYAGER_MAX_ASICS_PER_MODULE 7
84
85/* the CAT asic of each module is always the first one */
86#define VOYAGER_CAT_ID 0
87#define VOYAGER_PSI 0x1a
88
89/* voyager instruction operations and registers */
90#define VOYAGER_READ_CONFIG 0x1
91#define VOYAGER_WRITE_CONFIG 0x2
92#define VOYAGER_BYPASS 0xff
93
94typedef struct voyager_asic {
95 __u8 asic_addr; /* ASIC address; Level 4 */
96 __u8 asic_type; /* ASIC type */
97 __u8 asic_id; /* ASIC id */
98 __u8 jtag_id[4]; /* JTAG id */
99 __u8 asic_location; /* Location within scan path; start w/ 0 */
100 __u8 bit_location; /* Location within bit stream; start w/ 0 */
101 __u8 ireg_length; /* Instruction register length */
102 __u16 subaddr; /* Amount of sub address space */
103 struct voyager_asic *next; /* Next asic in linked list */
104} voyager_asic_t;
105
106typedef struct voyager_module {
107 __u8 module_addr; /* Module address */
108 __u8 scan_path_connected; /* Scan path connected */
109 __u16 ee_size; /* Size of the EEPROM */
110 __u16 num_asics; /* Number of Asics */
111 __u16 inst_bits; /* Instruction bits in the scan path */
112 __u16 largest_reg; /* Largest register in the scan path */
113 __u16 smallest_reg; /* Smallest register in the scan path */
114 voyager_asic_t *asic; /* First ASIC in scan path (CAT_I) */
115 struct voyager_module *submodule; /* Submodule pointer */
116 struct voyager_module *next; /* Next module in linked list */
117} voyager_module_t;
118
119typedef struct voyager_eeprom_hdr {
120 __u8 module_id[4];
121 __u8 version_id;
122 __u8 config_id;
123 __u16 boundry_id; /* boundary scan id */
124 __u16 ee_size; /* size of EEPROM */
125 __u8 assembly[11]; /* assembly # */
126 __u8 assembly_rev; /* assembly rev */
127 __u8 tracer[4]; /* tracer number */
128 __u16 assembly_cksum; /* asm checksum */
129 __u16 power_consump; /* pwr requirements */
130 __u16 num_asics; /* number of asics */
131 __u16 bist_time; /* min. bist time */
132 __u16 err_log_offset; /* error log offset */
133 __u16 scan_path_offset;/* scan path offset */
134 __u16 cct_offset;
135 __u16 log_length; /* length of err log */
136 __u16 xsum_end; /* offset to end of
137 checksum */
138 __u8 reserved[4];
139 __u8 sflag; /* starting sentinal */
140 __u8 part_number[13]; /* prom part number */
141 __u8 version[10]; /* version number */
142 __u8 signature[8];
143 __u16 eeprom_chksum;
144 __u32 data_stamp_offset;
145 __u8 eflag ; /* ending sentinal */
146} __attribute__((packed)) voyager_eprom_hdr_t;
147
148
149
150#define VOYAGER_EPROM_SIZE_OFFSET \
151 ((__u16)(&(((voyager_eprom_hdr_t *)0)->ee_size)))
152#define VOYAGER_XSUM_END_OFFSET 0x2a
153
154/* the following three definitions are for internal table layouts
155 * in the module EPROMs. We really only care about the IDs and
156 * offsets */
157typedef struct voyager_sp_table {
158 __u8 asic_id;
159 __u8 bypass_flag;
160 __u16 asic_data_offset;
161 __u16 config_data_offset;
162} __attribute__((packed)) voyager_sp_table_t;
163
164typedef struct voyager_jtag_table {
165 __u8 icode[4];
166 __u8 runbist[4];
167 __u8 intest[4];
168 __u8 samp_preld[4];
169 __u8 ireg_len;
170} __attribute__((packed)) voyager_jtt_t;
171
172typedef struct voyager_asic_data_table {
173 __u8 jtag_id[4];
174 __u16 length_bsr;
175 __u16 length_bist_reg;
176 __u32 bist_clk;
177 __u16 subaddr_bits;
178 __u16 seed_bits;
179 __u16 sig_bits;
180 __u16 jtag_offset;
181} __attribute__((packed)) voyager_at_t;
182
183/* Voyager Interrupt Controller (VIC) registers */
184
185/* Base to add to Cross Processor Interrupts (CPIs) when triggering
186 * the CPU IRQ line */
187/* register defines for the WCBICs (one per processor) */
188#define VOYAGER_WCBIC0 0x41 /* bus A node P1 processor 0 */
189#define VOYAGER_WCBIC1 0x49 /* bus A node P1 processor 1 */
190#define VOYAGER_WCBIC2 0x51 /* bus A node P2 processor 0 */
191#define VOYAGER_WCBIC3 0x59 /* bus A node P2 processor 1 */
192#define VOYAGER_WCBIC4 0x61 /* bus B node P1 processor 0 */
193#define VOYAGER_WCBIC5 0x69 /* bus B node P1 processor 1 */
194#define VOYAGER_WCBIC6 0x71 /* bus B node P2 processor 0 */
195#define VOYAGER_WCBIC7 0x79 /* bus B node P2 processor 1 */
196
197
198/* top of memory registers */
199#define VOYAGER_WCBIC_TOM_L 0x4
200#define VOYAGER_WCBIC_TOM_H 0x5
201
202/* register defines for Voyager Memory Contol (VMC)
203 * these are present on L4 machines only */
204#define VOYAGER_VMC1 0x81
205#define VOYAGER_VMC2 0x91
206#define VOYAGER_VMC3 0xa1
207#define VOYAGER_VMC4 0xb1
208
209/* VMC Ports */
210#define VOYAGER_VMC_MEMORY_SETUP 0x9
211# define VMC_Interleaving 0x01
212# define VMC_4Way 0x02
213# define VMC_EvenCacheLines 0x04
214# define VMC_HighLine 0x08
215# define VMC_Start0_Enable 0x20
216# define VMC_Start1_Enable 0x40
217# define VMC_Vremap 0x80
218#define VOYAGER_VMC_BANK_DENSITY 0xa
219# define VMC_BANK_EMPTY 0
220# define VMC_BANK_4MB 1
221# define VMC_BANK_16MB 2
222# define VMC_BANK_64MB 3
223# define VMC_BANK0_MASK 0x03
224# define VMC_BANK1_MASK 0x0C
225# define VMC_BANK2_MASK 0x30
226# define VMC_BANK3_MASK 0xC0
227
228/* Magellan Memory Controller (MMC) defines - present on L5 */
229#define VOYAGER_MMC_ASIC_ID 1
230/* the two memory modules corresponding to memory cards in the system */
231#define VOYAGER_MMC_MEMORY0_MODULE 0x14
232#define VOYAGER_MMC_MEMORY1_MODULE 0x15
233/* the Magellan Memory Address (MMA) defines */
234#define VOYAGER_MMA_ASIC_ID 2
235
236/* Submodule number for the Quad Baseboard */
237#define VOYAGER_QUAD_BASEBOARD 1
238
239/* ASIC defines for the Quad Baseboard */
240#define VOYAGER_QUAD_QDATA0 1
241#define VOYAGER_QUAD_QDATA1 2
242#define VOYAGER_QUAD_QABC 3
243
244/* Useful areas in extended CMOS */
245#define VOYAGER_PROCESSOR_PRESENT_MASK 0x88a
246#define VOYAGER_MEMORY_CLICKMAP 0xa23
247#define VOYAGER_DUMP_LOCATION 0xb1a
248
249/* SUS In Control bit - used to tell SUS that we don't need to be
250 * babysat anymore */
251#define VOYAGER_SUS_IN_CONTROL_PORT 0x3ff
252# define VOYAGER_IN_CONTROL_FLAG 0x80
253
254/* Voyager PSI defines */
255#define VOYAGER_PSI_STATUS_REG 0x08
256# define PSI_DC_FAIL 0x01
257# define PSI_MON 0x02
258# define PSI_FAULT 0x04
259# define PSI_ALARM 0x08
260# define PSI_CURRENT 0x10
261# define PSI_DVM 0x20
262# define PSI_PSCFAULT 0x40
263# define PSI_STAT_CHG 0x80
264
265#define VOYAGER_PSI_SUPPLY_REG 0x8000
266 /* read */
267# define PSI_FAIL_DC 0x01
268# define PSI_FAIL_AC 0x02
269# define PSI_MON_INT 0x04
270# define PSI_SWITCH_OFF 0x08
271# define PSI_HX_OFF 0x10
272# define PSI_SECURITY 0x20
273# define PSI_CMOS_BATT_LOW 0x40
274# define PSI_CMOS_BATT_FAIL 0x80
275 /* write */
276# define PSI_CLR_SWITCH_OFF 0x13
277# define PSI_CLR_HX_OFF 0x14
278# define PSI_CLR_CMOS_BATT_FAIL 0x17
279
280#define VOYAGER_PSI_MASK 0x8001
281# define PSI_MASK_MASK 0x10
282
283#define VOYAGER_PSI_AC_FAIL_REG 0x8004
284#define AC_FAIL_STAT_CHANGE 0x80
285
286#define VOYAGER_PSI_GENERAL_REG 0x8007
287 /* read */
288# define PSI_SWITCH_ON 0x01
289# define PSI_SWITCH_ENABLED 0x02
290# define PSI_ALARM_ENABLED 0x08
291# define PSI_SECURE_ENABLED 0x10
292# define PSI_COLD_RESET 0x20
293# define PSI_COLD_START 0x80
294 /* write */
295# define PSI_POWER_DOWN 0x10
296# define PSI_SWITCH_DISABLE 0x01
297# define PSI_SWITCH_ENABLE 0x11
298# define PSI_CLEAR 0x12
299# define PSI_ALARM_DISABLE 0x03
300# define PSI_ALARM_ENABLE 0x13
301# define PSI_CLEAR_COLD_RESET 0x05
302# define PSI_SET_COLD_RESET 0x15
303# define PSI_CLEAR_COLD_START 0x07
304# define PSI_SET_COLD_START 0x17
305
306
307
308struct voyager_bios_info {
309 __u8 len;
310 __u8 major;
311 __u8 minor;
312 __u8 debug;
313 __u8 num_classes;
314 __u8 class_1;
315 __u8 class_2;
316};
317
318/* The following structures and definitions are for the Kernel/SUS
319 * interface these are needed to find out how SUS initialised any Quad
320 * boards in the system */
321
322#define NUMBER_OF_MC_BUSSES 2
323#define SLOTS_PER_MC_BUS 8
324#define MAX_CPUS 16 /* 16 way CPU system */
325#define MAX_PROCESSOR_BOARDS 4 /* 4 processor slot system */
326#define MAX_CACHE_LEVELS 4 /* # of cache levels supported */
327#define MAX_SHARED_CPUS 4 /* # of CPUs that can share a LARC */
328#define NUMBER_OF_POS_REGS 8
329
330typedef struct {
331 __u8 MC_Slot;
332 __u8 POS_Values[NUMBER_OF_POS_REGS];
333} __attribute__((packed)) MC_SlotInformation_t;
334
335struct QuadDescription {
336 __u8 Type; /* for type 0 (DYADIC or MONADIC) all fields
337 * will be zero except for slot */
338 __u8 StructureVersion;
339 __u32 CPI_BaseAddress;
340 __u32 LARC_BankSize;
341 __u32 LocalMemoryStateBits;
342 __u8 Slot; /* Processor slots 1 - 4 */
343} __attribute__((packed));
344
345struct ProcBoardInfo {
346 __u8 Type;
347 __u8 StructureVersion;
348 __u8 NumberOfBoards;
349 struct QuadDescription QuadData[MAX_PROCESSOR_BOARDS];
350} __attribute__((packed));
351
352struct CacheDescription {
353 __u8 Level;
354 __u32 TotalSize;
355 __u16 LineSize;
356 __u8 Associativity;
357 __u8 CacheType;
358 __u8 WriteType;
359 __u8 Number_CPUs_SharedBy;
360 __u8 Shared_CPUs_Hardware_IDs[MAX_SHARED_CPUS];
361
362} __attribute__((packed));
363
364struct CPU_Description {
365 __u8 CPU_HardwareId;
366 char *FRU_String;
367 __u8 NumberOfCacheLevels;
368 struct CacheDescription CacheLevelData[MAX_CACHE_LEVELS];
369} __attribute__((packed));
370
371struct CPU_Info {
372 __u8 Type;
373 __u8 StructureVersion;
374 __u8 NumberOf_CPUs;
375 struct CPU_Description CPU_Data[MAX_CPUS];
376} __attribute__((packed));
377
378
379/*
380 * This structure will be used by SUS and the OS.
381 * The assumption about this structure is that no blank space is
382 * packed in it by our friend the compiler.
383 */
384typedef struct {
385 __u8 Mailbox_SUS; /* Written to by SUS to give
386 commands/response to the OS */
387 __u8 Mailbox_OS; /* Written to by the OS to give
388 commands/response to SUS */
389 __u8 SUS_MailboxVersion; /* Tells the OS which iteration of the
390 interface SUS supports */
391 __u8 OS_MailboxVersion; /* Tells SUS which iteration of the
392 interface the OS supports */
393 __u32 OS_Flags; /* Flags set by the OS as info for
394 SUS */
395 __u32 SUS_Flags; /* Flags set by SUS as info
396 for the OS */
397 __u32 WatchDogPeriod; /* Watchdog period (in seconds) which
398 the DP uses to see if the OS
399 is dead */
400 __u32 WatchDogCount; /* Updated by the OS on every tic. */
401 __u32 MemoryFor_SUS_ErrorLog; /* Flat 32 bit address which tells SUS
402 where to stuff the SUS error log
403 on a dump */
404 MC_SlotInformation_t MC_SlotInfo[NUMBER_OF_MC_BUSSES*SLOTS_PER_MC_BUS];
405 /* Storage for MCA POS data */
406 /* All new SECOND_PASS_INTERFACE fields added from this point */
407 struct ProcBoardInfo *BoardData;
408 struct CPU_Info *CPU_Data;
409 /* All new fields must be added from this point */
410} Voyager_KernelSUS_Mbox_t;
411
412/* structure for finding the right memory address to send a QIC CPI to */
413struct voyager_qic_cpi {
414 /* Each cache line (32 bytes) can trigger a cpi. The cpi
415 * read/write may occur anywhere in the cache line---pick the
416 * middle to be safe */
417 struct {
418 __u32 pad1[3];
419 __u32 cpi;
420 __u32 pad2[4];
421 } qic_cpi[8];
422};
423
424struct voyager_status {
425 __u32 power_fail:1;
426 __u32 switch_off:1;
427 __u32 request_from_kernel:1;
428};
429
430struct voyager_psi_regs {
431 __u8 cat_id;
432 __u8 cat_dev;
433 __u8 cat_control;
434 __u8 subaddr;
435 __u8 dummy4;
436 __u8 checkbit;
437 __u8 subaddr_low;
438 __u8 subaddr_high;
439 __u8 intstatus;
440 __u8 stat1;
441 __u8 stat3;
442 __u8 fault;
443 __u8 tms;
444 __u8 gen;
445 __u8 sysconf;
446 __u8 dummy15;
447};
448
449struct voyager_psi_subregs {
450 __u8 supply;
451 __u8 mask;
452 __u8 present;
453 __u8 DCfail;
454 __u8 ACfail;
455 __u8 fail;
456 __u8 UPSfail;
457 __u8 genstatus;
458};
459
460struct voyager_psi {
461 struct voyager_psi_regs regs;
462 struct voyager_psi_subregs subregs;
463};
464
465struct voyager_SUS {
466#define VOYAGER_DUMP_BUTTON_NMI 0x1
467#define VOYAGER_SUS_VALID 0x2
468#define VOYAGER_SYSINT_COMPLETE 0x3
469 __u8 SUS_mbox;
470#define VOYAGER_NO_COMMAND 0x0
471#define VOYAGER_IGNORE_DUMP 0x1
472#define VOYAGER_DO_DUMP 0x2
473#define VOYAGER_SYSINT_HANDSHAKE 0x3
474#define VOYAGER_DO_MEM_DUMP 0x4
475#define VOYAGER_SYSINT_WAS_RECOVERED 0x5
476 __u8 kernel_mbox;
477#define VOYAGER_MAILBOX_VERSION 0x10
478 __u8 SUS_version;
479 __u8 kernel_version;
480#define VOYAGER_OS_HAS_SYSINT 0x1
481#define VOYAGER_OS_IN_PROGRESS 0x2
482#define VOYAGER_UPDATING_WDPERIOD 0x4
483 __u32 kernel_flags;
484#define VOYAGER_SUS_BOOTING 0x1
485#define VOYAGER_SUS_IN_PROGRESS 0x2
486 __u32 SUS_flags;
487 __u32 watchdog_period;
488 __u32 watchdog_count;
489 __u32 SUS_errorlog;
490 /* lots of system configuration stuff under here */
491};
492
493/* Variables exported by voyager_smp */
494extern __u32 voyager_extended_vic_processors;
495extern __u32 voyager_allowed_boot_processors;
496extern __u32 voyager_quad_processors;
497extern struct voyager_qic_cpi *voyager_quad_cpi_addr[NR_CPUS];
498extern struct voyager_SUS *voyager_SUS;
499
500/* variables exported always */
501extern struct task_struct *voyager_thread;
502extern int voyager_level;
503extern struct voyager_status voyager_status;
504
505/* functions exported by the voyager and voyager_smp modules */
506extern int voyager_cat_readb(__u8 module, __u8 asic, int reg);
507extern void voyager_cat_init(void);
508extern void voyager_detect(struct voyager_bios_info *);
509extern void voyager_trap_init(void);
510extern void voyager_setup_irqs(void);
511extern int voyager_memory_detect(int region, __u32 *addr, __u32 *length);
512extern void voyager_smp_intr_init(void);
513extern __u8 voyager_extended_cmos_read(__u16 cmos_address);
514extern void voyager_smp_dump(void);
515extern void voyager_timer_interrupt(void);
516extern void smp_local_timer_interrupt(void);
517extern void voyager_power_off(void);
518extern void smp_voyager_power_off(void *dummy);
519extern void voyager_restart(void);
520extern void voyager_cat_power_off(void);
521extern void voyager_cat_do_common_interrupt(void);
522extern void voyager_handle_nmi(void);
523/* Commands for the following are */
524#define VOYAGER_PSI_READ 0
525#define VOYAGER_PSI_WRITE 1
526#define VOYAGER_PSI_SUBREAD 2
527#define VOYAGER_PSI_SUBWRITE 3
528extern void voyager_cat_psi(__u8, __u16, __u8 *);
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
new file mode 100644
index 00000000000..d0983d255fb
--- /dev/null
+++ b/arch/x86/include/asm/vsyscall.h
@@ -0,0 +1,44 @@
1#ifndef _ASM_X86_VSYSCALL_H
2#define _ASM_X86_VSYSCALL_H
3
4enum vsyscall_num {
5 __NR_vgettimeofday,
6 __NR_vtime,
7 __NR_vgetcpu,
8};
9
10#define VSYSCALL_START (-10UL << 20)
11#define VSYSCALL_SIZE 1024
12#define VSYSCALL_END (-2UL << 20)
13#define VSYSCALL_MAPPED_PAGES 1
14#define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
15
16#ifdef __KERNEL__
17#include <linux/seqlock.h>
18
19#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
20#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
21
22/* Definitions for CONFIG_GENERIC_TIME definitions */
23#define __section_vsyscall_gtod_data __attribute__ \
24 ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
25#define __section_vsyscall_clock __attribute__ \
26 ((unused, __section__ (".vsyscall_clock"),aligned(16)))
27#define __vsyscall_fn \
28 __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
29
30#define VGETCPU_RDTSCP 1
31#define VGETCPU_LSL 2
32
33extern int __vgetcpu_mode;
34extern volatile unsigned long __jiffies;
35
36/* kernel space (writeable) */
37extern int vgetcpu_mode;
38extern struct timezone sys_tz;
39
40extern void map_vsyscall(void);
41
42#endif /* __KERNEL__ */
43
44#endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/include/asm/xcr.h b/arch/x86/include/asm/xcr.h
new file mode 100644
index 00000000000..f2cba4e79a2
--- /dev/null
+++ b/arch/x86/include/asm/xcr.h
@@ -0,0 +1,49 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright 2008 rPath, Inc. - All Rights Reserved
4 *
5 * This file is part of the Linux kernel, and is made available under
6 * the terms of the GNU General Public License version 2 or (at your
7 * option) any later version; incorporated herein by reference.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * asm-x86/xcr.h
13 *
14 * Definitions for the eXtended Control Register instructions
15 */
16
17#ifndef _ASM_X86_XCR_H
18#define _ASM_X86_XCR_H
19
20#define XCR_XFEATURE_ENABLED_MASK 0x00000000
21
22#ifdef __KERNEL__
23# ifndef __ASSEMBLY__
24
25#include <linux/types.h>
26
27static inline u64 xgetbv(u32 index)
28{
29 u32 eax, edx;
30
31 asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
32 : "=a" (eax), "=d" (edx)
33 : "c" (index));
34 return eax + ((u64)edx << 32);
35}
36
37static inline void xsetbv(u32 index, u64 value)
38{
39 u32 eax = value;
40 u32 edx = value >> 32;
41
42 asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
43 : : "a" (eax), "d" (edx), "c" (index));
44}
45
46# endif /* __ASSEMBLY__ */
47#endif /* __KERNEL__ */
48
49#endif /* _ASM_X86_XCR_H */
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
new file mode 100644
index 00000000000..19144184983
--- /dev/null
+++ b/arch/x86/include/asm/xen/events.h
@@ -0,0 +1,24 @@
1#ifndef _ASM_X86_XEN_EVENTS_H
2#define _ASM_X86_XEN_EVENTS_H
3
4enum ipi_vector {
5 XEN_RESCHEDULE_VECTOR,
6 XEN_CALL_FUNCTION_VECTOR,
7 XEN_CALL_FUNCTION_SINGLE_VECTOR,
8 XEN_SPIN_UNLOCK_VECTOR,
9
10 XEN_NR_IPIS,
11};
12
13static inline int xen_irqs_disabled(struct pt_regs *regs)
14{
15 return raw_irqs_disabled_flags(regs->flags);
16}
17
18static inline void xen_do_IRQ(int irq, struct pt_regs *regs)
19{
20 regs->orig_ax = ~irq;
21 do_IRQ(regs);
22}
23
24#endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/grant_table.h b/arch/x86/include/asm/xen/grant_table.h
new file mode 100644
index 00000000000..fdbbb45767a
--- /dev/null
+++ b/arch/x86/include/asm/xen/grant_table.h
@@ -0,0 +1,7 @@
1#ifndef _ASM_X86_XEN_GRANT_TABLE_H
2#define _ASM_X86_XEN_GRANT_TABLE_H
3
4#define xen_alloc_vm_area(size) alloc_vm_area(size)
5#define xen_free_vm_area(area) free_vm_area(area)
6
7#endif /* _ASM_X86_XEN_GRANT_TABLE_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
new file mode 100644
index 00000000000..3f6000d95fe
--- /dev/null
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -0,0 +1,527 @@
1/******************************************************************************
2 * hypercall.h
3 *
4 * Linux-specific hypervisor handling.
5 *
6 * Copyright (c) 2002-2004, K A Fraser
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version 2
10 * as published by the Free Software Foundation; or, when distributed
11 * separately from the Linux kernel or incorporated into other
12 * software packages, subject to the following license:
13 *
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 * IN THE SOFTWARE.
31 */
32
33#ifndef _ASM_X86_XEN_HYPERCALL_H
34#define _ASM_X86_XEN_HYPERCALL_H
35
36#include <linux/errno.h>
37#include <linux/string.h>
38
39#include <xen/interface/xen.h>
40#include <xen/interface/sched.h>
41#include <xen/interface/physdev.h>
42
43/*
44 * The hypercall asms have to meet several constraints:
45 * - Work on 32- and 64-bit.
46 * The two architectures put their arguments in different sets of
47 * registers.
48 *
49 * - Work around asm syntax quirks
50 * It isn't possible to specify one of the rNN registers in a
51 * constraint, so we use explicit register variables to get the
52 * args into the right place.
53 *
54 * - Mark all registers as potentially clobbered
55 * Even unused parameters can be clobbered by the hypervisor, so we
56 * need to make sure gcc knows it.
57 *
58 * - Avoid compiler bugs.
59 * This is the tricky part. Because x86_32 has such a constrained
60 * register set, gcc versions below 4.3 have trouble generating
61 * code when all the arg registers and memory are trashed by the
62 * asm. There are syntactically simpler ways of achieving the
63 * semantics below, but they cause the compiler to crash.
64 *
65 * The only combination I found which works is:
66 * - assign the __argX variables first
67 * - list all actually used parameters as "+r" (__argX)
68 * - clobber the rest
69 *
70 * The result certainly isn't pretty, and it really shows up cpp's
71 * weakness as as macro language. Sorry. (But let's just give thanks
72 * there aren't more than 5 arguments...)
73 */
74
75extern struct { char _entry[32]; } hypercall_page[];
76
77#define __HYPERCALL "call hypercall_page+%c[offset]"
78#define __HYPERCALL_ENTRY(x) \
79 [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0]))
80
81#ifdef CONFIG_X86_32
82#define __HYPERCALL_RETREG "eax"
83#define __HYPERCALL_ARG1REG "ebx"
84#define __HYPERCALL_ARG2REG "ecx"
85#define __HYPERCALL_ARG3REG "edx"
86#define __HYPERCALL_ARG4REG "esi"
87#define __HYPERCALL_ARG5REG "edi"
88#else
89#define __HYPERCALL_RETREG "rax"
90#define __HYPERCALL_ARG1REG "rdi"
91#define __HYPERCALL_ARG2REG "rsi"
92#define __HYPERCALL_ARG3REG "rdx"
93#define __HYPERCALL_ARG4REG "r10"
94#define __HYPERCALL_ARG5REG "r8"
95#endif
96
97#define __HYPERCALL_DECLS \
98 register unsigned long __res asm(__HYPERCALL_RETREG); \
99 register unsigned long __arg1 asm(__HYPERCALL_ARG1REG) = __arg1; \
100 register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \
101 register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \
102 register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
103 register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
104
105#define __HYPERCALL_0PARAM "=r" (__res)
106#define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1)
107#define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2)
108#define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3)
109#define __HYPERCALL_4PARAM __HYPERCALL_3PARAM, "+r" (__arg4)
110#define __HYPERCALL_5PARAM __HYPERCALL_4PARAM, "+r" (__arg5)
111
112#define __HYPERCALL_0ARG()
113#define __HYPERCALL_1ARG(a1) \
114 __HYPERCALL_0ARG() __arg1 = (unsigned long)(a1);
115#define __HYPERCALL_2ARG(a1,a2) \
116 __HYPERCALL_1ARG(a1) __arg2 = (unsigned long)(a2);
117#define __HYPERCALL_3ARG(a1,a2,a3) \
118 __HYPERCALL_2ARG(a1,a2) __arg3 = (unsigned long)(a3);
119#define __HYPERCALL_4ARG(a1,a2,a3,a4) \
120 __HYPERCALL_3ARG(a1,a2,a3) __arg4 = (unsigned long)(a4);
121#define __HYPERCALL_5ARG(a1,a2,a3,a4,a5) \
122 __HYPERCALL_4ARG(a1,a2,a3,a4) __arg5 = (unsigned long)(a5);
123
124#define __HYPERCALL_CLOBBER5 "memory"
125#define __HYPERCALL_CLOBBER4 __HYPERCALL_CLOBBER5, __HYPERCALL_ARG5REG
126#define __HYPERCALL_CLOBBER3 __HYPERCALL_CLOBBER4, __HYPERCALL_ARG4REG
127#define __HYPERCALL_CLOBBER2 __HYPERCALL_CLOBBER3, __HYPERCALL_ARG3REG
128#define __HYPERCALL_CLOBBER1 __HYPERCALL_CLOBBER2, __HYPERCALL_ARG2REG
129#define __HYPERCALL_CLOBBER0 __HYPERCALL_CLOBBER1, __HYPERCALL_ARG1REG
130
131#define _hypercall0(type, name) \
132({ \
133 __HYPERCALL_DECLS; \
134 __HYPERCALL_0ARG(); \
135 asm volatile (__HYPERCALL \
136 : __HYPERCALL_0PARAM \
137 : __HYPERCALL_ENTRY(name) \
138 : __HYPERCALL_CLOBBER0); \
139 (type)__res; \
140})
141
142#define _hypercall1(type, name, a1) \
143({ \
144 __HYPERCALL_DECLS; \
145 __HYPERCALL_1ARG(a1); \
146 asm volatile (__HYPERCALL \
147 : __HYPERCALL_1PARAM \
148 : __HYPERCALL_ENTRY(name) \
149 : __HYPERCALL_CLOBBER1); \
150 (type)__res; \
151})
152
153#define _hypercall2(type, name, a1, a2) \
154({ \
155 __HYPERCALL_DECLS; \
156 __HYPERCALL_2ARG(a1, a2); \
157 asm volatile (__HYPERCALL \
158 : __HYPERCALL_2PARAM \
159 : __HYPERCALL_ENTRY(name) \
160 : __HYPERCALL_CLOBBER2); \
161 (type)__res; \
162})
163
164#define _hypercall3(type, name, a1, a2, a3) \
165({ \
166 __HYPERCALL_DECLS; \
167 __HYPERCALL_3ARG(a1, a2, a3); \
168 asm volatile (__HYPERCALL \
169 : __HYPERCALL_3PARAM \
170 : __HYPERCALL_ENTRY(name) \
171 : __HYPERCALL_CLOBBER3); \
172 (type)__res; \
173})
174
175#define _hypercall4(type, name, a1, a2, a3, a4) \
176({ \
177 __HYPERCALL_DECLS; \
178 __HYPERCALL_4ARG(a1, a2, a3, a4); \
179 asm volatile (__HYPERCALL \
180 : __HYPERCALL_4PARAM \
181 : __HYPERCALL_ENTRY(name) \
182 : __HYPERCALL_CLOBBER4); \
183 (type)__res; \
184})
185
186#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
187({ \
188 __HYPERCALL_DECLS; \
189 __HYPERCALL_5ARG(a1, a2, a3, a4, a5); \
190 asm volatile (__HYPERCALL \
191 : __HYPERCALL_5PARAM \
192 : __HYPERCALL_ENTRY(name) \
193 : __HYPERCALL_CLOBBER5); \
194 (type)__res; \
195})
196
197static inline int
198HYPERVISOR_set_trap_table(struct trap_info *table)
199{
200 return _hypercall1(int, set_trap_table, table);
201}
202
203static inline int
204HYPERVISOR_mmu_update(struct mmu_update *req, int count,
205 int *success_count, domid_t domid)
206{
207 return _hypercall4(int, mmu_update, req, count, success_count, domid);
208}
209
210static inline int
211HYPERVISOR_mmuext_op(struct mmuext_op *op, int count,
212 int *success_count, domid_t domid)
213{
214 return _hypercall4(int, mmuext_op, op, count, success_count, domid);
215}
216
217static inline int
218HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
219{
220 return _hypercall2(int, set_gdt, frame_list, entries);
221}
222
223static inline int
224HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
225{
226 return _hypercall2(int, stack_switch, ss, esp);
227}
228
229#ifdef CONFIG_X86_32
230static inline int
231HYPERVISOR_set_callbacks(unsigned long event_selector,
232 unsigned long event_address,
233 unsigned long failsafe_selector,
234 unsigned long failsafe_address)
235{
236 return _hypercall4(int, set_callbacks,
237 event_selector, event_address,
238 failsafe_selector, failsafe_address);
239}
240#else /* CONFIG_X86_64 */
241static inline int
242HYPERVISOR_set_callbacks(unsigned long event_address,
243 unsigned long failsafe_address,
244 unsigned long syscall_address)
245{
246 return _hypercall3(int, set_callbacks,
247 event_address, failsafe_address,
248 syscall_address);
249}
250#endif /* CONFIG_X86_{32,64} */
251
252static inline int
253HYPERVISOR_callback_op(int cmd, void *arg)
254{
255 return _hypercall2(int, callback_op, cmd, arg);
256}
257
258static inline int
259HYPERVISOR_fpu_taskswitch(int set)
260{
261 return _hypercall1(int, fpu_taskswitch, set);
262}
263
264static inline int
265HYPERVISOR_sched_op(int cmd, void *arg)
266{
267 return _hypercall2(int, sched_op_new, cmd, arg);
268}
269
270static inline long
271HYPERVISOR_set_timer_op(u64 timeout)
272{
273 unsigned long timeout_hi = (unsigned long)(timeout>>32);
274 unsigned long timeout_lo = (unsigned long)timeout;
275 return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
276}
277
278static inline int
279HYPERVISOR_set_debugreg(int reg, unsigned long value)
280{
281 return _hypercall2(int, set_debugreg, reg, value);
282}
283
284static inline unsigned long
285HYPERVISOR_get_debugreg(int reg)
286{
287 return _hypercall1(unsigned long, get_debugreg, reg);
288}
289
290static inline int
291HYPERVISOR_update_descriptor(u64 ma, u64 desc)
292{
293 return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
294}
295
296static inline int
297HYPERVISOR_memory_op(unsigned int cmd, void *arg)
298{
299 return _hypercall2(int, memory_op, cmd, arg);
300}
301
302static inline int
303HYPERVISOR_multicall(void *call_list, int nr_calls)
304{
305 return _hypercall2(int, multicall, call_list, nr_calls);
306}
307
308static inline int
309HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
310 unsigned long flags)
311{
312 if (sizeof(new_val) == sizeof(long))
313 return _hypercall3(int, update_va_mapping, va,
314 new_val.pte, flags);
315 else
316 return _hypercall4(int, update_va_mapping, va,
317 new_val.pte, new_val.pte >> 32, flags);
318}
319
320static inline int
321HYPERVISOR_event_channel_op(int cmd, void *arg)
322{
323 int rc = _hypercall2(int, event_channel_op, cmd, arg);
324 if (unlikely(rc == -ENOSYS)) {
325 struct evtchn_op op;
326 op.cmd = cmd;
327 memcpy(&op.u, arg, sizeof(op.u));
328 rc = _hypercall1(int, event_channel_op_compat, &op);
329 memcpy(arg, &op.u, sizeof(op.u));
330 }
331 return rc;
332}
333
334static inline int
335HYPERVISOR_xen_version(int cmd, void *arg)
336{
337 return _hypercall2(int, xen_version, cmd, arg);
338}
339
340static inline int
341HYPERVISOR_console_io(int cmd, int count, char *str)
342{
343 return _hypercall3(int, console_io, cmd, count, str);
344}
345
346static inline int
347HYPERVISOR_physdev_op(int cmd, void *arg)
348{
349 int rc = _hypercall2(int, physdev_op, cmd, arg);
350 if (unlikely(rc == -ENOSYS)) {
351 struct physdev_op op;
352 op.cmd = cmd;
353 memcpy(&op.u, arg, sizeof(op.u));
354 rc = _hypercall1(int, physdev_op_compat, &op);
355 memcpy(arg, &op.u, sizeof(op.u));
356 }
357 return rc;
358}
359
360static inline int
361HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
362{
363 return _hypercall3(int, grant_table_op, cmd, uop, count);
364}
365
366static inline int
367HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val,
368 unsigned long flags, domid_t domid)
369{
370 if (sizeof(new_val) == sizeof(long))
371 return _hypercall4(int, update_va_mapping_otherdomain, va,
372 new_val.pte, flags, domid);
373 else
374 return _hypercall5(int, update_va_mapping_otherdomain, va,
375 new_val.pte, new_val.pte >> 32,
376 flags, domid);
377}
378
379static inline int
380HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type)
381{
382 return _hypercall2(int, vm_assist, cmd, type);
383}
384
385static inline int
386HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
387{
388 return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
389}
390
391#ifdef CONFIG_X86_64
392static inline int
393HYPERVISOR_set_segment_base(int reg, unsigned long value)
394{
395 return _hypercall2(int, set_segment_base, reg, value);
396}
397#endif
398
399static inline int
400HYPERVISOR_suspend(unsigned long srec)
401{
402 return _hypercall3(int, sched_op, SCHEDOP_shutdown,
403 SHUTDOWN_suspend, srec);
404}
405
406static inline int
407HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
408{
409 return _hypercall2(int, nmi_op, op, arg);
410}
411
412static inline void
413MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
414{
415 mcl->op = __HYPERVISOR_fpu_taskswitch;
416 mcl->args[0] = set;
417}
418
419static inline void
420MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
421 pte_t new_val, unsigned long flags)
422{
423 mcl->op = __HYPERVISOR_update_va_mapping;
424 mcl->args[0] = va;
425 if (sizeof(new_val) == sizeof(long)) {
426 mcl->args[1] = new_val.pte;
427 mcl->args[2] = flags;
428 } else {
429 mcl->args[1] = new_val.pte;
430 mcl->args[2] = new_val.pte >> 32;
431 mcl->args[3] = flags;
432 }
433}
434
435static inline void
436MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd,
437 void *uop, unsigned int count)
438{
439 mcl->op = __HYPERVISOR_grant_table_op;
440 mcl->args[0] = cmd;
441 mcl->args[1] = (unsigned long)uop;
442 mcl->args[2] = count;
443}
444
445static inline void
446MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long va,
447 pte_t new_val, unsigned long flags,
448 domid_t domid)
449{
450 mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
451 mcl->args[0] = va;
452 if (sizeof(new_val) == sizeof(long)) {
453 mcl->args[1] = new_val.pte;
454 mcl->args[2] = flags;
455 mcl->args[3] = domid;
456 } else {
457 mcl->args[1] = new_val.pte;
458 mcl->args[2] = new_val.pte >> 32;
459 mcl->args[3] = flags;
460 mcl->args[4] = domid;
461 }
462}
463
464static inline void
465MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
466 struct desc_struct desc)
467{
468 mcl->op = __HYPERVISOR_update_descriptor;
469 if (sizeof(maddr) == sizeof(long)) {
470 mcl->args[0] = maddr;
471 mcl->args[1] = *(unsigned long *)&desc;
472 } else {
473 mcl->args[0] = maddr;
474 mcl->args[1] = maddr >> 32;
475 mcl->args[2] = desc.a;
476 mcl->args[3] = desc.b;
477 }
478}
479
480static inline void
481MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg)
482{
483 mcl->op = __HYPERVISOR_memory_op;
484 mcl->args[0] = cmd;
485 mcl->args[1] = (unsigned long)arg;
486}
487
488static inline void
489MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
490 int count, int *success_count, domid_t domid)
491{
492 mcl->op = __HYPERVISOR_mmu_update;
493 mcl->args[0] = (unsigned long)req;
494 mcl->args[1] = count;
495 mcl->args[2] = (unsigned long)success_count;
496 mcl->args[3] = domid;
497}
498
499static inline void
500MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
501 int *success_count, domid_t domid)
502{
503 mcl->op = __HYPERVISOR_mmuext_op;
504 mcl->args[0] = (unsigned long)op;
505 mcl->args[1] = count;
506 mcl->args[2] = (unsigned long)success_count;
507 mcl->args[3] = domid;
508}
509
510static inline void
511MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
512{
513 mcl->op = __HYPERVISOR_set_gdt;
514 mcl->args[0] = (unsigned long)frames;
515 mcl->args[1] = entries;
516}
517
518static inline void
519MULTI_stack_switch(struct multicall_entry *mcl,
520 unsigned long ss, unsigned long esp)
521{
522 mcl->op = __HYPERVISOR_stack_switch;
523 mcl->args[0] = ss;
524 mcl->args[1] = esp;
525}
526
527#endif /* _ASM_X86_XEN_HYPERCALL_H */
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
new file mode 100644
index 00000000000..a38d25ac87d
--- /dev/null
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -0,0 +1,82 @@
1/******************************************************************************
2 * hypervisor.h
3 *
4 * Linux-specific hypervisor handling.
5 *
6 * Copyright (c) 2002-2004, K A Fraser
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version 2
10 * as published by the Free Software Foundation; or, when distributed
11 * separately from the Linux kernel or incorporated into other
12 * software packages, subject to the following license:
13 *
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 * IN THE SOFTWARE.
31 */
32
33#ifndef _ASM_X86_XEN_HYPERVISOR_H
34#define _ASM_X86_XEN_HYPERVISOR_H
35
36#include <linux/types.h>
37#include <linux/kernel.h>
38
39#include <xen/interface/xen.h>
40#include <xen/interface/version.h>
41
42#include <asm/ptrace.h>
43#include <asm/page.h>
44#include <asm/desc.h>
45#if defined(__i386__)
46# ifdef CONFIG_X86_PAE
47# include <asm-generic/pgtable-nopud.h>
48# else
49# include <asm-generic/pgtable-nopmd.h>
50# endif
51#endif
52#include <asm/xen/hypercall.h>
53
54/* arch/i386/kernel/setup.c */
55extern struct shared_info *HYPERVISOR_shared_info;
56extern struct start_info *xen_start_info;
57
58/* arch/i386/mach-xen/evtchn.c */
59/* Force a proper event-channel callback from Xen. */
60extern void force_evtchn_callback(void);
61
62/* Turn jiffies into Xen system time. */
63u64 jiffies_to_st(unsigned long jiffies);
64
65
66#define MULTI_UVMFLAGS_INDEX 3
67#define MULTI_UVMDOMID_INDEX 4
68
69enum xen_domain_type {
70 XEN_NATIVE,
71 XEN_PV_DOMAIN,
72 XEN_HVM_DOMAIN,
73};
74
75extern enum xen_domain_type xen_domain_type;
76
77#define xen_domain() (xen_domain_type != XEN_NATIVE)
78#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN)
79#define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN)
80#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN)
81
82#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
new file mode 100644
index 00000000000..e8506c1f0c5
--- /dev/null
+++ b/arch/x86/include/asm/xen/interface.h
@@ -0,0 +1,175 @@
1/******************************************************************************
2 * arch-x86_32.h
3 *
4 * Guest OS interface to x86 Xen.
5 *
6 * Copyright (c) 2004, K A Fraser
7 */
8
9#ifndef _ASM_X86_XEN_INTERFACE_H
10#define _ASM_X86_XEN_INTERFACE_H
11
12#ifdef __XEN__
13#define __DEFINE_GUEST_HANDLE(name, type) \
14 typedef struct { type *p; } __guest_handle_ ## name
15#else
16#define __DEFINE_GUEST_HANDLE(name, type) \
17 typedef type * __guest_handle_ ## name
18#endif
19
20#define DEFINE_GUEST_HANDLE_STRUCT(name) \
21 __DEFINE_GUEST_HANDLE(name, struct name)
22#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
23#define GUEST_HANDLE(name) __guest_handle_ ## name
24
25#ifdef __XEN__
26#if defined(__i386__)
27#define set_xen_guest_handle(hnd, val) \
28 do { \
29 if (sizeof(hnd) == 8) \
30 *(uint64_t *)&(hnd) = 0; \
31 (hnd).p = val; \
32 } while (0)
33#elif defined(__x86_64__)
34#define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0)
35#endif
36#else
37#if defined(__i386__)
38#define set_xen_guest_handle(hnd, val) \
39 do { \
40 if (sizeof(hnd) == 8) \
41 *(uint64_t *)&(hnd) = 0; \
42 (hnd) = val; \
43 } while (0)
44#elif defined(__x86_64__)
45#define set_xen_guest_handle(hnd, val) do { (hnd) = val; } while (0)
46#endif
47#endif
48
49#ifndef __ASSEMBLY__
50/* Guest handles for primitive C types. */
51__DEFINE_GUEST_HANDLE(uchar, unsigned char);
52__DEFINE_GUEST_HANDLE(uint, unsigned int);
53__DEFINE_GUEST_HANDLE(ulong, unsigned long);
54DEFINE_GUEST_HANDLE(char);
55DEFINE_GUEST_HANDLE(int);
56DEFINE_GUEST_HANDLE(long);
57DEFINE_GUEST_HANDLE(void);
58#endif
59
60#ifndef HYPERVISOR_VIRT_START
61#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
62#endif
63
64#ifndef machine_to_phys_mapping
65#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
66#endif
67
68/* Maximum number of virtual CPUs in multi-processor guests. */
69#define MAX_VIRT_CPUS 32
70
71/*
72 * SEGMENT DESCRIPTOR TABLES
73 */
74/*
75 * A number of GDT entries are reserved by Xen. These are not situated at the
76 * start of the GDT because some stupid OSes export hard-coded selector values
77 * in their ABI. These hard-coded values are always near the start of the GDT,
78 * so Xen places itself out of the way, at the far end of the GDT.
79 */
80#define FIRST_RESERVED_GDT_PAGE 14
81#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
82#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
83
84/*
85 * Send an array of these to HYPERVISOR_set_trap_table()
86 * The privilege level specifies which modes may enter a trap via a software
87 * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
88 * privilege levels as follows:
89 * Level == 0: Noone may enter
90 * Level == 1: Kernel may enter
91 * Level == 2: Kernel may enter
92 * Level == 3: Everyone may enter
93 */
94#define TI_GET_DPL(_ti) ((_ti)->flags & 3)
95#define TI_GET_IF(_ti) ((_ti)->flags & 4)
96#define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl))
97#define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2))
98
99#ifndef __ASSEMBLY__
100struct trap_info {
101 uint8_t vector; /* exception vector */
102 uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
103 uint16_t cs; /* code selector */
104 unsigned long address; /* code offset */
105};
106DEFINE_GUEST_HANDLE_STRUCT(trap_info);
107
108struct arch_shared_info {
109 unsigned long max_pfn; /* max pfn that appears in table */
110 /* Frame containing list of mfns containing list of mfns containing p2m. */
111 unsigned long pfn_to_mfn_frame_list_list;
112 unsigned long nmi_reason;
113};
114#endif /* !__ASSEMBLY__ */
115
116#ifdef CONFIG_X86_32
117#include "interface_32.h"
118#else
119#include "interface_64.h"
120#endif
121
122#ifndef __ASSEMBLY__
123/*
124 * The following is all CPU context. Note that the fpu_ctxt block is filled
125 * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
126 */
127struct vcpu_guest_context {
128 /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
129 struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
130#define VGCF_I387_VALID (1<<0)
131#define VGCF_HVM_GUEST (1<<1)
132#define VGCF_IN_KERNEL (1<<2)
133 unsigned long flags; /* VGCF_* flags */
134 struct cpu_user_regs user_regs; /* User-level CPU registers */
135 struct trap_info trap_ctxt[256]; /* Virtual IDT */
136 unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
137 unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
138 unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
139 /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
140 unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
141 unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
142#ifdef __i386__
143 unsigned long event_callback_cs; /* CS:EIP of event callback */
144 unsigned long event_callback_eip;
145 unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
146 unsigned long failsafe_callback_eip;
147#else
148 unsigned long event_callback_eip;
149 unsigned long failsafe_callback_eip;
150 unsigned long syscall_callback_eip;
151#endif
152 unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
153#ifdef __x86_64__
154 /* Segment base addresses. */
155 uint64_t fs_base;
156 uint64_t gs_base_kernel;
157 uint64_t gs_base_user;
158#endif
159};
160DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
161#endif /* !__ASSEMBLY__ */
162
163/*
164 * Prefix forces emulation of some non-trapping instructions.
165 * Currently only CPUID.
166 */
167#ifdef __ASSEMBLY__
168#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
169#define XEN_CPUID XEN_EMULATE_PREFIX cpuid
170#else
171#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
172#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid"
173#endif
174
175#endif /* _ASM_X86_XEN_INTERFACE_H */
diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
new file mode 100644
index 00000000000..42a7e004ae5
--- /dev/null
+++ b/arch/x86/include/asm/xen/interface_32.h
@@ -0,0 +1,97 @@
1/******************************************************************************
2 * arch-x86_32.h
3 *
4 * Guest OS interface to x86 32-bit Xen.
5 *
6 * Copyright (c) 2004, K A Fraser
7 */
8
9#ifndef _ASM_X86_XEN_INTERFACE_32_H
10#define _ASM_X86_XEN_INTERFACE_32_H
11
12
13/*
14 * These flat segments are in the Xen-private section of every GDT. Since these
15 * are also present in the initial GDT, many OSes will be able to avoid
16 * installing their own GDT.
17 */
18#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
19#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
20#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
21#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
22#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
23#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
24
25#define FLAT_KERNEL_CS FLAT_RING1_CS
26#define FLAT_KERNEL_DS FLAT_RING1_DS
27#define FLAT_KERNEL_SS FLAT_RING1_SS
28#define FLAT_USER_CS FLAT_RING3_CS
29#define FLAT_USER_DS FLAT_RING3_DS
30#define FLAT_USER_SS FLAT_RING3_SS
31
32/* And the trap vector is... */
33#define TRAP_INSTR "int $0x82"
34
35/*
36 * Virtual addresses beyond this are not modifiable by guest OSes. The
37 * machine->physical mapping table starts at this address, read-only.
38 */
39#define __HYPERVISOR_VIRT_START 0xF5800000
40
41#ifndef __ASSEMBLY__
42
43struct cpu_user_regs {
44 uint32_t ebx;
45 uint32_t ecx;
46 uint32_t edx;
47 uint32_t esi;
48 uint32_t edi;
49 uint32_t ebp;
50 uint32_t eax;
51 uint16_t error_code; /* private */
52 uint16_t entry_vector; /* private */
53 uint32_t eip;
54 uint16_t cs;
55 uint8_t saved_upcall_mask;
56 uint8_t _pad0;
57 uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
58 uint32_t esp;
59 uint16_t ss, _pad1;
60 uint16_t es, _pad2;
61 uint16_t ds, _pad3;
62 uint16_t fs, _pad4;
63 uint16_t gs, _pad5;
64};
65DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
66
67typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
68
69struct arch_vcpu_info {
70 unsigned long cr2;
71 unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
72};
73
74struct xen_callback {
75 unsigned long cs;
76 unsigned long eip;
77};
78typedef struct xen_callback xen_callback_t;
79
80#define XEN_CALLBACK(__cs, __eip) \
81 ((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) })
82#endif /* !__ASSEMBLY__ */
83
84
85/*
86 * Page-directory addresses above 4GB do not fit into architectural %cr3.
87 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
88 * must use the following accessor macros to pack/unpack valid MFNs.
89 *
90 * Note that Xen is using the fact that the pagetable base is always
91 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
92 * of cr3.
93 */
94#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
95#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
96
97#endif /* _ASM_X86_XEN_INTERFACE_32_H */
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
new file mode 100644
index 00000000000..100d2662b97
--- /dev/null
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -0,0 +1,159 @@
1#ifndef _ASM_X86_XEN_INTERFACE_64_H
2#define _ASM_X86_XEN_INTERFACE_64_H
3
4/*
5 * 64-bit segment selectors
6 * These flat segments are in the Xen-private section of every GDT. Since these
7 * are also present in the initial GDT, many OSes will be able to avoid
8 * installing their own GDT.
9 */
10
11#define FLAT_RING3_CS32 0xe023 /* GDT index 260 */
12#define FLAT_RING3_CS64 0xe033 /* GDT index 261 */
13#define FLAT_RING3_DS32 0xe02b /* GDT index 262 */
14#define FLAT_RING3_DS64 0x0000 /* NULL selector */
15#define FLAT_RING3_SS32 0xe02b /* GDT index 262 */
16#define FLAT_RING3_SS64 0xe02b /* GDT index 262 */
17
18#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
19#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
20#define FLAT_KERNEL_DS FLAT_KERNEL_DS64
21#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
22#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
23#define FLAT_KERNEL_CS FLAT_KERNEL_CS64
24#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
25#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
26#define FLAT_KERNEL_SS FLAT_KERNEL_SS64
27
28#define FLAT_USER_DS64 FLAT_RING3_DS64
29#define FLAT_USER_DS32 FLAT_RING3_DS32
30#define FLAT_USER_DS FLAT_USER_DS64
31#define FLAT_USER_CS64 FLAT_RING3_CS64
32#define FLAT_USER_CS32 FLAT_RING3_CS32
33#define FLAT_USER_CS FLAT_USER_CS64
34#define FLAT_USER_SS64 FLAT_RING3_SS64
35#define FLAT_USER_SS32 FLAT_RING3_SS32
36#define FLAT_USER_SS FLAT_USER_SS64
37
38#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
39#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
40#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
41#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
42
43#ifndef HYPERVISOR_VIRT_START
44#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
45#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
46#endif
47
48#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
49#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
50#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
51#ifndef machine_to_phys_mapping
52#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
53#endif
54
55/*
56 * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
57 * @which == SEGBASE_* ; @base == 64-bit base address
58 * Returns 0 on success.
59 */
60#define SEGBASE_FS 0
61#define SEGBASE_GS_USER 1
62#define SEGBASE_GS_KERNEL 2
63#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
64
65/*
66 * int HYPERVISOR_iret(void)
67 * All arguments are on the kernel stack, in the following format.
68 * Never returns if successful. Current kernel context is lost.
69 * The saved CS is mapped as follows:
70 * RING0 -> RING3 kernel mode.
71 * RING1 -> RING3 kernel mode.
72 * RING2 -> RING3 kernel mode.
73 * RING3 -> RING3 user mode.
74 * However RING0 indicates that the guest kernel should return to iteself
75 * directly with
76 * orb $3,1*8(%rsp)
77 * iretq
78 * If flags contains VGCF_in_syscall:
79 * Restore RAX, RIP, RFLAGS, RSP.
80 * Discard R11, RCX, CS, SS.
81 * Otherwise:
82 * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
83 * All other registers are saved on hypercall entry and restored to user.
84 */
85/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
86#define _VGCF_in_syscall 8
87#define VGCF_in_syscall (1<<_VGCF_in_syscall)
88#define VGCF_IN_SYSCALL VGCF_in_syscall
89
90#ifndef __ASSEMBLY__
91
92struct iret_context {
93 /* Top of stack (%rsp at point of hypercall). */
94 uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
95 /* Bottom of iret stack frame. */
96};
97
98#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
99/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
100#define __DECL_REG(name) union { \
101 uint64_t r ## name, e ## name; \
102 uint32_t _e ## name; \
103}
104#else
105/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
106#define __DECL_REG(name) uint64_t r ## name
107#endif
108
109struct cpu_user_regs {
110 uint64_t r15;
111 uint64_t r14;
112 uint64_t r13;
113 uint64_t r12;
114 __DECL_REG(bp);
115 __DECL_REG(bx);
116 uint64_t r11;
117 uint64_t r10;
118 uint64_t r9;
119 uint64_t r8;
120 __DECL_REG(ax);
121 __DECL_REG(cx);
122 __DECL_REG(dx);
123 __DECL_REG(si);
124 __DECL_REG(di);
125 uint32_t error_code; /* private */
126 uint32_t entry_vector; /* private */
127 __DECL_REG(ip);
128 uint16_t cs, _pad0[1];
129 uint8_t saved_upcall_mask;
130 uint8_t _pad1[3];
131 __DECL_REG(flags); /* rflags.IF == !saved_upcall_mask */
132 __DECL_REG(sp);
133 uint16_t ss, _pad2[3];
134 uint16_t es, _pad3[3];
135 uint16_t ds, _pad4[3];
136 uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
137 uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
138};
139DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
140
141#undef __DECL_REG
142
143#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
144#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
145
146struct arch_vcpu_info {
147 unsigned long cr2;
148 unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
149};
150
151typedef unsigned long xen_callback_t;
152
153#define XEN_CALLBACK(__cs, __rip) \
154 ((unsigned long)(__rip))
155
156#endif /* !__ASSEMBLY__ */
157
158
159#endif /* _ASM_X86_XEN_INTERFACE_64_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
new file mode 100644
index 00000000000..bc628998a1b
--- /dev/null
+++ b/arch/x86/include/asm/xen/page.h
@@ -0,0 +1,165 @@
1#ifndef _ASM_X86_XEN_PAGE_H
2#define _ASM_X86_XEN_PAGE_H
3
4#include <linux/pfn.h>
5
6#include <asm/uaccess.h>
7#include <asm/pgtable.h>
8
9#include <xen/features.h>
10
11/* Xen machine address */
12typedef struct xmaddr {
13 phys_addr_t maddr;
14} xmaddr_t;
15
16/* Xen pseudo-physical address */
17typedef struct xpaddr {
18 phys_addr_t paddr;
19} xpaddr_t;
20
21#define XMADDR(x) ((xmaddr_t) { .maddr = (x) })
22#define XPADDR(x) ((xpaddr_t) { .paddr = (x) })
23
24/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
25#define INVALID_P2M_ENTRY (~0UL)
26#define FOREIGN_FRAME_BIT (1UL<<31)
27#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
28
29/* Maximum amount of memory we can handle in a domain in pages */
30#define MAX_DOMAIN_PAGES \
31 ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
32
33
34extern unsigned long get_phys_to_machine(unsigned long pfn);
35extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn);
36
37static inline unsigned long pfn_to_mfn(unsigned long pfn)
38{
39 if (xen_feature(XENFEAT_auto_translated_physmap))
40 return pfn;
41
42 return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT;
43}
44
45static inline int phys_to_machine_mapping_valid(unsigned long pfn)
46{
47 if (xen_feature(XENFEAT_auto_translated_physmap))
48 return 1;
49
50 return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY;
51}
52
53static inline unsigned long mfn_to_pfn(unsigned long mfn)
54{
55 unsigned long pfn;
56
57 if (xen_feature(XENFEAT_auto_translated_physmap))
58 return mfn;
59
60#if 0
61 if (unlikely((mfn >> machine_to_phys_order) != 0))
62 return max_mapnr;
63#endif
64
65 pfn = 0;
66 /*
67 * The array access can fail (e.g., device space beyond end of RAM).
68 * In such cases it doesn't matter what we return (we return garbage),
69 * but we must handle the fault without crashing!
70 */
71 __get_user(pfn, &machine_to_phys_mapping[mfn]);
72
73 return pfn;
74}
75
76static inline xmaddr_t phys_to_machine(xpaddr_t phys)
77{
78 unsigned offset = phys.paddr & ~PAGE_MASK;
79 return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
80}
81
82static inline xpaddr_t machine_to_phys(xmaddr_t machine)
83{
84 unsigned offset = machine.maddr & ~PAGE_MASK;
85 return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
86}
87
88/*
89 * We detect special mappings in one of two ways:
90 * 1. If the MFN is an I/O page then Xen will set the m2p entry
91 * to be outside our maximum possible pseudophys range.
92 * 2. If the MFN belongs to a different domain then we will certainly
93 * not have MFN in our p2m table. Conversely, if the page is ours,
94 * then we'll have p2m(m2p(MFN))==MFN.
95 * If we detect a special mapping then it doesn't have a 'struct page'.
96 * We force !pfn_valid() by returning an out-of-range pointer.
97 *
98 * NB. These checks require that, for any MFN that is not in our reservation,
99 * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
100 * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
101 * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
102 *
103 * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
104 * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
105 * require. In all the cases we care about, the FOREIGN_FRAME bit is
106 * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
107 */
108static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
109{
110 extern unsigned long max_mapnr;
111 unsigned long pfn = mfn_to_pfn(mfn);
112 if ((pfn < max_mapnr)
113 && !xen_feature(XENFEAT_auto_translated_physmap)
114 && (get_phys_to_machine(pfn) != mfn))
115 return max_mapnr; /* force !pfn_valid() */
116 /* XXX fixme; not true with sparsemem */
117 return pfn;
118}
119
120/* VIRT <-> MACHINE conversion */
121#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v))))
122#define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v))))
123#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
124
125static inline unsigned long pte_mfn(pte_t pte)
126{
127 return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
128}
129
130static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
131{
132 pte_t pte;
133
134 pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) |
135 (pgprot_val(pgprot) & __supported_pte_mask);
136
137 return pte;
138}
139
140static inline pteval_t pte_val_ma(pte_t pte)
141{
142 return pte.pte;
143}
144
145static inline pte_t __pte_ma(pteval_t x)
146{
147 return (pte_t) { .pte = x };
148}
149
150#define pmd_val_ma(v) ((v).pmd)
151#ifdef __PAGETABLE_PUD_FOLDED
152#define pud_val_ma(v) ((v).pgd.pgd)
153#else
154#define pud_val_ma(v) ((v).pud)
155#endif
156#define __pmd_ma(x) ((pmd_t) { (x) } )
157
158#define pgd_val_ma(x) ((x).pgd)
159
160
161xmaddr_t arbitrary_virt_to_machine(void *address);
162void make_lowmem_page_readonly(void *vaddr);
163void make_lowmem_page_readwrite(void *vaddr);
164
165#endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
new file mode 100644
index 00000000000..11b3bb86e17
--- /dev/null
+++ b/arch/x86/include/asm/xor.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "xor_32.h"
3#else
4# include "xor_64.h"
5#endif
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
new file mode 100644
index 00000000000..133b40a0f49
--- /dev/null
+++ b/arch/x86/include/asm/xor_32.h
@@ -0,0 +1,888 @@
1#ifndef _ASM_X86_XOR_32_H
2#define _ASM_X86_XOR_32_H
3
4/*
5 * Optimized RAID-5 checksumming functions for MMX and SSE.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
10 * any later version.
11 *
12 * You should have received a copy of the GNU General Public License
13 * (for example /usr/src/linux/COPYING); if not, write to the Free
14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * High-speed RAID5 checksumming functions utilizing MMX instructions.
19 * Copyright (C) 1998 Ingo Molnar.
20 */
21
22#define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
23#define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
24#define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
25#define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
26#define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
27#define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
28
29#include <asm/i387.h>
30
31static void
32xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
33{
34 unsigned long lines = bytes >> 7;
35
36 kernel_fpu_begin();
37
38 asm volatile(
39#undef BLOCK
40#define BLOCK(i) \
41 LD(i, 0) \
42 LD(i + 1, 1) \
43 LD(i + 2, 2) \
44 LD(i + 3, 3) \
45 XO1(i, 0) \
46 ST(i, 0) \
47 XO1(i+1, 1) \
48 ST(i+1, 1) \
49 XO1(i + 2, 2) \
50 ST(i + 2, 2) \
51 XO1(i + 3, 3) \
52 ST(i + 3, 3)
53
54 " .align 32 ;\n"
55 " 1: ;\n"
56
57 BLOCK(0)
58 BLOCK(4)
59 BLOCK(8)
60 BLOCK(12)
61
62 " addl $128, %1 ;\n"
63 " addl $128, %2 ;\n"
64 " decl %0 ;\n"
65 " jnz 1b ;\n"
66 : "+r" (lines),
67 "+r" (p1), "+r" (p2)
68 :
69 : "memory");
70
71 kernel_fpu_end();
72}
73
74static void
75xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
76 unsigned long *p3)
77{
78 unsigned long lines = bytes >> 7;
79
80 kernel_fpu_begin();
81
82 asm volatile(
83#undef BLOCK
84#define BLOCK(i) \
85 LD(i, 0) \
86 LD(i + 1, 1) \
87 LD(i + 2, 2) \
88 LD(i + 3, 3) \
89 XO1(i, 0) \
90 XO1(i + 1, 1) \
91 XO1(i + 2, 2) \
92 XO1(i + 3, 3) \
93 XO2(i, 0) \
94 ST(i, 0) \
95 XO2(i + 1, 1) \
96 ST(i + 1, 1) \
97 XO2(i + 2, 2) \
98 ST(i + 2, 2) \
99 XO2(i + 3, 3) \
100 ST(i + 3, 3)
101
102 " .align 32 ;\n"
103 " 1: ;\n"
104
105 BLOCK(0)
106 BLOCK(4)
107 BLOCK(8)
108 BLOCK(12)
109
110 " addl $128, %1 ;\n"
111 " addl $128, %2 ;\n"
112 " addl $128, %3 ;\n"
113 " decl %0 ;\n"
114 " jnz 1b ;\n"
115 : "+r" (lines),
116 "+r" (p1), "+r" (p2), "+r" (p3)
117 :
118 : "memory");
119
120 kernel_fpu_end();
121}
122
123static void
124xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
125 unsigned long *p3, unsigned long *p4)
126{
127 unsigned long lines = bytes >> 7;
128
129 kernel_fpu_begin();
130
131 asm volatile(
132#undef BLOCK
133#define BLOCK(i) \
134 LD(i, 0) \
135 LD(i + 1, 1) \
136 LD(i + 2, 2) \
137 LD(i + 3, 3) \
138 XO1(i, 0) \
139 XO1(i + 1, 1) \
140 XO1(i + 2, 2) \
141 XO1(i + 3, 3) \
142 XO2(i, 0) \
143 XO2(i + 1, 1) \
144 XO2(i + 2, 2) \
145 XO2(i + 3, 3) \
146 XO3(i, 0) \
147 ST(i, 0) \
148 XO3(i + 1, 1) \
149 ST(i + 1, 1) \
150 XO3(i + 2, 2) \
151 ST(i + 2, 2) \
152 XO3(i + 3, 3) \
153 ST(i + 3, 3)
154
155 " .align 32 ;\n"
156 " 1: ;\n"
157
158 BLOCK(0)
159 BLOCK(4)
160 BLOCK(8)
161 BLOCK(12)
162
163 " addl $128, %1 ;\n"
164 " addl $128, %2 ;\n"
165 " addl $128, %3 ;\n"
166 " addl $128, %4 ;\n"
167 " decl %0 ;\n"
168 " jnz 1b ;\n"
169 : "+r" (lines),
170 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
171 :
172 : "memory");
173
174 kernel_fpu_end();
175}
176
177
178static void
179xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
180 unsigned long *p3, unsigned long *p4, unsigned long *p5)
181{
182 unsigned long lines = bytes >> 7;
183
184 kernel_fpu_begin();
185
186 /* Make sure GCC forgets anything it knows about p4 or p5,
187 such that it won't pass to the asm volatile below a
188 register that is shared with any other variable. That's
189 because we modify p4 and p5 there, but we can't mark them
190 as read/write, otherwise we'd overflow the 10-asm-operands
191 limit of GCC < 3.1. */
192 asm("" : "+r" (p4), "+r" (p5));
193
194 asm volatile(
195#undef BLOCK
196#define BLOCK(i) \
197 LD(i, 0) \
198 LD(i + 1, 1) \
199 LD(i + 2, 2) \
200 LD(i + 3, 3) \
201 XO1(i, 0) \
202 XO1(i + 1, 1) \
203 XO1(i + 2, 2) \
204 XO1(i + 3, 3) \
205 XO2(i, 0) \
206 XO2(i + 1, 1) \
207 XO2(i + 2, 2) \
208 XO2(i + 3, 3) \
209 XO3(i, 0) \
210 XO3(i + 1, 1) \
211 XO3(i + 2, 2) \
212 XO3(i + 3, 3) \
213 XO4(i, 0) \
214 ST(i, 0) \
215 XO4(i + 1, 1) \
216 ST(i + 1, 1) \
217 XO4(i + 2, 2) \
218 ST(i + 2, 2) \
219 XO4(i + 3, 3) \
220 ST(i + 3, 3)
221
222 " .align 32 ;\n"
223 " 1: ;\n"
224
225 BLOCK(0)
226 BLOCK(4)
227 BLOCK(8)
228 BLOCK(12)
229
230 " addl $128, %1 ;\n"
231 " addl $128, %2 ;\n"
232 " addl $128, %3 ;\n"
233 " addl $128, %4 ;\n"
234 " addl $128, %5 ;\n"
235 " decl %0 ;\n"
236 " jnz 1b ;\n"
237 : "+r" (lines),
238 "+r" (p1), "+r" (p2), "+r" (p3)
239 : "r" (p4), "r" (p5)
240 : "memory");
241
242 /* p4 and p5 were modified, and now the variables are dead.
243 Clobber them just to be sure nobody does something stupid
244 like assuming they have some legal value. */
245 asm("" : "=r" (p4), "=r" (p5));
246
247 kernel_fpu_end();
248}
249
250#undef LD
251#undef XO1
252#undef XO2
253#undef XO3
254#undef XO4
255#undef ST
256#undef BLOCK
257
258static void
259xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
260{
261 unsigned long lines = bytes >> 6;
262
263 kernel_fpu_begin();
264
265 asm volatile(
266 " .align 32 ;\n"
267 " 1: ;\n"
268 " movq (%1), %%mm0 ;\n"
269 " movq 8(%1), %%mm1 ;\n"
270 " pxor (%2), %%mm0 ;\n"
271 " movq 16(%1), %%mm2 ;\n"
272 " movq %%mm0, (%1) ;\n"
273 " pxor 8(%2), %%mm1 ;\n"
274 " movq 24(%1), %%mm3 ;\n"
275 " movq %%mm1, 8(%1) ;\n"
276 " pxor 16(%2), %%mm2 ;\n"
277 " movq 32(%1), %%mm4 ;\n"
278 " movq %%mm2, 16(%1) ;\n"
279 " pxor 24(%2), %%mm3 ;\n"
280 " movq 40(%1), %%mm5 ;\n"
281 " movq %%mm3, 24(%1) ;\n"
282 " pxor 32(%2), %%mm4 ;\n"
283 " movq 48(%1), %%mm6 ;\n"
284 " movq %%mm4, 32(%1) ;\n"
285 " pxor 40(%2), %%mm5 ;\n"
286 " movq 56(%1), %%mm7 ;\n"
287 " movq %%mm5, 40(%1) ;\n"
288 " pxor 48(%2), %%mm6 ;\n"
289 " pxor 56(%2), %%mm7 ;\n"
290 " movq %%mm6, 48(%1) ;\n"
291 " movq %%mm7, 56(%1) ;\n"
292
293 " addl $64, %1 ;\n"
294 " addl $64, %2 ;\n"
295 " decl %0 ;\n"
296 " jnz 1b ;\n"
297 : "+r" (lines),
298 "+r" (p1), "+r" (p2)
299 :
300 : "memory");
301
302 kernel_fpu_end();
303}
304
305static void
306xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
307 unsigned long *p3)
308{
309 unsigned long lines = bytes >> 6;
310
311 kernel_fpu_begin();
312
313 asm volatile(
314 " .align 32,0x90 ;\n"
315 " 1: ;\n"
316 " movq (%1), %%mm0 ;\n"
317 " movq 8(%1), %%mm1 ;\n"
318 " pxor (%2), %%mm0 ;\n"
319 " movq 16(%1), %%mm2 ;\n"
320 " pxor 8(%2), %%mm1 ;\n"
321 " pxor (%3), %%mm0 ;\n"
322 " pxor 16(%2), %%mm2 ;\n"
323 " movq %%mm0, (%1) ;\n"
324 " pxor 8(%3), %%mm1 ;\n"
325 " pxor 16(%3), %%mm2 ;\n"
326 " movq 24(%1), %%mm3 ;\n"
327 " movq %%mm1, 8(%1) ;\n"
328 " movq 32(%1), %%mm4 ;\n"
329 " movq 40(%1), %%mm5 ;\n"
330 " pxor 24(%2), %%mm3 ;\n"
331 " movq %%mm2, 16(%1) ;\n"
332 " pxor 32(%2), %%mm4 ;\n"
333 " pxor 24(%3), %%mm3 ;\n"
334 " pxor 40(%2), %%mm5 ;\n"
335 " movq %%mm3, 24(%1) ;\n"
336 " pxor 32(%3), %%mm4 ;\n"
337 " pxor 40(%3), %%mm5 ;\n"
338 " movq 48(%1), %%mm6 ;\n"
339 " movq %%mm4, 32(%1) ;\n"
340 " movq 56(%1), %%mm7 ;\n"
341 " pxor 48(%2), %%mm6 ;\n"
342 " movq %%mm5, 40(%1) ;\n"
343 " pxor 56(%2), %%mm7 ;\n"
344 " pxor 48(%3), %%mm6 ;\n"
345 " pxor 56(%3), %%mm7 ;\n"
346 " movq %%mm6, 48(%1) ;\n"
347 " movq %%mm7, 56(%1) ;\n"
348
349 " addl $64, %1 ;\n"
350 " addl $64, %2 ;\n"
351 " addl $64, %3 ;\n"
352 " decl %0 ;\n"
353 " jnz 1b ;\n"
354 : "+r" (lines),
355 "+r" (p1), "+r" (p2), "+r" (p3)
356 :
357 : "memory" );
358
359 kernel_fpu_end();
360}
361
362static void
363xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
364 unsigned long *p3, unsigned long *p4)
365{
366 unsigned long lines = bytes >> 6;
367
368 kernel_fpu_begin();
369
370 asm volatile(
371 " .align 32,0x90 ;\n"
372 " 1: ;\n"
373 " movq (%1), %%mm0 ;\n"
374 " movq 8(%1), %%mm1 ;\n"
375 " pxor (%2), %%mm0 ;\n"
376 " movq 16(%1), %%mm2 ;\n"
377 " pxor 8(%2), %%mm1 ;\n"
378 " pxor (%3), %%mm0 ;\n"
379 " pxor 16(%2), %%mm2 ;\n"
380 " pxor 8(%3), %%mm1 ;\n"
381 " pxor (%4), %%mm0 ;\n"
382 " movq 24(%1), %%mm3 ;\n"
383 " pxor 16(%3), %%mm2 ;\n"
384 " pxor 8(%4), %%mm1 ;\n"
385 " movq %%mm0, (%1) ;\n"
386 " movq 32(%1), %%mm4 ;\n"
387 " pxor 24(%2), %%mm3 ;\n"
388 " pxor 16(%4), %%mm2 ;\n"
389 " movq %%mm1, 8(%1) ;\n"
390 " movq 40(%1), %%mm5 ;\n"
391 " pxor 32(%2), %%mm4 ;\n"
392 " pxor 24(%3), %%mm3 ;\n"
393 " movq %%mm2, 16(%1) ;\n"
394 " pxor 40(%2), %%mm5 ;\n"
395 " pxor 32(%3), %%mm4 ;\n"
396 " pxor 24(%4), %%mm3 ;\n"
397 " movq %%mm3, 24(%1) ;\n"
398 " movq 56(%1), %%mm7 ;\n"
399 " movq 48(%1), %%mm6 ;\n"
400 " pxor 40(%3), %%mm5 ;\n"
401 " pxor 32(%4), %%mm4 ;\n"
402 " pxor 48(%2), %%mm6 ;\n"
403 " movq %%mm4, 32(%1) ;\n"
404 " pxor 56(%2), %%mm7 ;\n"
405 " pxor 40(%4), %%mm5 ;\n"
406 " pxor 48(%3), %%mm6 ;\n"
407 " pxor 56(%3), %%mm7 ;\n"
408 " movq %%mm5, 40(%1) ;\n"
409 " pxor 48(%4), %%mm6 ;\n"
410 " pxor 56(%4), %%mm7 ;\n"
411 " movq %%mm6, 48(%1) ;\n"
412 " movq %%mm7, 56(%1) ;\n"
413
414 " addl $64, %1 ;\n"
415 " addl $64, %2 ;\n"
416 " addl $64, %3 ;\n"
417 " addl $64, %4 ;\n"
418 " decl %0 ;\n"
419 " jnz 1b ;\n"
420 : "+r" (lines),
421 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
422 :
423 : "memory");
424
425 kernel_fpu_end();
426}
427
428static void
429xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
430 unsigned long *p3, unsigned long *p4, unsigned long *p5)
431{
432 unsigned long lines = bytes >> 6;
433
434 kernel_fpu_begin();
435
436 /* Make sure GCC forgets anything it knows about p4 or p5,
437 such that it won't pass to the asm volatile below a
438 register that is shared with any other variable. That's
439 because we modify p4 and p5 there, but we can't mark them
440 as read/write, otherwise we'd overflow the 10-asm-operands
441 limit of GCC < 3.1. */
442 asm("" : "+r" (p4), "+r" (p5));
443
444 asm volatile(
445 " .align 32,0x90 ;\n"
446 " 1: ;\n"
447 " movq (%1), %%mm0 ;\n"
448 " movq 8(%1), %%mm1 ;\n"
449 " pxor (%2), %%mm0 ;\n"
450 " pxor 8(%2), %%mm1 ;\n"
451 " movq 16(%1), %%mm2 ;\n"
452 " pxor (%3), %%mm0 ;\n"
453 " pxor 8(%3), %%mm1 ;\n"
454 " pxor 16(%2), %%mm2 ;\n"
455 " pxor (%4), %%mm0 ;\n"
456 " pxor 8(%4), %%mm1 ;\n"
457 " pxor 16(%3), %%mm2 ;\n"
458 " movq 24(%1), %%mm3 ;\n"
459 " pxor (%5), %%mm0 ;\n"
460 " pxor 8(%5), %%mm1 ;\n"
461 " movq %%mm0, (%1) ;\n"
462 " pxor 16(%4), %%mm2 ;\n"
463 " pxor 24(%2), %%mm3 ;\n"
464 " movq %%mm1, 8(%1) ;\n"
465 " pxor 16(%5), %%mm2 ;\n"
466 " pxor 24(%3), %%mm3 ;\n"
467 " movq 32(%1), %%mm4 ;\n"
468 " movq %%mm2, 16(%1) ;\n"
469 " pxor 24(%4), %%mm3 ;\n"
470 " pxor 32(%2), %%mm4 ;\n"
471 " movq 40(%1), %%mm5 ;\n"
472 " pxor 24(%5), %%mm3 ;\n"
473 " pxor 32(%3), %%mm4 ;\n"
474 " pxor 40(%2), %%mm5 ;\n"
475 " movq %%mm3, 24(%1) ;\n"
476 " pxor 32(%4), %%mm4 ;\n"
477 " pxor 40(%3), %%mm5 ;\n"
478 " movq 48(%1), %%mm6 ;\n"
479 " movq 56(%1), %%mm7 ;\n"
480 " pxor 32(%5), %%mm4 ;\n"
481 " pxor 40(%4), %%mm5 ;\n"
482 " pxor 48(%2), %%mm6 ;\n"
483 " pxor 56(%2), %%mm7 ;\n"
484 " movq %%mm4, 32(%1) ;\n"
485 " pxor 48(%3), %%mm6 ;\n"
486 " pxor 56(%3), %%mm7 ;\n"
487 " pxor 40(%5), %%mm5 ;\n"
488 " pxor 48(%4), %%mm6 ;\n"
489 " pxor 56(%4), %%mm7 ;\n"
490 " movq %%mm5, 40(%1) ;\n"
491 " pxor 48(%5), %%mm6 ;\n"
492 " pxor 56(%5), %%mm7 ;\n"
493 " movq %%mm6, 48(%1) ;\n"
494 " movq %%mm7, 56(%1) ;\n"
495
496 " addl $64, %1 ;\n"
497 " addl $64, %2 ;\n"
498 " addl $64, %3 ;\n"
499 " addl $64, %4 ;\n"
500 " addl $64, %5 ;\n"
501 " decl %0 ;\n"
502 " jnz 1b ;\n"
503 : "+r" (lines),
504 "+r" (p1), "+r" (p2), "+r" (p3)
505 : "r" (p4), "r" (p5)
506 : "memory");
507
508 /* p4 and p5 were modified, and now the variables are dead.
509 Clobber them just to be sure nobody does something stupid
510 like assuming they have some legal value. */
511 asm("" : "=r" (p4), "=r" (p5));
512
513 kernel_fpu_end();
514}
515
516static struct xor_block_template xor_block_pII_mmx = {
517 .name = "pII_mmx",
518 .do_2 = xor_pII_mmx_2,
519 .do_3 = xor_pII_mmx_3,
520 .do_4 = xor_pII_mmx_4,
521 .do_5 = xor_pII_mmx_5,
522};
523
524static struct xor_block_template xor_block_p5_mmx = {
525 .name = "p5_mmx",
526 .do_2 = xor_p5_mmx_2,
527 .do_3 = xor_p5_mmx_3,
528 .do_4 = xor_p5_mmx_4,
529 .do_5 = xor_p5_mmx_5,
530};
531
532/*
533 * Cache avoiding checksumming functions utilizing KNI instructions
534 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
535 */
536
537#define XMMS_SAVE \
538do { \
539 preempt_disable(); \
540 cr0 = read_cr0(); \
541 clts(); \
542 asm volatile( \
543 "movups %%xmm0,(%0) ;\n\t" \
544 "movups %%xmm1,0x10(%0) ;\n\t" \
545 "movups %%xmm2,0x20(%0) ;\n\t" \
546 "movups %%xmm3,0x30(%0) ;\n\t" \
547 : \
548 : "r" (xmm_save) \
549 : "memory"); \
550} while (0)
551
552#define XMMS_RESTORE \
553do { \
554 asm volatile( \
555 "sfence ;\n\t" \
556 "movups (%0),%%xmm0 ;\n\t" \
557 "movups 0x10(%0),%%xmm1 ;\n\t" \
558 "movups 0x20(%0),%%xmm2 ;\n\t" \
559 "movups 0x30(%0),%%xmm3 ;\n\t" \
560 : \
561 : "r" (xmm_save) \
562 : "memory"); \
563 write_cr0(cr0); \
564 preempt_enable(); \
565} while (0)
566
567#define ALIGN16 __attribute__((aligned(16)))
568
569#define OFFS(x) "16*("#x")"
570#define PF_OFFS(x) "256+16*("#x")"
571#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
572#define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
573#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
574#define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
575#define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
576#define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
577#define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
578#define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
579#define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
580#define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
581#define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
582#define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
583#define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
584
585
586static void
587xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
588{
589 unsigned long lines = bytes >> 8;
590 char xmm_save[16*4] ALIGN16;
591 int cr0;
592
593 XMMS_SAVE;
594
595 asm volatile(
596#undef BLOCK
597#define BLOCK(i) \
598 LD(i, 0) \
599 LD(i + 1, 1) \
600 PF1(i) \
601 PF1(i + 2) \
602 LD(i + 2, 2) \
603 LD(i + 3, 3) \
604 PF0(i + 4) \
605 PF0(i + 6) \
606 XO1(i, 0) \
607 XO1(i + 1, 1) \
608 XO1(i + 2, 2) \
609 XO1(i + 3, 3) \
610 ST(i, 0) \
611 ST(i + 1, 1) \
612 ST(i + 2, 2) \
613 ST(i + 3, 3) \
614
615
616 PF0(0)
617 PF0(2)
618
619 " .align 32 ;\n"
620 " 1: ;\n"
621
622 BLOCK(0)
623 BLOCK(4)
624 BLOCK(8)
625 BLOCK(12)
626
627 " addl $256, %1 ;\n"
628 " addl $256, %2 ;\n"
629 " decl %0 ;\n"
630 " jnz 1b ;\n"
631 : "+r" (lines),
632 "+r" (p1), "+r" (p2)
633 :
634 : "memory");
635
636 XMMS_RESTORE;
637}
638
639static void
640xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
641 unsigned long *p3)
642{
643 unsigned long lines = bytes >> 8;
644 char xmm_save[16*4] ALIGN16;
645 int cr0;
646
647 XMMS_SAVE;
648
649 asm volatile(
650#undef BLOCK
651#define BLOCK(i) \
652 PF1(i) \
653 PF1(i + 2) \
654 LD(i,0) \
655 LD(i + 1, 1) \
656 LD(i + 2, 2) \
657 LD(i + 3, 3) \
658 PF2(i) \
659 PF2(i + 2) \
660 PF0(i + 4) \
661 PF0(i + 6) \
662 XO1(i,0) \
663 XO1(i + 1, 1) \
664 XO1(i + 2, 2) \
665 XO1(i + 3, 3) \
666 XO2(i,0) \
667 XO2(i + 1, 1) \
668 XO2(i + 2, 2) \
669 XO2(i + 3, 3) \
670 ST(i,0) \
671 ST(i + 1, 1) \
672 ST(i + 2, 2) \
673 ST(i + 3, 3) \
674
675
676 PF0(0)
677 PF0(2)
678
679 " .align 32 ;\n"
680 " 1: ;\n"
681
682 BLOCK(0)
683 BLOCK(4)
684 BLOCK(8)
685 BLOCK(12)
686
687 " addl $256, %1 ;\n"
688 " addl $256, %2 ;\n"
689 " addl $256, %3 ;\n"
690 " decl %0 ;\n"
691 " jnz 1b ;\n"
692 : "+r" (lines),
693 "+r" (p1), "+r"(p2), "+r"(p3)
694 :
695 : "memory" );
696
697 XMMS_RESTORE;
698}
699
700static void
701xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
702 unsigned long *p3, unsigned long *p4)
703{
704 unsigned long lines = bytes >> 8;
705 char xmm_save[16*4] ALIGN16;
706 int cr0;
707
708 XMMS_SAVE;
709
710 asm volatile(
711#undef BLOCK
712#define BLOCK(i) \
713 PF1(i) \
714 PF1(i + 2) \
715 LD(i,0) \
716 LD(i + 1, 1) \
717 LD(i + 2, 2) \
718 LD(i + 3, 3) \
719 PF2(i) \
720 PF2(i + 2) \
721 XO1(i,0) \
722 XO1(i + 1, 1) \
723 XO1(i + 2, 2) \
724 XO1(i + 3, 3) \
725 PF3(i) \
726 PF3(i + 2) \
727 PF0(i + 4) \
728 PF0(i + 6) \
729 XO2(i,0) \
730 XO2(i + 1, 1) \
731 XO2(i + 2, 2) \
732 XO2(i + 3, 3) \
733 XO3(i,0) \
734 XO3(i + 1, 1) \
735 XO3(i + 2, 2) \
736 XO3(i + 3, 3) \
737 ST(i,0) \
738 ST(i + 1, 1) \
739 ST(i + 2, 2) \
740 ST(i + 3, 3) \
741
742
743 PF0(0)
744 PF0(2)
745
746 " .align 32 ;\n"
747 " 1: ;\n"
748
749 BLOCK(0)
750 BLOCK(4)
751 BLOCK(8)
752 BLOCK(12)
753
754 " addl $256, %1 ;\n"
755 " addl $256, %2 ;\n"
756 " addl $256, %3 ;\n"
757 " addl $256, %4 ;\n"
758 " decl %0 ;\n"
759 " jnz 1b ;\n"
760 : "+r" (lines),
761 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
762 :
763 : "memory" );
764
765 XMMS_RESTORE;
766}
767
768static void
769xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
770 unsigned long *p3, unsigned long *p4, unsigned long *p5)
771{
772 unsigned long lines = bytes >> 8;
773 char xmm_save[16*4] ALIGN16;
774 int cr0;
775
776 XMMS_SAVE;
777
778 /* Make sure GCC forgets anything it knows about p4 or p5,
779 such that it won't pass to the asm volatile below a
780 register that is shared with any other variable. That's
781 because we modify p4 and p5 there, but we can't mark them
782 as read/write, otherwise we'd overflow the 10-asm-operands
783 limit of GCC < 3.1. */
784 asm("" : "+r" (p4), "+r" (p5));
785
786 asm volatile(
787#undef BLOCK
788#define BLOCK(i) \
789 PF1(i) \
790 PF1(i + 2) \
791 LD(i,0) \
792 LD(i + 1, 1) \
793 LD(i + 2, 2) \
794 LD(i + 3, 3) \
795 PF2(i) \
796 PF2(i + 2) \
797 XO1(i,0) \
798 XO1(i + 1, 1) \
799 XO1(i + 2, 2) \
800 XO1(i + 3, 3) \
801 PF3(i) \
802 PF3(i + 2) \
803 XO2(i,0) \
804 XO2(i + 1, 1) \
805 XO2(i + 2, 2) \
806 XO2(i + 3, 3) \
807 PF4(i) \
808 PF4(i + 2) \
809 PF0(i + 4) \
810 PF0(i + 6) \
811 XO3(i,0) \
812 XO3(i + 1, 1) \
813 XO3(i + 2, 2) \
814 XO3(i + 3, 3) \
815 XO4(i,0) \
816 XO4(i + 1, 1) \
817 XO4(i + 2, 2) \
818 XO4(i + 3, 3) \
819 ST(i,0) \
820 ST(i + 1, 1) \
821 ST(i + 2, 2) \
822 ST(i + 3, 3) \
823
824
825 PF0(0)
826 PF0(2)
827
828 " .align 32 ;\n"
829 " 1: ;\n"
830
831 BLOCK(0)
832 BLOCK(4)
833 BLOCK(8)
834 BLOCK(12)
835
836 " addl $256, %1 ;\n"
837 " addl $256, %2 ;\n"
838 " addl $256, %3 ;\n"
839 " addl $256, %4 ;\n"
840 " addl $256, %5 ;\n"
841 " decl %0 ;\n"
842 " jnz 1b ;\n"
843 : "+r" (lines),
844 "+r" (p1), "+r" (p2), "+r" (p3)
845 : "r" (p4), "r" (p5)
846 : "memory");
847
848 /* p4 and p5 were modified, and now the variables are dead.
849 Clobber them just to be sure nobody does something stupid
850 like assuming they have some legal value. */
851 asm("" : "=r" (p4), "=r" (p5));
852
853 XMMS_RESTORE;
854}
855
856static struct xor_block_template xor_block_pIII_sse = {
857 .name = "pIII_sse",
858 .do_2 = xor_sse_2,
859 .do_3 = xor_sse_3,
860 .do_4 = xor_sse_4,
861 .do_5 = xor_sse_5,
862};
863
864/* Also try the generic routines. */
865#include <asm-generic/xor.h>
866
867#undef XOR_TRY_TEMPLATES
868#define XOR_TRY_TEMPLATES \
869do { \
870 xor_speed(&xor_block_8regs); \
871 xor_speed(&xor_block_8regs_p); \
872 xor_speed(&xor_block_32regs); \
873 xor_speed(&xor_block_32regs_p); \
874 if (cpu_has_xmm) \
875 xor_speed(&xor_block_pIII_sse); \
876 if (cpu_has_mmx) { \
877 xor_speed(&xor_block_pII_mmx); \
878 xor_speed(&xor_block_p5_mmx); \
879 } \
880} while (0)
881
882/* We force the use of the SSE xor block because it can write around L2.
883 We may also be able to load into the L1 only depending on how the cpu
884 deals with a load to a line that is being prefetched. */
885#define XOR_SELECT_TEMPLATE(FASTEST) \
886 (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
887
888#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
new file mode 100644
index 00000000000..1549b5e261f
--- /dev/null
+++ b/arch/x86/include/asm/xor_64.h
@@ -0,0 +1,361 @@
1#ifndef _ASM_X86_XOR_64_H
2#define _ASM_X86_XOR_64_H
3
4/*
5 * Optimized RAID-5 checksumming functions for MMX and SSE.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
10 * any later version.
11 *
12 * You should have received a copy of the GNU General Public License
13 * (for example /usr/src/linux/COPYING); if not, write to the Free
14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17
18/*
19 * Cache avoiding checksumming functions utilizing KNI instructions
20 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
21 */
22
23/*
24 * Based on
25 * High-speed RAID5 checksumming functions utilizing SSE instructions.
26 * Copyright (C) 1998 Ingo Molnar.
27 */
28
29/*
30 * x86-64 changes / gcc fixes from Andi Kleen.
31 * Copyright 2002 Andi Kleen, SuSE Labs.
32 *
33 * This hasn't been optimized for the hammer yet, but there are likely
34 * no advantages to be gotten from x86-64 here anyways.
35 */
36
37typedef struct {
38 unsigned long a, b;
39} __attribute__((aligned(16))) xmm_store_t;
40
41/* Doesn't use gcc to save the XMM registers, because there is no easy way to
42 tell it to do a clts before the register saving. */
43#define XMMS_SAVE \
44do { \
45 preempt_disable(); \
46 asm volatile( \
47 "movq %%cr0,%0 ;\n\t" \
48 "clts ;\n\t" \
49 "movups %%xmm0,(%1) ;\n\t" \
50 "movups %%xmm1,0x10(%1) ;\n\t" \
51 "movups %%xmm2,0x20(%1) ;\n\t" \
52 "movups %%xmm3,0x30(%1) ;\n\t" \
53 : "=&r" (cr0) \
54 : "r" (xmm_save) \
55 : "memory"); \
56} while (0)
57
58#define XMMS_RESTORE \
59do { \
60 asm volatile( \
61 "sfence ;\n\t" \
62 "movups (%1),%%xmm0 ;\n\t" \
63 "movups 0x10(%1),%%xmm1 ;\n\t" \
64 "movups 0x20(%1),%%xmm2 ;\n\t" \
65 "movups 0x30(%1),%%xmm3 ;\n\t" \
66 "movq %0,%%cr0 ;\n\t" \
67 : \
68 : "r" (cr0), "r" (xmm_save) \
69 : "memory"); \
70 preempt_enable(); \
71} while (0)
72
73#define OFFS(x) "16*("#x")"
74#define PF_OFFS(x) "256+16*("#x")"
75#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
76#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
77#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
78#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
79#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
80#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
81#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
82#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
83#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
84#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
85#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
86#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
87#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
88
89
90static void
91xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
92{
93 unsigned int lines = bytes >> 8;
94 unsigned long cr0;
95 xmm_store_t xmm_save[4];
96
97 XMMS_SAVE;
98
99 asm volatile(
100#undef BLOCK
101#define BLOCK(i) \
102 LD(i, 0) \
103 LD(i + 1, 1) \
104 PF1(i) \
105 PF1(i + 2) \
106 LD(i + 2, 2) \
107 LD(i + 3, 3) \
108 PF0(i + 4) \
109 PF0(i + 6) \
110 XO1(i, 0) \
111 XO1(i + 1, 1) \
112 XO1(i + 2, 2) \
113 XO1(i + 3, 3) \
114 ST(i, 0) \
115 ST(i + 1, 1) \
116 ST(i + 2, 2) \
117 ST(i + 3, 3) \
118
119
120 PF0(0)
121 PF0(2)
122
123 " .align 32 ;\n"
124 " 1: ;\n"
125
126 BLOCK(0)
127 BLOCK(4)
128 BLOCK(8)
129 BLOCK(12)
130
131 " addq %[inc], %[p1] ;\n"
132 " addq %[inc], %[p2] ;\n"
133 " decl %[cnt] ; jnz 1b"
134 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
135 : [inc] "r" (256UL)
136 : "memory");
137
138 XMMS_RESTORE;
139}
140
141static void
142xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
143 unsigned long *p3)
144{
145 unsigned int lines = bytes >> 8;
146 xmm_store_t xmm_save[4];
147 unsigned long cr0;
148
149 XMMS_SAVE;
150
151 asm volatile(
152#undef BLOCK
153#define BLOCK(i) \
154 PF1(i) \
155 PF1(i + 2) \
156 LD(i, 0) \
157 LD(i + 1, 1) \
158 LD(i + 2, 2) \
159 LD(i + 3, 3) \
160 PF2(i) \
161 PF2(i + 2) \
162 PF0(i + 4) \
163 PF0(i + 6) \
164 XO1(i, 0) \
165 XO1(i + 1, 1) \
166 XO1(i + 2, 2) \
167 XO1(i + 3, 3) \
168 XO2(i, 0) \
169 XO2(i + 1, 1) \
170 XO2(i + 2, 2) \
171 XO2(i + 3, 3) \
172 ST(i, 0) \
173 ST(i + 1, 1) \
174 ST(i + 2, 2) \
175 ST(i + 3, 3) \
176
177
178 PF0(0)
179 PF0(2)
180
181 " .align 32 ;\n"
182 " 1: ;\n"
183
184 BLOCK(0)
185 BLOCK(4)
186 BLOCK(8)
187 BLOCK(12)
188
189 " addq %[inc], %[p1] ;\n"
190 " addq %[inc], %[p2] ;\n"
191 " addq %[inc], %[p3] ;\n"
192 " decl %[cnt] ; jnz 1b"
193 : [cnt] "+r" (lines),
194 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
195 : [inc] "r" (256UL)
196 : "memory");
197 XMMS_RESTORE;
198}
199
200static void
201xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
202 unsigned long *p3, unsigned long *p4)
203{
204 unsigned int lines = bytes >> 8;
205 xmm_store_t xmm_save[4];
206 unsigned long cr0;
207
208 XMMS_SAVE;
209
210 asm volatile(
211#undef BLOCK
212#define BLOCK(i) \
213 PF1(i) \
214 PF1(i + 2) \
215 LD(i, 0) \
216 LD(i + 1, 1) \
217 LD(i + 2, 2) \
218 LD(i + 3, 3) \
219 PF2(i) \
220 PF2(i + 2) \
221 XO1(i, 0) \
222 XO1(i + 1, 1) \
223 XO1(i + 2, 2) \
224 XO1(i + 3, 3) \
225 PF3(i) \
226 PF3(i + 2) \
227 PF0(i + 4) \
228 PF0(i + 6) \
229 XO2(i, 0) \
230 XO2(i + 1, 1) \
231 XO2(i + 2, 2) \
232 XO2(i + 3, 3) \
233 XO3(i, 0) \
234 XO3(i + 1, 1) \
235 XO3(i + 2, 2) \
236 XO3(i + 3, 3) \
237 ST(i, 0) \
238 ST(i + 1, 1) \
239 ST(i + 2, 2) \
240 ST(i + 3, 3) \
241
242
243 PF0(0)
244 PF0(2)
245
246 " .align 32 ;\n"
247 " 1: ;\n"
248
249 BLOCK(0)
250 BLOCK(4)
251 BLOCK(8)
252 BLOCK(12)
253
254 " addq %[inc], %[p1] ;\n"
255 " addq %[inc], %[p2] ;\n"
256 " addq %[inc], %[p3] ;\n"
257 " addq %[inc], %[p4] ;\n"
258 " decl %[cnt] ; jnz 1b"
259 : [cnt] "+c" (lines),
260 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
261 : [inc] "r" (256UL)
262 : "memory" );
263
264 XMMS_RESTORE;
265}
266
267static void
268xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
269 unsigned long *p3, unsigned long *p4, unsigned long *p5)
270{
271 unsigned int lines = bytes >> 8;
272 xmm_store_t xmm_save[4];
273 unsigned long cr0;
274
275 XMMS_SAVE;
276
277 asm volatile(
278#undef BLOCK
279#define BLOCK(i) \
280 PF1(i) \
281 PF1(i + 2) \
282 LD(i, 0) \
283 LD(i + 1, 1) \
284 LD(i + 2, 2) \
285 LD(i + 3, 3) \
286 PF2(i) \
287 PF2(i + 2) \
288 XO1(i, 0) \
289 XO1(i + 1, 1) \
290 XO1(i + 2, 2) \
291 XO1(i + 3, 3) \
292 PF3(i) \
293 PF3(i + 2) \
294 XO2(i, 0) \
295 XO2(i + 1, 1) \
296 XO2(i + 2, 2) \
297 XO2(i + 3, 3) \
298 PF4(i) \
299 PF4(i + 2) \
300 PF0(i + 4) \
301 PF0(i + 6) \
302 XO3(i, 0) \
303 XO3(i + 1, 1) \
304 XO3(i + 2, 2) \
305 XO3(i + 3, 3) \
306 XO4(i, 0) \
307 XO4(i + 1, 1) \
308 XO4(i + 2, 2) \
309 XO4(i + 3, 3) \
310 ST(i, 0) \
311 ST(i + 1, 1) \
312 ST(i + 2, 2) \
313 ST(i + 3, 3) \
314
315
316 PF0(0)
317 PF0(2)
318
319 " .align 32 ;\n"
320 " 1: ;\n"
321
322 BLOCK(0)
323 BLOCK(4)
324 BLOCK(8)
325 BLOCK(12)
326
327 " addq %[inc], %[p1] ;\n"
328 " addq %[inc], %[p2] ;\n"
329 " addq %[inc], %[p3] ;\n"
330 " addq %[inc], %[p4] ;\n"
331 " addq %[inc], %[p5] ;\n"
332 " decl %[cnt] ; jnz 1b"
333 : [cnt] "+c" (lines),
334 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
335 [p5] "+r" (p5)
336 : [inc] "r" (256UL)
337 : "memory");
338
339 XMMS_RESTORE;
340}
341
342static struct xor_block_template xor_block_sse = {
343 .name = "generic_sse",
344 .do_2 = xor_sse_2,
345 .do_3 = xor_sse_3,
346 .do_4 = xor_sse_4,
347 .do_5 = xor_sse_5,
348};
349
350#undef XOR_TRY_TEMPLATES
351#define XOR_TRY_TEMPLATES \
352do { \
353 xor_speed(&xor_block_sse); \
354} while (0)
355
356/* We force the use of the SSE xor block because it can write around L2.
357 We may also be able to load into the L1 only depending on how the cpu
358 deals with a load to a line that is being prefetched. */
359#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
360
361#endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
new file mode 100644
index 00000000000..08e9a1ac07a
--- /dev/null
+++ b/arch/x86/include/asm/xsave.h
@@ -0,0 +1,118 @@
1#ifndef __ASM_X86_XSAVE_H
2#define __ASM_X86_XSAVE_H
3
4#include <linux/types.h>
5#include <asm/processor.h>
6#include <asm/i387.h>
7
8#define XSTATE_FP 0x1
9#define XSTATE_SSE 0x2
10
11#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE)
12
13#define FXSAVE_SIZE 512
14
15/*
16 * These are the features that the OS can handle currently.
17 */
18#define XCNTXT_MASK (XSTATE_FP | XSTATE_SSE)
19
20#ifdef CONFIG_X86_64
21#define REX_PREFIX "0x48, "
22#else
23#define REX_PREFIX
24#endif
25
26extern unsigned int xstate_size;
27extern u64 pcntxt_mask;
28extern struct xsave_struct *init_xstate_buf;
29
30extern void xsave_cntxt_init(void);
31extern void xsave_init(void);
32extern int init_fpu(struct task_struct *child);
33extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
34 void __user *fpstate,
35 struct _fpx_sw_bytes *sw);
36
37static inline int xrstor_checking(struct xsave_struct *fx)
38{
39 int err;
40
41 asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
42 "2:\n"
43 ".section .fixup,\"ax\"\n"
44 "3: movl $-1,%[err]\n"
45 " jmp 2b\n"
46 ".previous\n"
47 _ASM_EXTABLE(1b, 3b)
48 : [err] "=r" (err)
49 : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0)
50 : "memory");
51
52 return err;
53}
54
55static inline int xsave_user(struct xsave_struct __user *buf)
56{
57 int err;
58 __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
59 "2:\n"
60 ".section .fixup,\"ax\"\n"
61 "3: movl $-1,%[err]\n"
62 " jmp 2b\n"
63 ".previous\n"
64 ".section __ex_table,\"a\"\n"
65 _ASM_ALIGN "\n"
66 _ASM_PTR "1b,3b\n"
67 ".previous"
68 : [err] "=r" (err)
69 : "D" (buf), "a" (-1), "d" (-1), "0" (0)
70 : "memory");
71 if (unlikely(err) && __clear_user(buf, xstate_size))
72 err = -EFAULT;
73 /* No need to clear here because the caller clears USED_MATH */
74 return err;
75}
76
77static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
78{
79 int err;
80 struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
81 u32 lmask = mask;
82 u32 hmask = mask >> 32;
83
84 __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n"
85 "2:\n"
86 ".section .fixup,\"ax\"\n"
87 "3: movl $-1,%[err]\n"
88 " jmp 2b\n"
89 ".previous\n"
90 ".section __ex_table,\"a\"\n"
91 _ASM_ALIGN "\n"
92 _ASM_PTR "1b,3b\n"
93 ".previous"
94 : [err] "=r" (err)
95 : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
96 : "memory"); /* memory required? */
97 return err;
98}
99
100static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
101{
102 u32 lmask = mask;
103 u32 hmask = mask >> 32;
104
105 asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
106 : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
107 : "memory");
108}
109
110static inline void xsave(struct task_struct *tsk)
111{
112 /* This, however, we can work around by forcing the compiler to select
113 an addressing mode that doesn't require extended registers. */
114 __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27"
115 : : "D" (&(tsk->thread.xstate->xsave)),
116 "a" (-1), "d"(-1) : "memory");
117}
118#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 3db651fc8ec..d7e5a58ee22 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -10,7 +10,7 @@ ifdef CONFIG_FTRACE
10# Do not profile debug and lowlevel utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt.o = -pg 13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
14endif 14endif
15 15
16# 16#
@@ -23,7 +23,7 @@ CFLAGS_hpet.o := $(nostackp)
23CFLAGS_tsc.o := $(nostackp) 23CFLAGS_tsc.o := $(nostackp)
24 24
25obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o 25obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
26obj-y += traps_$(BITS).o irq_$(BITS).o 26obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
27obj-y += time_$(BITS).o ioport.o ldt.o 27obj-y += time_$(BITS).o ioport.o ldt.o
28obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 28obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
29obj-$(CONFIG_X86_VISWS) += visws_quirks.o 29obj-$(CONFIG_X86_VISWS) += visws_quirks.o
@@ -38,7 +38,7 @@ obj-y += tsc.o io_delay.o rtc.o
38 38
39obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 39obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
40obj-y += process.o 40obj-y += process.o
41obj-y += i387.o 41obj-y += i387.o xsave.o
42obj-y += ptrace.o 42obj-y += ptrace.o
43obj-y += ds.o 43obj-y += ds.o
44obj-$(CONFIG_X86_32) += tls.o 44obj-$(CONFIG_X86_32) += tls.o
@@ -51,7 +51,6 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
51obj-$(CONFIG_MCA) += mca_32.o 51obj-$(CONFIG_MCA) += mca_32.o
52obj-$(CONFIG_X86_MSR) += msr.o 52obj-$(CONFIG_X86_MSR) += msr.o
53obj-$(CONFIG_X86_CPUID) += cpuid.o 53obj-$(CONFIG_X86_CPUID) += cpuid.o
54obj-$(CONFIG_MICROCODE) += microcode.o
55obj-$(CONFIG_PCI) += early-quirks.o 54obj-$(CONFIG_PCI) += early-quirks.o
56apm-y := apm_32.o 55apm-y := apm_32.o
57obj-$(CONFIG_APM) += apm.o 56obj-$(CONFIG_APM) += apm.o
@@ -61,14 +60,15 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o
61obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o 60obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o
62obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o 61obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
63obj-$(CONFIG_X86_MPPARSE) += mpparse.o 62obj-$(CONFIG_X86_MPPARSE) += mpparse.o
64obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o 63obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
65obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o 64obj-$(CONFIG_X86_IO_APIC) += io_apic.o
66obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 65obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
67obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 66obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
68obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 67obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
69obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 68obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
70obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 69obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
71obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 70obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
71obj-$(CONFIG_X86_ES7000) += es7000_32.o
72obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o 72obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
73obj-y += vsmp_64.o 73obj-y += vsmp_64.o
74obj-$(CONFIG_KPROBES) += kprobes.o 74obj-$(CONFIG_KPROBES) += kprobes.o
@@ -89,7 +89,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
89obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o 89obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
90obj-$(CONFIG_KVM_GUEST) += kvm.o 90obj-$(CONFIG_KVM_GUEST) += kvm.o
91obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 91obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
92obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 92obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o
93obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 93obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
94 94
95obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 95obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
@@ -99,11 +99,18 @@ scx200-y += scx200_32.o
99 99
100obj-$(CONFIG_OLPC) += olpc.o 100obj-$(CONFIG_OLPC) += olpc.o
101 101
102microcode-y := microcode_core.o
103microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
104microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
105obj-$(CONFIG_MICROCODE) += microcode.o
106
102### 107###
103# 64 bit specific files 108# 64 bit specific files
104ifeq ($(CONFIG_X86_64),y) 109ifeq ($(CONFIG_X86_64),y)
105 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 110 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
106 obj-y += bios_uv.o 111 obj-y += bios_uv.o uv_irq.o uv_sysfs.o
112 obj-y += genx2apic_cluster.o
113 obj-y += genx2apic_phys.o
107 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 114 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
108 obj-$(CONFIG_AUDIT) += audit_64.o 115 obj-$(CONFIG_AUDIT) += audit_64.o
109 116
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index bfd10fd211c..8c1f76abae9 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled);
58#ifdef CONFIG_X86_64 58#ifdef CONFIG_X86_64
59 59
60#include <asm/proto.h> 60#include <asm/proto.h>
61#include <asm/genapic.h>
62 61
63#else /* X86 */ 62#else /* X86 */
64 63
@@ -97,8 +96,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
97#warning ACPI uses CMPXCHG, i486 and later hardware 96#warning ACPI uses CMPXCHG, i486 and later hardware
98#endif 97#endif
99 98
100static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
101
102/* -------------------------------------------------------------------------- 99/* --------------------------------------------------------------------------
103 Boot-time Configuration 100 Boot-time Configuration
104 -------------------------------------------------------------------------- */ 101 -------------------------------------------------------------------------- */
@@ -156,6 +153,9 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
156} 153}
157 154
158#ifdef CONFIG_PCI_MMCONFIG 155#ifdef CONFIG_PCI_MMCONFIG
156
157static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
158
159/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ 159/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
160struct acpi_mcfg_allocation *pci_mmcfg_config; 160struct acpi_mcfg_allocation *pci_mmcfg_config;
161int pci_mmcfg_config_num; 161int pci_mmcfg_config_num;
@@ -253,10 +253,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
253 return; 253 return;
254 } 254 }
255 255
256#ifdef CONFIG_X86_32
257 if (boot_cpu_physical_apicid != -1U) 256 if (boot_cpu_physical_apicid != -1U)
258 ver = apic_version[boot_cpu_physical_apicid]; 257 ver = apic_version[boot_cpu_physical_apicid];
259#endif
260 258
261 generic_processor_info(id, ver); 259 generic_processor_info(id, ver);
262} 260}
@@ -775,11 +773,9 @@ static void __init acpi_register_lapic_address(unsigned long address)
775 773
776 set_fixmap_nocache(FIX_APIC_BASE, address); 774 set_fixmap_nocache(FIX_APIC_BASE, address);
777 if (boot_cpu_physical_apicid == -1U) { 775 if (boot_cpu_physical_apicid == -1U) {
778 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 776 boot_cpu_physical_apicid = read_apic_id();
779#ifdef CONFIG_X86_32
780 apic_version[boot_cpu_physical_apicid] = 777 apic_version[boot_cpu_physical_apicid] =
781 GET_APIC_VERSION(apic_read(APIC_LVR)); 778 GET_APIC_VERSION(apic_read(APIC_LVR));
782#endif
783 } 779 }
784} 780}
785 781
@@ -1141,7 +1137,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1141 return gsi; 1137 return gsi;
1142 } 1138 }
1143 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { 1139 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
1144 pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n", 1140 pr_debug("Pin %d-%d already programmed\n",
1145 mp_ioapic_routing[ioapic].apic_id, ioapic_pin); 1141 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1146#ifdef CONFIG_X86_32 1142#ifdef CONFIG_X86_32
1147 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); 1143 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
@@ -1261,7 +1257,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1261 1257
1262 count = 1258 count =
1263 acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, 1259 acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
1264 NR_IRQ_VECTORS); 1260 nr_irqs);
1265 if (count < 0) { 1261 if (count < 0) {
1266 printk(KERN_ERR PREFIX 1262 printk(KERN_ERR PREFIX
1267 "Error parsing interrupt source overrides entry\n"); 1263 "Error parsing interrupt source overrides entry\n");
@@ -1281,7 +1277,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1281 1277
1282 count = 1278 count =
1283 acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, 1279 acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
1284 NR_IRQ_VECTORS); 1280 nr_irqs);
1285 if (count < 0) { 1281 if (count < 0) {
1286 printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); 1282 printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
1287 /* TBD: Cleanup to allow fallback to MPS */ 1283 /* TBD: Cleanup to allow fallback to MPS */
@@ -1351,7 +1347,9 @@ static void __init acpi_process_madt(void)
1351 acpi_ioapic = 1; 1347 acpi_ioapic = 1;
1352 1348
1353 smp_found_config = 1; 1349 smp_found_config = 1;
1350#ifdef CONFIG_X86_32
1354 setup_apic_routing(); 1351 setup_apic_routing();
1352#endif
1355 } 1353 }
1356 } 1354 }
1357 if (error == -EINVAL) { 1355 if (error == -EINVAL) {
@@ -1421,8 +1419,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
1421 */ 1419 */
1422static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) 1420static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
1423{ 1421{
1424 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident); 1422 /*
1425 acpi_skip_timer_override = 1; 1423 * The ati_ixp4x0_rev() early PCI quirk should have set
1424 * the acpi_skip_timer_override flag already:
1425 */
1426 if (!acpi_skip_timer_override) {
1427 WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
1428 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
1429 d->ident);
1430 acpi_skip_timer_override = 1;
1431 }
1426 return 0; 1432 return 0;
1427} 1433}
1428 1434
@@ -1593,6 +1599,11 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1593 DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), 1599 DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
1594 }, 1600 },
1595 }, 1601 },
1602 {}
1603};
1604
1605/* second table for DMI checks that should run after early-quirks */
1606static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
1596 /* 1607 /*
1597 * HP laptops which use a DSDT reporting as HP/SB400/10000, 1608 * HP laptops which use a DSDT reporting as HP/SB400/10000,
1598 * which includes some code which overrides all temperature 1609 * which includes some code which overrides all temperature
@@ -1605,6 +1616,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1605 */ 1616 */
1606 { 1617 {
1607 .callback = dmi_ignore_irq0_timer_override, 1618 .callback = dmi_ignore_irq0_timer_override,
1619 .ident = "HP nx6115 laptop",
1620 .matches = {
1621 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1622 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"),
1623 },
1624 },
1625 {
1626 .callback = dmi_ignore_irq0_timer_override,
1608 .ident = "HP NX6125 laptop", 1627 .ident = "HP NX6125 laptop",
1609 .matches = { 1628 .matches = {
1610 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), 1629 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
@@ -1619,6 +1638,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1619 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), 1638 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
1620 }, 1639 },
1621 }, 1640 },
1641 {
1642 .callback = dmi_ignore_irq0_timer_override,
1643 .ident = "HP 6715b laptop",
1644 .matches = {
1645 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1646 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
1647 },
1648 },
1622 {} 1649 {}
1623}; 1650};
1624 1651
@@ -1705,6 +1732,9 @@ int __init early_acpi_boot_init(void)
1705 1732
1706int __init acpi_boot_init(void) 1733int __init acpi_boot_init(void)
1707{ 1734{
1735 /* those are executed after early-quirks are executed */
1736 dmi_check_system(acpi_dmi_table_late);
1737
1708 /* 1738 /*
1709 * If acpi_disabled, bail out 1739 * If acpi_disabled, bail out
1710 * One exception: acpi=ht continues far enough to enumerate LAPICs 1740 * One exception: acpi=ht continues far enough to enumerate LAPICs
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 426e5d91b63..806b4e9051b 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -10,6 +10,7 @@
10#include <linux/dmi.h> 10#include <linux/dmi.h>
11#include <linux/cpumask.h> 11#include <linux/cpumask.h>
12#include <asm/segment.h> 12#include <asm/segment.h>
13#include <asm/desc.h>
13 14
14#include "realmode/wakeup.h" 15#include "realmode/wakeup.h"
15#include "sleep.h" 16#include "sleep.h"
@@ -21,7 +22,7 @@ unsigned long acpi_realmode_flags;
21static unsigned long acpi_realmode; 22static unsigned long acpi_realmode;
22 23
23#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) 24#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
24static char temp_stack[10240]; 25static char temp_stack[4096];
25#endif 26#endif
26 27
27/** 28/**
@@ -97,7 +98,9 @@ int acpi_save_state_mem(void)
97#else /* CONFIG_64BIT */ 98#else /* CONFIG_64BIT */
98 header->trampoline_segment = setup_trampoline() >> 4; 99 header->trampoline_segment = setup_trampoline() >> 4;
99#ifdef CONFIG_SMP 100#ifdef CONFIG_SMP
100 stack_start.sp = temp_stack + 4096; 101 stack_start.sp = temp_stack + sizeof(temp_stack);
102 early_gdt_descr.address =
103 (unsigned long)get_cpu_gdt_table(smp_processor_id());
101#endif 104#endif
102 initial_code = (unsigned long)wakeup_long64; 105 initial_code = (unsigned long)wakeup_long64;
103 saved_magic = 0x123456789abcdef0; 106 saved_magic = 0x123456789abcdef0;
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 65a0c1b4869..a84ac7b570e 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -231,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
231 continue; 231 continue;
232 if (*ptr > text_end) 232 if (*ptr > text_end)
233 continue; 233 continue;
234 text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ 234 /* turn DS segment override prefix into lock prefix */
235 text_poke(*ptr, ((unsigned char []){0xf0}), 1);
235 }; 236 };
236} 237}
237 238
238static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 239static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
239{ 240{
240 u8 **ptr; 241 u8 **ptr;
241 char insn[1];
242 242
243 if (noreplace_smp) 243 if (noreplace_smp)
244 return; 244 return;
245 245
246 add_nops(insn, 1);
247 for (ptr = start; ptr < end; ptr++) { 246 for (ptr = start; ptr < end; ptr++) {
248 if (*ptr < text) 247 if (*ptr < text)
249 continue; 248 continue;
250 if (*ptr > text_end) 249 if (*ptr > text_end)
251 continue; 250 continue;
252 text_poke(*ptr, insn, 1); 251 /* turn lock prefix into DS segment override prefix */
252 text_poke(*ptr, ((unsigned char []){0x3E}), 1);
253 }; 253 };
254} 254}
255 255
@@ -444,7 +444,7 @@ void __init alternative_instructions(void)
444 _text, _etext); 444 _text, _etext);
445 445
446 /* Only switch to UP mode if we don't immediately boot others */ 446 /* Only switch to UP mode if we don't immediately boot others */
447 if (num_possible_cpus() == 1 || setup_max_cpus <= 1) 447 if (num_present_cpus() == 1 || setup_max_cpus <= 1)
448 alternatives_smp_switch(0); 448 alternatives_smp_switch(0);
449 } 449 }
450#endif 450#endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 69b4d060b21..a8fd9ebdc8e 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -33,6 +33,10 @@
33 33
34static DEFINE_RWLOCK(amd_iommu_devtable_lock); 34static DEFINE_RWLOCK(amd_iommu_devtable_lock);
35 35
36/* A list of preallocated protection domains */
37static LIST_HEAD(iommu_pd_list);
38static DEFINE_SPINLOCK(iommu_pd_list_lock);
39
36/* 40/*
37 * general struct to manage commands send to an IOMMU 41 * general struct to manage commands send to an IOMMU
38 */ 42 */
@@ -51,6 +55,102 @@ static int iommu_has_npcache(struct amd_iommu *iommu)
51 55
52/**************************************************************************** 56/****************************************************************************
53 * 57 *
58 * Interrupt handling functions
59 *
60 ****************************************************************************/
61
62static void iommu_print_event(void *__evt)
63{
64 u32 *event = __evt;
65 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
66 int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
67 int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
68 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
69 u64 address = (u64)(((u64)event[3]) << 32) | event[2];
70
71 printk(KERN_ERR "AMD IOMMU: Event logged [");
72
73 switch (type) {
74 case EVENT_TYPE_ILL_DEV:
75 printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
76 "address=0x%016llx flags=0x%04x]\n",
77 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
78 address, flags);
79 break;
80 case EVENT_TYPE_IO_FAULT:
81 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
82 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
83 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
84 domid, address, flags);
85 break;
86 case EVENT_TYPE_DEV_TAB_ERR:
87 printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
88 "address=0x%016llx flags=0x%04x]\n",
89 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
90 address, flags);
91 break;
92 case EVENT_TYPE_PAGE_TAB_ERR:
93 printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
94 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
95 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
96 domid, address, flags);
97 break;
98 case EVENT_TYPE_ILL_CMD:
99 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
100 break;
101 case EVENT_TYPE_CMD_HARD_ERR:
102 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
103 "flags=0x%04x]\n", address, flags);
104 break;
105 case EVENT_TYPE_IOTLB_INV_TO:
106 printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
107 "address=0x%016llx]\n",
108 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
109 address);
110 break;
111 case EVENT_TYPE_INV_DEV_REQ:
112 printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
113 "address=0x%016llx flags=0x%04x]\n",
114 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
115 address, flags);
116 break;
117 default:
118 printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
119 }
120}
121
122static void iommu_poll_events(struct amd_iommu *iommu)
123{
124 u32 head, tail;
125 unsigned long flags;
126
127 spin_lock_irqsave(&iommu->lock, flags);
128
129 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
130 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
131
132 while (head != tail) {
133 iommu_print_event(iommu->evt_buf + head);
134 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
135 }
136
137 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
138
139 spin_unlock_irqrestore(&iommu->lock, flags);
140}
141
142irqreturn_t amd_iommu_int_handler(int irq, void *data)
143{
144 struct amd_iommu *iommu;
145
146 list_for_each_entry(iommu, &amd_iommu_list, list)
147 iommu_poll_events(iommu);
148
149 return IRQ_HANDLED;
150}
151
152/****************************************************************************
153 *
54 * IOMMU command queuing functions 154 * IOMMU command queuing functions
55 * 155 *
56 ****************************************************************************/ 156 ****************************************************************************/
@@ -101,10 +201,10 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
101 */ 201 */
102static int iommu_completion_wait(struct amd_iommu *iommu) 202static int iommu_completion_wait(struct amd_iommu *iommu)
103{ 203{
104 int ret, ready = 0; 204 int ret = 0, ready = 0;
105 unsigned status = 0; 205 unsigned status = 0;
106 struct iommu_cmd cmd; 206 struct iommu_cmd cmd;
107 unsigned long i = 0; 207 unsigned long flags, i = 0;
108 208
109 memset(&cmd, 0, sizeof(cmd)); 209 memset(&cmd, 0, sizeof(cmd));
110 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; 210 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
@@ -112,10 +212,12 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
112 212
113 iommu->need_sync = 0; 213 iommu->need_sync = 0;
114 214
115 ret = iommu_queue_command(iommu, &cmd); 215 spin_lock_irqsave(&iommu->lock, flags);
216
217 ret = __iommu_queue_command(iommu, &cmd);
116 218
117 if (ret) 219 if (ret)
118 return ret; 220 goto out;
119 221
120 while (!ready && (i < EXIT_LOOP_COUNT)) { 222 while (!ready && (i < EXIT_LOOP_COUNT)) {
121 ++i; 223 ++i;
@@ -130,6 +232,8 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
130 232
131 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) 233 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
132 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); 234 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
235out:
236 spin_unlock_irqrestore(&iommu->lock, flags);
133 237
134 return 0; 238 return 0;
135} 239}
@@ -140,6 +244,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
140static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 244static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
141{ 245{
142 struct iommu_cmd cmd; 246 struct iommu_cmd cmd;
247 int ret;
143 248
144 BUG_ON(iommu == NULL); 249 BUG_ON(iommu == NULL);
145 250
@@ -147,9 +252,11 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
147 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 252 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
148 cmd.data[0] = devid; 253 cmd.data[0] = devid;
149 254
255 ret = iommu_queue_command(iommu, &cmd);
256
150 iommu->need_sync = 1; 257 iommu->need_sync = 1;
151 258
152 return iommu_queue_command(iommu, &cmd); 259 return ret;
153} 260}
154 261
155/* 262/*
@@ -159,6 +266,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
159 u64 address, u16 domid, int pde, int s) 266 u64 address, u16 domid, int pde, int s)
160{ 267{
161 struct iommu_cmd cmd; 268 struct iommu_cmd cmd;
269 int ret;
162 270
163 memset(&cmd, 0, sizeof(cmd)); 271 memset(&cmd, 0, sizeof(cmd));
164 address &= PAGE_MASK; 272 address &= PAGE_MASK;
@@ -171,9 +279,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
171 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ 279 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
172 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 280 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
173 281
282 ret = iommu_queue_command(iommu, &cmd);
283
174 iommu->need_sync = 1; 284 iommu->need_sync = 1;
175 285
176 return iommu_queue_command(iommu, &cmd); 286 return ret;
177} 287}
178 288
179/* 289/*
@@ -185,7 +295,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
185 u64 address, size_t size) 295 u64 address, size_t size)
186{ 296{
187 int s = 0; 297 int s = 0;
188 unsigned pages = iommu_num_pages(address, size); 298 unsigned pages = iommu_num_pages(address, size, PAGE_SIZE);
189 299
190 address &= PAGE_MASK; 300 address &= PAGE_MASK;
191 301
@@ -203,6 +313,14 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
203 return 0; 313 return 0;
204} 314}
205 315
316/* Flush the whole IO/TLB for a given protection domain */
317static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
318{
319 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
320
321 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
322}
323
206/**************************************************************************** 324/****************************************************************************
207 * 325 *
208 * The functions below are used the create the page table mappings for 326 * The functions below are used the create the page table mappings for
@@ -362,11 +480,6 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
362 * efficient allocator. 480 * efficient allocator.
363 * 481 *
364 ****************************************************************************/ 482 ****************************************************************************/
365static unsigned long dma_mask_to_pages(unsigned long mask)
366{
367 return (mask >> PAGE_SHIFT) +
368 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT);
369}
370 483
371/* 484/*
372 * The address allocator core function. 485 * The address allocator core function.
@@ -375,25 +488,31 @@ static unsigned long dma_mask_to_pages(unsigned long mask)
375 */ 488 */
376static unsigned long dma_ops_alloc_addresses(struct device *dev, 489static unsigned long dma_ops_alloc_addresses(struct device *dev,
377 struct dma_ops_domain *dom, 490 struct dma_ops_domain *dom,
378 unsigned int pages) 491 unsigned int pages,
492 unsigned long align_mask,
493 u64 dma_mask)
379{ 494{
380 unsigned long limit = dma_mask_to_pages(*dev->dma_mask); 495 unsigned long limit;
381 unsigned long address; 496 unsigned long address;
382 unsigned long size = dom->aperture_size >> PAGE_SHIFT;
383 unsigned long boundary_size; 497 unsigned long boundary_size;
384 498
385 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 499 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
386 PAGE_SIZE) >> PAGE_SHIFT; 500 PAGE_SIZE) >> PAGE_SHIFT;
387 limit = limit < size ? limit : size; 501 limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
502 dma_mask >> PAGE_SHIFT);
388 503
389 if (dom->next_bit >= limit) 504 if (dom->next_bit >= limit) {
390 dom->next_bit = 0; 505 dom->next_bit = 0;
506 dom->need_flush = true;
507 }
391 508
392 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, 509 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
393 0 , boundary_size, 0); 510 0 , boundary_size, align_mask);
394 if (address == -1) 511 if (address == -1) {
395 address = iommu_area_alloc(dom->bitmap, limit, 0, pages, 512 address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
396 0, boundary_size, 0); 513 0, boundary_size, align_mask);
514 dom->need_flush = true;
515 }
397 516
398 if (likely(address != -1)) { 517 if (likely(address != -1)) {
399 dom->next_bit = address + pages; 518 dom->next_bit = address + pages;
@@ -459,7 +578,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
459 if (start_page + pages > last_page) 578 if (start_page + pages > last_page)
460 pages = last_page - start_page; 579 pages = last_page - start_page;
461 580
462 set_bit_string(dom->bitmap, start_page, pages); 581 iommu_area_reserve(dom->bitmap, start_page, pages);
463} 582}
464 583
465static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) 584static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
@@ -553,12 +672,16 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
553 dma_dom->bitmap[0] = 1; 672 dma_dom->bitmap[0] = 1;
554 dma_dom->next_bit = 0; 673 dma_dom->next_bit = 0;
555 674
675 dma_dom->need_flush = false;
676 dma_dom->target_dev = 0xffff;
677
556 /* Intialize the exclusion range if necessary */ 678 /* Intialize the exclusion range if necessary */
557 if (iommu->exclusion_start && 679 if (iommu->exclusion_start &&
558 iommu->exclusion_start < dma_dom->aperture_size) { 680 iommu->exclusion_start < dma_dom->aperture_size) {
559 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 681 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
560 int pages = iommu_num_pages(iommu->exclusion_start, 682 int pages = iommu_num_pages(iommu->exclusion_start,
561 iommu->exclusion_length); 683 iommu->exclusion_length,
684 PAGE_SIZE);
562 dma_ops_reserve_addresses(dma_dom, startpage, pages); 685 dma_ops_reserve_addresses(dma_dom, startpage, pages);
563 } 686 }
564 687
@@ -623,12 +746,13 @@ static void set_device_domain(struct amd_iommu *iommu,
623 746
624 u64 pte_root = virt_to_phys(domain->pt_root); 747 u64 pte_root = virt_to_phys(domain->pt_root);
625 748
626 pte_root |= (domain->mode & 0x07) << 9; 749 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
627 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2; 750 << DEV_ENTRY_MODE_SHIFT;
751 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
628 752
629 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 753 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
630 amd_iommu_dev_table[devid].data[0] = pte_root; 754 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
631 amd_iommu_dev_table[devid].data[1] = pte_root >> 32; 755 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
632 amd_iommu_dev_table[devid].data[2] = domain->id; 756 amd_iommu_dev_table[devid].data[2] = domain->id;
633 757
634 amd_iommu_pd_table[devid] = domain; 758 amd_iommu_pd_table[devid] = domain;
@@ -646,6 +770,45 @@ static void set_device_domain(struct amd_iommu *iommu,
646 *****************************************************************************/ 770 *****************************************************************************/
647 771
648/* 772/*
773 * This function checks if the driver got a valid device from the caller to
774 * avoid dereferencing invalid pointers.
775 */
776static bool check_device(struct device *dev)
777{
778 if (!dev || !dev->dma_mask)
779 return false;
780
781 return true;
782}
783
784/*
785 * In this function the list of preallocated protection domains is traversed to
786 * find the domain for a specific device
787 */
788static struct dma_ops_domain *find_protection_domain(u16 devid)
789{
790 struct dma_ops_domain *entry, *ret = NULL;
791 unsigned long flags;
792
793 if (list_empty(&iommu_pd_list))
794 return NULL;
795
796 spin_lock_irqsave(&iommu_pd_list_lock, flags);
797
798 list_for_each_entry(entry, &iommu_pd_list, list) {
799 if (entry->target_dev == devid) {
800 ret = entry;
801 list_del(&ret->list);
802 break;
803 }
804 }
805
806 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
807
808 return ret;
809}
810
811/*
649 * In the dma_ops path we only have the struct device. This function 812 * In the dma_ops path we only have the struct device. This function
650 * finds the corresponding IOMMU, the protection domain and the 813 * finds the corresponding IOMMU, the protection domain and the
651 * requestor id for a given device. 814 * requestor id for a given device.
@@ -661,27 +824,30 @@ static int get_device_resources(struct device *dev,
661 struct pci_dev *pcidev; 824 struct pci_dev *pcidev;
662 u16 _bdf; 825 u16 _bdf;
663 826
664 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); 827 *iommu = NULL;
828 *domain = NULL;
829 *bdf = 0xffff;
830
831 if (dev->bus != &pci_bus_type)
832 return 0;
665 833
666 pcidev = to_pci_dev(dev); 834 pcidev = to_pci_dev(dev);
667 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); 835 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
668 836
669 /* device not translated by any IOMMU in the system? */ 837 /* device not translated by any IOMMU in the system? */
670 if (_bdf > amd_iommu_last_bdf) { 838 if (_bdf > amd_iommu_last_bdf)
671 *iommu = NULL;
672 *domain = NULL;
673 *bdf = 0xffff;
674 return 0; 839 return 0;
675 }
676 840
677 *bdf = amd_iommu_alias_table[_bdf]; 841 *bdf = amd_iommu_alias_table[_bdf];
678 842
679 *iommu = amd_iommu_rlookup_table[*bdf]; 843 *iommu = amd_iommu_rlookup_table[*bdf];
680 if (*iommu == NULL) 844 if (*iommu == NULL)
681 return 0; 845 return 0;
682 dma_dom = (*iommu)->default_dom;
683 *domain = domain_for_device(*bdf); 846 *domain = domain_for_device(*bdf);
684 if (*domain == NULL) { 847 if (*domain == NULL) {
848 dma_dom = find_protection_domain(*bdf);
849 if (!dma_dom)
850 dma_dom = (*iommu)->default_dom;
685 *domain = &dma_dom->domain; 851 *domain = &dma_dom->domain;
686 set_device_domain(*iommu, *domain, *bdf); 852 set_device_domain(*iommu, *domain, *bdf);
687 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 853 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
@@ -760,17 +926,24 @@ static dma_addr_t __map_single(struct device *dev,
760 struct dma_ops_domain *dma_dom, 926 struct dma_ops_domain *dma_dom,
761 phys_addr_t paddr, 927 phys_addr_t paddr,
762 size_t size, 928 size_t size,
763 int dir) 929 int dir,
930 bool align,
931 u64 dma_mask)
764{ 932{
765 dma_addr_t offset = paddr & ~PAGE_MASK; 933 dma_addr_t offset = paddr & ~PAGE_MASK;
766 dma_addr_t address, start; 934 dma_addr_t address, start;
767 unsigned int pages; 935 unsigned int pages;
936 unsigned long align_mask = 0;
768 int i; 937 int i;
769 938
770 pages = iommu_num_pages(paddr, size); 939 pages = iommu_num_pages(paddr, size, PAGE_SIZE);
771 paddr &= PAGE_MASK; 940 paddr &= PAGE_MASK;
772 941
773 address = dma_ops_alloc_addresses(dev, dma_dom, pages); 942 if (align)
943 align_mask = (1UL << get_order(size)) - 1;
944
945 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
946 dma_mask);
774 if (unlikely(address == bad_dma_address)) 947 if (unlikely(address == bad_dma_address))
775 goto out; 948 goto out;
776 949
@@ -782,6 +955,12 @@ static dma_addr_t __map_single(struct device *dev,
782 } 955 }
783 address += offset; 956 address += offset;
784 957
958 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
959 iommu_flush_tlb(iommu, dma_dom->domain.id);
960 dma_dom->need_flush = false;
961 } else if (unlikely(iommu_has_npcache(iommu)))
962 iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
963
785out: 964out:
786 return address; 965 return address;
787} 966}
@@ -802,7 +981,7 @@ static void __unmap_single(struct amd_iommu *iommu,
802 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) 981 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
803 return; 982 return;
804 983
805 pages = iommu_num_pages(dma_addr, size); 984 pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
806 dma_addr &= PAGE_MASK; 985 dma_addr &= PAGE_MASK;
807 start = dma_addr; 986 start = dma_addr;
808 987
@@ -812,6 +991,9 @@ static void __unmap_single(struct amd_iommu *iommu,
812 } 991 }
813 992
814 dma_ops_free_addresses(dma_dom, dma_addr, pages); 993 dma_ops_free_addresses(dma_dom, dma_addr, pages);
994
995 if (amd_iommu_unmap_flush)
996 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
815} 997}
816 998
817/* 999/*
@@ -825,6 +1007,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
825 struct protection_domain *domain; 1007 struct protection_domain *domain;
826 u16 devid; 1008 u16 devid;
827 dma_addr_t addr; 1009 dma_addr_t addr;
1010 u64 dma_mask;
1011
1012 if (!check_device(dev))
1013 return bad_dma_address;
1014
1015 dma_mask = *dev->dma_mask;
828 1016
829 get_device_resources(dev, &iommu, &domain, &devid); 1017 get_device_resources(dev, &iommu, &domain, &devid);
830 1018
@@ -833,14 +1021,12 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
833 return (dma_addr_t)paddr; 1021 return (dma_addr_t)paddr;
834 1022
835 spin_lock_irqsave(&domain->lock, flags); 1023 spin_lock_irqsave(&domain->lock, flags);
836 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir); 1024 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
1025 dma_mask);
837 if (addr == bad_dma_address) 1026 if (addr == bad_dma_address)
838 goto out; 1027 goto out;
839 1028
840 if (iommu_has_npcache(iommu)) 1029 if (unlikely(iommu->need_sync))
841 iommu_flush_pages(iommu, domain->id, addr, size);
842
843 if (iommu->need_sync)
844 iommu_completion_wait(iommu); 1030 iommu_completion_wait(iommu);
845 1031
846out: 1032out:
@@ -860,7 +1046,8 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
860 struct protection_domain *domain; 1046 struct protection_domain *domain;
861 u16 devid; 1047 u16 devid;
862 1048
863 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1049 if (!check_device(dev) ||
1050 !get_device_resources(dev, &iommu, &domain, &devid))
864 /* device not handled by any AMD IOMMU */ 1051 /* device not handled by any AMD IOMMU */
865 return; 1052 return;
866 1053
@@ -868,9 +1055,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
868 1055
869 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1056 __unmap_single(iommu, domain->priv, dma_addr, size, dir);
870 1057
871 iommu_flush_pages(iommu, domain->id, dma_addr, size); 1058 if (unlikely(iommu->need_sync))
872
873 if (iommu->need_sync)
874 iommu_completion_wait(iommu); 1059 iommu_completion_wait(iommu);
875 1060
876 spin_unlock_irqrestore(&domain->lock, flags); 1061 spin_unlock_irqrestore(&domain->lock, flags);
@@ -909,6 +1094,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
909 struct scatterlist *s; 1094 struct scatterlist *s;
910 phys_addr_t paddr; 1095 phys_addr_t paddr;
911 int mapped_elems = 0; 1096 int mapped_elems = 0;
1097 u64 dma_mask;
1098
1099 if (!check_device(dev))
1100 return 0;
1101
1102 dma_mask = *dev->dma_mask;
912 1103
913 get_device_resources(dev, &iommu, &domain, &devid); 1104 get_device_resources(dev, &iommu, &domain, &devid);
914 1105
@@ -921,19 +1112,17 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
921 paddr = sg_phys(s); 1112 paddr = sg_phys(s);
922 1113
923 s->dma_address = __map_single(dev, iommu, domain->priv, 1114 s->dma_address = __map_single(dev, iommu, domain->priv,
924 paddr, s->length, dir); 1115 paddr, s->length, dir, false,
1116 dma_mask);
925 1117
926 if (s->dma_address) { 1118 if (s->dma_address) {
927 s->dma_length = s->length; 1119 s->dma_length = s->length;
928 mapped_elems++; 1120 mapped_elems++;
929 } else 1121 } else
930 goto unmap; 1122 goto unmap;
931 if (iommu_has_npcache(iommu))
932 iommu_flush_pages(iommu, domain->id, s->dma_address,
933 s->dma_length);
934 } 1123 }
935 1124
936 if (iommu->need_sync) 1125 if (unlikely(iommu->need_sync))
937 iommu_completion_wait(iommu); 1126 iommu_completion_wait(iommu);
938 1127
939out: 1128out:
@@ -967,7 +1156,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
967 u16 devid; 1156 u16 devid;
968 int i; 1157 int i;
969 1158
970 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1159 if (!check_device(dev) ||
1160 !get_device_resources(dev, &iommu, &domain, &devid))
971 return; 1161 return;
972 1162
973 spin_lock_irqsave(&domain->lock, flags); 1163 spin_lock_irqsave(&domain->lock, flags);
@@ -975,12 +1165,10 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
975 for_each_sg(sglist, s, nelems, i) { 1165 for_each_sg(sglist, s, nelems, i) {
976 __unmap_single(iommu, domain->priv, s->dma_address, 1166 __unmap_single(iommu, domain->priv, s->dma_address,
977 s->dma_length, dir); 1167 s->dma_length, dir);
978 iommu_flush_pages(iommu, domain->id, s->dma_address,
979 s->dma_length);
980 s->dma_address = s->dma_length = 0; 1168 s->dma_address = s->dma_length = 0;
981 } 1169 }
982 1170
983 if (iommu->need_sync) 1171 if (unlikely(iommu->need_sync))
984 iommu_completion_wait(iommu); 1172 iommu_completion_wait(iommu);
985 1173
986 spin_unlock_irqrestore(&domain->lock, flags); 1174 spin_unlock_irqrestore(&domain->lock, flags);
@@ -998,25 +1186,33 @@ static void *alloc_coherent(struct device *dev, size_t size,
998 struct protection_domain *domain; 1186 struct protection_domain *domain;
999 u16 devid; 1187 u16 devid;
1000 phys_addr_t paddr; 1188 phys_addr_t paddr;
1189 u64 dma_mask = dev->coherent_dma_mask;
1190
1191 if (!check_device(dev))
1192 return NULL;
1001 1193
1194 if (!get_device_resources(dev, &iommu, &domain, &devid))
1195 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
1196
1197 flag |= __GFP_ZERO;
1002 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 1198 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1003 if (!virt_addr) 1199 if (!virt_addr)
1004 return 0; 1200 return 0;
1005 1201
1006 memset(virt_addr, 0, size);
1007 paddr = virt_to_phys(virt_addr); 1202 paddr = virt_to_phys(virt_addr);
1008 1203
1009 get_device_resources(dev, &iommu, &domain, &devid);
1010
1011 if (!iommu || !domain) { 1204 if (!iommu || !domain) {
1012 *dma_addr = (dma_addr_t)paddr; 1205 *dma_addr = (dma_addr_t)paddr;
1013 return virt_addr; 1206 return virt_addr;
1014 } 1207 }
1015 1208
1209 if (!dma_mask)
1210 dma_mask = *dev->dma_mask;
1211
1016 spin_lock_irqsave(&domain->lock, flags); 1212 spin_lock_irqsave(&domain->lock, flags);
1017 1213
1018 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1214 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
1019 size, DMA_BIDIRECTIONAL); 1215 size, DMA_BIDIRECTIONAL, true, dma_mask);
1020 1216
1021 if (*dma_addr == bad_dma_address) { 1217 if (*dma_addr == bad_dma_address) {
1022 free_pages((unsigned long)virt_addr, get_order(size)); 1218 free_pages((unsigned long)virt_addr, get_order(size));
@@ -1024,10 +1220,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
1024 goto out; 1220 goto out;
1025 } 1221 }
1026 1222
1027 if (iommu_has_npcache(iommu)) 1223 if (unlikely(iommu->need_sync))
1028 iommu_flush_pages(iommu, domain->id, *dma_addr, size);
1029
1030 if (iommu->need_sync)
1031 iommu_completion_wait(iommu); 1224 iommu_completion_wait(iommu);
1032 1225
1033out: 1226out:
@@ -1038,8 +1231,6 @@ out:
1038 1231
1039/* 1232/*
1040 * The exported free_coherent function for dma_ops. 1233 * The exported free_coherent function for dma_ops.
1041 * FIXME: fix the generic x86 DMA layer so that it actually calls that
1042 * function.
1043 */ 1234 */
1044static void free_coherent(struct device *dev, size_t size, 1235static void free_coherent(struct device *dev, size_t size,
1045 void *virt_addr, dma_addr_t dma_addr) 1236 void *virt_addr, dma_addr_t dma_addr)
@@ -1049,6 +1240,9 @@ static void free_coherent(struct device *dev, size_t size,
1049 struct protection_domain *domain; 1240 struct protection_domain *domain;
1050 u16 devid; 1241 u16 devid;
1051 1242
1243 if (!check_device(dev))
1244 return;
1245
1052 get_device_resources(dev, &iommu, &domain, &devid); 1246 get_device_resources(dev, &iommu, &domain, &devid);
1053 1247
1054 if (!iommu || !domain) 1248 if (!iommu || !domain)
@@ -1057,9 +1251,8 @@ static void free_coherent(struct device *dev, size_t size,
1057 spin_lock_irqsave(&domain->lock, flags); 1251 spin_lock_irqsave(&domain->lock, flags);
1058 1252
1059 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 1253 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
1060 iommu_flush_pages(iommu, domain->id, dma_addr, size);
1061 1254
1062 if (iommu->need_sync) 1255 if (unlikely(iommu->need_sync))
1063 iommu_completion_wait(iommu); 1256 iommu_completion_wait(iommu);
1064 1257
1065 spin_unlock_irqrestore(&domain->lock, flags); 1258 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1069,6 +1262,30 @@ free_mem:
1069} 1262}
1070 1263
1071/* 1264/*
1265 * This function is called by the DMA layer to find out if we can handle a
1266 * particular device. It is part of the dma_ops.
1267 */
1268static int amd_iommu_dma_supported(struct device *dev, u64 mask)
1269{
1270 u16 bdf;
1271 struct pci_dev *pcidev;
1272
1273 /* No device or no PCI device */
1274 if (!dev || dev->bus != &pci_bus_type)
1275 return 0;
1276
1277 pcidev = to_pci_dev(dev);
1278
1279 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
1280
1281 /* Out of our scope? */
1282 if (bdf > amd_iommu_last_bdf)
1283 return 0;
1284
1285 return 1;
1286}
1287
1288/*
1072 * The function for pre-allocating protection domains. 1289 * The function for pre-allocating protection domains.
1073 * 1290 *
1074 * If the driver core informs the DMA layer if a driver grabs a device 1291 * If the driver core informs the DMA layer if a driver grabs a device
@@ -1097,10 +1314,9 @@ void prealloc_protection_domains(void)
1097 if (!dma_dom) 1314 if (!dma_dom)
1098 continue; 1315 continue;
1099 init_unity_mappings_for_device(dma_dom, devid); 1316 init_unity_mappings_for_device(dma_dom, devid);
1100 set_device_domain(iommu, &dma_dom->domain, devid); 1317 dma_dom->target_dev = devid;
1101 printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ", 1318
1102 dma_dom->domain.id); 1319 list_add_tail(&dma_dom->list, &iommu_pd_list);
1103 print_devid(devid, 1);
1104 } 1320 }
1105} 1321}
1106 1322
@@ -1111,6 +1327,7 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
1111 .unmap_single = unmap_single, 1327 .unmap_single = unmap_single,
1112 .map_sg = map_sg, 1328 .map_sg = map_sg,
1113 .unmap_sg = unmap_sg, 1329 .unmap_sg = unmap_sg,
1330 .dma_supported = amd_iommu_dma_supported,
1114}; 1331};
1115 1332
1116/* 1333/*
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index a69cc0f5204..0cdcda35a05 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -22,6 +22,8 @@
22#include <linux/gfp.h> 22#include <linux/gfp.h>
23#include <linux/list.h> 23#include <linux/list.h>
24#include <linux/sysdev.h> 24#include <linux/sysdev.h>
25#include <linux/interrupt.h>
26#include <linux/msi.h>
25#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
26#include <asm/amd_iommu_types.h> 28#include <asm/amd_iommu_types.h>
27#include <asm/amd_iommu.h> 29#include <asm/amd_iommu.h>
@@ -30,7 +32,6 @@
30/* 32/*
31 * definitions for the ACPI scanning code 33 * definitions for the ACPI scanning code
32 */ 34 */
33#define PCI_BUS(x) (((x) >> 8) & 0xff)
34#define IVRS_HEADER_LENGTH 48 35#define IVRS_HEADER_LENGTH 48
35 36
36#define ACPI_IVHD_TYPE 0x10 37#define ACPI_IVHD_TYPE 0x10
@@ -121,6 +122,7 @@ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
121 we find in ACPI */ 122 we find in ACPI */
122unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ 123unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
123int amd_iommu_isolate; /* if 1, device isolation is enabled */ 124int amd_iommu_isolate; /* if 1, device isolation is enabled */
125bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
124 126
125LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 127LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
126 system */ 128 system */
@@ -210,7 +212,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
210/* Programs the physical address of the device table into the IOMMU hardware */ 212/* Programs the physical address of the device table into the IOMMU hardware */
211static void __init iommu_set_device_table(struct amd_iommu *iommu) 213static void __init iommu_set_device_table(struct amd_iommu *iommu)
212{ 214{
213 u32 entry; 215 u64 entry;
214 216
215 BUG_ON(iommu->mmio_base == NULL); 217 BUG_ON(iommu->mmio_base == NULL);
216 218
@@ -234,7 +236,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
234{ 236{
235 u32 ctrl; 237 u32 ctrl;
236 238
237 ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); 239 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
238 ctrl &= ~(1 << bit); 240 ctrl &= ~(1 << bit);
239 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 241 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
240} 242}
@@ -242,13 +244,23 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
242/* Function to enable the hardware */ 244/* Function to enable the hardware */
243void __init iommu_enable(struct amd_iommu *iommu) 245void __init iommu_enable(struct amd_iommu *iommu)
244{ 246{
245 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); 247 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
246 print_devid(iommu->devid, 0); 248 "at %02x:%02x.%x cap 0x%hx\n",
247 printk(" cap 0x%hx\n", iommu->cap_ptr); 249 iommu->dev->bus->number,
250 PCI_SLOT(iommu->dev->devfn),
251 PCI_FUNC(iommu->dev->devfn),
252 iommu->cap_ptr);
248 253
249 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 254 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
250} 255}
251 256
257/* Function to enable IOMMU event logging and event interrupts */
258void __init iommu_enable_event_logging(struct amd_iommu *iommu)
259{
260 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
261 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
262}
263
252/* 264/*
253 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in 265 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
254 * the system has one. 266 * the system has one.
@@ -286,6 +298,14 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
286 ****************************************************************************/ 298 ****************************************************************************/
287 299
288/* 300/*
301 * This function calculates the length of a given IVHD entry
302 */
303static inline int ivhd_entry_length(u8 *ivhd)
304{
305 return 0x04 << (*ivhd >> 6);
306}
307
308/*
289 * This function reads the last device id the IOMMU has to handle from the PCI 309 * This function reads the last device id the IOMMU has to handle from the PCI
290 * capability header for this IOMMU 310 * capability header for this IOMMU
291 */ 311 */
@@ -329,7 +349,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
329 default: 349 default:
330 break; 350 break;
331 } 351 }
332 p += 0x04 << (*p >> 6); 352 p += ivhd_entry_length(p);
333 } 353 }
334 354
335 WARN_ON(p != end); 355 WARN_ON(p != end);
@@ -414,7 +434,32 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
414 434
415static void __init free_command_buffer(struct amd_iommu *iommu) 435static void __init free_command_buffer(struct amd_iommu *iommu)
416{ 436{
417 free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); 437 free_pages((unsigned long)iommu->cmd_buf,
438 get_order(iommu->cmd_buf_size));
439}
440
441/* allocates the memory where the IOMMU will log its events to */
442static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
443{
444 u64 entry;
445 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
446 get_order(EVT_BUFFER_SIZE));
447
448 if (iommu->evt_buf == NULL)
449 return NULL;
450
451 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
452 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
453 &entry, sizeof(entry));
454
455 iommu->evt_buf_size = EVT_BUFFER_SIZE;
456
457 return iommu->evt_buf;
458}
459
460static void __init free_event_buffer(struct amd_iommu *iommu)
461{
462 free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
418} 463}
419 464
420/* sets a specific bit in the device table entry. */ 465/* sets a specific bit in the device table entry. */
@@ -487,19 +532,21 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
487 */ 532 */
488static void __init init_iommu_from_pci(struct amd_iommu *iommu) 533static void __init init_iommu_from_pci(struct amd_iommu *iommu)
489{ 534{
490 int bus = PCI_BUS(iommu->devid);
491 int dev = PCI_SLOT(iommu->devid);
492 int fn = PCI_FUNC(iommu->devid);
493 int cap_ptr = iommu->cap_ptr; 535 int cap_ptr = iommu->cap_ptr;
494 u32 range; 536 u32 range, misc;
495 537
496 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); 538 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
539 &iommu->cap);
540 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
541 &range);
542 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
543 &misc);
497 544
498 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
499 iommu->first_device = calc_devid(MMIO_GET_BUS(range), 545 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
500 MMIO_GET_FD(range)); 546 MMIO_GET_FD(range));
501 iommu->last_device = calc_devid(MMIO_GET_BUS(range), 547 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
502 MMIO_GET_LD(range)); 548 MMIO_GET_LD(range));
549 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
503} 550}
504 551
505/* 552/*
@@ -604,7 +651,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
604 break; 651 break;
605 } 652 }
606 653
607 p += 0x04 << (e->type >> 6); 654 p += ivhd_entry_length(p);
608 } 655 }
609} 656}
610 657
@@ -622,6 +669,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu)
622static void __init free_iommu_one(struct amd_iommu *iommu) 669static void __init free_iommu_one(struct amd_iommu *iommu)
623{ 670{
624 free_command_buffer(iommu); 671 free_command_buffer(iommu);
672 free_event_buffer(iommu);
625 iommu_unmap_mmio_space(iommu); 673 iommu_unmap_mmio_space(iommu);
626} 674}
627 675
@@ -649,8 +697,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
649 /* 697 /*
650 * Copy data from ACPI table entry to the iommu struct 698 * Copy data from ACPI table entry to the iommu struct
651 */ 699 */
652 iommu->devid = h->devid; 700 iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
701 if (!iommu->dev)
702 return 1;
703
653 iommu->cap_ptr = h->cap_ptr; 704 iommu->cap_ptr = h->cap_ptr;
705 iommu->pci_seg = h->pci_seg;
654 iommu->mmio_phys = h->mmio_phys; 706 iommu->mmio_phys = h->mmio_phys;
655 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); 707 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
656 if (!iommu->mmio_base) 708 if (!iommu->mmio_base)
@@ -661,11 +713,17 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
661 if (!iommu->cmd_buf) 713 if (!iommu->cmd_buf)
662 return -ENOMEM; 714 return -ENOMEM;
663 715
716 iommu->evt_buf = alloc_event_buffer(iommu);
717 if (!iommu->evt_buf)
718 return -ENOMEM;
719
720 iommu->int_enabled = false;
721
664 init_iommu_from_pci(iommu); 722 init_iommu_from_pci(iommu);
665 init_iommu_from_acpi(iommu, h); 723 init_iommu_from_acpi(iommu, h);
666 init_iommu_devices(iommu); 724 init_iommu_devices(iommu);
667 725
668 return 0; 726 return pci_enable_device(iommu->dev);
669} 727}
670 728
671/* 729/*
@@ -706,6 +764,95 @@ static int __init init_iommu_all(struct acpi_table_header *table)
706 764
707/**************************************************************************** 765/****************************************************************************
708 * 766 *
767 * The following functions initialize the MSI interrupts for all IOMMUs
768 * in the system. Its a bit challenging because there could be multiple
769 * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
770 * pci_dev.
771 *
772 ****************************************************************************/
773
774static int __init iommu_setup_msix(struct amd_iommu *iommu)
775{
776 struct amd_iommu *curr;
777 struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
778 int nvec = 0, i;
779
780 list_for_each_entry(curr, &amd_iommu_list, list) {
781 if (curr->dev == iommu->dev) {
782 entries[nvec].entry = curr->evt_msi_num;
783 entries[nvec].vector = 0;
784 curr->int_enabled = true;
785 nvec++;
786 }
787 }
788
789 if (pci_enable_msix(iommu->dev, entries, nvec)) {
790 pci_disable_msix(iommu->dev);
791 return 1;
792 }
793
794 for (i = 0; i < nvec; ++i) {
795 int r = request_irq(entries->vector, amd_iommu_int_handler,
796 IRQF_SAMPLE_RANDOM,
797 "AMD IOMMU",
798 NULL);
799 if (r)
800 goto out_free;
801 }
802
803 return 0;
804
805out_free:
806 for (i -= 1; i >= 0; --i)
807 free_irq(entries->vector, NULL);
808
809 pci_disable_msix(iommu->dev);
810
811 return 1;
812}
813
814static int __init iommu_setup_msi(struct amd_iommu *iommu)
815{
816 int r;
817 struct amd_iommu *curr;
818
819 list_for_each_entry(curr, &amd_iommu_list, list) {
820 if (curr->dev == iommu->dev)
821 curr->int_enabled = true;
822 }
823
824
825 if (pci_enable_msi(iommu->dev))
826 return 1;
827
828 r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
829 IRQF_SAMPLE_RANDOM,
830 "AMD IOMMU",
831 NULL);
832
833 if (r) {
834 pci_disable_msi(iommu->dev);
835 return 1;
836 }
837
838 return 0;
839}
840
841static int __init iommu_init_msi(struct amd_iommu *iommu)
842{
843 if (iommu->int_enabled)
844 return 0;
845
846 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
847 return iommu_setup_msix(iommu);
848 else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
849 return iommu_setup_msi(iommu);
850
851 return 1;
852}
853
854/****************************************************************************
855 *
709 * The next functions belong to the third pass of parsing the ACPI 856 * The next functions belong to the third pass of parsing the ACPI
710 * table. In this last pass the memory mapping requirements are 857 * table. In this last pass the memory mapping requirements are
711 * gathered (like exclusion and unity mapping reanges). 858 * gathered (like exclusion and unity mapping reanges).
@@ -811,7 +958,6 @@ static void init_device_table(void)
811 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { 958 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
812 set_dev_entry_bit(devid, DEV_ENTRY_VALID); 959 set_dev_entry_bit(devid, DEV_ENTRY_VALID);
813 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION); 960 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
814 set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT);
815 } 961 }
816} 962}
817 963
@@ -825,6 +971,8 @@ static void __init enable_iommus(void)
825 971
826 list_for_each_entry(iommu, &amd_iommu_list, list) { 972 list_for_each_entry(iommu, &amd_iommu_list, list) {
827 iommu_set_exclusion_range(iommu); 973 iommu_set_exclusion_range(iommu);
974 iommu_init_msi(iommu);
975 iommu_enable_event_logging(iommu);
828 iommu_enable(iommu); 976 iommu_enable(iommu);
829 } 977 }
830} 978}
@@ -995,11 +1143,17 @@ int __init amd_iommu_init(void)
995 else 1143 else
996 printk("disabled\n"); 1144 printk("disabled\n");
997 1145
1146 if (amd_iommu_unmap_flush)
1147 printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
1148 else
1149 printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
1150
998out: 1151out:
999 return ret; 1152 return ret;
1000 1153
1001free: 1154free:
1002 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); 1155 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1156 get_order(MAX_DOMAIN_ID/8));
1003 1157
1004 free_pages((unsigned long)amd_iommu_pd_table, 1158 free_pages((unsigned long)amd_iommu_pd_table,
1005 get_order(rlookup_table_size)); 1159 get_order(rlookup_table_size));
@@ -1057,8 +1211,10 @@ void __init amd_iommu_detect(void)
1057static int __init parse_amd_iommu_options(char *str) 1211static int __init parse_amd_iommu_options(char *str)
1058{ 1212{
1059 for (; *str; ++str) { 1213 for (; *str; ++str) {
1060 if (strcmp(str, "isolate") == 0) 1214 if (strncmp(str, "isolate", 7) == 0)
1061 amd_iommu_isolate = 1; 1215 amd_iommu_isolate = 1;
1216 if (strncmp(str, "fullflush", 11) == 0)
1217 amd_iommu_unmap_flush = true;
1062 } 1218 }
1063 1219
1064 return 1; 1220 return 1;
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 44e21826db1..9a32b37ee2e 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -455,11 +455,11 @@ out:
455 force_iommu || 455 force_iommu ||
456 valid_agp || 456 valid_agp ||
457 fallback_aper_force) { 457 fallback_aper_force) {
458 printk(KERN_ERR 458 printk(KERN_INFO
459 "Your BIOS doesn't leave a aperture memory hole\n"); 459 "Your BIOS doesn't leave a aperture memory hole\n");
460 printk(KERN_ERR 460 printk(KERN_INFO
461 "Please enable the IOMMU option in the BIOS setup\n"); 461 "Please enable the IOMMU option in the BIOS setup\n");
462 printk(KERN_ERR 462 printk(KERN_INFO
463 "This costs you %d MB of RAM\n", 463 "This costs you %d MB of RAM\n",
464 32 << fallback_aper_order); 464 32 << fallback_aper_order);
465 465
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic.c
index f88bd0d982b..04a7f960bbc 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic.c
@@ -23,11 +23,13 @@
23#include <linux/mc146818rtc.h> 23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h> 24#include <linux/kernel_stat.h>
25#include <linux/sysdev.h> 25#include <linux/sysdev.h>
26#include <linux/ioport.h>
26#include <linux/cpu.h> 27#include <linux/cpu.h>
27#include <linux/clockchips.h> 28#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h> 29#include <linux/acpi_pmtmr.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/dmi.h> 31#include <linux/dmi.h>
32#include <linux/dmar.h>
31 33
32#include <asm/atomic.h> 34#include <asm/atomic.h>
33#include <asm/smp.h> 35#include <asm/smp.h>
@@ -36,8 +38,14 @@
36#include <asm/desc.h> 38#include <asm/desc.h>
37#include <asm/arch_hooks.h> 39#include <asm/arch_hooks.h>
38#include <asm/hpet.h> 40#include <asm/hpet.h>
41#include <asm/pgalloc.h>
39#include <asm/i8253.h> 42#include <asm/i8253.h>
40#include <asm/nmi.h> 43#include <asm/nmi.h>
44#include <asm/idle.h>
45#include <asm/proto.h>
46#include <asm/timex.h>
47#include <asm/apic.h>
48#include <asm/i8259.h>
41 49
42#include <mach_apic.h> 50#include <mach_apic.h>
43#include <mach_apicdef.h> 51#include <mach_apicdef.h>
@@ -50,20 +58,60 @@
50# error SPURIOUS_APIC_VECTOR definition error 58# error SPURIOUS_APIC_VECTOR definition error
51#endif 59#endif
52 60
53unsigned long mp_lapic_addr; 61#ifdef CONFIG_X86_32
54
55/* 62/*
56 * Knob to control our willingness to enable the local APIC. 63 * Knob to control our willingness to enable the local APIC.
57 * 64 *
58 * +1=force-enable 65 * +1=force-enable
59 */ 66 */
60static int force_enable_local_apic; 67static int force_enable_local_apic;
61int disable_apic; 68/*
69 * APIC command line parameters
70 */
71static int __init parse_lapic(char *arg)
72{
73 force_enable_local_apic = 1;
74 return 0;
75}
76early_param("lapic", parse_lapic);
77/* Local APIC was disabled by the BIOS and enabled by the kernel */
78static int enabled_via_apicbase;
79
80#endif
81
82#ifdef CONFIG_X86_64
83static int apic_calibrate_pmtmr __initdata;
84static __init int setup_apicpmtimer(char *s)
85{
86 apic_calibrate_pmtmr = 1;
87 notsc_setup(NULL);
88 return 0;
89}
90__setup("apicpmtimer", setup_apicpmtimer);
91#endif
92
93#ifdef CONFIG_X86_64
94#define HAVE_X2APIC
95#endif
96
97#ifdef HAVE_X2APIC
98int x2apic;
99/* x2apic enabled before OS handover */
100int x2apic_preenabled;
101int disable_x2apic;
102static __init int setup_nox2apic(char *str)
103{
104 disable_x2apic = 1;
105 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
106 return 0;
107}
108early_param("nox2apic", setup_nox2apic);
109#endif
62 110
63/* Local APIC timer verification ok */ 111unsigned long mp_lapic_addr;
64static int local_apic_timer_verify_ok; 112int disable_apic;
65/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 113/* Disable local APIC timer from the kernel commandline or via dmi quirk */
66static int local_apic_timer_disabled; 114static int disable_apic_timer __cpuinitdata;
67/* Local APIC timer works in C2 */ 115/* Local APIC timer works in C2 */
68int local_apic_timer_c2_ok; 116int local_apic_timer_c2_ok;
69EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 117EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -112,9 +160,6 @@ static struct clock_event_device lapic_clockevent = {
112}; 160};
113static DEFINE_PER_CPU(struct clock_event_device, lapic_events); 161static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
114 162
115/* Local APIC was disabled by the BIOS and enabled by the kernel */
116static int enabled_via_apicbase;
117
118static unsigned long apic_phys; 163static unsigned long apic_phys;
119 164
120/* 165/*
@@ -130,7 +175,11 @@ static inline int lapic_get_version(void)
130 */ 175 */
131static inline int lapic_is_integrated(void) 176static inline int lapic_is_integrated(void)
132{ 177{
178#ifdef CONFIG_X86_64
179 return 1;
180#else
133 return APIC_INTEGRATED(lapic_get_version()); 181 return APIC_INTEGRATED(lapic_get_version());
182#endif
134} 183}
135 184
136/* 185/*
@@ -145,13 +194,18 @@ static int modern_apic(void)
145 return lapic_get_version() >= 0x14; 194 return lapic_get_version() >= 0x14;
146} 195}
147 196
148void apic_wait_icr_idle(void) 197/*
198 * Paravirt kernels also might be using these below ops. So we still
199 * use generic apic_read()/apic_write(), which might be pointing to different
200 * ops in PARAVIRT case.
201 */
202void xapic_wait_icr_idle(void)
149{ 203{
150 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 204 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
151 cpu_relax(); 205 cpu_relax();
152} 206}
153 207
154u32 safe_apic_wait_icr_idle(void) 208u32 safe_xapic_wait_icr_idle(void)
155{ 209{
156 u32 send_status; 210 u32 send_status;
157 int timeout; 211 int timeout;
@@ -167,19 +221,88 @@ u32 safe_apic_wait_icr_idle(void)
167 return send_status; 221 return send_status;
168} 222}
169 223
224void xapic_icr_write(u32 low, u32 id)
225{
226 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
227 apic_write(APIC_ICR, low);
228}
229
230u64 xapic_icr_read(void)
231{
232 u32 icr1, icr2;
233
234 icr2 = apic_read(APIC_ICR2);
235 icr1 = apic_read(APIC_ICR);
236
237 return icr1 | ((u64)icr2 << 32);
238}
239
240static struct apic_ops xapic_ops = {
241 .read = native_apic_mem_read,
242 .write = native_apic_mem_write,
243 .icr_read = xapic_icr_read,
244 .icr_write = xapic_icr_write,
245 .wait_icr_idle = xapic_wait_icr_idle,
246 .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
247};
248
249struct apic_ops __read_mostly *apic_ops = &xapic_ops;
250EXPORT_SYMBOL_GPL(apic_ops);
251
252#ifdef HAVE_X2APIC
253static void x2apic_wait_icr_idle(void)
254{
255 /* no need to wait for icr idle in x2apic */
256 return;
257}
258
259static u32 safe_x2apic_wait_icr_idle(void)
260{
261 /* no need to wait for icr idle in x2apic */
262 return 0;
263}
264
265void x2apic_icr_write(u32 low, u32 id)
266{
267 wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
268}
269
270u64 x2apic_icr_read(void)
271{
272 unsigned long val;
273
274 rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
275 return val;
276}
277
278static struct apic_ops x2apic_ops = {
279 .read = native_apic_msr_read,
280 .write = native_apic_msr_write,
281 .icr_read = x2apic_icr_read,
282 .icr_write = x2apic_icr_write,
283 .wait_icr_idle = x2apic_wait_icr_idle,
284 .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
285};
286#endif
287
170/** 288/**
171 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 289 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
172 */ 290 */
173void __cpuinit enable_NMI_through_LVT0(void) 291void __cpuinit enable_NMI_through_LVT0(void)
174{ 292{
175 unsigned int v = APIC_DM_NMI; 293 unsigned int v;
294
295 /* unmask and set to NMI */
296 v = APIC_DM_NMI;
176 297
177 /* Level triggered for 82489DX */ 298 /* Level triggered for 82489DX (32bit mode) */
178 if (!lapic_is_integrated()) 299 if (!lapic_is_integrated())
179 v |= APIC_LVT_LEVEL_TRIGGER; 300 v |= APIC_LVT_LEVEL_TRIGGER;
301
180 apic_write(APIC_LVT0, v); 302 apic_write(APIC_LVT0, v);
181} 303}
182 304
305#ifdef CONFIG_X86_32
183/** 306/**
184 * get_physical_broadcast - Get number of physical broadcast IDs 307 * get_physical_broadcast - Get number of physical broadcast IDs
185 */ 308 */
@@ -187,15 +310,20 @@ int get_physical_broadcast(void)
187{ 310{
188 return modern_apic() ? 0xff : 0xf; 311 return modern_apic() ? 0xff : 0xf;
189} 312}
313#endif
190 314
191/** 315/**
192 * lapic_get_maxlvt - get the maximum number of local vector table entries 316 * lapic_get_maxlvt - get the maximum number of local vector table entries
193 */ 317 */
194int lapic_get_maxlvt(void) 318int lapic_get_maxlvt(void)
195{ 319{
196 unsigned int v = apic_read(APIC_LVR); 320 unsigned int v;
197 321
198 /* 82489DXs do not report # of LVT entries. */ 322 v = apic_read(APIC_LVR);
323 /*
324 * - we always have APIC integrated on 64bit mode
325 * - 82489DXs do not report # of LVT entries
326 */
199 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; 327 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
200} 328}
201 329
@@ -203,7 +331,7 @@ int lapic_get_maxlvt(void)
203 * Local APIC timer 331 * Local APIC timer
204 */ 332 */
205 333
206/* Clock divisor is set to 16 */ 334/* Clock divisor */
207#define APIC_DIVISOR 16 335#define APIC_DIVISOR 16
208 336
209/* 337/*
@@ -212,6 +340,9 @@ int lapic_get_maxlvt(void)
212 * this function twice on the boot CPU, once with a bogus timeout 340 * this function twice on the boot CPU, once with a bogus timeout
213 * value, second time for real. The other (noncalibrating) CPUs 341 * value, second time for real. The other (noncalibrating) CPUs
214 * call this function only once, with the real, calibrated value. 342 * call this function only once, with the real, calibrated value.
343 *
344 * We do reads before writes even if unnecessary, to get around the
345 * P5 APIC double write bug.
215 */ 346 */
216static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) 347static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
217{ 348{
@@ -233,14 +364,48 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
233 */ 364 */
234 tmp_value = apic_read(APIC_TDCR); 365 tmp_value = apic_read(APIC_TDCR);
235 apic_write(APIC_TDCR, 366 apic_write(APIC_TDCR,
236 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | 367 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
237 APIC_TDR_DIV_16); 368 APIC_TDR_DIV_16);
238 369
239 if (!oneshot) 370 if (!oneshot)
240 apic_write(APIC_TMICT, clocks / APIC_DIVISOR); 371 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
241} 372}
242 373
243/* 374/*
375 * Setup extended LVT, AMD specific (K8, family 10h)
376 *
377 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
378 * MCE interrupts are supported. Thus MCE offset must be set to 0.
379 *
380 * If mask=1, the LVT entry does not generate interrupts while mask=0
381 * enables the vector. See also the BKDGs.
382 */
383
384#define APIC_EILVT_LVTOFF_MCE 0
385#define APIC_EILVT_LVTOFF_IBS 1
386
387static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
388{
389 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
390 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
391
392 apic_write(reg, v);
393}
394
395u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
396{
397 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
398 return APIC_EILVT_LVTOFF_MCE;
399}
400
401u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
402{
403 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
404 return APIC_EILVT_LVTOFF_IBS;
405}
406EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
407
408/*
244 * Program the next event, relative to now 409 * Program the next event, relative to now
245 */ 410 */
246static int lapic_next_event(unsigned long delta, 411static int lapic_next_event(unsigned long delta,
@@ -259,8 +424,8 @@ static void lapic_timer_setup(enum clock_event_mode mode,
259 unsigned long flags; 424 unsigned long flags;
260 unsigned int v; 425 unsigned int v;
261 426
262 /* Lapic used for broadcast ? */ 427 /* Lapic used as dummy for broadcast ? */
263 if (!local_apic_timer_verify_ok) 428 if (evt->features & CLOCK_EVT_FEAT_DUMMY)
264 return; 429 return;
265 430
266 local_irq_save(flags); 431 local_irq_save(flags);
@@ -299,7 +464,7 @@ static void lapic_timer_broadcast(cpumask_t mask)
299 * Setup the local APIC timer for this CPU. Copy the initilized values 464 * Setup the local APIC timer for this CPU. Copy the initilized values
300 * of the boot CPU and register the clock event in the framework. 465 * of the boot CPU and register the clock event in the framework.
301 */ 466 */
302static void __devinit setup_APIC_timer(void) 467static void __cpuinit setup_APIC_timer(void)
303{ 468{
304 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 469 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
305 470
@@ -369,14 +534,51 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
369 } 534 }
370} 535}
371 536
537static int __init calibrate_by_pmtimer(long deltapm, long *delta)
538{
539 const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
540 const long pm_thresh = pm_100ms / 100;
541 unsigned long mult;
542 u64 res;
543
544#ifndef CONFIG_X86_PM_TIMER
545 return -1;
546#endif
547
548 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
549
550 /* Check, if the PM timer is available */
551 if (!deltapm)
552 return -1;
553
554 mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
555
556 if (deltapm > (pm_100ms - pm_thresh) &&
557 deltapm < (pm_100ms + pm_thresh)) {
558 apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
559 } else {
560 res = (((u64)deltapm) * mult) >> 22;
561 do_div(res, 1000000);
562 printk(KERN_WARNING "APIC calibration not consistent "
563 "with PM Timer: %ldms instead of 100ms\n",
564 (long)res);
565 /* Correct the lapic counter value */
566 res = (((u64)(*delta)) * pm_100ms);
567 do_div(res, deltapm);
568 printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
569 "%lu (%ld)\n", (unsigned long)res, *delta);
570 *delta = (long)res;
571 }
572
573 return 0;
574}
575
372static int __init calibrate_APIC_clock(void) 576static int __init calibrate_APIC_clock(void)
373{ 577{
374 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 578 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
375 const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
376 const long pm_thresh = pm_100ms/100;
377 void (*real_handler)(struct clock_event_device *dev); 579 void (*real_handler)(struct clock_event_device *dev);
378 unsigned long deltaj; 580 unsigned long deltaj;
379 long delta, deltapm; 581 long delta;
380 int pm_referenced = 0; 582 int pm_referenced = 0;
381 583
382 local_irq_disable(); 584 local_irq_disable();
@@ -386,10 +588,10 @@ static int __init calibrate_APIC_clock(void)
386 global_clock_event->event_handler = lapic_cal_handler; 588 global_clock_event->event_handler = lapic_cal_handler;
387 589
388 /* 590 /*
389 * Setup the APIC counter to 1e9. There is no way the lapic 591 * Setup the APIC counter to maximum. There is no way the lapic
390 * can underflow in the 100ms detection time frame 592 * can underflow in the 100ms detection time frame
391 */ 593 */
392 __setup_APIC_LVTT(1000000000, 0, 0); 594 __setup_APIC_LVTT(0xffffffff, 0, 0);
393 595
394 /* Let the interrupts run */ 596 /* Let the interrupts run */
395 local_irq_enable(); 597 local_irq_enable();
@@ -406,34 +608,9 @@ static int __init calibrate_APIC_clock(void)
406 delta = lapic_cal_t1 - lapic_cal_t2; 608 delta = lapic_cal_t1 - lapic_cal_t2;
407 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); 609 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
408 610
409 /* Check, if the PM timer is available */ 611 /* we trust the PM based calibration if possible */
410 deltapm = lapic_cal_pm2 - lapic_cal_pm1; 612 pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
411 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); 613 &delta);
412
413 if (deltapm) {
414 unsigned long mult;
415 u64 res;
416
417 mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
418
419 if (deltapm > (pm_100ms - pm_thresh) &&
420 deltapm < (pm_100ms + pm_thresh)) {
421 apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
422 } else {
423 res = (((u64) deltapm) * mult) >> 22;
424 do_div(res, 1000000);
425 printk(KERN_WARNING "APIC calibration not consistent "
426 "with PM Timer: %ldms instead of 100ms\n",
427 (long)res);
428 /* Correct the lapic counter value */
429 res = (((u64) delta) * pm_100ms);
430 do_div(res, deltapm);
431 printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
432 "%lu (%ld)\n", (unsigned long) res, delta);
433 delta = (long) res;
434 }
435 pm_referenced = 1;
436 }
437 614
438 /* Calculate the scaled math multiplication factor */ 615 /* Calculate the scaled math multiplication factor */
439 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 616 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
@@ -473,9 +650,12 @@ static int __init calibrate_APIC_clock(void)
473 return -1; 650 return -1;
474 } 651 }
475 652
476 local_apic_timer_verify_ok = 1; 653 levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
477 654
478 /* We trust the pm timer based calibration */ 655 /*
656 * PM timer calibration failed or not turned on
657 * so lets try APIC timer based calibration
658 */
479 if (!pm_referenced) { 659 if (!pm_referenced) {
480 apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); 660 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
481 661
@@ -507,11 +687,11 @@ static int __init calibrate_APIC_clock(void)
507 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) 687 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
508 apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); 688 apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
509 else 689 else
510 local_apic_timer_verify_ok = 0; 690 levt->features |= CLOCK_EVT_FEAT_DUMMY;
511 } else 691 } else
512 local_irq_enable(); 692 local_irq_enable();
513 693
514 if (!local_apic_timer_verify_ok) { 694 if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
515 printk(KERN_WARNING 695 printk(KERN_WARNING
516 "APIC timer disabled due to verification failure.\n"); 696 "APIC timer disabled due to verification failure.\n");
517 return -1; 697 return -1;
@@ -533,7 +713,8 @@ void __init setup_boot_APIC_clock(void)
533 * timer as a dummy clock event source on SMP systems, so the 713 * timer as a dummy clock event source on SMP systems, so the
534 * broadcast mechanism is used. On UP systems simply ignore it. 714 * broadcast mechanism is used. On UP systems simply ignore it.
535 */ 715 */
536 if (local_apic_timer_disabled) { 716 if (disable_apic_timer) {
717 printk(KERN_INFO "Disabling APIC timer\n");
537 /* No broadcast on UP ! */ 718 /* No broadcast on UP ! */
538 if (num_possible_cpus() > 1) { 719 if (num_possible_cpus() > 1) {
539 lapic_clockevent.mult = 1; 720 lapic_clockevent.mult = 1;
@@ -567,7 +748,7 @@ void __init setup_boot_APIC_clock(void)
567 setup_APIC_timer(); 748 setup_APIC_timer();
568} 749}
569 750
570void __devinit setup_secondary_APIC_clock(void) 751void __cpuinit setup_secondary_APIC_clock(void)
571{ 752{
572 setup_APIC_timer(); 753 setup_APIC_timer();
573} 754}
@@ -602,7 +783,11 @@ static void local_apic_timer_interrupt(void)
602 /* 783 /*
603 * the NMI deadlock-detector uses this. 784 * the NMI deadlock-detector uses this.
604 */ 785 */
786#ifdef CONFIG_X86_64
787 add_pda(apic_timer_irqs, 1);
788#else
605 per_cpu(irq_stat, cpu).apic_timer_irqs++; 789 per_cpu(irq_stat, cpu).apic_timer_irqs++;
790#endif
606 791
607 evt->event_handler(evt); 792 evt->event_handler(evt);
608} 793}
@@ -629,6 +814,9 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
629 * Besides, if we don't timer interrupts ignore the global 814 * Besides, if we don't timer interrupts ignore the global
630 * interrupt lock, which is the WrongThing (tm) to do. 815 * interrupt lock, which is the WrongThing (tm) to do.
631 */ 816 */
817#ifdef CONFIG_X86_64
818 exit_idle();
819#endif
632 irq_enter(); 820 irq_enter();
633 local_apic_timer_interrupt(); 821 local_apic_timer_interrupt();
634 irq_exit(); 822 irq_exit();
@@ -642,35 +830,6 @@ int setup_profiling_timer(unsigned int multiplier)
642} 830}
643 831
644/* 832/*
645 * Setup extended LVT, AMD specific (K8, family 10h)
646 *
647 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
648 * MCE interrupts are supported. Thus MCE offset must be set to 0.
649 */
650
651#define APIC_EILVT_LVTOFF_MCE 0
652#define APIC_EILVT_LVTOFF_IBS 1
653
654static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
655{
656 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
657 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
658 apic_write(reg, v);
659}
660
661u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
662{
663 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
664 return APIC_EILVT_LVTOFF_MCE;
665}
666
667u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
668{
669 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
670 return APIC_EILVT_LVTOFF_IBS;
671}
672
673/*
674 * Local APIC start and shutdown 833 * Local APIC start and shutdown
675 */ 834 */
676 835
@@ -715,7 +874,7 @@ void clear_local_APIC(void)
715 } 874 }
716 875
717 /* lets not touch this if we didn't frob it */ 876 /* lets not touch this if we didn't frob it */
718#ifdef CONFIG_X86_MCE_P4THERMAL 877#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
719 if (maxlvt >= 5) { 878 if (maxlvt >= 5) {
720 v = apic_read(APIC_LVTTHMR); 879 v = apic_read(APIC_LVTTHMR);
721 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); 880 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
@@ -732,10 +891,6 @@ void clear_local_APIC(void)
732 if (maxlvt >= 4) 891 if (maxlvt >= 4)
733 apic_write(APIC_LVTPC, APIC_LVT_MASKED); 892 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
734 893
735#ifdef CONFIG_X86_MCE_P4THERMAL
736 if (maxlvt >= 5)
737 apic_write(APIC_LVTTHMR, APIC_LVT_MASKED);
738#endif
739 /* Integrated APIC (!82489DX) ? */ 894 /* Integrated APIC (!82489DX) ? */
740 if (lapic_is_integrated()) { 895 if (lapic_is_integrated()) {
741 if (maxlvt > 3) 896 if (maxlvt > 3)
@@ -750,7 +905,7 @@ void clear_local_APIC(void)
750 */ 905 */
751void disable_local_APIC(void) 906void disable_local_APIC(void)
752{ 907{
753 unsigned long value; 908 unsigned int value;
754 909
755 clear_local_APIC(); 910 clear_local_APIC();
756 911
@@ -762,6 +917,7 @@ void disable_local_APIC(void)
762 value &= ~APIC_SPIV_APIC_ENABLED; 917 value &= ~APIC_SPIV_APIC_ENABLED;
763 apic_write(APIC_SPIV, value); 918 apic_write(APIC_SPIV, value);
764 919
920#ifdef CONFIG_X86_32
765 /* 921 /*
766 * When LAPIC was disabled by the BIOS and enabled by the kernel, 922 * When LAPIC was disabled by the BIOS and enabled by the kernel,
767 * restore the disabled state. 923 * restore the disabled state.
@@ -773,6 +929,7 @@ void disable_local_APIC(void)
773 l &= ~MSR_IA32_APICBASE_ENABLE; 929 l &= ~MSR_IA32_APICBASE_ENABLE;
774 wrmsr(MSR_IA32_APICBASE, l, h); 930 wrmsr(MSR_IA32_APICBASE, l, h);
775 } 931 }
932#endif
776} 933}
777 934
778/* 935/*
@@ -789,11 +946,15 @@ void lapic_shutdown(void)
789 return; 946 return;
790 947
791 local_irq_save(flags); 948 local_irq_save(flags);
792 clear_local_APIC();
793 949
794 if (enabled_via_apicbase) 950#ifdef CONFIG_X86_32
951 if (!enabled_via_apicbase)
952 clear_local_APIC();
953 else
954#endif
795 disable_local_APIC(); 955 disable_local_APIC();
796 956
957
797 local_irq_restore(flags); 958 local_irq_restore(flags);
798} 959}
799 960
@@ -838,6 +999,12 @@ int __init verify_local_APIC(void)
838 */ 999 */
839 reg0 = apic_read(APIC_ID); 1000 reg0 = apic_read(APIC_ID);
840 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); 1001 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
1002 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
1003 reg1 = apic_read(APIC_ID);
1004 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
1005 apic_write(APIC_ID, reg0);
1006 if (reg1 != (reg0 ^ APIC_ID_MASK))
1007 return 0;
841 1008
842 /* 1009 /*
843 * The next two are just to see if we have sane values. 1010 * The next two are just to see if we have sane values.
@@ -863,14 +1030,15 @@ void __init sync_Arb_IDs(void)
863 */ 1030 */
864 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 1031 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
865 return; 1032 return;
1033
866 /* 1034 /*
867 * Wait for idle. 1035 * Wait for idle.
868 */ 1036 */
869 apic_wait_icr_idle(); 1037 apic_wait_icr_idle();
870 1038
871 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 1039 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
872 apic_write(APIC_ICR, 1040 apic_write(APIC_ICR, APIC_DEST_ALLINC |
873 APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); 1041 APIC_INT_LEVELTRIG | APIC_DM_INIT);
874} 1042}
875 1043
876/* 1044/*
@@ -878,7 +1046,7 @@ void __init sync_Arb_IDs(void)
878 */ 1046 */
879void __init init_bsp_APIC(void) 1047void __init init_bsp_APIC(void)
880{ 1048{
881 unsigned long value; 1049 unsigned int value;
882 1050
883 /* 1051 /*
884 * Don't do the setup now if we have a SMP BIOS as the 1052 * Don't do the setup now if we have a SMP BIOS as the
@@ -899,11 +1067,13 @@ void __init init_bsp_APIC(void)
899 value &= ~APIC_VECTOR_MASK; 1067 value &= ~APIC_VECTOR_MASK;
900 value |= APIC_SPIV_APIC_ENABLED; 1068 value |= APIC_SPIV_APIC_ENABLED;
901 1069
1070#ifdef CONFIG_X86_32
902 /* This bit is reserved on P4/Xeon and should be cleared */ 1071 /* This bit is reserved on P4/Xeon and should be cleared */
903 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && 1072 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
904 (boot_cpu_data.x86 == 15)) 1073 (boot_cpu_data.x86 == 15))
905 value &= ~APIC_SPIV_FOCUS_DISABLED; 1074 value &= ~APIC_SPIV_FOCUS_DISABLED;
906 else 1075 else
1076#endif
907 value |= APIC_SPIV_FOCUS_DISABLED; 1077 value |= APIC_SPIV_FOCUS_DISABLED;
908 value |= SPURIOUS_APIC_VECTOR; 1078 value |= SPURIOUS_APIC_VECTOR;
909 apic_write(APIC_SPIV, value); 1079 apic_write(APIC_SPIV, value);
@@ -920,39 +1090,43 @@ void __init init_bsp_APIC(void)
920 1090
921static void __cpuinit lapic_setup_esr(void) 1091static void __cpuinit lapic_setup_esr(void)
922{ 1092{
923 unsigned long oldvalue, value, maxlvt; 1093 unsigned int oldvalue, value, maxlvt;
924 if (lapic_is_integrated() && !esr_disable) { 1094
925 /* !82489DX */ 1095 if (!lapic_is_integrated()) {
926 maxlvt = lapic_get_maxlvt(); 1096 printk(KERN_INFO "No ESR for 82489DX.\n");
927 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 1097 return;
928 apic_write(APIC_ESR, 0); 1098 }
929 oldvalue = apic_read(APIC_ESR);
930 1099
931 /* enables sending errors */ 1100 if (esr_disable) {
932 value = ERROR_APIC_VECTOR;
933 apic_write(APIC_LVTERR, value);
934 /* 1101 /*
935 * spec says clear errors after enabling vector. 1102 * Something untraceable is creating bad interrupts on
1103 * secondary quads ... for the moment, just leave the
1104 * ESR disabled - we can't do anything useful with the
1105 * errors anyway - mbligh
936 */ 1106 */
937 if (maxlvt > 3) 1107 printk(KERN_INFO "Leaving ESR disabled.\n");
938 apic_write(APIC_ESR, 0); 1108 return;
939 value = apic_read(APIC_ESR);
940 if (value != oldvalue)
941 apic_printk(APIC_VERBOSE, "ESR value before enabling "
942 "vector: 0x%08lx after: 0x%08lx\n",
943 oldvalue, value);
944 } else {
945 if (esr_disable)
946 /*
947 * Something untraceable is creating bad interrupts on
948 * secondary quads ... for the moment, just leave the
949 * ESR disabled - we can't do anything useful with the
950 * errors anyway - mbligh
951 */
952 printk(KERN_INFO "Leaving ESR disabled.\n");
953 else
954 printk(KERN_INFO "No ESR for 82489DX.\n");
955 } 1109 }
1110
1111 maxlvt = lapic_get_maxlvt();
1112 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1113 apic_write(APIC_ESR, 0);
1114 oldvalue = apic_read(APIC_ESR);
1115
1116 /* enables sending errors */
1117 value = ERROR_APIC_VECTOR;
1118 apic_write(APIC_LVTERR, value);
1119
1120 /*
1121 * spec says clear errors after enabling vector.
1122 */
1123 if (maxlvt > 3)
1124 apic_write(APIC_ESR, 0);
1125 value = apic_read(APIC_ESR);
1126 if (value != oldvalue)
1127 apic_printk(APIC_VERBOSE, "ESR value before enabling "
1128 "vector: 0x%08x after: 0x%08x\n",
1129 oldvalue, value);
956} 1130}
957 1131
958 1132
@@ -961,24 +1135,27 @@ static void __cpuinit lapic_setup_esr(void)
961 */ 1135 */
962void __cpuinit setup_local_APIC(void) 1136void __cpuinit setup_local_APIC(void)
963{ 1137{
964 unsigned long value, integrated; 1138 unsigned int value;
965 int i, j; 1139 int i, j;
966 1140
1141#ifdef CONFIG_X86_32
967 /* Pound the ESR really hard over the head with a big hammer - mbligh */ 1142 /* Pound the ESR really hard over the head with a big hammer - mbligh */
968 if (esr_disable) { 1143 if (lapic_is_integrated() && esr_disable) {
969 apic_write(APIC_ESR, 0); 1144 apic_write(APIC_ESR, 0);
970 apic_write(APIC_ESR, 0); 1145 apic_write(APIC_ESR, 0);
971 apic_write(APIC_ESR, 0); 1146 apic_write(APIC_ESR, 0);
972 apic_write(APIC_ESR, 0); 1147 apic_write(APIC_ESR, 0);
973 } 1148 }
1149#endif
974 1150
975 integrated = lapic_is_integrated(); 1151 preempt_disable();
976 1152
977 /* 1153 /*
978 * Double-check whether this APIC is really registered. 1154 * Double-check whether this APIC is really registered.
1155 * This is meaningless in clustered apic mode, so we skip it.
979 */ 1156 */
980 if (!apic_id_registered()) 1157 if (!apic_id_registered())
981 WARN_ON_ONCE(1); 1158 BUG();
982 1159
983 /* 1160 /*
984 * Intel recommends to set DFR, LDR and TPR before enabling 1161 * Intel recommends to set DFR, LDR and TPR before enabling
@@ -1024,6 +1201,7 @@ void __cpuinit setup_local_APIC(void)
1024 */ 1201 */
1025 value |= APIC_SPIV_APIC_ENABLED; 1202 value |= APIC_SPIV_APIC_ENABLED;
1026 1203
1204#ifdef CONFIG_X86_32
1027 /* 1205 /*
1028 * Some unknown Intel IO/APIC (or APIC) errata is biting us with 1206 * Some unknown Intel IO/APIC (or APIC) errata is biting us with
1029 * certain networking cards. If high frequency interrupts are 1207 * certain networking cards. If high frequency interrupts are
@@ -1044,8 +1222,13 @@ void __cpuinit setup_local_APIC(void)
1044 * See also the comment in end_level_ioapic_irq(). --macro 1222 * See also the comment in end_level_ioapic_irq(). --macro
1045 */ 1223 */
1046 1224
1047 /* Enable focus processor (bit==0) */ 1225 /*
1226 * - enable focus processor (bit==0)
1227 * - 64bit mode always use processor focus
1228 * so no need to set it
1229 */
1048 value &= ~APIC_SPIV_FOCUS_DISABLED; 1230 value &= ~APIC_SPIV_FOCUS_DISABLED;
1231#endif
1049 1232
1050 /* 1233 /*
1051 * Set spurious IRQ vector 1234 * Set spurious IRQ vector
@@ -1082,25 +1265,178 @@ void __cpuinit setup_local_APIC(void)
1082 value = APIC_DM_NMI; 1265 value = APIC_DM_NMI;
1083 else 1266 else
1084 value = APIC_DM_NMI | APIC_LVT_MASKED; 1267 value = APIC_DM_NMI | APIC_LVT_MASKED;
1085 if (!integrated) /* 82489DX */ 1268 if (!lapic_is_integrated()) /* 82489DX */
1086 value |= APIC_LVT_LEVEL_TRIGGER; 1269 value |= APIC_LVT_LEVEL_TRIGGER;
1087 apic_write(APIC_LVT1, value); 1270 apic_write(APIC_LVT1, value);
1271
1272 preempt_enable();
1088} 1273}
1089 1274
1090void __cpuinit end_local_APIC_setup(void) 1275void __cpuinit end_local_APIC_setup(void)
1091{ 1276{
1092 unsigned long value;
1093
1094 lapic_setup_esr(); 1277 lapic_setup_esr();
1095 /* Disable the local apic timer */ 1278
1096 value = apic_read(APIC_LVTT); 1279#ifdef CONFIG_X86_32
1097 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 1280 {
1098 apic_write(APIC_LVTT, value); 1281 unsigned int value;
1282 /* Disable the local apic timer */
1283 value = apic_read(APIC_LVTT);
1284 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1285 apic_write(APIC_LVTT, value);
1286 }
1287#endif
1099 1288
1100 setup_apic_nmi_watchdog(NULL); 1289 setup_apic_nmi_watchdog(NULL);
1101 apic_pm_activate(); 1290 apic_pm_activate();
1102} 1291}
1103 1292
1293#ifdef HAVE_X2APIC
1294void check_x2apic(void)
1295{
1296 int msr, msr2;
1297
1298 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1299
1300 if (msr & X2APIC_ENABLE) {
1301 printk("x2apic enabled by BIOS, switching to x2apic ops\n");
1302 x2apic_preenabled = x2apic = 1;
1303 apic_ops = &x2apic_ops;
1304 }
1305}
1306
1307void enable_x2apic(void)
1308{
1309 int msr, msr2;
1310
1311 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1312 if (!(msr & X2APIC_ENABLE)) {
1313 printk("Enabling x2apic\n");
1314 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1315 }
1316}
1317
1318void enable_IR_x2apic(void)
1319{
1320#ifdef CONFIG_INTR_REMAP
1321 int ret;
1322 unsigned long flags;
1323
1324 if (!cpu_has_x2apic)
1325 return;
1326
1327 if (!x2apic_preenabled && disable_x2apic) {
1328 printk(KERN_INFO
1329 "Skipped enabling x2apic and Interrupt-remapping "
1330 "because of nox2apic\n");
1331 return;
1332 }
1333
1334 if (x2apic_preenabled && disable_x2apic)
1335 panic("Bios already enabled x2apic, can't enforce nox2apic");
1336
1337 if (!x2apic_preenabled && skip_ioapic_setup) {
1338 printk(KERN_INFO
1339 "Skipped enabling x2apic and Interrupt-remapping "
1340 "because of skipping io-apic setup\n");
1341 return;
1342 }
1343
1344 ret = dmar_table_init();
1345 if (ret) {
1346 printk(KERN_INFO
1347 "dmar_table_init() failed with %d:\n", ret);
1348
1349 if (x2apic_preenabled)
1350 panic("x2apic enabled by bios. But IR enabling failed");
1351 else
1352 printk(KERN_INFO
1353 "Not enabling x2apic,Intr-remapping\n");
1354 return;
1355 }
1356
1357 local_irq_save(flags);
1358 mask_8259A();
1359
1360 ret = save_mask_IO_APIC_setup();
1361 if (ret) {
1362 printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret);
1363 goto end;
1364 }
1365
1366 ret = enable_intr_remapping(1);
1367
1368 if (ret && x2apic_preenabled) {
1369 local_irq_restore(flags);
1370 panic("x2apic enabled by bios. But IR enabling failed");
1371 }
1372
1373 if (ret)
1374 goto end_restore;
1375
1376 if (!x2apic) {
1377 x2apic = 1;
1378 apic_ops = &x2apic_ops;
1379 enable_x2apic();
1380 }
1381
1382end_restore:
1383 if (ret)
1384 /*
1385 * IR enabling failed
1386 */
1387 restore_IO_APIC_setup();
1388 else
1389 reinit_intr_remapped_IO_APIC(x2apic_preenabled);
1390
1391end:
1392 unmask_8259A();
1393 local_irq_restore(flags);
1394
1395 if (!ret) {
1396 if (!x2apic_preenabled)
1397 printk(KERN_INFO
1398 "Enabled x2apic and interrupt-remapping\n");
1399 else
1400 printk(KERN_INFO
1401 "Enabled Interrupt-remapping\n");
1402 } else
1403 printk(KERN_ERR
1404 "Failed to enable Interrupt-remapping and x2apic\n");
1405#else
1406 if (!cpu_has_x2apic)
1407 return;
1408
1409 if (x2apic_preenabled)
1410 panic("x2apic enabled prior OS handover,"
1411 " enable CONFIG_INTR_REMAP");
1412
1413 printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
1414 " and x2apic\n");
1415#endif
1416
1417 return;
1418}
1419#endif /* HAVE_X2APIC */
1420
1421#ifdef CONFIG_X86_64
1422/*
1423 * Detect and enable local APICs on non-SMP boards.
1424 * Original code written by Keir Fraser.
1425 * On AMD64 we trust the BIOS - if it says no APIC it is likely
1426 * not correctly set up (usually the APIC timer won't work etc.)
1427 */
1428static int __init detect_init_APIC(void)
1429{
1430 if (!cpu_has_apic) {
1431 printk(KERN_INFO "No local APIC present\n");
1432 return -1;
1433 }
1434
1435 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1436 boot_cpu_physical_apicid = 0;
1437 return 0;
1438}
1439#else
1104/* 1440/*
1105 * Detect and initialize APIC 1441 * Detect and initialize APIC
1106 */ 1442 */
@@ -1179,12 +1515,46 @@ no_apic:
1179 printk(KERN_INFO "No local APIC present or hardware disabled\n"); 1515 printk(KERN_INFO "No local APIC present or hardware disabled\n");
1180 return -1; 1516 return -1;
1181} 1517}
1518#endif
1519
1520#ifdef CONFIG_X86_64
1521void __init early_init_lapic_mapping(void)
1522{
1523 unsigned long phys_addr;
1524
1525 /*
1526 * If no local APIC can be found then go out
1527 * : it means there is no mpatable and MADT
1528 */
1529 if (!smp_found_config)
1530 return;
1531
1532 phys_addr = mp_lapic_addr;
1533
1534 set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
1535 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
1536 APIC_BASE, phys_addr);
1537
1538 /*
1539 * Fetch the APIC ID of the BSP in case we have a
1540 * default configuration (or the MP table is broken).
1541 */
1542 boot_cpu_physical_apicid = read_apic_id();
1543}
1544#endif
1182 1545
1183/** 1546/**
1184 * init_apic_mappings - initialize APIC mappings 1547 * init_apic_mappings - initialize APIC mappings
1185 */ 1548 */
1186void __init init_apic_mappings(void) 1549void __init init_apic_mappings(void)
1187{ 1550{
1551#ifdef HAVE_X2APIC
1552 if (x2apic) {
1553 boot_cpu_physical_apicid = read_apic_id();
1554 return;
1555 }
1556#endif
1557
1188 /* 1558 /*
1189 * If no local APIC can be found then set up a fake all 1559 * If no local APIC can be found then set up a fake all
1190 * zeroes page to simulate the local APIC and another 1560 * zeroes page to simulate the local APIC and another
@@ -1197,27 +1567,36 @@ void __init init_apic_mappings(void)
1197 apic_phys = mp_lapic_addr; 1567 apic_phys = mp_lapic_addr;
1198 1568
1199 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 1569 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
1200 printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE, 1570 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
1201 apic_phys); 1571 APIC_BASE, apic_phys);
1202 1572
1203 /* 1573 /*
1204 * Fetch the APIC ID of the BSP in case we have a 1574 * Fetch the APIC ID of the BSP in case we have a
1205 * default configuration (or the MP table is broken). 1575 * default configuration (or the MP table is broken).
1206 */ 1576 */
1207 if (boot_cpu_physical_apicid == -1U) 1577 if (boot_cpu_physical_apicid == -1U)
1208 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1578 boot_cpu_physical_apicid = read_apic_id();
1209
1210} 1579}
1211 1580
1212/* 1581/*
1213 * This initializes the IO-APIC and APIC hardware if this is 1582 * This initializes the IO-APIC and APIC hardware if this is
1214 * a UP kernel. 1583 * a UP kernel.
1215 */ 1584 */
1216
1217int apic_version[MAX_APICS]; 1585int apic_version[MAX_APICS];
1218 1586
1219int __init APIC_init_uniprocessor(void) 1587int __init APIC_init_uniprocessor(void)
1220{ 1588{
1589#ifdef CONFIG_X86_64
1590 if (disable_apic) {
1591 printk(KERN_INFO "Apic disabled\n");
1592 return -1;
1593 }
1594 if (!cpu_has_apic) {
1595 disable_apic = 1;
1596 printk(KERN_INFO "Apic disabled by BIOS\n");
1597 return -1;
1598 }
1599#else
1221 if (!smp_found_config && !cpu_has_apic) 1600 if (!smp_found_config && !cpu_has_apic)
1222 return -1; 1601 return -1;
1223 1602
@@ -1226,39 +1605,68 @@ int __init APIC_init_uniprocessor(void)
1226 */ 1605 */
1227 if (!cpu_has_apic && 1606 if (!cpu_has_apic &&
1228 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { 1607 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1229 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 1608 printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n",
1230 boot_cpu_physical_apicid); 1609 boot_cpu_physical_apicid);
1231 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1610 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1232 return -1; 1611 return -1;
1233 } 1612 }
1613#endif
1234 1614
1235 verify_local_APIC(); 1615#ifdef HAVE_X2APIC
1616 enable_IR_x2apic();
1617#endif
1618#ifdef CONFIG_X86_64
1619 setup_apic_routing();
1620#endif
1236 1621
1622 verify_local_APIC();
1237 connect_bsp_APIC(); 1623 connect_bsp_APIC();
1238 1624
1625#ifdef CONFIG_X86_64
1626 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
1627#else
1239 /* 1628 /*
1240 * Hack: In case of kdump, after a crash, kernel might be booting 1629 * Hack: In case of kdump, after a crash, kernel might be booting
1241 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid 1630 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
1242 * might be zero if read from MP tables. Get it from LAPIC. 1631 * might be zero if read from MP tables. Get it from LAPIC.
1243 */ 1632 */
1244#ifdef CONFIG_CRASH_DUMP 1633# ifdef CONFIG_CRASH_DUMP
1245 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1634 boot_cpu_physical_apicid = read_apic_id();
1635# endif
1246#endif 1636#endif
1247 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 1637 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1248
1249 setup_local_APIC(); 1638 setup_local_APIC();
1250 1639
1640#ifdef CONFIG_X86_64
1641 /*
1642 * Now enable IO-APICs, actually call clear_IO_APIC
1643 * We need clear_IO_APIC before enabling vector on BP
1644 */
1645 if (!skip_ioapic_setup && nr_ioapics)
1646 enable_IO_APIC();
1647#endif
1648
1251#ifdef CONFIG_X86_IO_APIC 1649#ifdef CONFIG_X86_IO_APIC
1252 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) 1650 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
1253#endif 1651#endif
1254 localise_nmi_watchdog(); 1652 localise_nmi_watchdog();
1255 end_local_APIC_setup(); 1653 end_local_APIC_setup();
1654
1256#ifdef CONFIG_X86_IO_APIC 1655#ifdef CONFIG_X86_IO_APIC
1257 if (smp_found_config) 1656 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1258 if (!skip_ioapic_setup && nr_ioapics) 1657 setup_IO_APIC();
1259 setup_IO_APIC(); 1658# ifdef CONFIG_X86_64
1659 else
1660 nr_ioapics = 0;
1661# endif
1260#endif 1662#endif
1663
1664#ifdef CONFIG_X86_64
1665 setup_boot_APIC_clock();
1666 check_nmi_watchdog();
1667#else
1261 setup_boot_clock(); 1668 setup_boot_clock();
1669#endif
1262 1670
1263 return 0; 1671 return 0;
1264} 1672}
@@ -1272,8 +1680,11 @@ int __init APIC_init_uniprocessor(void)
1272 */ 1680 */
1273void smp_spurious_interrupt(struct pt_regs *regs) 1681void smp_spurious_interrupt(struct pt_regs *regs)
1274{ 1682{
1275 unsigned long v; 1683 u32 v;
1276 1684
1685#ifdef CONFIG_X86_64
1686 exit_idle();
1687#endif
1277 irq_enter(); 1688 irq_enter();
1278 /* 1689 /*
1279 * Check if this really is a spurious interrupt and ACK it 1690 * Check if this really is a spurious interrupt and ACK it
@@ -1284,10 +1695,14 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1284 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) 1695 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1285 ack_APIC_irq(); 1696 ack_APIC_irq();
1286 1697
1698#ifdef CONFIG_X86_64
1699 add_pda(irq_spurious_count, 1);
1700#else
1287 /* see sw-dev-man vol 3, chapter 7.4.13.5 */ 1701 /* see sw-dev-man vol 3, chapter 7.4.13.5 */
1288 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " 1702 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
1289 "should never happen.\n", smp_processor_id()); 1703 "should never happen.\n", smp_processor_id());
1290 __get_cpu_var(irq_stat).irq_spurious_count++; 1704 __get_cpu_var(irq_stat).irq_spurious_count++;
1705#endif
1291 irq_exit(); 1706 irq_exit();
1292} 1707}
1293 1708
@@ -1296,8 +1711,11 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1296 */ 1711 */
1297void smp_error_interrupt(struct pt_regs *regs) 1712void smp_error_interrupt(struct pt_regs *regs)
1298{ 1713{
1299 unsigned long v, v1; 1714 u32 v, v1;
1300 1715
1716#ifdef CONFIG_X86_64
1717 exit_idle();
1718#endif
1301 irq_enter(); 1719 irq_enter();
1302 /* First tickle the hardware, only then report what went on. -- REW */ 1720 /* First tickle the hardware, only then report what went on. -- REW */
1303 v = apic_read(APIC_ESR); 1721 v = apic_read(APIC_ESR);
@@ -1316,64 +1734,17 @@ void smp_error_interrupt(struct pt_regs *regs)
1316 6: Received illegal vector 1734 6: Received illegal vector
1317 7: Illegal register address 1735 7: Illegal register address
1318 */ 1736 */
1319 printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", 1737 printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
1320 smp_processor_id(), v , v1); 1738 smp_processor_id(), v , v1);
1321 irq_exit(); 1739 irq_exit();
1322} 1740}
1323 1741
1324#ifdef CONFIG_SMP
1325void __init smp_intr_init(void)
1326{
1327 /*
1328 * IRQ0 must be given a fixed assignment and initialized,
1329 * because it's used before the IO-APIC is set up.
1330 */
1331 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
1332
1333 /*
1334 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
1335 * IPI, driven by wakeup.
1336 */
1337 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
1338
1339 /* IPI for invalidation */
1340 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1341
1342 /* IPI for generic function call */
1343 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
1344
1345 /* IPI for single call function */
1346 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
1347 call_function_single_interrupt);
1348}
1349#endif
1350
1351/*
1352 * Initialize APIC interrupts
1353 */
1354void __init apic_intr_init(void)
1355{
1356#ifdef CONFIG_SMP
1357 smp_intr_init();
1358#endif
1359 /* self generated IPI for local APIC timer */
1360 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
1361
1362 /* IPI vectors for APIC spurious and error interrupts */
1363 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
1364 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
1365
1366 /* thermal monitor LVT interrupt */
1367#ifdef CONFIG_X86_MCE_P4THERMAL
1368 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
1369#endif
1370}
1371
1372/** 1742/**
1373 * connect_bsp_APIC - attach the APIC to the interrupt system 1743 * connect_bsp_APIC - attach the APIC to the interrupt system
1374 */ 1744 */
1375void __init connect_bsp_APIC(void) 1745void __init connect_bsp_APIC(void)
1376{ 1746{
1747#ifdef CONFIG_X86_32
1377 if (pic_mode) { 1748 if (pic_mode) {
1378 /* 1749 /*
1379 * Do not trust the local APIC being empty at bootup. 1750 * Do not trust the local APIC being empty at bootup.
@@ -1388,6 +1759,7 @@ void __init connect_bsp_APIC(void)
1388 outb(0x70, 0x22); 1759 outb(0x70, 0x22);
1389 outb(0x01, 0x23); 1760 outb(0x01, 0x23);
1390 } 1761 }
1762#endif
1391 enable_apic_mode(); 1763 enable_apic_mode();
1392} 1764}
1393 1765
@@ -1400,6 +1772,9 @@ void __init connect_bsp_APIC(void)
1400 */ 1772 */
1401void disconnect_bsp_APIC(int virt_wire_setup) 1773void disconnect_bsp_APIC(int virt_wire_setup)
1402{ 1774{
1775 unsigned int value;
1776
1777#ifdef CONFIG_X86_32
1403 if (pic_mode) { 1778 if (pic_mode) {
1404 /* 1779 /*
1405 * Put the board back into PIC mode (has an effect only on 1780 * Put the board back into PIC mode (has an effect only on
@@ -1411,54 +1786,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1411 "entering PIC mode.\n"); 1786 "entering PIC mode.\n");
1412 outb(0x70, 0x22); 1787 outb(0x70, 0x22);
1413 outb(0x00, 0x23); 1788 outb(0x00, 0x23);
1414 } else { 1789 return;
1415 /* Go back to Virtual Wire compatibility mode */ 1790 }
1416 unsigned long value; 1791#endif
1417 1792
1418 /* For the spurious interrupt use vector F, and enable it */ 1793 /* Go back to Virtual Wire compatibility mode */
1419 value = apic_read(APIC_SPIV); 1794
1420 value &= ~APIC_VECTOR_MASK; 1795 /* For the spurious interrupt use vector F, and enable it */
1421 value |= APIC_SPIV_APIC_ENABLED; 1796 value = apic_read(APIC_SPIV);
1422 value |= 0xf; 1797 value &= ~APIC_VECTOR_MASK;
1423 apic_write(APIC_SPIV, value); 1798 value |= APIC_SPIV_APIC_ENABLED;
1424 1799 value |= 0xf;
1425 if (!virt_wire_setup) { 1800 apic_write(APIC_SPIV, value);
1426 /*
1427 * For LVT0 make it edge triggered, active high,
1428 * external and enabled
1429 */
1430 value = apic_read(APIC_LVT0);
1431 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1432 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1433 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1434 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1435 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1436 apic_write(APIC_LVT0, value);
1437 } else {
1438 /* Disable LVT0 */
1439 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1440 }
1441 1801
1802 if (!virt_wire_setup) {
1442 /* 1803 /*
1443 * For LVT1 make it edge triggered, active high, nmi and 1804 * For LVT0 make it edge triggered, active high,
1444 * enabled 1805 * external and enabled
1445 */ 1806 */
1446 value = apic_read(APIC_LVT1); 1807 value = apic_read(APIC_LVT0);
1447 value &= ~( 1808 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1448 APIC_MODE_MASK | APIC_SEND_PENDING |
1449 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1809 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1450 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1810 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1451 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1811 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1452 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 1812 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1453 apic_write(APIC_LVT1, value); 1813 apic_write(APIC_LVT0, value);
1814 } else {
1815 /* Disable LVT0 */
1816 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1454 } 1817 }
1818
1819 /*
1820 * For LVT1 make it edge triggered, active high,
1821 * nmi and enabled
1822 */
1823 value = apic_read(APIC_LVT1);
1824 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1825 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1826 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1827 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1828 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1829 apic_write(APIC_LVT1, value);
1455} 1830}
1456 1831
1457void __cpuinit generic_processor_info(int apicid, int version) 1832void __cpuinit generic_processor_info(int apicid, int version)
1458{ 1833{
1459 int cpu; 1834 int cpu;
1460 cpumask_t tmp_map; 1835 cpumask_t tmp_map;
1461 physid_mask_t phys_cpu;
1462 1836
1463 /* 1837 /*
1464 * Validate version 1838 * Validate version
@@ -1471,9 +1845,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1471 } 1845 }
1472 apic_version[apicid] = version; 1846 apic_version[apicid] = version;
1473 1847
1474 phys_cpu = apicid_to_cpu_present(apicid);
1475 physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
1476
1477 if (num_processors >= NR_CPUS) { 1848 if (num_processors >= NR_CPUS) {
1478 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1849 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1479 " Processor ignored.\n", NR_CPUS); 1850 " Processor ignored.\n", NR_CPUS);
@@ -1484,17 +1855,19 @@ void __cpuinit generic_processor_info(int apicid, int version)
1484 cpus_complement(tmp_map, cpu_present_map); 1855 cpus_complement(tmp_map, cpu_present_map);
1485 cpu = first_cpu(tmp_map); 1856 cpu = first_cpu(tmp_map);
1486 1857
1487 if (apicid == boot_cpu_physical_apicid) 1858 physid_set(apicid, phys_cpu_present_map);
1859 if (apicid == boot_cpu_physical_apicid) {
1488 /* 1860 /*
1489 * x86_bios_cpu_apicid is required to have processors listed 1861 * x86_bios_cpu_apicid is required to have processors listed
1490 * in same order as logical cpu numbers. Hence the first 1862 * in same order as logical cpu numbers. Hence the first
1491 * entry is BSP, and so on. 1863 * entry is BSP, and so on.
1492 */ 1864 */
1493 cpu = 0; 1865 cpu = 0;
1494 1866 }
1495 if (apicid > max_physical_apicid) 1867 if (apicid > max_physical_apicid)
1496 max_physical_apicid = apicid; 1868 max_physical_apicid = apicid;
1497 1869
1870#ifdef CONFIG_X86_32
1498 /* 1871 /*
1499 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y 1872 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1500 * but we need to work other dependencies like SMP_SUSPEND etc 1873 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1514,7 +1887,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
1514 def_to_bigsmp = 1; 1887 def_to_bigsmp = 1;
1515 } 1888 }
1516 } 1889 }
1517#ifdef CONFIG_SMP 1890#endif
1891
1892#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1518 /* are we being called early in kernel startup? */ 1893 /* are we being called early in kernel startup? */
1519 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1894 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1520 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 1895 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1527,16 +1902,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
1527 per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1902 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1528 } 1903 }
1529#endif 1904#endif
1905
1530 cpu_set(cpu, cpu_possible_map); 1906 cpu_set(cpu, cpu_possible_map);
1531 cpu_set(cpu, cpu_present_map); 1907 cpu_set(cpu, cpu_present_map);
1532} 1908}
1533 1909
1910#ifdef CONFIG_X86_64
1911int hard_smp_processor_id(void)
1912{
1913 return read_apic_id();
1914}
1915#endif
1916
1534/* 1917/*
1535 * Power management 1918 * Power management
1536 */ 1919 */
1537#ifdef CONFIG_PM 1920#ifdef CONFIG_PM
1538 1921
1539static struct { 1922static struct {
1923 /*
1924 * 'active' is true if the local APIC was enabled by us and
1925 * not the BIOS; this signifies that we are also responsible
1926 * for disabling it before entering apm/acpi suspend
1927 */
1540 int active; 1928 int active;
1541 /* r/w apic fields */ 1929 /* r/w apic fields */
1542 unsigned int apic_id; 1930 unsigned int apic_id;
@@ -1577,7 +1965,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1577 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 1965 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1578 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 1966 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1579 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 1967 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1580#ifdef CONFIG_X86_MCE_P4THERMAL 1968#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1581 if (maxlvt >= 5) 1969 if (maxlvt >= 5)
1582 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 1970 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1583#endif 1971#endif
@@ -1601,16 +1989,23 @@ static int lapic_resume(struct sys_device *dev)
1601 1989
1602 local_irq_save(flags); 1990 local_irq_save(flags);
1603 1991
1604 /* 1992#ifdef HAVE_X2APIC
1605 * Make sure the APICBASE points to the right address 1993 if (x2apic)
1606 * 1994 enable_x2apic();
1607 * FIXME! This will be wrong if we ever support suspend on 1995 else
1608 * SMP! We'll need to do this as part of the CPU restore! 1996#endif
1609 */ 1997 {
1610 rdmsr(MSR_IA32_APICBASE, l, h); 1998 /*
1611 l &= ~MSR_IA32_APICBASE_BASE; 1999 * Make sure the APICBASE points to the right address
1612 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 2000 *
1613 wrmsr(MSR_IA32_APICBASE, l, h); 2001 * FIXME! This will be wrong if we ever support suspend on
2002 * SMP! We'll need to do this as part of the CPU restore!
2003 */
2004 rdmsr(MSR_IA32_APICBASE, l, h);
2005 l &= ~MSR_IA32_APICBASE_BASE;
2006 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
2007 wrmsr(MSR_IA32_APICBASE, l, h);
2008 }
1614 2009
1615 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 2010 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1616 apic_write(APIC_ID, apic_pm_state.apic_id); 2011 apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1620,7 +2015,7 @@ static int lapic_resume(struct sys_device *dev)
1620 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 2015 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1621 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 2016 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1622 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 2017 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1623#ifdef CONFIG_X86_MCE_P4THERMAL 2018#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1624 if (maxlvt >= 5) 2019 if (maxlvt >= 5)
1625 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 2020 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1626#endif 2021#endif
@@ -1634,7 +2029,9 @@ static int lapic_resume(struct sys_device *dev)
1634 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); 2029 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1635 apic_write(APIC_ESR, 0); 2030 apic_write(APIC_ESR, 0);
1636 apic_read(APIC_ESR); 2031 apic_read(APIC_ESR);
2032
1637 local_irq_restore(flags); 2033 local_irq_restore(flags);
2034
1638 return 0; 2035 return 0;
1639} 2036}
1640 2037
@@ -1654,7 +2051,7 @@ static struct sys_device device_lapic = {
1654 .cls = &lapic_sysclass, 2051 .cls = &lapic_sysclass,
1655}; 2052};
1656 2053
1657static void __devinit apic_pm_activate(void) 2054static void __cpuinit apic_pm_activate(void)
1658{ 2055{
1659 apic_pm_state.active = 1; 2056 apic_pm_state.active = 1;
1660} 2057}
@@ -1680,30 +2077,101 @@ static void apic_pm_activate(void) { }
1680 2077
1681#endif /* CONFIG_PM */ 2078#endif /* CONFIG_PM */
1682 2079
2080#ifdef CONFIG_X86_64
1683/* 2081/*
1684 * APIC command line parameters 2082 * apic_is_clustered_box() -- Check if we can expect good TSC
2083 *
2084 * Thus far, the major user of this is IBM's Summit2 series:
2085 *
2086 * Clustered boxes may have unsynced TSC problems if they are
2087 * multi-chassis. Use available data to take a good guess.
2088 * If in doubt, go HPET.
1685 */ 2089 */
1686static int __init parse_lapic(char *arg) 2090__cpuinit int apic_is_clustered_box(void)
1687{ 2091{
1688 force_enable_local_apic = 1; 2092 int i, clusters, zeros;
1689 return 0; 2093 unsigned id;
2094 u16 *bios_cpu_apicid;
2095 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
2096
2097 /*
2098 * there is not this kind of box with AMD CPU yet.
2099 * Some AMD box with quadcore cpu and 8 sockets apicid
2100 * will be [4, 0x23] or [8, 0x27] could be thought to
2101 * vsmp box still need checking...
2102 */
2103 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
2104 return 0;
2105
2106 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
2107 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
2108
2109 for (i = 0; i < NR_CPUS; i++) {
2110 /* are we being called early in kernel startup? */
2111 if (bios_cpu_apicid) {
2112 id = bios_cpu_apicid[i];
2113 }
2114 else if (i < nr_cpu_ids) {
2115 if (cpu_present(i))
2116 id = per_cpu(x86_bios_cpu_apicid, i);
2117 else
2118 continue;
2119 }
2120 else
2121 break;
2122
2123 if (id != BAD_APICID)
2124 __set_bit(APIC_CLUSTERID(id), clustermap);
2125 }
2126
2127 /* Problem: Partially populated chassis may not have CPUs in some of
2128 * the APIC clusters they have been allocated. Only present CPUs have
2129 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
2130 * Since clusters are allocated sequentially, count zeros only if
2131 * they are bounded by ones.
2132 */
2133 clusters = 0;
2134 zeros = 0;
2135 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
2136 if (test_bit(i, clustermap)) {
2137 clusters += 1 + zeros;
2138 zeros = 0;
2139 } else
2140 ++zeros;
2141 }
2142
2143 /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
2144 * not guaranteed to be synced between boards
2145 */
2146 if (is_vsmp_box() && clusters > 1)
2147 return 1;
2148
2149 /*
2150 * If clusters > 2, then should be multi-chassis.
2151 * May have to revisit this when multi-core + hyperthreaded CPUs come
2152 * out, but AFAIK this will work even for them.
2153 */
2154 return (clusters > 2);
1690} 2155}
1691early_param("lapic", parse_lapic); 2156#endif
1692 2157
1693static int __init parse_nolapic(char *arg) 2158/*
2159 * APIC command line parameters
2160 */
2161static int __init setup_disableapic(char *arg)
1694{ 2162{
1695 disable_apic = 1; 2163 disable_apic = 1;
1696 setup_clear_cpu_cap(X86_FEATURE_APIC); 2164 setup_clear_cpu_cap(X86_FEATURE_APIC);
1697 return 0; 2165 return 0;
1698} 2166}
1699early_param("nolapic", parse_nolapic); 2167early_param("disableapic", setup_disableapic);
1700 2168
1701static int __init parse_disable_lapic_timer(char *arg) 2169/* same as disableapic, for compatibility */
2170static int __init setup_nolapic(char *arg)
1702{ 2171{
1703 local_apic_timer_disabled = 1; 2172 return setup_disableapic(arg);
1704 return 0;
1705} 2173}
1706early_param("nolapic_timer", parse_disable_lapic_timer); 2174early_param("nolapic", setup_nolapic);
1707 2175
1708static int __init parse_lapic_timer_c2_ok(char *arg) 2176static int __init parse_lapic_timer_c2_ok(char *arg)
1709{ 2177{
@@ -1712,15 +2180,39 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1712} 2180}
1713early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 2181early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1714 2182
2183static int __init parse_disable_apic_timer(char *arg)
2184{
2185 disable_apic_timer = 1;
2186 return 0;
2187}
2188early_param("noapictimer", parse_disable_apic_timer);
2189
2190static int __init parse_nolapic_timer(char *arg)
2191{
2192 disable_apic_timer = 1;
2193 return 0;
2194}
2195early_param("nolapic_timer", parse_nolapic_timer);
2196
1715static int __init apic_set_verbosity(char *arg) 2197static int __init apic_set_verbosity(char *arg)
1716{ 2198{
1717 if (!arg) 2199 if (!arg) {
2200#ifdef CONFIG_X86_64
2201 skip_ioapic_setup = 0;
2202 return 0;
2203#endif
1718 return -EINVAL; 2204 return -EINVAL;
2205 }
1719 2206
1720 if (strcmp(arg, "debug") == 0) 2207 if (strcmp("debug", arg) == 0)
1721 apic_verbosity = APIC_DEBUG; 2208 apic_verbosity = APIC_DEBUG;
1722 else if (strcmp(arg, "verbose") == 0) 2209 else if (strcmp("verbose", arg) == 0)
1723 apic_verbosity = APIC_VERBOSE; 2210 apic_verbosity = APIC_VERBOSE;
2211 else {
2212 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
2213 " use apic=verbose or apic=debug\n", arg);
2214 return -EINVAL;
2215 }
1724 2216
1725 return 0; 2217 return 0;
1726} 2218}
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
deleted file mode 100644
index 446c062e831..00000000000
--- a/arch/x86/kernel/apic_64.c
+++ /dev/null
@@ -1,1390 +0,0 @@
1/*
2 * Local APIC handling, local APIC timers
3 *
4 * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes
7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
8 * thanks to Eric Gilmore
9 * and Rolf G. Tews
10 * for testing these extensively.
11 * Maciej W. Rozycki : Various updates and fixes.
12 * Mikael Pettersson : Power Management for UP-APIC.
13 * Pavel Machek and
14 * Mikael Pettersson : PM converted to driver model.
15 */
16
17#include <linux/init.h>
18
19#include <linux/mm.h>
20#include <linux/delay.h>
21#include <linux/bootmem.h>
22#include <linux/interrupt.h>
23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h>
25#include <linux/sysdev.h>
26#include <linux/ioport.h>
27#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h>
29#include <linux/module.h>
30
31#include <asm/atomic.h>
32#include <asm/smp.h>
33#include <asm/mtrr.h>
34#include <asm/mpspec.h>
35#include <asm/hpet.h>
36#include <asm/pgalloc.h>
37#include <asm/nmi.h>
38#include <asm/idle.h>
39#include <asm/proto.h>
40#include <asm/timex.h>
41#include <asm/apic.h>
42
43#include <mach_ipi.h>
44#include <mach_apic.h>
45
46static int disable_apic_timer __cpuinitdata;
47static int apic_calibrate_pmtmr __initdata;
48int disable_apic;
49
50/* Local APIC timer works in C2 */
51int local_apic_timer_c2_ok;
52EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
53
54/*
55 * Debug level, exported for io_apic.c
56 */
57unsigned int apic_verbosity;
58
59/* Have we found an MP table */
60int smp_found_config;
61
62static struct resource lapic_resource = {
63 .name = "Local APIC",
64 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
65};
66
67static unsigned int calibration_result;
68
69static int lapic_next_event(unsigned long delta,
70 struct clock_event_device *evt);
71static void lapic_timer_setup(enum clock_event_mode mode,
72 struct clock_event_device *evt);
73static void lapic_timer_broadcast(cpumask_t mask);
74static void apic_pm_activate(void);
75
76static struct clock_event_device lapic_clockevent = {
77 .name = "lapic",
78 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
79 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
80 .shift = 32,
81 .set_mode = lapic_timer_setup,
82 .set_next_event = lapic_next_event,
83 .broadcast = lapic_timer_broadcast,
84 .rating = 100,
85 .irq = -1,
86};
87static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
88
89static unsigned long apic_phys;
90
91unsigned long mp_lapic_addr;
92
93/*
94 * Get the LAPIC version
95 */
96static inline int lapic_get_version(void)
97{
98 return GET_APIC_VERSION(apic_read(APIC_LVR));
99}
100
101/*
102 * Check, if the APIC is integrated or a seperate chip
103 */
104static inline int lapic_is_integrated(void)
105{
106 return 1;
107}
108
109/*
110 * Check, whether this is a modern or a first generation APIC
111 */
112static int modern_apic(void)
113{
114 /* AMD systems use old APIC versions, so check the CPU */
115 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
116 boot_cpu_data.x86 >= 0xf)
117 return 1;
118 return lapic_get_version() >= 0x14;
119}
120
121void apic_wait_icr_idle(void)
122{
123 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
124 cpu_relax();
125}
126
127u32 safe_apic_wait_icr_idle(void)
128{
129 u32 send_status;
130 int timeout;
131
132 timeout = 0;
133 do {
134 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
135 if (!send_status)
136 break;
137 udelay(100);
138 } while (timeout++ < 1000);
139
140 return send_status;
141}
142
143/**
144 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
145 */
146void __cpuinit enable_NMI_through_LVT0(void)
147{
148 unsigned int v;
149
150 /* unmask and set to NMI */
151 v = APIC_DM_NMI;
152 apic_write(APIC_LVT0, v);
153}
154
155/**
156 * lapic_get_maxlvt - get the maximum number of local vector table entries
157 */
158int lapic_get_maxlvt(void)
159{
160 unsigned int v, maxlvt;
161
162 v = apic_read(APIC_LVR);
163 maxlvt = GET_APIC_MAXLVT(v);
164 return maxlvt;
165}
166
167/*
168 * This function sets up the local APIC timer, with a timeout of
169 * 'clocks' APIC bus clock. During calibration we actually call
170 * this function twice on the boot CPU, once with a bogus timeout
171 * value, second time for real. The other (noncalibrating) CPUs
172 * call this function only once, with the real, calibrated value.
173 *
174 * We do reads before writes even if unnecessary, to get around the
175 * P5 APIC double write bug.
176 */
177
178static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
179{
180 unsigned int lvtt_value, tmp_value;
181
182 lvtt_value = LOCAL_TIMER_VECTOR;
183 if (!oneshot)
184 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
185 if (!irqen)
186 lvtt_value |= APIC_LVT_MASKED;
187
188 apic_write(APIC_LVTT, lvtt_value);
189
190 /*
191 * Divide PICLK by 16
192 */
193 tmp_value = apic_read(APIC_TDCR);
194 apic_write(APIC_TDCR, (tmp_value
195 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
196 | APIC_TDR_DIV_16);
197
198 if (!oneshot)
199 apic_write(APIC_TMICT, clocks);
200}
201
202/*
203 * Setup extended LVT, AMD specific (K8, family 10h)
204 *
205 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
206 * MCE interrupts are supported. Thus MCE offset must be set to 0.
207 */
208
209#define APIC_EILVT_LVTOFF_MCE 0
210#define APIC_EILVT_LVTOFF_IBS 1
211
212static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
213{
214 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
215 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
216
217 apic_write(reg, v);
218}
219
220u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
221{
222 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
223 return APIC_EILVT_LVTOFF_MCE;
224}
225
226u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
227{
228 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
229 return APIC_EILVT_LVTOFF_IBS;
230}
231
232/*
233 * Program the next event, relative to now
234 */
235static int lapic_next_event(unsigned long delta,
236 struct clock_event_device *evt)
237{
238 apic_write(APIC_TMICT, delta);
239 return 0;
240}
241
242/*
243 * Setup the lapic timer in periodic or oneshot mode
244 */
245static void lapic_timer_setup(enum clock_event_mode mode,
246 struct clock_event_device *evt)
247{
248 unsigned long flags;
249 unsigned int v;
250
251 /* Lapic used as dummy for broadcast ? */
252 if (evt->features & CLOCK_EVT_FEAT_DUMMY)
253 return;
254
255 local_irq_save(flags);
256
257 switch (mode) {
258 case CLOCK_EVT_MODE_PERIODIC:
259 case CLOCK_EVT_MODE_ONESHOT:
260 __setup_APIC_LVTT(calibration_result,
261 mode != CLOCK_EVT_MODE_PERIODIC, 1);
262 break;
263 case CLOCK_EVT_MODE_UNUSED:
264 case CLOCK_EVT_MODE_SHUTDOWN:
265 v = apic_read(APIC_LVTT);
266 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
267 apic_write(APIC_LVTT, v);
268 break;
269 case CLOCK_EVT_MODE_RESUME:
270 /* Nothing to do here */
271 break;
272 }
273
274 local_irq_restore(flags);
275}
276
277/*
278 * Local APIC timer broadcast function
279 */
280static void lapic_timer_broadcast(cpumask_t mask)
281{
282#ifdef CONFIG_SMP
283 send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
284#endif
285}
286
287/*
288 * Setup the local APIC timer for this CPU. Copy the initilized values
289 * of the boot CPU and register the clock event in the framework.
290 */
291static void setup_APIC_timer(void)
292{
293 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
294
295 memcpy(levt, &lapic_clockevent, sizeof(*levt));
296 levt->cpumask = cpumask_of_cpu(smp_processor_id());
297
298 clockevents_register_device(levt);
299}
300
301/*
302 * In this function we calibrate APIC bus clocks to the external
303 * timer. Unfortunately we cannot use jiffies and the timer irq
304 * to calibrate, since some later bootup code depends on getting
305 * the first irq? Ugh.
306 *
307 * We want to do the calibration only once since we
308 * want to have local timer irqs syncron. CPUs connected
309 * by the same APIC bus have the very same bus frequency.
310 * And we want to have irqs off anyways, no accidental
311 * APIC irq that way.
312 */
313
314#define TICK_COUNT 100000000
315
316static int __init calibrate_APIC_clock(void)
317{
318 unsigned apic, apic_start;
319 unsigned long tsc, tsc_start;
320 int result;
321
322 local_irq_disable();
323
324 /*
325 * Put whatever arbitrary (but long enough) timeout
326 * value into the APIC clock, we just want to get the
327 * counter running for calibration.
328 *
329 * No interrupt enable !
330 */
331 __setup_APIC_LVTT(250000000, 0, 0);
332
333 apic_start = apic_read(APIC_TMCCT);
334#ifdef CONFIG_X86_PM_TIMER
335 if (apic_calibrate_pmtmr && pmtmr_ioport) {
336 pmtimer_wait(5000); /* 5ms wait */
337 apic = apic_read(APIC_TMCCT);
338 result = (apic_start - apic) * 1000L / 5;
339 } else
340#endif
341 {
342 rdtscll(tsc_start);
343
344 do {
345 apic = apic_read(APIC_TMCCT);
346 rdtscll(tsc);
347 } while ((tsc - tsc_start) < TICK_COUNT &&
348 (apic_start - apic) < TICK_COUNT);
349
350 result = (apic_start - apic) * 1000L * tsc_khz /
351 (tsc - tsc_start);
352 }
353
354 local_irq_enable();
355
356 printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
357
358 printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
359 result / 1000 / 1000, result / 1000 % 1000);
360
361 /* Calculate the scaled math multiplication factor */
362 lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC,
363 lapic_clockevent.shift);
364 lapic_clockevent.max_delta_ns =
365 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
366 lapic_clockevent.min_delta_ns =
367 clockevent_delta2ns(0xF, &lapic_clockevent);
368
369 calibration_result = result / HZ;
370
371 /*
372 * Do a sanity check on the APIC calibration result
373 */
374 if (calibration_result < (1000000 / HZ)) {
375 printk(KERN_WARNING
376 "APIC frequency too slow, disabling apic timer\n");
377 return -1;
378 }
379
380 return 0;
381}
382
383/*
384 * Setup the boot APIC
385 *
386 * Calibrate and verify the result.
387 */
388void __init setup_boot_APIC_clock(void)
389{
390 /*
391 * The local apic timer can be disabled via the kernel commandline.
392 * Register the lapic timer as a dummy clock event source on SMP
393 * systems, so the broadcast mechanism is used. On UP systems simply
394 * ignore it.
395 */
396 if (disable_apic_timer) {
397 printk(KERN_INFO "Disabling APIC timer\n");
398 /* No broadcast on UP ! */
399 if (num_possible_cpus() > 1) {
400 lapic_clockevent.mult = 1;
401 setup_APIC_timer();
402 }
403 return;
404 }
405
406 printk(KERN_INFO "Using local APIC timer interrupts.\n");
407 if (calibrate_APIC_clock()) {
408 /* No broadcast on UP ! */
409 if (num_possible_cpus() > 1)
410 setup_APIC_timer();
411 return;
412 }
413
414 /*
415 * If nmi_watchdog is set to IO_APIC, we need the
416 * PIT/HPET going. Otherwise register lapic as a dummy
417 * device.
418 */
419 if (nmi_watchdog != NMI_IO_APIC)
420 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
421 else
422 printk(KERN_WARNING "APIC timer registered as dummy,"
423 " due to nmi_watchdog=%d!\n", nmi_watchdog);
424
425 setup_APIC_timer();
426}
427
428void __cpuinit setup_secondary_APIC_clock(void)
429{
430 setup_APIC_timer();
431}
432
433/*
434 * The guts of the apic timer interrupt
435 */
436static void local_apic_timer_interrupt(void)
437{
438 int cpu = smp_processor_id();
439 struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
440
441 /*
442 * Normally we should not be here till LAPIC has been initialized but
443 * in some cases like kdump, its possible that there is a pending LAPIC
444 * timer interrupt from previous kernel's context and is delivered in
445 * new kernel the moment interrupts are enabled.
446 *
447 * Interrupts are enabled early and LAPIC is setup much later, hence
448 * its possible that when we get here evt->event_handler is NULL.
449 * Check for event_handler being NULL and discard the interrupt as
450 * spurious.
451 */
452 if (!evt->event_handler) {
453 printk(KERN_WARNING
454 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
455 /* Switch it off */
456 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
457 return;
458 }
459
460 /*
461 * the NMI deadlock-detector uses this.
462 */
463 add_pda(apic_timer_irqs, 1);
464
465 evt->event_handler(evt);
466}
467
468/*
469 * Local APIC timer interrupt. This is the most natural way for doing
470 * local interrupts, but local timer interrupts can be emulated by
471 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
472 *
473 * [ if a single-CPU system runs an SMP kernel then we call the local
474 * interrupt as well. Thus we cannot inline the local irq ... ]
475 */
476void smp_apic_timer_interrupt(struct pt_regs *regs)
477{
478 struct pt_regs *old_regs = set_irq_regs(regs);
479
480 /*
481 * NOTE! We'd better ACK the irq immediately,
482 * because timer handling can be slow.
483 */
484 ack_APIC_irq();
485 /*
486 * update_process_times() expects us to have done irq_enter().
487 * Besides, if we don't timer interrupts ignore the global
488 * interrupt lock, which is the WrongThing (tm) to do.
489 */
490 exit_idle();
491 irq_enter();
492 local_apic_timer_interrupt();
493 irq_exit();
494 set_irq_regs(old_regs);
495}
496
497int setup_profiling_timer(unsigned int multiplier)
498{
499 return -EINVAL;
500}
501
502
503/*
504 * Local APIC start and shutdown
505 */
506
507/**
508 * clear_local_APIC - shutdown the local APIC
509 *
510 * This is called, when a CPU is disabled and before rebooting, so the state of
511 * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
512 * leftovers during boot.
513 */
514void clear_local_APIC(void)
515{
516 int maxlvt;
517 u32 v;
518
519 /* APIC hasn't been mapped yet */
520 if (!apic_phys)
521 return;
522
523 maxlvt = lapic_get_maxlvt();
524 /*
525 * Masking an LVT entry can trigger a local APIC error
526 * if the vector is zero. Mask LVTERR first to prevent this.
527 */
528 if (maxlvt >= 3) {
529 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
530 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
531 }
532 /*
533 * Careful: we have to set masks only first to deassert
534 * any level-triggered sources.
535 */
536 v = apic_read(APIC_LVTT);
537 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
538 v = apic_read(APIC_LVT0);
539 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
540 v = apic_read(APIC_LVT1);
541 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
542 if (maxlvt >= 4) {
543 v = apic_read(APIC_LVTPC);
544 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
545 }
546
547 /*
548 * Clean APIC state for other OSs:
549 */
550 apic_write(APIC_LVTT, APIC_LVT_MASKED);
551 apic_write(APIC_LVT0, APIC_LVT_MASKED);
552 apic_write(APIC_LVT1, APIC_LVT_MASKED);
553 if (maxlvt >= 3)
554 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
555 if (maxlvt >= 4)
556 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
557 apic_write(APIC_ESR, 0);
558 apic_read(APIC_ESR);
559}
560
561/**
562 * disable_local_APIC - clear and disable the local APIC
563 */
564void disable_local_APIC(void)
565{
566 unsigned int value;
567
568 clear_local_APIC();
569
570 /*
571 * Disable APIC (implies clearing of registers
572 * for 82489DX!).
573 */
574 value = apic_read(APIC_SPIV);
575 value &= ~APIC_SPIV_APIC_ENABLED;
576 apic_write(APIC_SPIV, value);
577}
578
579void lapic_shutdown(void)
580{
581 unsigned long flags;
582
583 if (!cpu_has_apic)
584 return;
585
586 local_irq_save(flags);
587
588 disable_local_APIC();
589
590 local_irq_restore(flags);
591}
592
593/*
594 * This is to verify that we're looking at a real local APIC.
595 * Check these against your board if the CPUs aren't getting
596 * started for no apparent reason.
597 */
598int __init verify_local_APIC(void)
599{
600 unsigned int reg0, reg1;
601
602 /*
603 * The version register is read-only in a real APIC.
604 */
605 reg0 = apic_read(APIC_LVR);
606 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
607 apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
608 reg1 = apic_read(APIC_LVR);
609 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
610
611 /*
612 * The two version reads above should print the same
613 * numbers. If the second one is different, then we
614 * poke at a non-APIC.
615 */
616 if (reg1 != reg0)
617 return 0;
618
619 /*
620 * Check if the version looks reasonably.
621 */
622 reg1 = GET_APIC_VERSION(reg0);
623 if (reg1 == 0x00 || reg1 == 0xff)
624 return 0;
625 reg1 = lapic_get_maxlvt();
626 if (reg1 < 0x02 || reg1 == 0xff)
627 return 0;
628
629 /*
630 * The ID register is read/write in a real APIC.
631 */
632 reg0 = read_apic_id();
633 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
634 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
635 reg1 = read_apic_id();
636 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
637 apic_write(APIC_ID, reg0);
638 if (reg1 != (reg0 ^ APIC_ID_MASK))
639 return 0;
640
641 /*
642 * The next two are just to see if we have sane values.
643 * They're only really relevant if we're in Virtual Wire
644 * compatibility mode, but most boxes are anymore.
645 */
646 reg0 = apic_read(APIC_LVT0);
647 apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
648 reg1 = apic_read(APIC_LVT1);
649 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
650
651 return 1;
652}
653
654/**
655 * sync_Arb_IDs - synchronize APIC bus arbitration IDs
656 */
657void __init sync_Arb_IDs(void)
658{
659 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
660 if (modern_apic())
661 return;
662
663 /*
664 * Wait for idle.
665 */
666 apic_wait_icr_idle();
667
668 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
669 apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
670 | APIC_DM_INIT);
671}
672
673/*
674 * An initial setup of the virtual wire mode.
675 */
676void __init init_bsp_APIC(void)
677{
678 unsigned int value;
679
680 /*
681 * Don't do the setup now if we have a SMP BIOS as the
682 * through-I/O-APIC virtual wire mode might be active.
683 */
684 if (smp_found_config || !cpu_has_apic)
685 return;
686
687 value = apic_read(APIC_LVR);
688
689 /*
690 * Do not trust the local APIC being empty at bootup.
691 */
692 clear_local_APIC();
693
694 /*
695 * Enable APIC.
696 */
697 value = apic_read(APIC_SPIV);
698 value &= ~APIC_VECTOR_MASK;
699 value |= APIC_SPIV_APIC_ENABLED;
700 value |= APIC_SPIV_FOCUS_DISABLED;
701 value |= SPURIOUS_APIC_VECTOR;
702 apic_write(APIC_SPIV, value);
703
704 /*
705 * Set up the virtual wire mode.
706 */
707 apic_write(APIC_LVT0, APIC_DM_EXTINT);
708 value = APIC_DM_NMI;
709 apic_write(APIC_LVT1, value);
710}
711
712/**
713 * setup_local_APIC - setup the local APIC
714 */
715void __cpuinit setup_local_APIC(void)
716{
717 unsigned int value;
718 int i, j;
719
720 preempt_disable();
721 value = apic_read(APIC_LVR);
722
723 BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
724
725 /*
726 * Double-check whether this APIC is really registered.
727 * This is meaningless in clustered apic mode, so we skip it.
728 */
729 if (!apic_id_registered())
730 BUG();
731
732 /*
733 * Intel recommends to set DFR, LDR and TPR before enabling
734 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
735 * document number 292116). So here it goes...
736 */
737 init_apic_ldr();
738
739 /*
740 * Set Task Priority to 'accept all'. We never change this
741 * later on.
742 */
743 value = apic_read(APIC_TASKPRI);
744 value &= ~APIC_TPRI_MASK;
745 apic_write(APIC_TASKPRI, value);
746
747 /*
748 * After a crash, we no longer service the interrupts and a pending
749 * interrupt from previous kernel might still have ISR bit set.
750 *
751 * Most probably by now CPU has serviced that pending interrupt and
752 * it might not have done the ack_APIC_irq() because it thought,
753 * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
754 * does not clear the ISR bit and cpu thinks it has already serivced
755 * the interrupt. Hence a vector might get locked. It was noticed
756 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
757 */
758 for (i = APIC_ISR_NR - 1; i >= 0; i--) {
759 value = apic_read(APIC_ISR + i*0x10);
760 for (j = 31; j >= 0; j--) {
761 if (value & (1<<j))
762 ack_APIC_irq();
763 }
764 }
765
766 /*
767 * Now that we are all set up, enable the APIC
768 */
769 value = apic_read(APIC_SPIV);
770 value &= ~APIC_VECTOR_MASK;
771 /*
772 * Enable APIC
773 */
774 value |= APIC_SPIV_APIC_ENABLED;
775
776 /* We always use processor focus */
777
778 /*
779 * Set spurious IRQ vector
780 */
781 value |= SPURIOUS_APIC_VECTOR;
782 apic_write(APIC_SPIV, value);
783
784 /*
785 * Set up LVT0, LVT1:
786 *
787 * set up through-local-APIC on the BP's LINT0. This is not
788 * strictly necessary in pure symmetric-IO mode, but sometimes
789 * we delegate interrupts to the 8259A.
790 */
791 /*
792 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
793 */
794 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
795 if (!smp_processor_id() && !value) {
796 value = APIC_DM_EXTINT;
797 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
798 smp_processor_id());
799 } else {
800 value = APIC_DM_EXTINT | APIC_LVT_MASKED;
801 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
802 smp_processor_id());
803 }
804 apic_write(APIC_LVT0, value);
805
806 /*
807 * only the BP should see the LINT1 NMI signal, obviously.
808 */
809 if (!smp_processor_id())
810 value = APIC_DM_NMI;
811 else
812 value = APIC_DM_NMI | APIC_LVT_MASKED;
813 apic_write(APIC_LVT1, value);
814 preempt_enable();
815}
816
817static void __cpuinit lapic_setup_esr(void)
818{
819 unsigned maxlvt = lapic_get_maxlvt();
820
821 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
822 /*
823 * spec says clear errors after enabling vector.
824 */
825 if (maxlvt > 3)
826 apic_write(APIC_ESR, 0);
827}
828
829void __cpuinit end_local_APIC_setup(void)
830{
831 lapic_setup_esr();
832 setup_apic_nmi_watchdog(NULL);
833 apic_pm_activate();
834}
835
836/*
837 * Detect and enable local APICs on non-SMP boards.
838 * Original code written by Keir Fraser.
839 * On AMD64 we trust the BIOS - if it says no APIC it is likely
840 * not correctly set up (usually the APIC timer won't work etc.)
841 */
842static int __init detect_init_APIC(void)
843{
844 if (!cpu_has_apic) {
845 printk(KERN_INFO "No local APIC present\n");
846 return -1;
847 }
848
849 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
850 boot_cpu_physical_apicid = 0;
851 return 0;
852}
853
854void __init early_init_lapic_mapping(void)
855{
856 unsigned long phys_addr;
857
858 /*
859 * If no local APIC can be found then go out
860 * : it means there is no mpatable and MADT
861 */
862 if (!smp_found_config)
863 return;
864
865 phys_addr = mp_lapic_addr;
866
867 set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
868 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
869 APIC_BASE, phys_addr);
870
871 /*
872 * Fetch the APIC ID of the BSP in case we have a
873 * default configuration (or the MP table is broken).
874 */
875 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
876}
877
878/**
879 * init_apic_mappings - initialize APIC mappings
880 */
881void __init init_apic_mappings(void)
882{
883 /*
884 * If no local APIC can be found then set up a fake all
885 * zeroes page to simulate the local APIC and another
886 * one for the IO-APIC.
887 */
888 if (!smp_found_config && detect_init_APIC()) {
889 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
890 apic_phys = __pa(apic_phys);
891 } else
892 apic_phys = mp_lapic_addr;
893
894 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
895 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
896 APIC_BASE, apic_phys);
897
898 /*
899 * Fetch the APIC ID of the BSP in case we have a
900 * default configuration (or the MP table is broken).
901 */
902 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
903}
904
905/*
906 * This initializes the IO-APIC and APIC hardware if this is
907 * a UP kernel.
908 */
909int __init APIC_init_uniprocessor(void)
910{
911 if (disable_apic) {
912 printk(KERN_INFO "Apic disabled\n");
913 return -1;
914 }
915 if (!cpu_has_apic) {
916 disable_apic = 1;
917 printk(KERN_INFO "Apic disabled by BIOS\n");
918 return -1;
919 }
920
921 verify_local_APIC();
922
923 connect_bsp_APIC();
924
925 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
926 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
927
928 setup_local_APIC();
929
930 /*
931 * Now enable IO-APICs, actually call clear_IO_APIC
932 * We need clear_IO_APIC before enabling vector on BP
933 */
934 if (!skip_ioapic_setup && nr_ioapics)
935 enable_IO_APIC();
936
937 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
938 localise_nmi_watchdog();
939 end_local_APIC_setup();
940
941 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
942 setup_IO_APIC();
943 else
944 nr_ioapics = 0;
945 setup_boot_APIC_clock();
946 check_nmi_watchdog();
947 return 0;
948}
949
950/*
951 * Local APIC interrupts
952 */
953
954/*
955 * This interrupt should _never_ happen with our APIC/SMP architecture
956 */
957asmlinkage void smp_spurious_interrupt(void)
958{
959 unsigned int v;
960 exit_idle();
961 irq_enter();
962 /*
963 * Check if this really is a spurious interrupt and ACK it
964 * if it is a vectored one. Just in case...
965 * Spurious interrupts should not be ACKed.
966 */
967 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
968 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
969 ack_APIC_irq();
970
971 add_pda(irq_spurious_count, 1);
972 irq_exit();
973}
974
975/*
976 * This interrupt should never happen with our APIC/SMP architecture
977 */
978asmlinkage void smp_error_interrupt(void)
979{
980 unsigned int v, v1;
981
982 exit_idle();
983 irq_enter();
984 /* First tickle the hardware, only then report what went on. -- REW */
985 v = apic_read(APIC_ESR);
986 apic_write(APIC_ESR, 0);
987 v1 = apic_read(APIC_ESR);
988 ack_APIC_irq();
989 atomic_inc(&irq_err_count);
990
991 /* Here is what the APIC error bits mean:
992 0: Send CS error
993 1: Receive CS error
994 2: Send accept error
995 3: Receive accept error
996 4: Reserved
997 5: Send illegal vector
998 6: Received illegal vector
999 7: Illegal register address
1000 */
1001 printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
1002 smp_processor_id(), v , v1);
1003 irq_exit();
1004}
1005
1006/**
1007 * * connect_bsp_APIC - attach the APIC to the interrupt system
1008 * */
1009void __init connect_bsp_APIC(void)
1010{
1011 enable_apic_mode();
1012}
1013
1014void disconnect_bsp_APIC(int virt_wire_setup)
1015{
1016 /* Go back to Virtual Wire compatibility mode */
1017 unsigned long value;
1018
1019 /* For the spurious interrupt use vector F, and enable it */
1020 value = apic_read(APIC_SPIV);
1021 value &= ~APIC_VECTOR_MASK;
1022 value |= APIC_SPIV_APIC_ENABLED;
1023 value |= 0xf;
1024 apic_write(APIC_SPIV, value);
1025
1026 if (!virt_wire_setup) {
1027 /*
1028 * For LVT0 make it edge triggered, active high,
1029 * external and enabled
1030 */
1031 value = apic_read(APIC_LVT0);
1032 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1033 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1034 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1035 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1036 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1037 apic_write(APIC_LVT0, value);
1038 } else {
1039 /* Disable LVT0 */
1040 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1041 }
1042
1043 /* For LVT1 make it edge triggered, active high, nmi and enabled */
1044 value = apic_read(APIC_LVT1);
1045 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1046 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1047 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1048 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1049 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1050 apic_write(APIC_LVT1, value);
1051}
1052
1053void __cpuinit generic_processor_info(int apicid, int version)
1054{
1055 int cpu;
1056 cpumask_t tmp_map;
1057
1058 if (num_processors >= NR_CPUS) {
1059 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1060 " Processor ignored.\n", NR_CPUS);
1061 return;
1062 }
1063
1064 num_processors++;
1065 cpus_complement(tmp_map, cpu_present_map);
1066 cpu = first_cpu(tmp_map);
1067
1068 physid_set(apicid, phys_cpu_present_map);
1069 if (apicid == boot_cpu_physical_apicid) {
1070 /*
1071 * x86_bios_cpu_apicid is required to have processors listed
1072 * in same order as logical cpu numbers. Hence the first
1073 * entry is BSP, and so on.
1074 */
1075 cpu = 0;
1076 }
1077 if (apicid > max_physical_apicid)
1078 max_physical_apicid = apicid;
1079
1080 /* are we being called early in kernel startup? */
1081 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1082 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1083 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1084
1085 cpu_to_apicid[cpu] = apicid;
1086 bios_cpu_apicid[cpu] = apicid;
1087 } else {
1088 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1089 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1090 }
1091
1092 cpu_set(cpu, cpu_possible_map);
1093 cpu_set(cpu, cpu_present_map);
1094}
1095
1096/*
1097 * Power management
1098 */
1099#ifdef CONFIG_PM
1100
1101static struct {
1102 /* 'active' is true if the local APIC was enabled by us and
1103 not the BIOS; this signifies that we are also responsible
1104 for disabling it before entering apm/acpi suspend */
1105 int active;
1106 /* r/w apic fields */
1107 unsigned int apic_id;
1108 unsigned int apic_taskpri;
1109 unsigned int apic_ldr;
1110 unsigned int apic_dfr;
1111 unsigned int apic_spiv;
1112 unsigned int apic_lvtt;
1113 unsigned int apic_lvtpc;
1114 unsigned int apic_lvt0;
1115 unsigned int apic_lvt1;
1116 unsigned int apic_lvterr;
1117 unsigned int apic_tmict;
1118 unsigned int apic_tdcr;
1119 unsigned int apic_thmr;
1120} apic_pm_state;
1121
1122static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1123{
1124 unsigned long flags;
1125 int maxlvt;
1126
1127 if (!apic_pm_state.active)
1128 return 0;
1129
1130 maxlvt = lapic_get_maxlvt();
1131
1132 apic_pm_state.apic_id = read_apic_id();
1133 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
1134 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
1135 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
1136 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
1137 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
1138 if (maxlvt >= 4)
1139 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
1140 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
1141 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
1142 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1143 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1144 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1145#ifdef CONFIG_X86_MCE_INTEL
1146 if (maxlvt >= 5)
1147 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1148#endif
1149 local_irq_save(flags);
1150 disable_local_APIC();
1151 local_irq_restore(flags);
1152 return 0;
1153}
1154
1155static int lapic_resume(struct sys_device *dev)
1156{
1157 unsigned int l, h;
1158 unsigned long flags;
1159 int maxlvt;
1160
1161 if (!apic_pm_state.active)
1162 return 0;
1163
1164 maxlvt = lapic_get_maxlvt();
1165
1166 local_irq_save(flags);
1167 rdmsr(MSR_IA32_APICBASE, l, h);
1168 l &= ~MSR_IA32_APICBASE_BASE;
1169 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1170 wrmsr(MSR_IA32_APICBASE, l, h);
1171 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1172 apic_write(APIC_ID, apic_pm_state.apic_id);
1173 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
1174 apic_write(APIC_LDR, apic_pm_state.apic_ldr);
1175 apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
1176 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1177 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1178 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1179#ifdef CONFIG_X86_MCE_INTEL
1180 if (maxlvt >= 5)
1181 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1182#endif
1183 if (maxlvt >= 4)
1184 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
1185 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
1186 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
1187 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
1188 apic_write(APIC_ESR, 0);
1189 apic_read(APIC_ESR);
1190 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1191 apic_write(APIC_ESR, 0);
1192 apic_read(APIC_ESR);
1193 local_irq_restore(flags);
1194 return 0;
1195}
1196
1197static struct sysdev_class lapic_sysclass = {
1198 .name = "lapic",
1199 .resume = lapic_resume,
1200 .suspend = lapic_suspend,
1201};
1202
1203static struct sys_device device_lapic = {
1204 .id = 0,
1205 .cls = &lapic_sysclass,
1206};
1207
1208static void __cpuinit apic_pm_activate(void)
1209{
1210 apic_pm_state.active = 1;
1211}
1212
1213static int __init init_lapic_sysfs(void)
1214{
1215 int error;
1216
1217 if (!cpu_has_apic)
1218 return 0;
1219 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
1220
1221 error = sysdev_class_register(&lapic_sysclass);
1222 if (!error)
1223 error = sysdev_register(&device_lapic);
1224 return error;
1225}
1226device_initcall(init_lapic_sysfs);
1227
1228#else /* CONFIG_PM */
1229
1230static void apic_pm_activate(void) { }
1231
1232#endif /* CONFIG_PM */
1233
1234/*
1235 * apic_is_clustered_box() -- Check if we can expect good TSC
1236 *
1237 * Thus far, the major user of this is IBM's Summit2 series:
1238 *
1239 * Clustered boxes may have unsynced TSC problems if they are
1240 * multi-chassis. Use available data to take a good guess.
1241 * If in doubt, go HPET.
1242 */
1243__cpuinit int apic_is_clustered_box(void)
1244{
1245 int i, clusters, zeros;
1246 unsigned id;
1247 u16 *bios_cpu_apicid;
1248 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
1249
1250 /*
1251 * there is not this kind of box with AMD CPU yet.
1252 * Some AMD box with quadcore cpu and 8 sockets apicid
1253 * will be [4, 0x23] or [8, 0x27] could be thought to
1254 * vsmp box still need checking...
1255 */
1256 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
1257 return 0;
1258
1259 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1260 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
1261
1262 for (i = 0; i < NR_CPUS; i++) {
1263 /* are we being called early in kernel startup? */
1264 if (bios_cpu_apicid) {
1265 id = bios_cpu_apicid[i];
1266 }
1267 else if (i < nr_cpu_ids) {
1268 if (cpu_present(i))
1269 id = per_cpu(x86_bios_cpu_apicid, i);
1270 else
1271 continue;
1272 }
1273 else
1274 break;
1275
1276 if (id != BAD_APICID)
1277 __set_bit(APIC_CLUSTERID(id), clustermap);
1278 }
1279
1280 /* Problem: Partially populated chassis may not have CPUs in some of
1281 * the APIC clusters they have been allocated. Only present CPUs have
1282 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
1283 * Since clusters are allocated sequentially, count zeros only if
1284 * they are bounded by ones.
1285 */
1286 clusters = 0;
1287 zeros = 0;
1288 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
1289 if (test_bit(i, clustermap)) {
1290 clusters += 1 + zeros;
1291 zeros = 0;
1292 } else
1293 ++zeros;
1294 }
1295
1296 /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
1297 * not guaranteed to be synced between boards
1298 */
1299 if (is_vsmp_box() && clusters > 1)
1300 return 1;
1301
1302 /*
1303 * If clusters > 2, then should be multi-chassis.
1304 * May have to revisit this when multi-core + hyperthreaded CPUs come
1305 * out, but AFAIK this will work even for them.
1306 */
1307 return (clusters > 2);
1308}
1309
1310/*
1311 * APIC command line parameters
1312 */
1313static int __init apic_set_verbosity(char *str)
1314{
1315 if (str == NULL) {
1316 skip_ioapic_setup = 0;
1317 ioapic_force = 1;
1318 return 0;
1319 }
1320 if (strcmp("debug", str) == 0)
1321 apic_verbosity = APIC_DEBUG;
1322 else if (strcmp("verbose", str) == 0)
1323 apic_verbosity = APIC_VERBOSE;
1324 else {
1325 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1326 " use apic=verbose or apic=debug\n", str);
1327 return -EINVAL;
1328 }
1329
1330 return 0;
1331}
1332early_param("apic", apic_set_verbosity);
1333
1334static __init int setup_disableapic(char *str)
1335{
1336 disable_apic = 1;
1337 setup_clear_cpu_cap(X86_FEATURE_APIC);
1338 return 0;
1339}
1340early_param("disableapic", setup_disableapic);
1341
1342/* same as disableapic, for compatibility */
1343static __init int setup_nolapic(char *str)
1344{
1345 return setup_disableapic(str);
1346}
1347early_param("nolapic", setup_nolapic);
1348
1349static int __init parse_lapic_timer_c2_ok(char *arg)
1350{
1351 local_apic_timer_c2_ok = 1;
1352 return 0;
1353}
1354early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1355
1356static __init int setup_noapictimer(char *str)
1357{
1358 if (str[0] != ' ' && str[0] != 0)
1359 return 0;
1360 disable_apic_timer = 1;
1361 return 1;
1362}
1363__setup("noapictimer", setup_noapictimer);
1364
1365static __init int setup_apicpmtimer(char *s)
1366{
1367 apic_calibrate_pmtmr = 1;
1368 notsc_setup(NULL);
1369 return 0;
1370}
1371__setup("apicpmtimer", setup_apicpmtimer);
1372
1373static int __init lapic_insert_resource(void)
1374{
1375 if (!apic_phys)
1376 return -1;
1377
1378 /* Put local APIC into the resource map. */
1379 lapic_resource.start = apic_phys;
1380 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
1381 insert_resource(&iomem_resource, &lapic_resource);
1382
1383 return 0;
1384}
1385
1386/*
1387 * need call insert after e820_reserve_resources()
1388 * that is using request_resource
1389 */
1390late_initcall(lapic_insert_resource);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 9ee24e6bc4b..5145a6e72bb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -228,12 +228,12 @@
228#include <linux/suspend.h> 228#include <linux/suspend.h>
229#include <linux/kthread.h> 229#include <linux/kthread.h>
230#include <linux/jiffies.h> 230#include <linux/jiffies.h>
231#include <linux/smp_lock.h>
232 231
233#include <asm/system.h> 232#include <asm/system.h>
234#include <asm/uaccess.h> 233#include <asm/uaccess.h>
235#include <asm/desc.h> 234#include <asm/desc.h>
236#include <asm/i8253.h> 235#include <asm/i8253.h>
236#include <asm/olpc.h>
237#include <asm/paravirt.h> 237#include <asm/paravirt.h>
238#include <asm/reboot.h> 238#include <asm/reboot.h>
239 239
@@ -2217,7 +2217,7 @@ static int __init apm_init(void)
2217 2217
2218 dmi_check_system(apm_dmi_table); 2218 dmi_check_system(apm_dmi_table);
2219 2219
2220 if (apm_info.bios.version == 0 || paravirt_enabled()) { 2220 if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
2221 printk(KERN_INFO "apm: BIOS not found.\n"); 2221 printk(KERN_INFO "apm: BIOS not found.\n");
2222 return -ENODEV; 2222 return -ENODEV;
2223 } 2223 }
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index aa89387006f..7fcf63d22f8 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -22,7 +22,7 @@
22 22
23#define __NO_STUBS 1 23#define __NO_STUBS 1
24#undef __SYSCALL 24#undef __SYSCALL
25#undef _ASM_X86_64_UNISTD_H_ 25#undef _ASM_X86_UNISTD_64_H
26#define __SYSCALL(nr, sym) [nr] = 1, 26#define __SYSCALL(nr, sym) [nr] = 1,
27static char syscalls[] = { 27static char syscalls[] = {
28#include <asm/unistd.h> 28#include <asm/unistd.h>
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index c639bd55391..f0dfe6f17e7 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -1,8 +1,6 @@
1/* 1/*
2 * BIOS run time interface routines. 2 * BIOS run time interface routines.
3 * 3 *
4 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by 5 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or 6 * the Free Software Foundation; either version 2 of the License, or
@@ -16,33 +14,128 @@
16 * You should have received a copy of the GNU General Public License 14 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson
19 */ 20 */
20 21
22#include <linux/efi.h>
23#include <asm/efi.h>
24#include <linux/io.h>
21#include <asm/uv/bios.h> 25#include <asm/uv/bios.h>
26#include <asm/uv/uv_hub.h>
27
28struct uv_systab uv_systab;
22 29
23const char * 30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
24x86_bios_strerror(long status)
25{ 31{
26 const char *str; 32 struct uv_systab *tab = &uv_systab;
27 switch (status) { 33
28 case 0: str = "Call completed without error"; break; 34 if (!tab->function)
29 case -1: str = "Not implemented"; break; 35 /*
30 case -2: str = "Invalid argument"; break; 36 * BIOS does not support UV systab
31 case -3: str = "Call completed with error"; break; 37 */
32 default: str = "Unknown BIOS status code"; break; 38 return BIOS_STATUS_UNIMPLEMENTED;
33 } 39
34 return str; 40 return efi_call6((void *)__va(tab->function),
41 (u64)which, a1, a2, a3, a4, a5);
35} 42}
36 43
37long 44s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
38x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second, 45 u64 a4, u64 a5)
39 unsigned long *drift_info)
40{ 46{
41 struct uv_bios_retval isrv; 47 unsigned long bios_flags;
48 s64 ret;
42 49
43 BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0); 50 local_irq_save(bios_flags);
44 *ticks_per_second = isrv.v0; 51 ret = uv_bios_call(which, a1, a2, a3, a4, a5);
45 *drift_info = isrv.v1; 52 local_irq_restore(bios_flags);
46 return isrv.status; 53
54 return ret;
47} 55}
48EXPORT_SYMBOL_GPL(x86_bios_freq_base); 56
57s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
58 u64 a4, u64 a5)
59{
60 s64 ret;
61
62 preempt_disable();
63 ret = uv_bios_call(which, a1, a2, a3, a4, a5);
64 preempt_enable();
65
66 return ret;
67}
68
69
70long sn_partition_id;
71EXPORT_SYMBOL_GPL(sn_partition_id);
72long uv_coherency_id;
73EXPORT_SYMBOL_GPL(uv_coherency_id);
74long uv_region_size;
75EXPORT_SYMBOL_GPL(uv_region_size);
76int uv_type;
77
78
79s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
80 long *region)
81{
82 s64 ret;
83 u64 v0, v1;
84 union partition_info_u part;
85
86 ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc,
87 (u64)(&v0), (u64)(&v1), 0, 0);
88 if (ret != BIOS_STATUS_SUCCESS)
89 return ret;
90
91 part.val = v0;
92 if (uvtype)
93 *uvtype = part.hub_version;
94 if (partid)
95 *partid = part.partition_id;
96 if (coher)
97 *coher = part.coherence_id;
98 if (region)
99 *region = part.region_size;
100 return ret;
101}
102
103
104s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
105{
106 return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type,
107 (u64)ticks_per_second, 0, 0, 0);
108}
109EXPORT_SYMBOL_GPL(uv_bios_freq_base);
110
111
112#ifdef CONFIG_EFI
113void uv_bios_init(void)
114{
115 struct uv_systab *tab;
116
117 if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
118 (efi.uv_systab == (unsigned long)NULL)) {
119 printk(KERN_CRIT "No EFI UV System Table.\n");
120 uv_systab.function = (unsigned long)NULL;
121 return;
122 }
123
124 tab = (struct uv_systab *)ioremap(efi.uv_systab,
125 sizeof(struct uv_systab));
126 if (strncmp(tab->signature, "UVST", 4) != 0)
127 printk(KERN_ERR "bad signature in UV system table!");
128
129 /*
130 * Copy table to permanent spot for later use.
131 */
132 memcpy(&uv_systab, tab, sizeof(struct uv_systab));
133 iounmap(tab);
134
135 printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision);
136}
137#else /* !CONFIG_EFI */
138
139void uv_bios_init(void) { }
140#endif
141
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore
new file mode 100644
index 00000000000..667df55a439
--- /dev/null
+++ b/arch/x86/kernel/cpu/.gitignore
@@ -0,0 +1 @@
capflags.c
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index ee76eaad300..82ec6075c05 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -3,22 +3,30 @@
3# 3#
4 4
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 5obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += proc.o feature_names.o 6obj-y += proc.o capflags.o powerflags.o common.o
7 7
8obj-$(CONFIG_X86_32) += common.o bugs.o 8obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
9obj-$(CONFIG_X86_64) += common_64.o bugs_64.o 9obj-$(CONFIG_X86_64) += bugs_64.o
10obj-$(CONFIG_X86_32) += amd.o 10
11obj-$(CONFIG_X86_64) += amd_64.o 11obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
12obj-$(CONFIG_X86_32) += cyrix.o 12obj-$(CONFIG_CPU_SUP_AMD) += amd.o
13obj-$(CONFIG_X86_32) += centaur.o 13obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
14obj-$(CONFIG_X86_64) += centaur_64.o 14obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o
15obj-$(CONFIG_X86_32) += transmeta.o 15obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
16obj-$(CONFIG_X86_32) += intel.o 16obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
17obj-$(CONFIG_X86_64) += intel_64.o 17obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
18obj-$(CONFIG_X86_32) += umc.o
19 18
20obj-$(CONFIG_X86_MCE) += mcheck/ 19obj-$(CONFIG_X86_MCE) += mcheck/
21obj-$(CONFIG_MTRR) += mtrr/ 20obj-$(CONFIG_MTRR) += mtrr/
22obj-$(CONFIG_CPU_FREQ) += cpufreq/ 21obj-$(CONFIG_CPU_FREQ) += cpufreq/
23 22
24obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 23obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
24
25quiet_cmd_mkcapflags = MKCAP $@
26 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
27
28cpufeature = $(src)/../../include/asm/cpufeature.h
29
30targets += capflags.c
31$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE
32 $(call if_changed,mkcapflags)
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index a6ef672adbb..0d9c993aa93 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -7,6 +7,8 @@
7#include <asm/pat.h> 7#include <asm/pat.h>
8#include <asm/processor.h> 8#include <asm/processor.h>
9 9
10#include <mach_apic.h>
11
10struct cpuid_bit { 12struct cpuid_bit {
11 u16 feature; 13 u16 feature;
12 u8 reg; 14 u8 reg;
@@ -48,6 +50,92 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
48 } 50 }
49} 51}
50 52
53/* leaf 0xb SMT level */
54#define SMT_LEVEL 0
55
56/* leaf 0xb sub-leaf types */
57#define INVALID_TYPE 0
58#define SMT_TYPE 1
59#define CORE_TYPE 2
60
61#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
62#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
63#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
64
65/*
66 * Check for extended topology enumeration cpuid leaf 0xb and if it
67 * exists, use it for populating initial_apicid and cpu topology
68 * detection.
69 */
70void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
71{
72#ifdef CONFIG_SMP
73 unsigned int eax, ebx, ecx, edx, sub_index;
74 unsigned int ht_mask_width, core_plus_mask_width;
75 unsigned int core_select_mask, core_level_siblings;
76
77 if (c->cpuid_level < 0xb)
78 return;
79
80 cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
81
82 /*
83 * check if the cpuid leaf 0xb is actually implemented.
84 */
85 if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
86 return;
87
88 set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
89
90 /*
91 * initial apic id, which also represents 32-bit extended x2apic id.
92 */
93 c->initial_apicid = edx;
94
95 /*
96 * Populate HT related information from sub-leaf level 0.
97 */
98 core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
99 core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
100
101 sub_index = 1;
102 do {
103 cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
104
105 /*
106 * Check for the Core type in the implemented sub leaves.
107 */
108 if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
109 core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
110 core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
111 break;
112 }
113
114 sub_index++;
115 } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
116
117 core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
118
119#ifdef CONFIG_X86_32
120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
121 & core_select_mask;
122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
123#else
124 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
125 c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
126#endif
127 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
128
129
130 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
131 c->phys_proc_id);
132 if (c->x86_max_cores > 1)
133 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
134 c->cpu_core_id);
135 return;
136#endif
137}
138
51#ifdef CONFIG_X86_PAT 139#ifdef CONFIG_X86_PAT
52void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) 140void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
53{ 141{
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 18514ed2610..8f1e31db2ad 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,13 +1,22 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/bitops.h> 2#include <linux/bitops.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4
4#include <asm/io.h> 5#include <asm/io.h>
5#include <asm/processor.h> 6#include <asm/processor.h>
6#include <asm/apic.h> 7#include <asm/apic.h>
7 8
9#ifdef CONFIG_X86_64
10# include <asm/numa_64.h>
11# include <asm/mmconfig.h>
12# include <asm/cacheflush.h>
13#endif
14
8#include <mach_apic.h> 15#include <mach_apic.h>
16
9#include "cpu.h" 17#include "cpu.h"
10 18
19#ifdef CONFIG_X86_32
11/* 20/*
12 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause 21 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
13 * misexecution of code under Linux. Owners of such processors should 22 * misexecution of code under Linux. Owners of such processors should
@@ -24,26 +33,273 @@
24extern void vide(void); 33extern void vide(void);
25__asm__(".align 4\nvide: ret"); 34__asm__(".align 4\nvide: ret");
26 35
27static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) 36static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
28{ 37{
29 if (cpuid_eax(0x80000000) >= 0x80000007) { 38/*
30 c->x86_power = cpuid_edx(0x80000007); 39 * General Systems BIOSen alias the cpu frequency registers
31 if (c->x86_power & (1<<8)) 40 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
32 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 41 * drivers subsequently pokes it, and changes the CPU speed.
42 * Workaround : Remove the unneeded alias.
43 */
44#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
45#define CBAR_ENB (0x80000000)
46#define CBAR_KEY (0X000000CB)
47 if (c->x86_model == 9 || c->x86_model == 10) {
48 if (inl (CBAR) & CBAR_ENB)
49 outl (0 | CBAR_KEY, CBAR);
33 } 50 }
34
35 /* Set MTRR capability flag if appropriate */
36 if (c->x86_model == 13 || c->x86_model == 9 ||
37 (c->x86_model == 8 && c->x86_mask >= 8))
38 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
39} 51}
40 52
41static void __cpuinit init_amd(struct cpuinfo_x86 *c) 53
54static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
42{ 55{
43 u32 l, h; 56 u32 l, h;
44 int mbytes = num_physpages >> (20-PAGE_SHIFT); 57 int mbytes = num_physpages >> (20-PAGE_SHIFT);
45 int r;
46 58
59 if (c->x86_model < 6) {
60 /* Based on AMD doc 20734R - June 2000 */
61 if (c->x86_model == 0) {
62 clear_cpu_cap(c, X86_FEATURE_APIC);
63 set_cpu_cap(c, X86_FEATURE_PGE);
64 }
65 return;
66 }
67
68 if (c->x86_model == 6 && c->x86_mask == 1) {
69 const int K6_BUG_LOOP = 1000000;
70 int n;
71 void (*f_vide)(void);
72 unsigned long d, d2;
73
74 printk(KERN_INFO "AMD K6 stepping B detected - ");
75
76 /*
77 * It looks like AMD fixed the 2.6.2 bug and improved indirect
78 * calls at the same time.
79 */
80
81 n = K6_BUG_LOOP;
82 f_vide = vide;
83 rdtscl(d);
84 while (n--)
85 f_vide();
86 rdtscl(d2);
87 d = d2-d;
88
89 if (d > 20*K6_BUG_LOOP)
90 printk("system stability may be impaired when more than 32 MB are used.\n");
91 else
92 printk("probably OK (after B9730xxxx).\n");
93 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
94 }
95
96 /* K6 with old style WHCR */
97 if (c->x86_model < 8 ||
98 (c->x86_model == 8 && c->x86_mask < 8)) {
99 /* We can only write allocate on the low 508Mb */
100 if (mbytes > 508)
101 mbytes = 508;
102
103 rdmsr(MSR_K6_WHCR, l, h);
104 if ((l&0x0000FFFF) == 0) {
105 unsigned long flags;
106 l = (1<<0)|((mbytes/4)<<1);
107 local_irq_save(flags);
108 wbinvd();
109 wrmsr(MSR_K6_WHCR, l, h);
110 local_irq_restore(flags);
111 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
112 mbytes);
113 }
114 return;
115 }
116
117 if ((c->x86_model == 8 && c->x86_mask > 7) ||
118 c->x86_model == 9 || c->x86_model == 13) {
119 /* The more serious chips .. */
120
121 if (mbytes > 4092)
122 mbytes = 4092;
123
124 rdmsr(MSR_K6_WHCR, l, h);
125 if ((l&0xFFFF0000) == 0) {
126 unsigned long flags;
127 l = ((mbytes>>2)<<22)|(1<<16);
128 local_irq_save(flags);
129 wbinvd();
130 wrmsr(MSR_K6_WHCR, l, h);
131 local_irq_restore(flags);
132 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
133 mbytes);
134 }
135
136 return;
137 }
138
139 if (c->x86_model == 10) {
140 /* AMD Geode LX is model 10 */
141 /* placeholder for any needed mods */
142 return;
143 }
144}
145
146static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
147{
148 u32 l, h;
149
150 /*
151 * Bit 15 of Athlon specific MSR 15, needs to be 0
152 * to enable SSE on Palomino/Morgan/Barton CPU's.
153 * If the BIOS didn't enable it already, enable it here.
154 */
155 if (c->x86_model >= 6 && c->x86_model <= 10) {
156 if (!cpu_has(c, X86_FEATURE_XMM)) {
157 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
158 rdmsr(MSR_K7_HWCR, l, h);
159 l &= ~0x00008000;
160 wrmsr(MSR_K7_HWCR, l, h);
161 set_cpu_cap(c, X86_FEATURE_XMM);
162 }
163 }
164
165 /*
166 * It's been determined by AMD that Athlons since model 8 stepping 1
167 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
168 * As per AMD technical note 27212 0.2
169 */
170 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
171 rdmsr(MSR_K7_CLK_CTL, l, h);
172 if ((l & 0xfff00000) != 0x20000000) {
173 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
174 ((l & 0x000fffff)|0x20000000));
175 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
176 }
177 }
178
179 set_cpu_cap(c, X86_FEATURE_K7);
180}
181#endif
182
183#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
184static int __cpuinit nearby_node(int apicid)
185{
186 int i, node;
187
188 for (i = apicid - 1; i >= 0; i--) {
189 node = apicid_to_node[i];
190 if (node != NUMA_NO_NODE && node_online(node))
191 return node;
192 }
193 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
194 node = apicid_to_node[i];
195 if (node != NUMA_NO_NODE && node_online(node))
196 return node;
197 }
198 return first_node(node_online_map); /* Shouldn't happen */
199}
200#endif
201
202/*
203 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
204 * Assumes number of cores is a power of two.
205 */
206static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
207{
208#ifdef CONFIG_X86_HT
209 unsigned bits;
210
211 bits = c->x86_coreid_bits;
212
213 /* Low order bits define the core id (index of core in socket) */
214 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
215 /* Convert the initial APIC ID into the socket ID */
216 c->phys_proc_id = c->initial_apicid >> bits;
217#endif
218}
219
220static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
221{
222#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
223 int cpu = smp_processor_id();
224 int node;
225 unsigned apicid = hard_smp_processor_id();
226
227 node = c->phys_proc_id;
228 if (apicid_to_node[apicid] != NUMA_NO_NODE)
229 node = apicid_to_node[apicid];
230 if (!node_online(node)) {
231 /* Two possibilities here:
232 - The CPU is missing memory and no node was created.
233 In that case try picking one from a nearby CPU
234 - The APIC IDs differ from the HyperTransport node IDs
235 which the K8 northbridge parsing fills in.
236 Assume they are all increased by a constant offset,
237 but in the same order as the HT nodeids.
238 If that doesn't result in a usable node fall back to the
239 path for the previous case. */
240
241 int ht_nodeid = c->initial_apicid;
242
243 if (ht_nodeid >= 0 &&
244 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
245 node = apicid_to_node[ht_nodeid];
246 /* Pick a nearby node */
247 if (!node_online(node))
248 node = nearby_node(apicid);
249 }
250 numa_set_node(cpu, node);
251
252 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
253#endif
254}
255
256static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
257{
258#ifdef CONFIG_X86_HT
259 unsigned bits, ecx;
260
261 /* Multi core CPU? */
262 if (c->extended_cpuid_level < 0x80000008)
263 return;
264
265 ecx = cpuid_ecx(0x80000008);
266
267 c->x86_max_cores = (ecx & 0xff) + 1;
268
269 /* CPU telling us the core id bits shift? */
270 bits = (ecx >> 12) & 0xF;
271
272 /* Otherwise recompute */
273 if (bits == 0) {
274 while ((1 << bits) < c->x86_max_cores)
275 bits++;
276 }
277
278 c->x86_coreid_bits = bits;
279#endif
280}
281
282static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
283{
284 early_init_amd_mc(c);
285
286 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
287 if (c->x86_power & (1<<8))
288 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
289
290#ifdef CONFIG_X86_64
291 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
292#else
293 /* Set MTRR capability flag if appropriate */
294 if (c->x86 == 5)
295 if (c->x86_model == 13 || c->x86_model == 9 ||
296 (c->x86_model == 8 && c->x86_mask >= 8))
297 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
298#endif
299}
300
301static void __cpuinit init_amd(struct cpuinfo_x86 *c)
302{
47#ifdef CONFIG_SMP 303#ifdef CONFIG_SMP
48 unsigned long long value; 304 unsigned long long value;
49 305
@@ -54,7 +310,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
54 * Errata 63 for SH-B3 steppings 310 * Errata 63 for SH-B3 steppings
55 * Errata 122 for all steppings (F+ have it disabled by default) 311 * Errata 122 for all steppings (F+ have it disabled by default)
56 */ 312 */
57 if (c->x86 == 15) { 313 if (c->x86 == 0xf) {
58 rdmsrl(MSR_K7_HWCR, value); 314 rdmsrl(MSR_K7_HWCR, value);
59 value |= 1 << 6; 315 value |= 1 << 6;
60 wrmsrl(MSR_K7_HWCR, value); 316 wrmsrl(MSR_K7_HWCR, value);
@@ -64,209 +320,119 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
64 early_init_amd(c); 320 early_init_amd(c);
65 321
66 /* 322 /*
67 * FIXME: We should handle the K5 here. Set up the write
68 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
69 * no bus pipeline)
70 */
71
72 /*
73 * Bit 31 in normal CPUID used for nonstandard 3DNow ID; 323 * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
74 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway 324 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
75 */ 325 */
76 clear_cpu_cap(c, 0*32+31); 326 clear_cpu_cap(c, 0*32+31);
77 327
78 r = get_model_name(c); 328#ifdef CONFIG_X86_64
329 /* On C+ stepping K8 rep microcode works well for copy/memset */
330 if (c->x86 == 0xf) {
331 u32 level;
79 332
80 switch (c->x86) { 333 level = cpuid_eax(1);
81 case 4: 334 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
82 /* 335 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
83 * General Systems BIOSen alias the cpu frequency registers
84 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
85 * drivers subsequently pokes it, and changes the CPU speed.
86 * Workaround : Remove the unneeded alias.
87 */
88#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
89#define CBAR_ENB (0x80000000)
90#define CBAR_KEY (0X000000CB)
91 if (c->x86_model == 9 || c->x86_model == 10) {
92 if (inl (CBAR) & CBAR_ENB)
93 outl (0 | CBAR_KEY, CBAR);
94 }
95 break;
96 case 5:
97 if (c->x86_model < 6) {
98 /* Based on AMD doc 20734R - June 2000 */
99 if (c->x86_model == 0) {
100 clear_cpu_cap(c, X86_FEATURE_APIC);
101 set_cpu_cap(c, X86_FEATURE_PGE);
102 }
103 break;
104 }
105
106 if (c->x86_model == 6 && c->x86_mask == 1) {
107 const int K6_BUG_LOOP = 1000000;
108 int n;
109 void (*f_vide)(void);
110 unsigned long d, d2;
111
112 printk(KERN_INFO "AMD K6 stepping B detected - ");
113
114 /*
115 * It looks like AMD fixed the 2.6.2 bug and improved indirect
116 * calls at the same time.
117 */
118
119 n = K6_BUG_LOOP;
120 f_vide = vide;
121 rdtscl(d);
122 while (n--)
123 f_vide();
124 rdtscl(d2);
125 d = d2-d;
126
127 if (d > 20*K6_BUG_LOOP)
128 printk("system stability may be impaired when more than 32 MB are used.\n");
129 else
130 printk("probably OK (after B9730xxxx).\n");
131 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
132 }
133
134 /* K6 with old style WHCR */
135 if (c->x86_model < 8 ||
136 (c->x86_model == 8 && c->x86_mask < 8)) {
137 /* We can only write allocate on the low 508Mb */
138 if (mbytes > 508)
139 mbytes = 508;
140
141 rdmsr(MSR_K6_WHCR, l, h);
142 if ((l&0x0000FFFF) == 0) {
143 unsigned long flags;
144 l = (1<<0)|((mbytes/4)<<1);
145 local_irq_save(flags);
146 wbinvd();
147 wrmsr(MSR_K6_WHCR, l, h);
148 local_irq_restore(flags);
149 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
150 mbytes);
151 }
152 break;
153 }
154
155 if ((c->x86_model == 8 && c->x86_mask > 7) ||
156 c->x86_model == 9 || c->x86_model == 13) {
157 /* The more serious chips .. */
158
159 if (mbytes > 4092)
160 mbytes = 4092;
161
162 rdmsr(MSR_K6_WHCR, l, h);
163 if ((l&0xFFFF0000) == 0) {
164 unsigned long flags;
165 l = ((mbytes>>2)<<22)|(1<<16);
166 local_irq_save(flags);
167 wbinvd();
168 wrmsr(MSR_K6_WHCR, l, h);
169 local_irq_restore(flags);
170 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
171 mbytes);
172 }
173
174 break;
175 }
176
177 if (c->x86_model == 10) {
178 /* AMD Geode LX is model 10 */
179 /* placeholder for any needed mods */
180 break;
181 }
182 break;
183 case 6: /* An Athlon/Duron */
184
185 /*
186 * Bit 15 of Athlon specific MSR 15, needs to be 0
187 * to enable SSE on Palomino/Morgan/Barton CPU's.
188 * If the BIOS didn't enable it already, enable it here.
189 */
190 if (c->x86_model >= 6 && c->x86_model <= 10) {
191 if (!cpu_has(c, X86_FEATURE_XMM)) {
192 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
193 rdmsr(MSR_K7_HWCR, l, h);
194 l &= ~0x00008000;
195 wrmsr(MSR_K7_HWCR, l, h);
196 set_cpu_cap(c, X86_FEATURE_XMM);
197 }
198 }
199
200 /*
201 * It's been determined by AMD that Athlons since model 8 stepping 1
202 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
203 * As per AMD technical note 27212 0.2
204 */
205 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
206 rdmsr(MSR_K7_CLK_CTL, l, h);
207 if ((l & 0xfff00000) != 0x20000000) {
208 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
209 ((l & 0x000fffff)|0x20000000));
210 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
211 }
212 }
213 break;
214 } 336 }
337 if (c->x86 == 0x10 || c->x86 == 0x11)
338 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
339#else
340
341 /*
342 * FIXME: We should handle the K5 here. Set up the write
343 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
344 * no bus pipeline)
345 */
215 346
216 switch (c->x86) { 347 switch (c->x86) {
217 case 15: 348 case 4:
218 /* Use K8 tuning for Fam10h and Fam11h */ 349 init_amd_k5(c);
219 case 0x10:
220 case 0x11:
221 set_cpu_cap(c, X86_FEATURE_K8);
222 break; 350 break;
223 case 6: 351 case 5:
224 set_cpu_cap(c, X86_FEATURE_K7); 352 init_amd_k6(c);
353 break;
354 case 6: /* An Athlon/Duron */
355 init_amd_k7(c);
225 break; 356 break;
226 } 357 }
358
359 /* K6s reports MCEs but don't actually have all the MSRs */
360 if (c->x86 < 6)
361 clear_cpu_cap(c, X86_FEATURE_MCE);
362#endif
363
364 /* Enable workaround for FXSAVE leak */
227 if (c->x86 >= 6) 365 if (c->x86 >= 6)
228 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); 366 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
229 367
230 display_cacheinfo(c); 368 if (!c->x86_model_id[0]) {
231 369 switch (c->x86) {
232 if (cpuid_eax(0x80000000) >= 0x80000008) 370 case 0xf:
233 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; 371 /* Should distinguish Models here, but this is only
372 a fallback anyways. */
373 strcpy(c->x86_model_id, "Hammer");
374 break;
375 }
376 }
234 377
235#ifdef CONFIG_X86_HT 378 display_cacheinfo(c);
236 /*
237 * On a AMD multi core setup the lower bits of the APIC id
238 * distinguish the cores.
239 */
240 if (c->x86_max_cores > 1) {
241 int cpu = smp_processor_id();
242 unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
243 379
244 if (bits == 0) { 380 /* Multi core CPU? */
245 while ((1 << bits) < c->x86_max_cores) 381 if (c->extended_cpuid_level >= 0x80000008) {
246 bits++; 382 amd_detect_cmp(c);
247 } 383 srat_detect_node(c);
248 c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
249 c->phys_proc_id >>= bits;
250 printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
251 cpu, c->x86_max_cores, c->cpu_core_id);
252 } 384 }
385
386#ifdef CONFIG_X86_32
387 detect_ht(c);
253#endif 388#endif
254 389
255 if (cpuid_eax(0x80000000) >= 0x80000006) { 390 if (c->extended_cpuid_level >= 0x80000006) {
256 if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) 391 if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
257 num_cache_leaves = 4; 392 num_cache_leaves = 4;
258 else 393 else
259 num_cache_leaves = 3; 394 num_cache_leaves = 3;
260 } 395 }
261 396
262 /* K6s reports MCEs but don't actually have all the MSRs */ 397 if (c->x86 >= 0xf && c->x86 <= 0x11)
263 if (c->x86 < 6) 398 set_cpu_cap(c, X86_FEATURE_K8);
264 clear_cpu_cap(c, X86_FEATURE_MCE);
265 399
266 if (cpu_has_xmm2) 400 if (cpu_has_xmm2) {
401 /* MFENCE stops RDTSC speculation */
267 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); 402 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
403 }
404
405#ifdef CONFIG_X86_64
406 if (c->x86 == 0x10) {
407 /* do this for boot cpu */
408 if (c == &boot_cpu_data)
409 check_enable_amd_mmconf_dmi();
410
411 fam10h_check_enable_mmcfg();
412 }
413
414 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
415 unsigned long long tseg;
416
417 /*
418 * Split up direct mapping around the TSEG SMM area.
419 * Don't do it for gbpages because there seems very little
420 * benefit in doing so.
421 */
422 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
423 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
424 if ((tseg>>PMD_SHIFT) <
425 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
426 ((tseg>>PMD_SHIFT) <
427 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
428 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
429 set_memory_4k((unsigned long)__va(tseg), 1);
430 }
431 }
432#endif
268} 433}
269 434
435#ifdef CONFIG_X86_32
270static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) 436static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
271{ 437{
272 /* AMD errata T13 (order #21922) */ 438 /* AMD errata T13 (order #21922) */
@@ -279,10 +445,12 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
279 } 445 }
280 return size; 446 return size;
281} 447}
448#endif
282 449
283static struct cpu_dev amd_cpu_dev __cpuinitdata = { 450static struct cpu_dev amd_cpu_dev __cpuinitdata = {
284 .c_vendor = "AMD", 451 .c_vendor = "AMD",
285 .c_ident = { "AuthenticAMD" }, 452 .c_ident = { "AuthenticAMD" },
453#ifdef CONFIG_X86_32
286 .c_models = { 454 .c_models = {
287 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = 455 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
288 { 456 {
@@ -295,9 +463,11 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = {
295 } 463 }
296 }, 464 },
297 }, 465 },
466 .c_size_cache = amd_size_cache,
467#endif
298 .c_early_init = early_init_amd, 468 .c_early_init = early_init_amd,
299 .c_init = init_amd, 469 .c_init = init_amd,
300 .c_size_cache = amd_size_cache, 470 .c_x86_vendor = X86_VENDOR_AMD,
301}; 471};
302 472
303cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); 473cpu_dev_register(amd_cpu_dev);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
deleted file mode 100644
index d1692b2a41f..00000000000
--- a/arch/x86/kernel/cpu/amd_64.c
+++ /dev/null
@@ -1,224 +0,0 @@
1#include <linux/init.h>
2#include <linux/mm.h>
3
4#include <asm/numa_64.h>
5#include <asm/mmconfig.h>
6#include <asm/cacheflush.h>
7
8#include <mach_apic.h>
9
10#include "cpu.h"
11
12int force_mwait __cpuinitdata;
13
14#ifdef CONFIG_NUMA
15static int __cpuinit nearby_node(int apicid)
16{
17 int i, node;
18
19 for (i = apicid - 1; i >= 0; i--) {
20 node = apicid_to_node[i];
21 if (node != NUMA_NO_NODE && node_online(node))
22 return node;
23 }
24 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
25 node = apicid_to_node[i];
26 if (node != NUMA_NO_NODE && node_online(node))
27 return node;
28 }
29 return first_node(node_online_map); /* Shouldn't happen */
30}
31#endif
32
33/*
34 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
35 * Assumes number of cores is a power of two.
36 */
37static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
38{
39#ifdef CONFIG_SMP
40 unsigned bits;
41#ifdef CONFIG_NUMA
42 int cpu = smp_processor_id();
43 int node = 0;
44 unsigned apicid = hard_smp_processor_id();
45#endif
46 bits = c->x86_coreid_bits;
47
48 /* Low order bits define the core id (index of core in socket) */
49 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
50 /* Convert the initial APIC ID into the socket ID */
51 c->phys_proc_id = c->initial_apicid >> bits;
52
53#ifdef CONFIG_NUMA
54 node = c->phys_proc_id;
55 if (apicid_to_node[apicid] != NUMA_NO_NODE)
56 node = apicid_to_node[apicid];
57 if (!node_online(node)) {
58 /* Two possibilities here:
59 - The CPU is missing memory and no node was created.
60 In that case try picking one from a nearby CPU
61 - The APIC IDs differ from the HyperTransport node IDs
62 which the K8 northbridge parsing fills in.
63 Assume they are all increased by a constant offset,
64 but in the same order as the HT nodeids.
65 If that doesn't result in a usable node fall back to the
66 path for the previous case. */
67
68 int ht_nodeid = c->initial_apicid;
69
70 if (ht_nodeid >= 0 &&
71 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
72 node = apicid_to_node[ht_nodeid];
73 /* Pick a nearby node */
74 if (!node_online(node))
75 node = nearby_node(apicid);
76 }
77 numa_set_node(cpu, node);
78
79 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
80#endif
81#endif
82}
83
84static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
85{
86#ifdef CONFIG_SMP
87 unsigned bits, ecx;
88
89 /* Multi core CPU? */
90 if (c->extended_cpuid_level < 0x80000008)
91 return;
92
93 ecx = cpuid_ecx(0x80000008);
94
95 c->x86_max_cores = (ecx & 0xff) + 1;
96
97 /* CPU telling us the core id bits shift? */
98 bits = (ecx >> 12) & 0xF;
99
100 /* Otherwise recompute */
101 if (bits == 0) {
102 while ((1 << bits) < c->x86_max_cores)
103 bits++;
104 }
105
106 c->x86_coreid_bits = bits;
107
108#endif
109}
110
111static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
112{
113 early_init_amd_mc(c);
114
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118
119 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
120}
121
122static void __cpuinit init_amd(struct cpuinfo_x86 *c)
123{
124 unsigned level;
125
126#ifdef CONFIG_SMP
127 unsigned long value;
128
129 /*
130 * Disable TLB flush filter by setting HWCR.FFDIS on K8
131 * bit 6 of msr C001_0015
132 *
133 * Errata 63 for SH-B3 steppings
134 * Errata 122 for all steppings (F+ have it disabled by default)
135 */
136 if (c->x86 == 0xf) {
137 rdmsrl(MSR_K8_HWCR, value);
138 value |= 1 << 6;
139 wrmsrl(MSR_K8_HWCR, value);
140 }
141#endif
142
143 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
144 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
145 clear_cpu_cap(c, 0*32+31);
146
147 /* On C+ stepping K8 rep microcode works well for copy/memset */
148 if (c->x86 == 0xf) {
149 level = cpuid_eax(1);
150 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
151 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
152 }
153 if (c->x86 == 0x10 || c->x86 == 0x11)
154 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
155
156 /* Enable workaround for FXSAVE leak */
157 if (c->x86 >= 6)
158 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
159
160 level = get_model_name(c);
161 if (!level) {
162 switch (c->x86) {
163 case 0xf:
164 /* Should distinguish Models here, but this is only
165 a fallback anyways. */
166 strcpy(c->x86_model_id, "Hammer");
167 break;
168 }
169 }
170 display_cacheinfo(c);
171
172 /* Multi core CPU? */
173 if (c->extended_cpuid_level >= 0x80000008)
174 amd_detect_cmp(c);
175
176 if (c->extended_cpuid_level >= 0x80000006 &&
177 (cpuid_edx(0x80000006) & 0xf000))
178 num_cache_leaves = 4;
179 else
180 num_cache_leaves = 3;
181
182 if (c->x86 >= 0xf && c->x86 <= 0x11)
183 set_cpu_cap(c, X86_FEATURE_K8);
184
185 /* MFENCE stops RDTSC speculation */
186 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
187
188 if (c->x86 == 0x10) {
189 /* do this for boot cpu */
190 if (c == &boot_cpu_data)
191 check_enable_amd_mmconf_dmi();
192
193 fam10h_check_enable_mmcfg();
194 }
195
196 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
197 unsigned long long tseg;
198
199 /*
200 * Split up direct mapping around the TSEG SMM area.
201 * Don't do it for gbpages because there seems very little
202 * benefit in doing so.
203 */
204 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
205 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
206 if ((tseg>>PMD_SHIFT) <
207 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
208 ((tseg>>PMD_SHIFT) <
209 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
210 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
211 set_memory_4k((unsigned long)__va(tseg), 1);
212 }
213 }
214}
215
216static struct cpu_dev amd_cpu_dev __cpuinitdata = {
217 .c_vendor = "AMD",
218 .c_ident = { "AuthenticAMD" },
219 .c_early_init = early_init_amd,
220 .c_init = init_amd,
221};
222
223cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
224
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index a0534c04d38..89bfdd9cacc 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -289,7 +289,6 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
289 if (c->x86_model >= 6 && c->x86_model < 9) 289 if (c->x86_model >= 6 && c->x86_model < 9)
290 set_cpu_cap(c, X86_FEATURE_3DNOW); 290 set_cpu_cap(c, X86_FEATURE_3DNOW);
291 291
292 get_model_name(c);
293 display_cacheinfo(c); 292 display_cacheinfo(c);
294} 293}
295 294
@@ -475,6 +474,7 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
475 .c_early_init = early_init_centaur, 474 .c_early_init = early_init_centaur,
476 .c_init = init_centaur, 475 .c_init = init_centaur,
477 .c_size_cache = centaur_size_cache, 476 .c_size_cache = centaur_size_cache,
477 .c_x86_vendor = X86_VENDOR_CENTAUR,
478}; 478};
479 479
480cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev); 480cpu_dev_register(centaur_cpu_dev);
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index 1d181c40e2e..a1625f5a1e7 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -16,9 +16,10 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
16 16
17static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
18{ 18{
19 early_init_centaur(c);
20
19 if (c->x86 == 0x6 && c->x86_model >= 0xf) { 21 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
20 c->x86_cache_alignment = c->x86_clflush_size * 2; 22 c->x86_cache_alignment = c->x86_clflush_size * 2;
21 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
22 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 23 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
23 } 24 }
24 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 25 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -29,7 +30,8 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
29 .c_ident = { "CentaurHauls" }, 30 .c_ident = { "CentaurHauls" },
30 .c_early_init = early_init_centaur, 31 .c_early_init = early_init_centaur,
31 .c_init = init_centaur, 32 .c_init = init_centaur,
33 .c_x86_vendor = X86_VENDOR_CENTAUR,
32}; 34};
33 35
34cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev); 36cpu_dev_register(centaur_cpu_dev);
35 37
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
new file mode 100644
index 00000000000..2056ccf572c
--- /dev/null
+++ b/arch/x86/kernel/cpu/cmpxchg.c
@@ -0,0 +1,72 @@
1/*
2 * cmpxchg*() fallbacks for CPU not supporting these instructions
3 */
4
5#include <linux/kernel.h>
6#include <linux/smp.h>
7#include <linux/module.h>
8
9#ifndef CONFIG_X86_CMPXCHG
10unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
11{
12 u8 prev;
13 unsigned long flags;
14
15 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
16 local_irq_save(flags);
17 prev = *(u8 *)ptr;
18 if (prev == old)
19 *(u8 *)ptr = new;
20 local_irq_restore(flags);
21 return prev;
22}
23EXPORT_SYMBOL(cmpxchg_386_u8);
24
25unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
26{
27 u16 prev;
28 unsigned long flags;
29
30 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
31 local_irq_save(flags);
32 prev = *(u16 *)ptr;
33 if (prev == old)
34 *(u16 *)ptr = new;
35 local_irq_restore(flags);
36 return prev;
37}
38EXPORT_SYMBOL(cmpxchg_386_u16);
39
40unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
41{
42 u32 prev;
43 unsigned long flags;
44
45 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
46 local_irq_save(flags);
47 prev = *(u32 *)ptr;
48 if (prev == old)
49 *(u32 *)ptr = new;
50 local_irq_restore(flags);
51 return prev;
52}
53EXPORT_SYMBOL(cmpxchg_386_u32);
54#endif
55
56#ifndef CONFIG_X86_CMPXCHG64
57unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
58{
59 u64 prev;
60 unsigned long flags;
61
62 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
63 local_irq_save(flags);
64 prev = *(u64 *)ptr;
65 if (prev == old)
66 *(u64 *)ptr = new;
67 local_irq_restore(flags);
68 return prev;
69}
70EXPORT_SYMBOL(cmpxchg_486_u64);
71#endif
72
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4e456bd955b..25581dcb280 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,28 +1,62 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
2#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
3#include <linux/delay.h> 10#include <linux/delay.h>
4#include <linux/smp.h> 11#include <linux/smp.h>
5#include <linux/module.h>
6#include <linux/percpu.h> 12#include <linux/percpu.h>
7#include <linux/bootmem.h>
8#include <asm/processor.h>
9#include <asm/i387.h> 13#include <asm/i387.h>
10#include <asm/msr.h> 14#include <asm/msr.h>
11#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/linkage.h>
12#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
13#include <asm/mtrr.h> 18#include <asm/mtrr.h>
14#include <asm/mce.h> 19#include <asm/mce.h>
15#include <asm/pat.h> 20#include <asm/pat.h>
16#include <asm/asm.h> 21#include <asm/asm.h>
22#include <asm/numa.h>
17#ifdef CONFIG_X86_LOCAL_APIC 23#ifdef CONFIG_X86_LOCAL_APIC
18#include <asm/mpspec.h> 24#include <asm/mpspec.h>
19#include <asm/apic.h> 25#include <asm/apic.h>
20#include <mach_apic.h> 26#include <mach_apic.h>
27#include <asm/genapic.h>
21#endif 28#endif
22 29
30#include <asm/pda.h>
31#include <asm/pgtable.h>
32#include <asm/processor.h>
33#include <asm/desc.h>
34#include <asm/atomic.h>
35#include <asm/proto.h>
36#include <asm/sections.h>
37#include <asm/setup.h>
38
23#include "cpu.h" 39#include "cpu.h"
24 40
41static struct cpu_dev *this_cpu __cpuinitdata;
42
43#ifdef CONFIG_X86_64
44/* We need valid kernel segments for data and code in long mode too
45 * IRET will check the segment types kkeil 2000/10/28
46 * Also sysret mandates a special GDT layout
47 */
48/* The TLS descriptors are currently at a different place compared to i386.
49 Hopefully nobody expects them at a fixed place (Wine?) */
25DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 50DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
51 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
52 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
53 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
54 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
55 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
56 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
57} };
58#else
59DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
26 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 60 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
27 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 61 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
28 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 62 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -56,17 +90,157 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
56 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 90 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
57 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, 91 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
58} }; 92} };
93#endif
59EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 94EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
60 95
61__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 96#ifdef CONFIG_X86_32
62
63static int cachesize_override __cpuinitdata = -1; 97static int cachesize_override __cpuinitdata = -1;
64static int disable_x86_serial_nr __cpuinitdata = 1; 98static int disable_x86_serial_nr __cpuinitdata = 1;
65 99
66struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 100static int __init cachesize_setup(char *str)
101{
102 get_option(&str, &cachesize_override);
103 return 1;
104}
105__setup("cachesize=", cachesize_setup);
106
107static int __init x86_fxsr_setup(char *s)
108{
109 setup_clear_cpu_cap(X86_FEATURE_FXSR);
110 setup_clear_cpu_cap(X86_FEATURE_XMM);
111 return 1;
112}
113__setup("nofxsr", x86_fxsr_setup);
114
115static int __init x86_sep_setup(char *s)
116{
117 setup_clear_cpu_cap(X86_FEATURE_SEP);
118 return 1;
119}
120__setup("nosep", x86_sep_setup);
121
122/* Standard macro to see if a specific flag is changeable */
123static inline int flag_is_changeable_p(u32 flag)
124{
125 u32 f1, f2;
126
127 /*
128 * Cyrix and IDT cpus allow disabling of CPUID
129 * so the code below may return different results
130 * when it is executed before and after enabling
131 * the CPUID. Add "volatile" to not allow gcc to
132 * optimize the subsequent calls to this function.
133 */
134 asm volatile ("pushfl\n\t"
135 "pushfl\n\t"
136 "popl %0\n\t"
137 "movl %0,%1\n\t"
138 "xorl %2,%0\n\t"
139 "pushl %0\n\t"
140 "popfl\n\t"
141 "pushfl\n\t"
142 "popl %0\n\t"
143 "popfl\n\t"
144 : "=&r" (f1), "=&r" (f2)
145 : "ir" (flag));
146
147 return ((f1^f2) & flag) != 0;
148}
149
150/* Probe for the CPUID instruction */
151static int __cpuinit have_cpuid_p(void)
152{
153 return flag_is_changeable_p(X86_EFLAGS_ID);
154}
155
156static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
157{
158 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
159 /* Disable processor serial number */
160 unsigned long lo, hi;
161 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
162 lo |= 0x200000;
163 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
164 printk(KERN_NOTICE "CPU serial number disabled.\n");
165 clear_cpu_cap(c, X86_FEATURE_PN);
166
167 /* Disabling the serial number may affect the cpuid level */
168 c->cpuid_level = cpuid_eax(0);
169 }
170}
171
172static int __init x86_serial_nr_setup(char *s)
173{
174 disable_x86_serial_nr = 0;
175 return 1;
176}
177__setup("serialnumber", x86_serial_nr_setup);
178#else
179static inline int flag_is_changeable_p(u32 flag)
180{
181 return 1;
182}
183/* Probe for the CPUID instruction */
184static inline int have_cpuid_p(void)
185{
186 return 1;
187}
188static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
189{
190}
191#endif
192
193/*
194 * Naming convention should be: <Name> [(<Codename>)]
195 * This table only is used unless init_<vendor>() below doesn't set it;
196 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
197 *
198 */
199
200/* Look up CPU names by table lookup. */
201static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
202{
203 struct cpu_model_info *info;
204
205 if (c->x86_model >= 16)
206 return NULL; /* Range check */
207
208 if (!this_cpu)
209 return NULL;
210
211 info = this_cpu->c_models;
212
213 while (info && info->family) {
214 if (info->family == c->x86)
215 return info->model_names[c->x86_model];
216 info++;
217 }
218 return NULL; /* Not found */
219}
220
221__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
222
223/* Current gdt points %fs at the "master" per-cpu area: after this,
224 * it's on the real one. */
225void switch_to_new_gdt(void)
226{
227 struct desc_ptr gdt_descr;
228
229 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
230 gdt_descr.size = GDT_SIZE - 1;
231 load_gdt(&gdt_descr);
232#ifdef CONFIG_X86_32
233 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
234#endif
235}
236
237static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
67 238
68static void __cpuinit default_init(struct cpuinfo_x86 *c) 239static void __cpuinit default_init(struct cpuinfo_x86 *c)
69{ 240{
241#ifdef CONFIG_X86_64
242 display_cacheinfo(c);
243#else
70 /* Not much we can do here... */ 244 /* Not much we can do here... */
71 /* Check if at least it has cpuid */ 245 /* Check if at least it has cpuid */
72 if (c->cpuid_level == -1) { 246 if (c->cpuid_level == -1) {
@@ -76,28 +250,22 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
76 else if (c->x86 == 3) 250 else if (c->x86 == 3)
77 strcpy(c->x86_model_id, "386"); 251 strcpy(c->x86_model_id, "386");
78 } 252 }
253#endif
79} 254}
80 255
81static struct cpu_dev __cpuinitdata default_cpu = { 256static struct cpu_dev __cpuinitdata default_cpu = {
82 .c_init = default_init, 257 .c_init = default_init,
83 .c_vendor = "Unknown", 258 .c_vendor = "Unknown",
259 .c_x86_vendor = X86_VENDOR_UNKNOWN,
84}; 260};
85static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
86
87static int __init cachesize_setup(char *str)
88{
89 get_option(&str, &cachesize_override);
90 return 1;
91}
92__setup("cachesize=", cachesize_setup);
93 261
94int __cpuinit get_model_name(struct cpuinfo_x86 *c) 262static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
95{ 263{
96 unsigned int *v; 264 unsigned int *v;
97 char *p, *q; 265 char *p, *q;
98 266
99 if (cpuid_eax(0x80000000) < 0x80000004) 267 if (c->extended_cpuid_level < 0x80000004)
100 return 0; 268 return;
101 269
102 v = (unsigned int *) c->x86_model_id; 270 v = (unsigned int *) c->x86_model_id;
103 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); 271 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
@@ -116,30 +284,34 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
116 while (q <= &c->x86_model_id[48]) 284 while (q <= &c->x86_model_id[48])
117 *q++ = '\0'; /* Zero-pad the rest */ 285 *q++ = '\0'; /* Zero-pad the rest */
118 } 286 }
119
120 return 1;
121} 287}
122 288
123
124void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 289void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
125{ 290{
126 unsigned int n, dummy, ecx, edx, l2size; 291 unsigned int n, dummy, ebx, ecx, edx, l2size;
127 292
128 n = cpuid_eax(0x80000000); 293 n = c->extended_cpuid_level;
129 294
130 if (n >= 0x80000005) { 295 if (n >= 0x80000005) {
131 cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); 296 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
132 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", 297 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
133 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); 298 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
134 c->x86_cache_size = (ecx>>24)+(edx>>24); 299 c->x86_cache_size = (ecx>>24) + (edx>>24);
300#ifdef CONFIG_X86_64
301 /* On K8 L1 TLB is inclusive, so don't count it */
302 c->x86_tlbsize = 0;
303#endif
135 } 304 }
136 305
137 if (n < 0x80000006) /* Some chips just has a large L1. */ 306 if (n < 0x80000006) /* Some chips just has a large L1. */
138 return; 307 return;
139 308
140 ecx = cpuid_ecx(0x80000006); 309 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
141 l2size = ecx >> 16; 310 l2size = ecx >> 16;
142 311
312#ifdef CONFIG_X86_64
313 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
314#else
143 /* do processor-specific cache resizing */ 315 /* do processor-specific cache resizing */
144 if (this_cpu->c_size_cache) 316 if (this_cpu->c_size_cache)
145 l2size = this_cpu->c_size_cache(c, l2size); 317 l2size = this_cpu->c_size_cache(c, l2size);
@@ -150,116 +322,106 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
150 322
151 if (l2size == 0) 323 if (l2size == 0)
152 return; /* Again, no L2 cache is possible */ 324 return; /* Again, no L2 cache is possible */
325#endif
153 326
154 c->x86_cache_size = l2size; 327 c->x86_cache_size = l2size;
155 328
156 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", 329 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
157 l2size, ecx & 0xFF); 330 l2size, ecx & 0xFF);
158} 331}
159 332
160/* 333void __cpuinit detect_ht(struct cpuinfo_x86 *c)
161 * Naming convention should be: <Name> [(<Codename>)]
162 * This table only is used unless init_<vendor>() below doesn't set it;
163 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
164 *
165 */
166
167/* Look up CPU names by table lookup. */
168static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
169{ 334{
170 struct cpu_model_info *info; 335#ifdef CONFIG_X86_HT
336 u32 eax, ebx, ecx, edx;
337 int index_msb, core_bits;
171 338
172 if (c->x86_model >= 16) 339 if (!cpu_has(c, X86_FEATURE_HT))
173 return NULL; /* Range check */ 340 return;
174 341
175 if (!this_cpu) 342 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
176 return NULL; 343 goto out;
177 344
178 info = this_cpu->c_models; 345 if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
346 return;
179 347
180 while (info && info->family) { 348 cpuid(1, &eax, &ebx, &ecx, &edx);
181 if (info->family == c->x86) 349
182 return info->model_names[c->x86_model]; 350 smp_num_siblings = (ebx & 0xff0000) >> 16;
183 info++; 351
352 if (smp_num_siblings == 1) {
353 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
354 } else if (smp_num_siblings > 1) {
355
356 if (smp_num_siblings > NR_CPUS) {
357 printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
358 smp_num_siblings);
359 smp_num_siblings = 1;
360 return;
361 }
362
363 index_msb = get_count_order(smp_num_siblings);
364#ifdef CONFIG_X86_64
365 c->phys_proc_id = phys_pkg_id(index_msb);
366#else
367 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
368#endif
369
370 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
371
372 index_msb = get_count_order(smp_num_siblings);
373
374 core_bits = get_count_order(c->x86_max_cores);
375
376#ifdef CONFIG_X86_64
377 c->cpu_core_id = phys_pkg_id(index_msb) &
378 ((1 << core_bits) - 1);
379#else
380 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
381 ((1 << core_bits) - 1);
382#endif
184 } 383 }
185 return NULL; /* Not found */
186}
187 384
385out:
386 if ((c->x86_max_cores * smp_num_siblings) > 1) {
387 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
388 c->phys_proc_id);
389 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
390 c->cpu_core_id);
391 }
392#endif
393}
188 394
189static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) 395static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
190{ 396{
191 char *v = c->x86_vendor_id; 397 char *v = c->x86_vendor_id;
192 int i; 398 int i;
193 static int printed; 399 static int printed;
194 400
195 for (i = 0; i < X86_VENDOR_NUM; i++) { 401 for (i = 0; i < X86_VENDOR_NUM; i++) {
196 if (cpu_devs[i]) { 402 if (!cpu_devs[i])
197 if (!strcmp(v, cpu_devs[i]->c_ident[0]) || 403 break;
198 (cpu_devs[i]->c_ident[1] && 404
199 !strcmp(v, cpu_devs[i]->c_ident[1]))) { 405 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
200 c->x86_vendor = i; 406 (cpu_devs[i]->c_ident[1] &&
201 if (!early) 407 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
202 this_cpu = cpu_devs[i]; 408 this_cpu = cpu_devs[i];
203 return; 409 c->x86_vendor = this_cpu->c_x86_vendor;
204 } 410 return;
205 } 411 }
206 } 412 }
413
207 if (!printed) { 414 if (!printed) {
208 printed++; 415 printed++;
209 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); 416 printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
210 printk(KERN_ERR "CPU: Your system may be unstable.\n"); 417 printk(KERN_ERR "CPU: Your system may be unstable.\n");
211 } 418 }
419
212 c->x86_vendor = X86_VENDOR_UNKNOWN; 420 c->x86_vendor = X86_VENDOR_UNKNOWN;
213 this_cpu = &default_cpu; 421 this_cpu = &default_cpu;
214} 422}
215 423
216 424void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
217static int __init x86_fxsr_setup(char *s)
218{
219 setup_clear_cpu_cap(X86_FEATURE_FXSR);
220 setup_clear_cpu_cap(X86_FEATURE_XMM);
221 return 1;
222}
223__setup("nofxsr", x86_fxsr_setup);
224
225
226static int __init x86_sep_setup(char *s)
227{
228 setup_clear_cpu_cap(X86_FEATURE_SEP);
229 return 1;
230}
231__setup("nosep", x86_sep_setup);
232
233
234/* Standard macro to see if a specific flag is changeable */
235static inline int flag_is_changeable_p(u32 flag)
236{
237 u32 f1, f2;
238
239 asm("pushfl\n\t"
240 "pushfl\n\t"
241 "popl %0\n\t"
242 "movl %0,%1\n\t"
243 "xorl %2,%0\n\t"
244 "pushl %0\n\t"
245 "popfl\n\t"
246 "pushfl\n\t"
247 "popl %0\n\t"
248 "popfl\n\t"
249 : "=&r" (f1), "=&r" (f2)
250 : "ir" (flag));
251
252 return ((f1^f2) & flag) != 0;
253}
254
255
256/* Probe for the CPUID instruction */
257static int __cpuinit have_cpuid_p(void)
258{
259 return flag_is_changeable_p(X86_EFLAGS_ID);
260}
261
262void __init cpu_detect(struct cpuinfo_x86 *c)
263{ 425{
264 /* Get vendor name */ 426 /* Get vendor name */
265 cpuid(0x00000000, (unsigned int *)&c->cpuid_level, 427 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
@@ -268,50 +430,87 @@ void __init cpu_detect(struct cpuinfo_x86 *c)
268 (unsigned int *)&c->x86_vendor_id[4]); 430 (unsigned int *)&c->x86_vendor_id[4]);
269 431
270 c->x86 = 4; 432 c->x86 = 4;
433 /* Intel-defined flags: level 0x00000001 */
271 if (c->cpuid_level >= 0x00000001) { 434 if (c->cpuid_level >= 0x00000001) {
272 u32 junk, tfms, cap0, misc; 435 u32 junk, tfms, cap0, misc;
273 cpuid(0x00000001, &tfms, &misc, &junk, &cap0); 436 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
274 c->x86 = (tfms >> 8) & 15; 437 c->x86 = (tfms >> 8) & 0xf;
275 c->x86_model = (tfms >> 4) & 15; 438 c->x86_model = (tfms >> 4) & 0xf;
439 c->x86_mask = tfms & 0xf;
276 if (c->x86 == 0xf) 440 if (c->x86 == 0xf)
277 c->x86 += (tfms >> 20) & 0xff; 441 c->x86 += (tfms >> 20) & 0xff;
278 if (c->x86 >= 0x6) 442 if (c->x86 >= 0x6)
279 c->x86_model += ((tfms >> 16) & 0xF) << 4; 443 c->x86_model += ((tfms >> 16) & 0xf) << 4;
280 c->x86_mask = tfms & 15;
281 if (cap0 & (1<<19)) { 444 if (cap0 & (1<<19)) {
282 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
283 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 445 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
446 c->x86_cache_alignment = c->x86_clflush_size;
284 } 447 }
285 } 448 }
286} 449}
287static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) 450
451static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
288{ 452{
289 u32 tfms, xlvl; 453 u32 tfms, xlvl;
290 unsigned int ebx; 454 u32 ebx;
291 455
292 memset(&c->x86_capability, 0, sizeof c->x86_capability); 456 /* Intel-defined flags: level 0x00000001 */
293 if (have_cpuid_p()) { 457 if (c->cpuid_level >= 0x00000001) {
294 /* Intel-defined flags: level 0x00000001 */ 458 u32 capability, excap;
295 if (c->cpuid_level >= 0x00000001) { 459 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
296 u32 capability, excap; 460 c->x86_capability[0] = capability;
297 cpuid(0x00000001, &tfms, &ebx, &excap, &capability); 461 c->x86_capability[4] = excap;
298 c->x86_capability[0] = capability; 462 }
299 c->x86_capability[4] = excap;
300 }
301 463
302 /* AMD-defined flags: level 0x80000001 */ 464 /* AMD-defined flags: level 0x80000001 */
303 xlvl = cpuid_eax(0x80000000); 465 xlvl = cpuid_eax(0x80000000);
304 if ((xlvl & 0xffff0000) == 0x80000000) { 466 c->extended_cpuid_level = xlvl;
305 if (xlvl >= 0x80000001) { 467 if ((xlvl & 0xffff0000) == 0x80000000) {
306 c->x86_capability[1] = cpuid_edx(0x80000001); 468 if (xlvl >= 0x80000001) {
307 c->x86_capability[6] = cpuid_ecx(0x80000001); 469 c->x86_capability[1] = cpuid_edx(0x80000001);
308 } 470 c->x86_capability[6] = cpuid_ecx(0x80000001);
309 } 471 }
472 }
473
474#ifdef CONFIG_X86_64
475 if (c->extended_cpuid_level >= 0x80000008) {
476 u32 eax = cpuid_eax(0x80000008);
310 477
478 c->x86_virt_bits = (eax >> 8) & 0xff;
479 c->x86_phys_bits = eax & 0xff;
311 } 480 }
481#endif
482
483 if (c->extended_cpuid_level >= 0x80000007)
484 c->x86_power = cpuid_edx(0x80000007);
312 485
313} 486}
314 487
488static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
489{
490#ifdef CONFIG_X86_32
491 int i;
492
493 /*
494 * First of all, decide if this is a 486 or higher
495 * It's a 486 if we can modify the AC flag
496 */
497 if (flag_is_changeable_p(X86_EFLAGS_AC))
498 c->x86 = 4;
499 else
500 c->x86 = 3;
501
502 for (i = 0; i < X86_VENDOR_NUM; i++)
503 if (cpu_devs[i] && cpu_devs[i]->c_identify) {
504 c->x86_vendor_id[0] = 0;
505 cpu_devs[i]->c_identify(c);
506 if (c->x86_vendor_id[0]) {
507 get_cpu_vendor(c);
508 break;
509 }
510 }
511#endif
512}
513
315/* 514/*
316 * Do minimum CPU detection early. 515 * Do minimum CPU detection early.
317 * Fields really needed: vendor, cpuid_level, family, model, mask, 516 * Fields really needed: vendor, cpuid_level, family, model, mask,
@@ -321,25 +520,61 @@ static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
321 * WARNING: this function is only called on the BP. Don't add code here 520 * WARNING: this function is only called on the BP. Don't add code here
322 * that is supposed to run on all CPUs. 521 * that is supposed to run on all CPUs.
323 */ 522 */
324static void __init early_cpu_detect(void) 523static void __init early_identify_cpu(struct cpuinfo_x86 *c)
325{ 524{
326 struct cpuinfo_x86 *c = &boot_cpu_data; 525#ifdef CONFIG_X86_64
327 526 c->x86_clflush_size = 64;
328 c->x86_cache_alignment = 32; 527#else
329 c->x86_clflush_size = 32; 528 c->x86_clflush_size = 32;
529#endif
530 c->x86_cache_alignment = c->x86_clflush_size;
531
532 memset(&c->x86_capability, 0, sizeof c->x86_capability);
533 c->extended_cpuid_level = 0;
534
535 if (!have_cpuid_p())
536 identify_cpu_without_cpuid(c);
330 537
538 /* cyrix could have cpuid enabled via c_identify()*/
331 if (!have_cpuid_p()) 539 if (!have_cpuid_p())
332 return; 540 return;
333 541
334 cpu_detect(c); 542 cpu_detect(c);
335 543
336 get_cpu_vendor(c, 1); 544 get_cpu_vendor(c);
545
546 get_cpu_cap(c);
337 547
338 early_get_cap(c); 548 if (this_cpu->c_early_init)
549 this_cpu->c_early_init(c);
339 550
340 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 551 validate_pat_support(c);
341 cpu_devs[c->x86_vendor]->c_early_init) 552}
342 cpu_devs[c->x86_vendor]->c_early_init(c); 553
554void __init early_cpu_init(void)
555{
556 struct cpu_dev **cdev;
557 int count = 0;
558
559 printk("KERNEL supported cpus:\n");
560 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
561 struct cpu_dev *cpudev = *cdev;
562 unsigned int j;
563
564 if (count >= X86_VENDOR_NUM)
565 break;
566 cpu_devs[count] = cpudev;
567 count++;
568
569 for (j = 0; j < 2; j++) {
570 if (!cpudev->c_ident[j])
571 continue;
572 printk(" %s %s\n", cpudev->c_vendor,
573 cpudev->c_ident[j]);
574 }
575 }
576
577 early_identify_cpu(&boot_cpu_data);
343} 578}
344 579
345/* 580/*
@@ -357,86 +592,41 @@ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
357 592
358static void __cpuinit generic_identify(struct cpuinfo_x86 *c) 593static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
359{ 594{
360 u32 tfms, xlvl; 595 c->extended_cpuid_level = 0;
361 unsigned int ebx;
362
363 if (have_cpuid_p()) {
364 /* Get vendor name */
365 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
366 (unsigned int *)&c->x86_vendor_id[0],
367 (unsigned int *)&c->x86_vendor_id[8],
368 (unsigned int *)&c->x86_vendor_id[4]);
369
370 get_cpu_vendor(c, 0);
371 /* Initialize the standard set of capabilities */
372 /* Note that the vendor-specific code below might override */
373 /* Intel-defined flags: level 0x00000001 */
374 if (c->cpuid_level >= 0x00000001) {
375 u32 capability, excap;
376 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
377 c->x86_capability[0] = capability;
378 c->x86_capability[4] = excap;
379 c->x86 = (tfms >> 8) & 15;
380 c->x86_model = (tfms >> 4) & 15;
381 if (c->x86 == 0xf)
382 c->x86 += (tfms >> 20) & 0xff;
383 if (c->x86 >= 0x6)
384 c->x86_model += ((tfms >> 16) & 0xF) << 4;
385 c->x86_mask = tfms & 15;
386 c->initial_apicid = (ebx >> 24) & 0xFF;
387#ifdef CONFIG_X86_HT
388 c->apicid = phys_pkg_id(c->initial_apicid, 0);
389 c->phys_proc_id = c->initial_apicid;
390#else
391 c->apicid = c->initial_apicid;
392#endif
393 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
394 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
395 } else {
396 /* Have CPUID level 0 only - unheard of */
397 c->x86 = 4;
398 }
399 596
400 /* AMD-defined flags: level 0x80000001 */ 597 if (!have_cpuid_p())
401 xlvl = cpuid_eax(0x80000000); 598 identify_cpu_without_cpuid(c);
402 if ((xlvl & 0xffff0000) == 0x80000000) {
403 if (xlvl >= 0x80000001) {
404 c->x86_capability[1] = cpuid_edx(0x80000001);
405 c->x86_capability[6] = cpuid_ecx(0x80000001);
406 }
407 if (xlvl >= 0x80000004)
408 get_model_name(c); /* Default name */
409 }
410 599
411 init_scattered_cpuid_features(c); 600 /* cyrix could have cpuid enabled via c_identify()*/
412 detect_nopl(c); 601 if (!have_cpuid_p())
413 } 602 return;
414}
415 603
416static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) 604 cpu_detect(c);
417{
418 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
419 /* Disable processor serial number */
420 unsigned long lo, hi;
421 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
422 lo |= 0x200000;
423 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
424 printk(KERN_NOTICE "CPU serial number disabled.\n");
425 clear_cpu_cap(c, X86_FEATURE_PN);
426 605
427 /* Disabling the serial number may affect the cpuid level */ 606 get_cpu_vendor(c);
428 c->cpuid_level = cpuid_eax(0);
429 }
430}
431 607
432static int __init x86_serial_nr_setup(char *s) 608 get_cpu_cap(c);
433{ 609
434 disable_x86_serial_nr = 0; 610 if (c->cpuid_level >= 0x00000001) {
435 return 1; 611 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
436} 612#ifdef CONFIG_X86_32
437__setup("serialnumber", x86_serial_nr_setup); 613# ifdef CONFIG_X86_HT
614 c->apicid = phys_pkg_id(c->initial_apicid, 0);
615# else
616 c->apicid = c->initial_apicid;
617# endif
618#endif
438 619
620#ifdef CONFIG_X86_HT
621 c->phys_proc_id = c->initial_apicid;
622#endif
623 }
439 624
625 get_model_name(c); /* Default name */
626
627 init_scattered_cpuid_features(c);
628 detect_nopl(c);
629}
440 630
441/* 631/*
442 * This does the hard work of actually picking apart the CPU stuff... 632 * This does the hard work of actually picking apart the CPU stuff...
@@ -448,30 +638,29 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
448 c->loops_per_jiffy = loops_per_jiffy; 638 c->loops_per_jiffy = loops_per_jiffy;
449 c->x86_cache_size = -1; 639 c->x86_cache_size = -1;
450 c->x86_vendor = X86_VENDOR_UNKNOWN; 640 c->x86_vendor = X86_VENDOR_UNKNOWN;
451 c->cpuid_level = -1; /* CPUID not detected */
452 c->x86_model = c->x86_mask = 0; /* So far unknown... */ 641 c->x86_model = c->x86_mask = 0; /* So far unknown... */
453 c->x86_vendor_id[0] = '\0'; /* Unset */ 642 c->x86_vendor_id[0] = '\0'; /* Unset */
454 c->x86_model_id[0] = '\0'; /* Unset */ 643 c->x86_model_id[0] = '\0'; /* Unset */
455 c->x86_max_cores = 1; 644 c->x86_max_cores = 1;
645 c->x86_coreid_bits = 0;
646#ifdef CONFIG_X86_64
647 c->x86_clflush_size = 64;
648#else
649 c->cpuid_level = -1; /* CPUID not detected */
456 c->x86_clflush_size = 32; 650 c->x86_clflush_size = 32;
651#endif
652 c->x86_cache_alignment = c->x86_clflush_size;
457 memset(&c->x86_capability, 0, sizeof c->x86_capability); 653 memset(&c->x86_capability, 0, sizeof c->x86_capability);
458 654
459 if (!have_cpuid_p()) {
460 /*
461 * First of all, decide if this is a 486 or higher
462 * It's a 486 if we can modify the AC flag
463 */
464 if (flag_is_changeable_p(X86_EFLAGS_AC))
465 c->x86 = 4;
466 else
467 c->x86 = 3;
468 }
469
470 generic_identify(c); 655 generic_identify(c);
471 656
472 if (this_cpu->c_identify) 657 if (this_cpu->c_identify)
473 this_cpu->c_identify(c); 658 this_cpu->c_identify(c);
474 659
660#ifdef CONFIG_X86_64
661 c->apicid = phys_pkg_id(0);
662#endif
663
475 /* 664 /*
476 * Vendor-specific initialization. In this section we 665 * Vendor-specific initialization. In this section we
477 * canonicalize the feature flags, meaning if there are 666 * canonicalize the feature flags, meaning if there are
@@ -505,6 +694,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
505 c->x86, c->x86_model); 694 c->x86, c->x86_model);
506 } 695 }
507 696
697#ifdef CONFIG_X86_64
698 detect_ht(c);
699#endif
700
508 /* 701 /*
509 * On SMP, boot_cpu_data holds the common feature set between 702 * On SMP, boot_cpu_data holds the common feature set between
510 * all CPUs; so make sure that we indicate which features are 703 * all CPUs; so make sure that we indicate which features are
@@ -513,7 +706,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
513 */ 706 */
514 if (c != &boot_cpu_data) { 707 if (c != &boot_cpu_data) {
515 /* AND the already accumulated flags with these */ 708 /* AND the already accumulated flags with these */
516 for (i = 0 ; i < NCAPINTS ; i++) 709 for (i = 0; i < NCAPINTS; i++)
517 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 710 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
518 } 711 }
519 712
@@ -521,72 +714,91 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
521 for (i = 0; i < NCAPINTS; i++) 714 for (i = 0; i < NCAPINTS; i++)
522 c->x86_capability[i] &= ~cleared_cpu_caps[i]; 715 c->x86_capability[i] &= ~cleared_cpu_caps[i];
523 716
717#ifdef CONFIG_X86_MCE
524 /* Init Machine Check Exception if available. */ 718 /* Init Machine Check Exception if available. */
525 mcheck_init(c); 719 mcheck_init(c);
720#endif
526 721
527 select_idle_routine(c); 722 select_idle_routine(c);
723
724#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
725 numa_add_cpu(smp_processor_id());
726#endif
528} 727}
529 728
729#ifdef CONFIG_X86_64
730static void vgetcpu_set_mode(void)
731{
732 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
733 vgetcpu_mode = VGETCPU_RDTSCP;
734 else
735 vgetcpu_mode = VGETCPU_LSL;
736}
737#endif
738
530void __init identify_boot_cpu(void) 739void __init identify_boot_cpu(void)
531{ 740{
532 identify_cpu(&boot_cpu_data); 741 identify_cpu(&boot_cpu_data);
742#ifdef CONFIG_X86_32
533 sysenter_setup(); 743 sysenter_setup();
534 enable_sep_cpu(); 744 enable_sep_cpu();
745#else
746 vgetcpu_set_mode();
747#endif
535} 748}
536 749
537void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 750void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
538{ 751{
539 BUG_ON(c == &boot_cpu_data); 752 BUG_ON(c == &boot_cpu_data);
540 identify_cpu(c); 753 identify_cpu(c);
754#ifdef CONFIG_X86_32
541 enable_sep_cpu(); 755 enable_sep_cpu();
756#endif
542 mtrr_ap_init(); 757 mtrr_ap_init();
543} 758}
544 759
545#ifdef CONFIG_X86_HT 760struct msr_range {
546void __cpuinit detect_ht(struct cpuinfo_x86 *c) 761 unsigned min;
547{ 762 unsigned max;
548 u32 eax, ebx, ecx, edx; 763};
549 int index_msb, core_bits;
550
551 cpuid(1, &eax, &ebx, &ecx, &edx);
552
553 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
554 return;
555
556 smp_num_siblings = (ebx & 0xff0000) >> 16;
557 764
558 if (smp_num_siblings == 1) { 765static struct msr_range msr_range_array[] __cpuinitdata = {
559 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 766 { 0x00000000, 0x00000418},
560 } else if (smp_num_siblings > 1) { 767 { 0xc0000000, 0xc000040b},
768 { 0xc0010000, 0xc0010142},
769 { 0xc0011000, 0xc001103b},
770};
561 771
562 if (smp_num_siblings > NR_CPUS) { 772static void __cpuinit print_cpu_msr(void)
563 printk(KERN_WARNING "CPU: Unsupported number of the " 773{
564 "siblings %d", smp_num_siblings); 774 unsigned index;
565 smp_num_siblings = 1; 775 u64 val;
566 return; 776 int i;
777 unsigned index_min, index_max;
778
779 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
780 index_min = msr_range_array[i].min;
781 index_max = msr_range_array[i].max;
782 for (index = index_min; index < index_max; index++) {
783 if (rdmsrl_amd_safe(index, &val))
784 continue;
785 printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
567 } 786 }
787 }
788}
568 789
569 index_msb = get_count_order(smp_num_siblings); 790static int show_msr __cpuinitdata;
570 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); 791static __init int setup_show_msr(char *arg)
571 792{
572 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 793 int num;
573 c->phys_proc_id);
574
575 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
576
577 index_msb = get_count_order(smp_num_siblings) ;
578
579 core_bits = get_count_order(c->x86_max_cores);
580 794
581 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & 795 get_option(&arg, &num);
582 ((1 << core_bits) - 1);
583 796
584 if (c->x86_max_cores > 1) 797 if (num > 0)
585 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 798 show_msr = num;
586 c->cpu_core_id); 799 return 1;
587 }
588} 800}
589#endif 801__setup("show_msr=", setup_show_msr);
590 802
591static __init int setup_noclflush(char *arg) 803static __init int setup_noclflush(char *arg)
592{ 804{
@@ -604,18 +816,26 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
604 else if (c->cpuid_level >= 0) 816 else if (c->cpuid_level >= 0)
605 vendor = c->x86_vendor_id; 817 vendor = c->x86_vendor_id;
606 818
607 if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) 819 if (vendor && !strstr(c->x86_model_id, vendor))
608 printk("%s ", vendor); 820 printk(KERN_CONT "%s ", vendor);
609 821
610 if (!c->x86_model_id[0]) 822 if (c->x86_model_id[0])
611 printk("%d86", c->x86); 823 printk(KERN_CONT "%s", c->x86_model_id);
612 else 824 else
613 printk("%s", c->x86_model_id); 825 printk(KERN_CONT "%d86", c->x86);
614 826
615 if (c->x86_mask || c->cpuid_level >= 0) 827 if (c->x86_mask || c->cpuid_level >= 0)
616 printk(" stepping %02x\n", c->x86_mask); 828 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
617 else 829 else
618 printk("\n"); 830 printk(KERN_CONT "\n");
831
832#ifdef CONFIG_SMP
833 if (c->cpu_index < show_msr)
834 print_cpu_msr();
835#else
836 if (show_msr)
837 print_cpu_msr();
838#endif
619} 839}
620 840
621static __init int setup_disablecpuid(char *arg) 841static __init int setup_disablecpuid(char *arg)
@@ -631,19 +851,89 @@ __setup("clearcpuid=", setup_disablecpuid);
631 851
632cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 852cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
633 853
634void __init early_cpu_init(void) 854#ifdef CONFIG_X86_64
855struct x8664_pda **_cpu_pda __read_mostly;
856EXPORT_SYMBOL(_cpu_pda);
857
858struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
859
860char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
861
862void __cpuinit pda_init(int cpu)
635{ 863{
636 struct cpu_vendor_dev *cvdev; 864 struct x8664_pda *pda = cpu_pda(cpu);
865
866 /* Setup up data that may be needed in __get_free_pages early */
867 loadsegment(fs, 0);
868 loadsegment(gs, 0);
869 /* Memory clobbers used to order PDA accessed */
870 mb();
871 wrmsrl(MSR_GS_BASE, pda);
872 mb();
873
874 pda->cpunumber = cpu;
875 pda->irqcount = -1;
876 pda->kernelstack = (unsigned long)stack_thread_info() -
877 PDA_STACKOFFSET + THREAD_SIZE;
878 pda->active_mm = &init_mm;
879 pda->mmu_state = 0;
880
881 if (cpu == 0) {
882 /* others are initialized in smpboot.c */
883 pda->pcurrent = &init_task;
884 pda->irqstackptr = boot_cpu_stack;
885 pda->irqstackptr += IRQSTACKSIZE - 64;
886 } else {
887 if (!pda->irqstackptr) {
888 pda->irqstackptr = (char *)
889 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
890 if (!pda->irqstackptr)
891 panic("cannot allocate irqstack for cpu %d",
892 cpu);
893 pda->irqstackptr += IRQSTACKSIZE - 64;
894 }
895
896 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
897 pda->nodenumber = cpu_to_node(cpu);
898 }
899}
900
901char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
902 DEBUG_STKSZ] __page_aligned_bss;
637 903
638 for (cvdev = __x86cpuvendor_start ; 904extern asmlinkage void ignore_sysret(void);
639 cvdev < __x86cpuvendor_end ;
640 cvdev++)
641 cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
642 905
643 early_cpu_detect(); 906/* May not be marked __init: used by software suspend */
644 validate_pat_support(&boot_cpu_data); 907void syscall_init(void)
908{
909 /*
910 * LSTAR and STAR live in a bit strange symbiosis.
911 * They both write to the same internal register. STAR allows to
912 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
913 */
914 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
915 wrmsrl(MSR_LSTAR, system_call);
916 wrmsrl(MSR_CSTAR, ignore_sysret);
917
918#ifdef CONFIG_IA32_EMULATION
919 syscall32_cpu_init();
920#endif
921
922 /* Flags to clear on syscall */
923 wrmsrl(MSR_SYSCALL_MASK,
924 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
645} 925}
646 926
927unsigned long kernel_eflags;
928
929/*
930 * Copies of the original ist values from the tss are only accessed during
931 * debugging, no special alignment required.
932 */
933DEFINE_PER_CPU(struct orig_ist, orig_ist);
934
935#else
936
647/* Make sure %fs is initialized properly in idle threads */ 937/* Make sure %fs is initialized properly in idle threads */
648struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) 938struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
649{ 939{
@@ -651,25 +941,136 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
651 regs->fs = __KERNEL_PERCPU; 941 regs->fs = __KERNEL_PERCPU;
652 return regs; 942 return regs;
653} 943}
654 944#endif
655/* Current gdt points %fs at the "master" per-cpu area: after this,
656 * it's on the real one. */
657void switch_to_new_gdt(void)
658{
659 struct desc_ptr gdt_descr;
660
661 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
662 gdt_descr.size = GDT_SIZE - 1;
663 load_gdt(&gdt_descr);
664 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
665}
666 945
667/* 946/*
668 * cpu_init() initializes state that is per-CPU. Some data is already 947 * cpu_init() initializes state that is per-CPU. Some data is already
669 * initialized (naturally) in the bootstrap process, such as the GDT 948 * initialized (naturally) in the bootstrap process, such as the GDT
670 * and IDT. We reload them nevertheless, this function acts as a 949 * and IDT. We reload them nevertheless, this function acts as a
671 * 'CPU state barrier', nothing should get across. 950 * 'CPU state barrier', nothing should get across.
951 * A lot of state is already set up in PDA init for 64 bit
672 */ 952 */
953#ifdef CONFIG_X86_64
954void __cpuinit cpu_init(void)
955{
956 int cpu = stack_smp_processor_id();
957 struct tss_struct *t = &per_cpu(init_tss, cpu);
958 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
959 unsigned long v;
960 char *estacks = NULL;
961 struct task_struct *me;
962 int i;
963
964 /* CPU 0 is initialised in head64.c */
965 if (cpu != 0)
966 pda_init(cpu);
967 else
968 estacks = boot_exception_stacks;
969
970 me = current;
971
972 if (cpu_test_and_set(cpu, cpu_initialized))
973 panic("CPU#%d already initialized!\n", cpu);
974
975 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
976
977 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
978
979 /*
980 * Initialize the per-CPU GDT with the boot GDT,
981 * and set up the GDT descriptor:
982 */
983
984 switch_to_new_gdt();
985 load_idt((const struct desc_ptr *)&idt_descr);
986
987 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
988 syscall_init();
989
990 wrmsrl(MSR_FS_BASE, 0);
991 wrmsrl(MSR_KERNEL_GS_BASE, 0);
992 barrier();
993
994 check_efer();
995 if (cpu != 0 && x2apic)
996 enable_x2apic();
997
998 /*
999 * set up and load the per-CPU TSS
1000 */
1001 if (!orig_ist->ist[0]) {
1002 static const unsigned int order[N_EXCEPTION_STACKS] = {
1003 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
1004 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
1005 };
1006 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1007 if (cpu) {
1008 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1009 if (!estacks)
1010 panic("Cannot allocate exception "
1011 "stack %ld %d\n", v, cpu);
1012 }
1013 estacks += PAGE_SIZE << order[v];
1014 orig_ist->ist[v] = t->x86_tss.ist[v] =
1015 (unsigned long)estacks;
1016 }
1017 }
1018
1019 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1020 /*
1021 * <= is required because the CPU will access up to
1022 * 8 bits beyond the end of the IO permission bitmap.
1023 */
1024 for (i = 0; i <= IO_BITMAP_LONGS; i++)
1025 t->io_bitmap[i] = ~0UL;
1026
1027 atomic_inc(&init_mm.mm_count);
1028 me->active_mm = &init_mm;
1029 if (me->mm)
1030 BUG();
1031 enter_lazy_tlb(&init_mm, me);
1032
1033 load_sp0(t, &current->thread);
1034 set_tss_desc(cpu, t);
1035 load_TR_desc();
1036 load_LDT(&init_mm.context);
1037
1038#ifdef CONFIG_KGDB
1039 /*
1040 * If the kgdb is connected no debug regs should be altered. This
1041 * is only applicable when KGDB and a KGDB I/O module are built
1042 * into the kernel and you are using early debugging with
1043 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1044 */
1045 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1046 arch_kgdb_ops.correct_hw_break();
1047 else {
1048#endif
1049 /*
1050 * Clear all 6 debug registers:
1051 */
1052
1053 set_debugreg(0UL, 0);
1054 set_debugreg(0UL, 1);
1055 set_debugreg(0UL, 2);
1056 set_debugreg(0UL, 3);
1057 set_debugreg(0UL, 6);
1058 set_debugreg(0UL, 7);
1059#ifdef CONFIG_KGDB
1060 /* If the kgdb is connected no debug regs should be altered. */
1061 }
1062#endif
1063
1064 fpu_init();
1065
1066 raw_local_save_flags(kernel_eflags);
1067
1068 if (is_uv_system())
1069 uv_cpu_init();
1070}
1071
1072#else
1073
673void __cpuinit cpu_init(void) 1074void __cpuinit cpu_init(void)
674{ 1075{
675 int cpu = smp_processor_id(); 1076 int cpu = smp_processor_id();
@@ -723,19 +1124,21 @@ void __cpuinit cpu_init(void)
723 /* 1124 /*
724 * Force FPU initialization: 1125 * Force FPU initialization:
725 */ 1126 */
726 current_thread_info()->status = 0; 1127 if (cpu_has_xsave)
1128 current_thread_info()->status = TS_XSAVE;
1129 else
1130 current_thread_info()->status = 0;
727 clear_used_math(); 1131 clear_used_math();
728 mxcsr_feature_mask_init(); 1132 mxcsr_feature_mask_init();
729}
730 1133
731#ifdef CONFIG_HOTPLUG_CPU 1134 /*
732void __cpuinit cpu_uninit(void) 1135 * Boot processor to setup the FP and extended state context info.
733{ 1136 */
734 int cpu = raw_smp_processor_id(); 1137 if (!smp_processor_id())
735 cpu_clear(cpu, cpu_initialized); 1138 init_thread_xstate();
736 1139
737 /* lazy TLB state */ 1140 xsave_init();
738 per_cpu(cpu_tlbstate, cpu).state = 0;
739 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
740} 1141}
1142
1143
741#endif 1144#endif
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
deleted file mode 100644
index a11f5d4477c..00000000000
--- a/arch/x86/kernel/cpu/common_64.c
+++ /dev/null
@@ -1,712 +0,0 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
10#include <linux/delay.h>
11#include <linux/smp.h>
12#include <linux/percpu.h>
13#include <asm/i387.h>
14#include <asm/msr.h>
15#include <asm/io.h>
16#include <asm/linkage.h>
17#include <asm/mmu_context.h>
18#include <asm/mtrr.h>
19#include <asm/mce.h>
20#include <asm/pat.h>
21#include <asm/asm.h>
22#include <asm/numa.h>
23#ifdef CONFIG_X86_LOCAL_APIC
24#include <asm/mpspec.h>
25#include <asm/apic.h>
26#include <mach_apic.h>
27#endif
28#include <asm/pda.h>
29#include <asm/pgtable.h>
30#include <asm/processor.h>
31#include <asm/desc.h>
32#include <asm/atomic.h>
33#include <asm/proto.h>
34#include <asm/sections.h>
35#include <asm/setup.h>
36#include <asm/genapic.h>
37
38#include "cpu.h"
39
40/* We need valid kernel segments for data and code in long mode too
41 * IRET will check the segment types kkeil 2000/10/28
42 * Also sysret mandates a special GDT layout
43 */
44/* The TLS descriptors are currently at a different place compared to i386.
45 Hopefully nobody expects them at a fixed place (Wine?) */
46DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
47 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
48 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
49 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
50 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
51 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
52 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
53} };
54EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
55
56__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
57
58/* Current gdt points %fs at the "master" per-cpu area: after this,
59 * it's on the real one. */
60void switch_to_new_gdt(void)
61{
62 struct desc_ptr gdt_descr;
63
64 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
65 gdt_descr.size = GDT_SIZE - 1;
66 load_gdt(&gdt_descr);
67}
68
69struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
70
71static void __cpuinit default_init(struct cpuinfo_x86 *c)
72{
73 display_cacheinfo(c);
74}
75
76static struct cpu_dev __cpuinitdata default_cpu = {
77 .c_init = default_init,
78 .c_vendor = "Unknown",
79};
80static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
81
82int __cpuinit get_model_name(struct cpuinfo_x86 *c)
83{
84 unsigned int *v;
85
86 if (c->extended_cpuid_level < 0x80000004)
87 return 0;
88
89 v = (unsigned int *) c->x86_model_id;
90 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
91 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
92 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
93 c->x86_model_id[48] = 0;
94 return 1;
95}
96
97
98void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
99{
100 unsigned int n, dummy, ebx, ecx, edx;
101
102 n = c->extended_cpuid_level;
103
104 if (n >= 0x80000005) {
105 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
106 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
107 "D cache %dK (%d bytes/line)\n",
108 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
109 c->x86_cache_size = (ecx>>24) + (edx>>24);
110 /* On K8 L1 TLB is inclusive, so don't count it */
111 c->x86_tlbsize = 0;
112 }
113
114 if (n >= 0x80000006) {
115 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
116 ecx = cpuid_ecx(0x80000006);
117 c->x86_cache_size = ecx >> 16;
118 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
119
120 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
121 c->x86_cache_size, ecx & 0xFF);
122 }
123}
124
125void __cpuinit detect_ht(struct cpuinfo_x86 *c)
126{
127#ifdef CONFIG_SMP
128 u32 eax, ebx, ecx, edx;
129 int index_msb, core_bits;
130
131 cpuid(1, &eax, &ebx, &ecx, &edx);
132
133
134 if (!cpu_has(c, X86_FEATURE_HT))
135 return;
136 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
137 goto out;
138
139 smp_num_siblings = (ebx & 0xff0000) >> 16;
140
141 if (smp_num_siblings == 1) {
142 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
143 } else if (smp_num_siblings > 1) {
144
145 if (smp_num_siblings > NR_CPUS) {
146 printk(KERN_WARNING "CPU: Unsupported number of "
147 "siblings %d", smp_num_siblings);
148 smp_num_siblings = 1;
149 return;
150 }
151
152 index_msb = get_count_order(smp_num_siblings);
153 c->phys_proc_id = phys_pkg_id(index_msb);
154
155 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
156
157 index_msb = get_count_order(smp_num_siblings);
158
159 core_bits = get_count_order(c->x86_max_cores);
160
161 c->cpu_core_id = phys_pkg_id(index_msb) &
162 ((1 << core_bits) - 1);
163 }
164out:
165 if ((c->x86_max_cores * smp_num_siblings) > 1) {
166 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
167 c->phys_proc_id);
168 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
169 c->cpu_core_id);
170 }
171
172#endif
173}
174
175static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
176{
177 char *v = c->x86_vendor_id;
178 int i;
179 static int printed;
180
181 for (i = 0; i < X86_VENDOR_NUM; i++) {
182 if (cpu_devs[i]) {
183 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
184 (cpu_devs[i]->c_ident[1] &&
185 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
186 c->x86_vendor = i;
187 this_cpu = cpu_devs[i];
188 return;
189 }
190 }
191 }
192 if (!printed) {
193 printed++;
194 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
195 printk(KERN_ERR "CPU: Your system may be unstable.\n");
196 }
197 c->x86_vendor = X86_VENDOR_UNKNOWN;
198}
199
200static void __init early_cpu_support_print(void)
201{
202 int i,j;
203 struct cpu_dev *cpu_devx;
204
205 printk("KERNEL supported cpus:\n");
206 for (i = 0; i < X86_VENDOR_NUM; i++) {
207 cpu_devx = cpu_devs[i];
208 if (!cpu_devx)
209 continue;
210 for (j = 0; j < 2; j++) {
211 if (!cpu_devx->c_ident[j])
212 continue;
213 printk(" %s %s\n", cpu_devx->c_vendor,
214 cpu_devx->c_ident[j]);
215 }
216 }
217}
218
219/*
220 * The NOPL instruction is supposed to exist on all CPUs with
221 * family >= 6, unfortunately, that's not true in practice because
222 * of early VIA chips and (more importantly) broken virtualizers that
223 * are not easy to detect. Hence, probe for it based on first
224 * principles.
225 *
226 * Note: no 64-bit chip is known to lack these, but put the code here
227 * for consistency with 32 bits, and to make it utterly trivial to
228 * diagnose the problem should it ever surface.
229 */
230static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
231{
232 const u32 nopl_signature = 0x888c53b1; /* Random number */
233 u32 has_nopl = nopl_signature;
234
235 clear_cpu_cap(c, X86_FEATURE_NOPL);
236 if (c->x86 >= 6) {
237 asm volatile("\n"
238 "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */
239 "2:\n"
240 " .section .fixup,\"ax\"\n"
241 "3: xor %0,%0\n"
242 " jmp 2b\n"
243 " .previous\n"
244 _ASM_EXTABLE(1b,3b)
245 : "+a" (has_nopl));
246
247 if (has_nopl == nopl_signature)
248 set_cpu_cap(c, X86_FEATURE_NOPL);
249 }
250}
251
252static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
253
254void __init early_cpu_init(void)
255{
256 struct cpu_vendor_dev *cvdev;
257
258 for (cvdev = __x86cpuvendor_start ;
259 cvdev < __x86cpuvendor_end ;
260 cvdev++)
261 cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
262 early_cpu_support_print();
263 early_identify_cpu(&boot_cpu_data);
264}
265
266/* Do some early cpuid on the boot CPU to get some parameter that are
267 needed before check_bugs. Everything advanced is in identify_cpu
268 below. */
269static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
270{
271 u32 tfms, xlvl;
272
273 c->loops_per_jiffy = loops_per_jiffy;
274 c->x86_cache_size = -1;
275 c->x86_vendor = X86_VENDOR_UNKNOWN;
276 c->x86_model = c->x86_mask = 0; /* So far unknown... */
277 c->x86_vendor_id[0] = '\0'; /* Unset */
278 c->x86_model_id[0] = '\0'; /* Unset */
279 c->x86_clflush_size = 64;
280 c->x86_cache_alignment = c->x86_clflush_size;
281 c->x86_max_cores = 1;
282 c->x86_coreid_bits = 0;
283 c->extended_cpuid_level = 0;
284 memset(&c->x86_capability, 0, sizeof c->x86_capability);
285
286 /* Get vendor name */
287 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
288 (unsigned int *)&c->x86_vendor_id[0],
289 (unsigned int *)&c->x86_vendor_id[8],
290 (unsigned int *)&c->x86_vendor_id[4]);
291
292 get_cpu_vendor(c);
293
294 /* Initialize the standard set of capabilities */
295 /* Note that the vendor-specific code below might override */
296
297 /* Intel-defined flags: level 0x00000001 */
298 if (c->cpuid_level >= 0x00000001) {
299 __u32 misc;
300 cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
301 &c->x86_capability[0]);
302 c->x86 = (tfms >> 8) & 0xf;
303 c->x86_model = (tfms >> 4) & 0xf;
304 c->x86_mask = tfms & 0xf;
305 if (c->x86 == 0xf)
306 c->x86 += (tfms >> 20) & 0xff;
307 if (c->x86 >= 0x6)
308 c->x86_model += ((tfms >> 16) & 0xF) << 4;
309 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
310 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
311 } else {
312 /* Have CPUID level 0 only - unheard of */
313 c->x86 = 4;
314 }
315
316 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
317#ifdef CONFIG_SMP
318 c->phys_proc_id = c->initial_apicid;
319#endif
320 /* AMD-defined flags: level 0x80000001 */
321 xlvl = cpuid_eax(0x80000000);
322 c->extended_cpuid_level = xlvl;
323 if ((xlvl & 0xffff0000) == 0x80000000) {
324 if (xlvl >= 0x80000001) {
325 c->x86_capability[1] = cpuid_edx(0x80000001);
326 c->x86_capability[6] = cpuid_ecx(0x80000001);
327 }
328 if (xlvl >= 0x80000004)
329 get_model_name(c); /* Default name */
330 }
331
332 /* Transmeta-defined flags: level 0x80860001 */
333 xlvl = cpuid_eax(0x80860000);
334 if ((xlvl & 0xffff0000) == 0x80860000) {
335 /* Don't set x86_cpuid_level here for now to not confuse. */
336 if (xlvl >= 0x80860001)
337 c->x86_capability[2] = cpuid_edx(0x80860001);
338 }
339
340 if (c->extended_cpuid_level >= 0x80000007)
341 c->x86_power = cpuid_edx(0x80000007);
342
343 if (c->extended_cpuid_level >= 0x80000008) {
344 u32 eax = cpuid_eax(0x80000008);
345
346 c->x86_virt_bits = (eax >> 8) & 0xff;
347 c->x86_phys_bits = eax & 0xff;
348 }
349
350 detect_nopl(c);
351
352 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
353 cpu_devs[c->x86_vendor]->c_early_init)
354 cpu_devs[c->x86_vendor]->c_early_init(c);
355
356 validate_pat_support(c);
357}
358
359/*
360 * This does the hard work of actually picking apart the CPU stuff...
361 */
362static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
363{
364 int i;
365
366 early_identify_cpu(c);
367
368 init_scattered_cpuid_features(c);
369
370 c->apicid = phys_pkg_id(0);
371
372 /*
373 * Vendor-specific initialization. In this section we
374 * canonicalize the feature flags, meaning if there are
375 * features a certain CPU supports which CPUID doesn't
376 * tell us, CPUID claiming incorrect flags, or other bugs,
377 * we handle them here.
378 *
379 * At the end of this section, c->x86_capability better
380 * indicate the features this CPU genuinely supports!
381 */
382 if (this_cpu->c_init)
383 this_cpu->c_init(c);
384
385 detect_ht(c);
386
387 /*
388 * On SMP, boot_cpu_data holds the common feature set between
389 * all CPUs; so make sure that we indicate which features are
390 * common between the CPUs. The first time this routine gets
391 * executed, c == &boot_cpu_data.
392 */
393 if (c != &boot_cpu_data) {
394 /* AND the already accumulated flags with these */
395 for (i = 0; i < NCAPINTS; i++)
396 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
397 }
398
399 /* Clear all flags overriden by options */
400 for (i = 0; i < NCAPINTS; i++)
401 c->x86_capability[i] &= ~cleared_cpu_caps[i];
402
403#ifdef CONFIG_X86_MCE
404 mcheck_init(c);
405#endif
406 select_idle_routine(c);
407
408#ifdef CONFIG_NUMA
409 numa_add_cpu(smp_processor_id());
410#endif
411
412}
413
414void __cpuinit identify_boot_cpu(void)
415{
416 identify_cpu(&boot_cpu_data);
417}
418
419void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
420{
421 BUG_ON(c == &boot_cpu_data);
422 identify_cpu(c);
423 mtrr_ap_init();
424}
425
426static __init int setup_noclflush(char *arg)
427{
428 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
429 return 1;
430}
431__setup("noclflush", setup_noclflush);
432
433void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
434{
435 if (c->x86_model_id[0])
436 printk(KERN_CONT "%s", c->x86_model_id);
437
438 if (c->x86_mask || c->cpuid_level >= 0)
439 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
440 else
441 printk(KERN_CONT "\n");
442}
443
444static __init int setup_disablecpuid(char *arg)
445{
446 int bit;
447 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
448 setup_clear_cpu_cap(bit);
449 else
450 return 0;
451 return 1;
452}
453__setup("clearcpuid=", setup_disablecpuid);
454
455cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
456
457struct x8664_pda **_cpu_pda __read_mostly;
458EXPORT_SYMBOL(_cpu_pda);
459
460struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
461
462char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
463
464unsigned long __supported_pte_mask __read_mostly = ~0UL;
465EXPORT_SYMBOL_GPL(__supported_pte_mask);
466
467static int do_not_nx __cpuinitdata;
468
469/* noexec=on|off
470Control non executable mappings for 64bit processes.
471
472on Enable(default)
473off Disable
474*/
475static int __init nonx_setup(char *str)
476{
477 if (!str)
478 return -EINVAL;
479 if (!strncmp(str, "on", 2)) {
480 __supported_pte_mask |= _PAGE_NX;
481 do_not_nx = 0;
482 } else if (!strncmp(str, "off", 3)) {
483 do_not_nx = 1;
484 __supported_pte_mask &= ~_PAGE_NX;
485 }
486 return 0;
487}
488early_param("noexec", nonx_setup);
489
490int force_personality32;
491
492/* noexec32=on|off
493Control non executable heap for 32bit processes.
494To control the stack too use noexec=off
495
496on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
497off PROT_READ implies PROT_EXEC
498*/
499static int __init nonx32_setup(char *str)
500{
501 if (!strcmp(str, "on"))
502 force_personality32 &= ~READ_IMPLIES_EXEC;
503 else if (!strcmp(str, "off"))
504 force_personality32 |= READ_IMPLIES_EXEC;
505 return 1;
506}
507__setup("noexec32=", nonx32_setup);
508
509void pda_init(int cpu)
510{
511 struct x8664_pda *pda = cpu_pda(cpu);
512
513 /* Setup up data that may be needed in __get_free_pages early */
514 loadsegment(fs, 0);
515 loadsegment(gs, 0);
516 /* Memory clobbers used to order PDA accessed */
517 mb();
518 wrmsrl(MSR_GS_BASE, pda);
519 mb();
520
521 pda->cpunumber = cpu;
522 pda->irqcount = -1;
523 pda->kernelstack = (unsigned long)stack_thread_info() -
524 PDA_STACKOFFSET + THREAD_SIZE;
525 pda->active_mm = &init_mm;
526 pda->mmu_state = 0;
527
528 if (cpu == 0) {
529 /* others are initialized in smpboot.c */
530 pda->pcurrent = &init_task;
531 pda->irqstackptr = boot_cpu_stack;
532 pda->irqstackptr += IRQSTACKSIZE - 64;
533 } else {
534 if (!pda->irqstackptr) {
535 pda->irqstackptr = (char *)
536 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
537 if (!pda->irqstackptr)
538 panic("cannot allocate irqstack for cpu %d",
539 cpu);
540 pda->irqstackptr += IRQSTACKSIZE - 64;
541 }
542
543 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
544 pda->nodenumber = cpu_to_node(cpu);
545 }
546}
547
548char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
549 DEBUG_STKSZ] __page_aligned_bss;
550
551extern asmlinkage void ignore_sysret(void);
552
553/* May not be marked __init: used by software suspend */
554void syscall_init(void)
555{
556 /*
557 * LSTAR and STAR live in a bit strange symbiosis.
558 * They both write to the same internal register. STAR allows to
559 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
560 */
561 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
562 wrmsrl(MSR_LSTAR, system_call);
563 wrmsrl(MSR_CSTAR, ignore_sysret);
564
565#ifdef CONFIG_IA32_EMULATION
566 syscall32_cpu_init();
567#endif
568
569 /* Flags to clear on syscall */
570 wrmsrl(MSR_SYSCALL_MASK,
571 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
572}
573
574void __cpuinit check_efer(void)
575{
576 unsigned long efer;
577
578 rdmsrl(MSR_EFER, efer);
579 if (!(efer & EFER_NX) || do_not_nx)
580 __supported_pte_mask &= ~_PAGE_NX;
581}
582
583unsigned long kernel_eflags;
584
585/*
586 * Copies of the original ist values from the tss are only accessed during
587 * debugging, no special alignment required.
588 */
589DEFINE_PER_CPU(struct orig_ist, orig_ist);
590
591/*
592 * cpu_init() initializes state that is per-CPU. Some data is already
593 * initialized (naturally) in the bootstrap process, such as the GDT
594 * and IDT. We reload them nevertheless, this function acts as a
595 * 'CPU state barrier', nothing should get across.
596 * A lot of state is already set up in PDA init.
597 */
598void __cpuinit cpu_init(void)
599{
600 int cpu = stack_smp_processor_id();
601 struct tss_struct *t = &per_cpu(init_tss, cpu);
602 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
603 unsigned long v;
604 char *estacks = NULL;
605 struct task_struct *me;
606 int i;
607
608 /* CPU 0 is initialised in head64.c */
609 if (cpu != 0)
610 pda_init(cpu);
611 else
612 estacks = boot_exception_stacks;
613
614 me = current;
615
616 if (cpu_test_and_set(cpu, cpu_initialized))
617 panic("CPU#%d already initialized!\n", cpu);
618
619 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
620
621 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
622
623 /*
624 * Initialize the per-CPU GDT with the boot GDT,
625 * and set up the GDT descriptor:
626 */
627
628 switch_to_new_gdt();
629 load_idt((const struct desc_ptr *)&idt_descr);
630
631 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
632 syscall_init();
633
634 wrmsrl(MSR_FS_BASE, 0);
635 wrmsrl(MSR_KERNEL_GS_BASE, 0);
636 barrier();
637
638 check_efer();
639
640 /*
641 * set up and load the per-CPU TSS
642 */
643 if (!orig_ist->ist[0]) {
644 static const unsigned int order[N_EXCEPTION_STACKS] = {
645 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
646 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
647 };
648 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
649 if (cpu) {
650 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
651 if (!estacks)
652 panic("Cannot allocate exception "
653 "stack %ld %d\n", v, cpu);
654 }
655 estacks += PAGE_SIZE << order[v];
656 orig_ist->ist[v] = t->x86_tss.ist[v] =
657 (unsigned long)estacks;
658 }
659 }
660
661 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
662 /*
663 * <= is required because the CPU will access up to
664 * 8 bits beyond the end of the IO permission bitmap.
665 */
666 for (i = 0; i <= IO_BITMAP_LONGS; i++)
667 t->io_bitmap[i] = ~0UL;
668
669 atomic_inc(&init_mm.mm_count);
670 me->active_mm = &init_mm;
671 if (me->mm)
672 BUG();
673 enter_lazy_tlb(&init_mm, me);
674
675 load_sp0(t, &current->thread);
676 set_tss_desc(cpu, t);
677 load_TR_desc();
678 load_LDT(&init_mm.context);
679
680#ifdef CONFIG_KGDB
681 /*
682 * If the kgdb is connected no debug regs should be altered. This
683 * is only applicable when KGDB and a KGDB I/O module are built
684 * into the kernel and you are using early debugging with
685 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
686 */
687 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
688 arch_kgdb_ops.correct_hw_break();
689 else {
690#endif
691 /*
692 * Clear all 6 debug registers:
693 */
694
695 set_debugreg(0UL, 0);
696 set_debugreg(0UL, 1);
697 set_debugreg(0UL, 2);
698 set_debugreg(0UL, 3);
699 set_debugreg(0UL, 6);
700 set_debugreg(0UL, 7);
701#ifdef CONFIG_KGDB
702 /* If the kgdb is connected no debug regs should be altered. */
703 }
704#endif
705
706 fpu_init();
707
708 raw_local_save_flags(kernel_eflags);
709
710 if (is_uv_system())
711 uv_cpu_init();
712}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 4d894e8565f..de4094a3921 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -21,23 +21,16 @@ struct cpu_dev {
21 void (*c_init)(struct cpuinfo_x86 * c); 21 void (*c_init)(struct cpuinfo_x86 * c);
22 void (*c_identify)(struct cpuinfo_x86 * c); 22 void (*c_identify)(struct cpuinfo_x86 * c);
23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); 23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
24 int c_x86_vendor;
24}; 25};
25 26
26extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; 27#define cpu_dev_register(cpu_devX) \
28 static struct cpu_dev *__cpu_dev_##cpu_devX __used \
29 __attribute__((__section__(".x86_cpu_dev.init"))) = \
30 &cpu_devX;
27 31
28struct cpu_vendor_dev { 32extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
29 int vendor;
30 struct cpu_dev *cpu_dev;
31};
32
33#define cpu_vendor_dev_register(cpu_vendor_id, cpu_dev) \
34 static struct cpu_vendor_dev __cpu_vendor_dev_##cpu_vendor_id __used \
35 __attribute__((__section__(".x86cpuvendor.init"))) = \
36 { cpu_vendor_id, cpu_dev }
37
38extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
39 33
40extern int get_model_name(struct cpuinfo_x86 *c);
41extern void display_cacheinfo(struct cpuinfo_x86 *c); 34extern void display_cacheinfo(struct cpuinfo_x86 *c);
42 35
43#endif 36#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index dd097b83583..8e48c5d4467 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -256,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask)
256 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and 256 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
257 * no meaning should be associated with absolute values of these MSRs. 257 * no meaning should be associated with absolute values of these MSRs.
258 */ 258 */
259static unsigned int get_measured_perf(unsigned int cpu) 259static unsigned int get_measured_perf(struct cpufreq_policy *policy,
260 unsigned int cpu)
260{ 261{
261 union { 262 union {
262 struct { 263 struct {
@@ -326,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
326 327
327#endif 328#endif
328 329
329 retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; 330 retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
330 331
331 put_cpu(); 332 put_cpu();
332 set_cpus_allowed_ptr(current, &saved_mask); 333 set_cpus_allowed_ptr(current, &saved_mask);
@@ -779,13 +780,20 @@ static int __init acpi_cpufreq_init(void)
779{ 780{
780 int ret; 781 int ret;
781 782
783 if (acpi_disabled)
784 return 0;
785
782 dprintk("acpi_cpufreq_init\n"); 786 dprintk("acpi_cpufreq_init\n");
783 787
784 ret = acpi_cpufreq_early_init(); 788 ret = acpi_cpufreq_early_init();
785 if (ret) 789 if (ret)
786 return ret; 790 return ret;
787 791
788 return cpufreq_register_driver(&acpi_cpufreq_driver); 792 ret = cpufreq_register_driver(&acpi_cpufreq_driver);
793 if (ret)
794 free_percpu(acpi_perf_data);
795
796 return ret;
789} 797}
790 798
791static void __exit acpi_cpufreq_exit(void) 799static void __exit acpi_cpufreq_exit(void)
@@ -795,8 +803,6 @@ static void __exit acpi_cpufreq_exit(void)
795 cpufreq_unregister_driver(&acpi_cpufreq_driver); 803 cpufreq_unregister_driver(&acpi_cpufreq_driver);
796 804
797 free_percpu(acpi_perf_data); 805 free_percpu(acpi_perf_data);
798
799 return;
800} 806}
801 807
802module_param(acpi_pstate_strict, uint, 0644); 808module_param(acpi_pstate_strict, uint, 0644);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index e4a4bf870e9..fe613c93b36 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -25,8 +25,8 @@
25#include <linux/cpufreq.h> 25#include <linux/cpufreq.h>
26 26
27#include <asm/msr.h> 27#include <asm/msr.h>
28#include <asm/timex.h> 28#include <linux/timex.h>
29#include <asm/io.h> 29#include <linux/io.h>
30 30
31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ 31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ 32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
@@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
82 u8 clockspeed_reg; /* Clock Speed Register */ 82 u8 clockspeed_reg; /* Clock Speed Register */
83 83
84 local_irq_disable(); 84 local_irq_disable();
85 outb_p(0x80,REG_CSCIR); 85 outb_p(0x80, REG_CSCIR);
86 clockspeed_reg = inb_p(REG_CSCDR); 86 clockspeed_reg = inb_p(REG_CSCDR);
87 local_irq_enable(); 87 local_irq_enable();
88 88
@@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
98 } 98 }
99 99
100 /* 33 MHz is not 32 MHz... */ 100 /* 33 MHz is not 32 MHz... */
101 if ((clockspeed_reg & 0xE0)==0xA0) 101 if ((clockspeed_reg & 0xE0) == 0xA0)
102 return 33000; 102 return 33000;
103 103
104 return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000); 104 return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
105} 105}
106 106
107 107
@@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
117 * There is no return value. 117 * There is no return value.
118 */ 118 */
119 119
120static void elanfreq_set_cpu_state (unsigned int state) 120static void elanfreq_set_cpu_state(unsigned int state)
121{ 121{
122 struct cpufreq_freqs freqs; 122 struct cpufreq_freqs freqs;
123 123
@@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state)
144 */ 144 */
145 145
146 local_irq_disable(); 146 local_irq_disable();
147 outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */ 147 outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
148 outb_p(0x00,REG_CSCDR); 148 outb_p(0x00, REG_CSCDR);
149 local_irq_enable(); /* wait till internal pipelines and */ 149 local_irq_enable(); /* wait till internal pipelines and */
150 udelay(1000); /* buffers have cleaned up */ 150 udelay(1000); /* buffers have cleaned up */
151 151
152 local_irq_disable(); 152 local_irq_disable();
153 153
154 /* now, set the CPU clock speed register (0x80) */ 154 /* now, set the CPU clock speed register (0x80) */
155 outb_p(0x80,REG_CSCIR); 155 outb_p(0x80, REG_CSCIR);
156 outb_p(elan_multiplier[state].val80h,REG_CSCDR); 156 outb_p(elan_multiplier[state].val80h, REG_CSCDR);
157 157
158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ 158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
159 outb_p(0x40,REG_CSCIR); 159 outb_p(0x40, REG_CSCIR);
160 outb_p(elan_multiplier[state].val40h,REG_CSCDR); 160 outb_p(elan_multiplier[state].val40h, REG_CSCDR);
161 udelay(10000); 161 udelay(10000);
162 local_irq_enable(); 162 local_irq_enable();
163 163
@@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state)
173 * for the hardware supported by the driver. 173 * for the hardware supported by the driver.
174 */ 174 */
175 175
176static int elanfreq_verify (struct cpufreq_policy *policy) 176static int elanfreq_verify(struct cpufreq_policy *policy)
177{ 177{
178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); 178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
179} 179}
180 180
181static int elanfreq_target (struct cpufreq_policy *policy, 181static int elanfreq_target(struct cpufreq_policy *policy,
182 unsigned int target_freq, 182 unsigned int target_freq,
183 unsigned int relation) 183 unsigned int relation)
184{ 184{
@@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
205 205
206 /* capability check */ 206 /* capability check */
207 if ((c->x86_vendor != X86_VENDOR_AMD) || 207 if ((c->x86_vendor != X86_VENDOR_AMD) ||
208 (c->x86 != 4) || (c->x86_model!=10)) 208 (c->x86 != 4) || (c->x86_model != 10))
209 return -ENODEV; 209 return -ENODEV;
210 210
211 /* max freq */ 211 /* max freq */
@@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
213 max_freq = elanfreq_get_cpu_frequency(0); 213 max_freq = elanfreq_get_cpu_frequency(0);
214 214
215 /* table init */ 215 /* table init */
216 for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { 216 for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
217 if (elanfreq_table[i].frequency > max_freq) 217 if (elanfreq_table[i].frequency > max_freq)
218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; 218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
219 } 219 }
@@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
224 224
225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); 225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
226 if (result) 226 if (result)
227 return (result); 227 return result;
228 228
229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); 229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
230 return 0; 230 return 0;
@@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup);
260#endif 260#endif
261 261
262 262
263static struct freq_attr* elanfreq_attr[] = { 263static struct freq_attr *elanfreq_attr[] = {
264 &cpufreq_freq_attr_scaling_available_freqs, 264 &cpufreq_freq_attr_scaling_available_freqs,
265 NULL, 265 NULL,
266}; 266};
@@ -284,9 +284,9 @@ static int __init elanfreq_init(void)
284 284
285 /* Test if we have the right hardware */ 285 /* Test if we have the right hardware */
286 if ((c->x86_vendor != X86_VENDOR_AMD) || 286 if ((c->x86_vendor != X86_VENDOR_AMD) ||
287 (c->x86 != 4) || (c->x86_model!=10)) { 287 (c->x86 != 4) || (c->x86_model != 10)) {
288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); 288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
289 return -ENODEV; 289 return -ENODEV;
290 } 290 }
291 return cpufreq_register_driver(&elanfreq_driver); 291 return cpufreq_register_driver(&elanfreq_driver);
292} 292}
@@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void)
298} 298}
299 299
300 300
301module_param (max_freq, int, 0444); 301module_param(max_freq, int, 0444);
302 302
303MODULE_LICENSE("GPL"); 303MODULE_LICENSE("GPL");
304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); 304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>");
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 06fcce516d5..b0461856acf 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * (C) 2001-2004 Dave Jones. <davej@codemonkey.org.uk> 2 * (C) 2001-2004 Dave Jones. <davej@redhat.com>
3 * (C) 2002 Padraig Brady. <padraig@antefacto.com> 3 * (C) 2002 Padraig Brady. <padraig@antefacto.com>
4 * 4 *
5 * Licensed under the terms of the GNU GPL License version 2. 5 * Licensed under the terms of the GNU GPL License version 2.
@@ -1019,7 +1019,7 @@ MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
1019module_param(revid_errata, int, 0644); 1019module_param(revid_errata, int, 0644);
1020MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); 1020MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
1021 1021
1022MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); 1022MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
1023MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); 1023MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
1024MODULE_LICENSE ("GPL"); 1024MODULE_LICENSE ("GPL");
1025 1025
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index f1685fb91fb..b8e05ee4f73 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
171 } 171 }
172 172
173 if (c->x86 != 0xF) { 173 if (c->x86 != 0xF) {
174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n"); 174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n");
175 return 0; 175 return 0;
176 } 176 }
177 177
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index eb9b62b0830..c1ac5790c63 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -15,12 +15,11 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include <asm/msr.h> 17#include <asm/msr.h>
18#include <asm/timex.h> 18#include <linux/timex.h>
19#include <asm/io.h> 19#include <linux/io.h>
20 20
21 21#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
22#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long 22 as it is unused */
23 as it is unused */
24 23
25static unsigned int busfreq; /* FSB, in 10 kHz */ 24static unsigned int busfreq; /* FSB, in 10 kHz */
26static unsigned int max_multiplier; 25static unsigned int max_multiplier;
@@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void)
53 52
54 msrval = POWERNOW_IOPORT + 0x1; 53 msrval = POWERNOW_IOPORT + 0x1;
55 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 54 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
56 invalue=inl(POWERNOW_IOPORT + 0x8); 55 invalue = inl(POWERNOW_IOPORT + 0x8);
57 msrval = POWERNOW_IOPORT + 0x0; 56 msrval = POWERNOW_IOPORT + 0x0;
58 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 57 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
59 58
@@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void)
67 * 66 *
68 * Tries to change the PowerNow! multiplier 67 * Tries to change the PowerNow! multiplier
69 */ 68 */
70static void powernow_k6_set_state (unsigned int best_i) 69static void powernow_k6_set_state(unsigned int best_i)
71{ 70{
72 unsigned long outvalue=0, invalue=0; 71 unsigned long outvalue = 0, invalue = 0;
73 unsigned long msrval; 72 unsigned long msrval;
74 struct cpufreq_freqs freqs; 73 struct cpufreq_freqs freqs;
75 74
@@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i)
90 89
91 msrval = POWERNOW_IOPORT + 0x1; 90 msrval = POWERNOW_IOPORT + 0x1;
92 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 91 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
93 invalue=inl(POWERNOW_IOPORT + 0x8); 92 invalue = inl(POWERNOW_IOPORT + 0x8);
94 invalue = invalue & 0xf; 93 invalue = invalue & 0xf;
95 outvalue = outvalue | invalue; 94 outvalue = outvalue | invalue;
96 outl(outvalue ,(POWERNOW_IOPORT + 0x8)); 95 outl(outvalue , (POWERNOW_IOPORT + 0x8));
97 msrval = POWERNOW_IOPORT + 0x0; 96 msrval = POWERNOW_IOPORT + 0x0;
98 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 97 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
99 98
@@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy)
124 * 123 *
125 * sets a new CPUFreq policy 124 * sets a new CPUFreq policy
126 */ 125 */
127static int powernow_k6_target (struct cpufreq_policy *policy, 126static int powernow_k6_target(struct cpufreq_policy *policy,
128 unsigned int target_freq, 127 unsigned int target_freq,
129 unsigned int relation) 128 unsigned int relation)
130{ 129{
@@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
152 busfreq = cpu_khz / max_multiplier; 151 busfreq = cpu_khz / max_multiplier;
153 152
154 /* table init */ 153 /* table init */
155 for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { 154 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
156 if (clock_ratio[i].index > max_multiplier) 155 if (clock_ratio[i].index > max_multiplier)
157 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; 156 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
158 else 157 else
@@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
165 164
166 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); 165 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
167 if (result) 166 if (result)
168 return (result); 167 return result;
169 168
170 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); 169 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
171 170
@@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
176static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) 175static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
177{ 176{
178 unsigned int i; 177 unsigned int i;
179 for (i=0; i<8; i++) { 178 for (i = 0; i < 8; i++) {
180 if (i==max_multiplier) 179 if (i == max_multiplier)
181 powernow_k6_set_state(i); 180 powernow_k6_set_state(i);
182 } 181 }
183 cpufreq_frequency_table_put_attr(policy->cpu); 182 cpufreq_frequency_table_put_attr(policy->cpu);
@@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu)
189 return busfreq * powernow_k6_get_cpu_multiplier(); 188 return busfreq * powernow_k6_get_cpu_multiplier();
190} 189}
191 190
192static struct freq_attr* powernow_k6_attr[] = { 191static struct freq_attr *powernow_k6_attr[] = {
193 &cpufreq_freq_attr_scaling_available_freqs, 192 &cpufreq_freq_attr_scaling_available_freqs,
194 NULL, 193 NULL,
195}; 194};
@@ -227,7 +226,7 @@ static int __init powernow_k6_init(void)
227 } 226 }
228 227
229 if (cpufreq_register_driver(&powernow_k6_driver)) { 228 if (cpufreq_register_driver(&powernow_k6_driver)) {
230 release_region (POWERNOW_IOPORT, 16); 229 release_region(POWERNOW_IOPORT, 16);
231 return -EINVAL; 230 return -EINVAL;
232 } 231 }
233 232
@@ -243,13 +242,13 @@ static int __init powernow_k6_init(void)
243static void __exit powernow_k6_exit(void) 242static void __exit powernow_k6_exit(void)
244{ 243{
245 cpufreq_unregister_driver(&powernow_k6_driver); 244 cpufreq_unregister_driver(&powernow_k6_driver);
246 release_region (POWERNOW_IOPORT, 16); 245 release_region(POWERNOW_IOPORT, 16);
247} 246}
248 247
249 248
250MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); 249MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
251MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); 250MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
252MODULE_LICENSE ("GPL"); 251MODULE_LICENSE("GPL");
253 252
254module_init(powernow_k6_init); 253module_init(powernow_k6_init);
255module_exit(powernow_k6_exit); 254module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 0a61159d7b7..7c7d56b4313 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * AMD K7 Powernow driver. 2 * AMD K7 Powernow driver.
3 * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs. 3 * (C) 2003 Dave Jones on behalf of SuSE Labs.
4 * (C) 2003-2004 Dave Jones <davej@redhat.com> 4 * (C) 2003-2004 Dave Jones <davej@redhat.com>
5 * 5 *
6 * Licensed under the terms of the GNU GPL License version 2. 6 * Licensed under the terms of the GNU GPL License version 2.
@@ -692,7 +692,7 @@ static void __exit powernow_exit (void)
692module_param(acpi_force, int, 0444); 692module_param(acpi_force, int, 0444);
693MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); 693MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
694 694
695MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); 695MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
696MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors."); 696MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors.");
697MODULE_LICENSE ("GPL"); 697MODULE_LICENSE ("GPL");
698 698
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 84bb395038d..d3dcd58b87c 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -7,7 +7,7 @@
7 * Support : mark.langsdorf@amd.com 7 * Support : mark.langsdorf@amd.com
8 * 8 *
9 * Based on the powernow-k7.c module written by Dave Jones. 9 * Based on the powernow-k7.c module written by Dave Jones.
10 * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs 10 * (C) 2003 Dave Jones on behalf of SuSE Labs
11 * (C) 2004 Dominik Brodowski <linux@brodo.de> 11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
12 * (C) 2004 Pavel Machek <pavel@suse.cz> 12 * (C) 2004 Pavel Machek <pavel@suse.cz>
13 * Licensed under the terms of the GNU GPL License version 2. 13 * Licensed under the terms of the GNU GPL License version 2.
@@ -45,7 +45,6 @@
45#endif 45#endif
46 46
47#define PFX "powernow-k8: " 47#define PFX "powernow-k8: "
48#define BFX PFX "BIOS error: "
49#define VERSION "version 2.20.00" 48#define VERSION "version 2.20.00"
50#include "powernow-k8.h" 49#include "powernow-k8.h"
51 50
@@ -536,35 +535,40 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8
536 535
537 for (j = 0; j < data->numps; j++) { 536 for (j = 0; j < data->numps; j++) {
538 if (pst[j].vid > LEAST_VID) { 537 if (pst[j].vid > LEAST_VID) {
539 printk(KERN_ERR PFX "vid %d invalid : 0x%x\n", j, pst[j].vid); 538 printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
539 j, pst[j].vid);
540 return -EINVAL; 540 return -EINVAL;
541 } 541 }
542 if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */ 542 if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */
543 printk(KERN_ERR BFX "0 vid exceeded with pstate %d\n", j); 543 printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
544 " %d\n", j);
544 return -ENODEV; 545 return -ENODEV;
545 } 546 }
546 if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */ 547 if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */
547 printk(KERN_ERR BFX "maxvid exceeded with pstate %d\n", j); 548 printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
549 " %d\n", j);
548 return -ENODEV; 550 return -ENODEV;
549 } 551 }
550 if (pst[j].fid > MAX_FID) { 552 if (pst[j].fid > MAX_FID) {
551 printk(KERN_ERR BFX "maxfid exceeded with pstate %d\n", j); 553 printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
554 " %d\n", j);
552 return -ENODEV; 555 return -ENODEV;
553 } 556 }
554 if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) { 557 if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
555 /* Only first fid is allowed to be in "low" range */ 558 /* Only first fid is allowed to be in "low" range */
556 printk(KERN_ERR BFX "two low fids - %d : 0x%x\n", j, pst[j].fid); 559 printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
560 "0x%x\n", j, pst[j].fid);
557 return -EINVAL; 561 return -EINVAL;
558 } 562 }
559 if (pst[j].fid < lastfid) 563 if (pst[j].fid < lastfid)
560 lastfid = pst[j].fid; 564 lastfid = pst[j].fid;
561 } 565 }
562 if (lastfid & 1) { 566 if (lastfid & 1) {
563 printk(KERN_ERR BFX "lastfid invalid\n"); 567 printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
564 return -EINVAL; 568 return -EINVAL;
565 } 569 }
566 if (lastfid > LO_FID_TABLE_TOP) 570 if (lastfid > LO_FID_TABLE_TOP)
567 printk(KERN_INFO BFX "first fid not from lo freq table\n"); 571 printk(KERN_INFO FW_BUG PFX "first fid not from lo freq table\n");
568 572
569 return 0; 573 return 0;
570} 574}
@@ -672,13 +676,13 @@ static int find_psb_table(struct powernow_k8_data *data)
672 676
673 dprintk("table vers: 0x%x\n", psb->tableversion); 677 dprintk("table vers: 0x%x\n", psb->tableversion);
674 if (psb->tableversion != PSB_VERSION_1_4) { 678 if (psb->tableversion != PSB_VERSION_1_4) {
675 printk(KERN_ERR BFX "PSB table is not v1.4\n"); 679 printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
676 return -ENODEV; 680 return -ENODEV;
677 } 681 }
678 682
679 dprintk("flags: 0x%x\n", psb->flags1); 683 dprintk("flags: 0x%x\n", psb->flags1);
680 if (psb->flags1) { 684 if (psb->flags1) {
681 printk(KERN_ERR BFX "unknown flags\n"); 685 printk(KERN_ERR FW_BUG PFX "unknown flags\n");
682 return -ENODEV; 686 return -ENODEV;
683 } 687 }
684 688
@@ -705,7 +709,7 @@ static int find_psb_table(struct powernow_k8_data *data)
705 } 709 }
706 } 710 }
707 if (cpst != 1) { 711 if (cpst != 1) {
708 printk(KERN_ERR BFX "numpst must be 1\n"); 712 printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
709 return -ENODEV; 713 return -ENODEV;
710 } 714 }
711 715
@@ -1130,17 +1134,19 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1130 "ACPI Processor module before starting this " 1134 "ACPI Processor module before starting this "
1131 "driver.\n"); 1135 "driver.\n");
1132#else 1136#else
1133 printk(KERN_ERR PFX "Your BIOS does not provide ACPI " 1137 printk(KERN_ERR FW_BUG PFX "Your BIOS does not provide"
1134 "_PSS objects in a way that Linux understands. " 1138 " ACPI _PSS objects in a way that Linux "
1135 "Please report this to the Linux ACPI maintainers" 1139 "understands. Please report this to the Linux "
1136 " and complain to your BIOS vendor.\n"); 1140 "ACPI maintainers and complain to your BIOS "
1141 "vendor.\n");
1137#endif 1142#endif
1138 kfree(data); 1143 kfree(data);
1139 return -ENODEV; 1144 return -ENODEV;
1140 } 1145 }
1141 if (pol->cpu != 0) { 1146 if (pol->cpu != 0) {
1142 printk(KERN_ERR PFX "No ACPI _PSS objects for CPU other than " 1147 printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
1143 "CPU0. Complain to your BIOS vendor.\n"); 1148 "CPU other than CPU0. Complain to your BIOS "
1149 "vendor.\n");
1144 kfree(data); 1150 kfree(data);
1145 return -ENODEV; 1151 return -ENODEV;
1146 } 1152 }
@@ -1193,7 +1199,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1193 1199
1194 /* min/max the cpu is capable of */ 1200 /* min/max the cpu is capable of */
1195 if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) { 1201 if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
1196 printk(KERN_ERR PFX "invalid powernow_table\n"); 1202 printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
1197 powernow_k8_cpu_exit_acpi(data); 1203 powernow_k8_cpu_exit_acpi(data);
1198 kfree(data->powernow_table); 1204 kfree(data->powernow_table);
1199 kfree(data); 1205 kfree(data);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 15e13c01cc3..3b5f06423e7 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -26,7 +26,7 @@
26#include <asm/cpufeature.h> 26#include <asm/cpufeature.h>
27 27
28#define PFX "speedstep-centrino: " 28#define PFX "speedstep-centrino: "
29#define MAINTAINER "cpufreq@lists.linux.org.uk" 29#define MAINTAINER "cpufreq@vger.kernel.org"
30 30
31#define dprintk(msg...) \ 31#define dprintk(msg...) \
32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) 32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 191f7263c61..04d0376b64b 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -431,7 +431,7 @@ static void __exit speedstep_exit(void)
431} 431}
432 432
433 433
434MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); 434MODULE_AUTHOR ("Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
435MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges."); 435MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges.");
436MODULE_LICENSE ("GPL"); 436MODULE_LICENSE ("GPL");
437 437
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 898a5a2002e..ffd0f5ed071 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -121,7 +121,7 @@ static void __cpuinit set_cx86_reorder(void)
121 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 121 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
122 122
123 /* Load/Store Serialize to mem access disable (=reorder it) */ 123 /* Load/Store Serialize to mem access disable (=reorder it) */
124 setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); 124 setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
125 /* set load/store serialize from 1GB to 4GB */ 125 /* set load/store serialize from 1GB to 4GB */
126 ccr3 |= 0xe0; 126 ccr3 |= 0xe0;
127 setCx86(CX86_CCR3, ccr3); 127 setCx86(CX86_CCR3, ccr3);
@@ -132,11 +132,11 @@ static void __cpuinit set_cx86_memwb(void)
132 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); 132 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
133 133
134 /* CCR2 bit 2: unlock NW bit */ 134 /* CCR2 bit 2: unlock NW bit */
135 setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); 135 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
136 /* set 'Not Write-through' */ 136 /* set 'Not Write-through' */
137 write_cr0(read_cr0() | X86_CR0_NW); 137 write_cr0(read_cr0() | X86_CR0_NW);
138 /* CCR2 bit 2: lock NW bit and set WT1 */ 138 /* CCR2 bit 2: lock NW bit and set WT1 */
139 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); 139 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
140} 140}
141 141
142/* 142/*
@@ -150,14 +150,14 @@ static void __cpuinit geode_configure(void)
150 local_irq_save(flags); 150 local_irq_save(flags);
151 151
152 /* Suspend on halt power saving and enable #SUSP pin */ 152 /* Suspend on halt power saving and enable #SUSP pin */
153 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); 153 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
154 154
155 ccr3 = getCx86(CX86_CCR3); 155 ccr3 = getCx86(CX86_CCR3);
156 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 156 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
157 157
158 158
159 /* FPU fast, DTE cache, Mem bypass */ 159 /* FPU fast, DTE cache, Mem bypass */
160 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); 160 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
161 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 161 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
162 162
163 set_cx86_memwb(); 163 set_cx86_memwb();
@@ -291,7 +291,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
291 /* GXm supports extended cpuid levels 'ala' AMD */ 291 /* GXm supports extended cpuid levels 'ala' AMD */
292 if (c->cpuid_level == 2) { 292 if (c->cpuid_level == 2) {
293 /* Enable cxMMX extensions (GX1 Datasheet 54) */ 293 /* Enable cxMMX extensions (GX1 Datasheet 54) */
294 setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); 294 setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
295 295
296 /* 296 /*
297 * GXm : 0x30 ... 0x5f GXm datasheet 51 297 * GXm : 0x30 ... 0x5f GXm datasheet 51
@@ -301,7 +301,6 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
301 */ 301 */
302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) 302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f))
303 geode_configure(); 303 geode_configure();
304 get_model_name(c); /* get CPU marketing name */
305 return; 304 return;
306 } else { /* MediaGX */ 305 } else { /* MediaGX */
307 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; 306 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
@@ -314,7 +313,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
314 if (dir1 > 7) { 313 if (dir1 > 7) {
315 dir0_msn++; /* M II */ 314 dir0_msn++; /* M II */
316 /* Enable MMX extensions (App note 108) */ 315 /* Enable MMX extensions (App note 108) */
317 setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); 316 setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
318 } else { 317 } else {
319 c->coma_bug = 1; /* 6x86MX, it has the bug. */ 318 c->coma_bug = 1; /* 6x86MX, it has the bug. */
320 } 319 }
@@ -429,7 +428,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
429 local_irq_save(flags); 428 local_irq_save(flags);
430 ccr3 = getCx86(CX86_CCR3); 429 ccr3 = getCx86(CX86_CCR3);
431 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 430 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
432 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */ 431 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */
433 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 432 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
434 local_irq_restore(flags); 433 local_irq_restore(flags);
435 } 434 }
@@ -442,14 +441,16 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
442 .c_early_init = early_init_cyrix, 441 .c_early_init = early_init_cyrix,
443 .c_init = init_cyrix, 442 .c_init = init_cyrix,
444 .c_identify = cyrix_identify, 443 .c_identify = cyrix_identify,
444 .c_x86_vendor = X86_VENDOR_CYRIX,
445}; 445};
446 446
447cpu_vendor_dev_register(X86_VENDOR_CYRIX, &cyrix_cpu_dev); 447cpu_dev_register(cyrix_cpu_dev);
448 448
449static struct cpu_dev nsc_cpu_dev __cpuinitdata = { 449static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
450 .c_vendor = "NSC", 450 .c_vendor = "NSC",
451 .c_ident = { "Geode by NSC" }, 451 .c_ident = { "Geode by NSC" },
452 .c_init = init_nsc, 452 .c_init = init_nsc,
453 .c_x86_vendor = X86_VENDOR_NSC,
453}; 454};
454 455
455cpu_vendor_dev_register(X86_VENDOR_NSC, &nsc_cpu_dev); 456cpu_dev_register(nsc_cpu_dev);
diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c
deleted file mode 100644
index c9017799497..00000000000
--- a/arch/x86/kernel/cpu/feature_names.c
+++ /dev/null
@@ -1,84 +0,0 @@
1/*
2 * Strings for the various x86 capability flags.
3 *
4 * This file must not contain any executable code.
5 */
6
7#include <asm/cpufeature.h>
8
9/*
10 * These flag bits must match the definitions in <asm/cpufeature.h>.
11 * NULL means this bit is undefined or reserved; either way it doesn't
12 * have meaning as far as Linux is concerned. Note that it's important
13 * to realize there is a difference between this table and CPUID -- if
14 * applications want to get the raw CPUID data, they should access
15 * /dev/cpu/<cpu_nr>/cpuid instead.
16 */
17const char * const x86_cap_flags[NCAPINTS*32] = {
18 /* Intel-defined */
19 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
20 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
21 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
22 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
23
24 /* AMD-defined */
25 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
26 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
27 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
28 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
29 "3dnowext", "3dnow",
30
31 /* Transmeta-defined */
32 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
33 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
34 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36
37 /* Other (Linux-defined) */
38 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
39 NULL, NULL, NULL, NULL,
40 "constant_tsc", "up", NULL, "arch_perfmon",
41 "pebs", "bts", NULL, NULL,
42 "rep_good", NULL, NULL, NULL,
43 "nopl", NULL, NULL, NULL,
44 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
45
46 /* Intel-defined (#2) */
47 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
48 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
49 NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
50 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
51
52 /* VIA/Cyrix/Centaur-defined */
53 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
54 "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
55 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
56 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
57
58 /* AMD-defined (#2) */
59 "lahf_lm", "cmp_legacy", "svm", "extapic",
60 "cr8_legacy", "abm", "sse4a", "misalignsse",
61 "3dnowprefetch", "osvw", "ibs", "sse5",
62 "skinit", "wdt", NULL, NULL,
63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
64 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
65
66 /* Auxiliary (Linux-defined) */
67 "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
68 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
69 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
70 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
71};
72
73const char *const x86_power_flags[32] = {
74 "ts", /* temperature sensor */
75 "fid", /* frequency id control */
76 "vid", /* voltage id control */
77 "ttp", /* thermal trip */
78 "tm",
79 "stc",
80 "100mhzsteps",
81 "hwpstate",
82 "", /* tsc invariant mapped to constant_tsc */
83 /* nothing */
84};
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b75f2569b8f..cce0b6118d5 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -15,6 +15,11 @@
15#include <asm/ds.h> 15#include <asm/ds.h>
16#include <asm/bugs.h> 16#include <asm/bugs.h>
17 17
18#ifdef CONFIG_X86_64
19#include <asm/topology.h>
20#include <asm/numa_64.h>
21#endif
22
18#include "cpu.h" 23#include "cpu.h"
19 24
20#ifdef CONFIG_X86_LOCAL_APIC 25#ifdef CONFIG_X86_LOCAL_APIC
@@ -23,23 +28,22 @@
23#include <mach_apic.h> 28#include <mach_apic.h>
24#endif 29#endif
25 30
26#ifdef CONFIG_X86_INTEL_USERCOPY
27/*
28 * Alignment at which movsl is preferred for bulk memory copies.
29 */
30struct movsl_mask movsl_mask __read_mostly;
31#endif
32
33static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 31static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
34{ 32{
35 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
36 if (c->x86 == 15 && c->x86_cache_alignment == 64)
37 c->x86_cache_alignment = 128;
38 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 33 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
39 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 34 (c->x86 == 0x6 && c->x86_model >= 0x0e))
40 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 35 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
36
37#ifdef CONFIG_X86_64
38 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
39#else
40 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
41 if (c->x86 == 15 && c->x86_cache_alignment == 64)
42 c->x86_cache_alignment = 128;
43#endif
41} 44}
42 45
46#ifdef CONFIG_X86_32
43/* 47/*
44 * Early probe support logic for ppro memory erratum #50 48 * Early probe support logic for ppro memory erratum #50
45 * 49 *
@@ -59,15 +63,54 @@ int __cpuinit ppro_with_ram_bug(void)
59 return 0; 63 return 0;
60} 64}
61 65
66#ifdef CONFIG_X86_F00F_BUG
67static void __cpuinit trap_init_f00f_bug(void)
68{
69 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
62 70
63/* 71 /*
64 * P4 Xeon errata 037 workaround. 72 * Update the IDT descriptor and reload the IDT so that
65 * Hardware prefetcher may cause stale data to be loaded into the cache. 73 * it uses the read-only mapped virtual address.
66 */ 74 */
67static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) 75 idt_descr.address = fix_to_virt(FIX_F00F_IDT);
76 load_idt(&idt_descr);
77}
78#endif
79
80static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
68{ 81{
69 unsigned long lo, hi; 82 unsigned long lo, hi;
70 83
84#ifdef CONFIG_X86_F00F_BUG
85 /*
86 * All current models of Pentium and Pentium with MMX technology CPUs
87 * have the F0 0F bug, which lets nonprivileged users lock up the system.
88 * Note that the workaround only should be initialized once...
89 */
90 c->f00f_bug = 0;
91 if (!paravirt_enabled() && c->x86 == 5) {
92 static int f00f_workaround_enabled;
93
94 c->f00f_bug = 1;
95 if (!f00f_workaround_enabled) {
96 trap_init_f00f_bug();
97 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
98 f00f_workaround_enabled = 1;
99 }
100 }
101#endif
102
103 /*
104 * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
105 * model 3 mask 3
106 */
107 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
108 clear_cpu_cap(c, X86_FEATURE_SEP);
109
110 /*
111 * P4 Xeon errata 037 workaround.
112 * Hardware prefetcher may cause stale data to be loaded into the cache.
113 */
71 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { 114 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
72 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); 115 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
73 if ((lo & (1<<9)) == 0) { 116 if ((lo & (1<<9)) == 0) {
@@ -77,13 +120,68 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
77 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); 120 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
78 } 121 }
79 } 122 }
123
124 /*
125 * See if we have a good local APIC by checking for buggy Pentia,
126 * i.e. all B steppings and the C2 stepping of P54C when using their
127 * integrated APIC (see 11AP erratum in "Pentium Processor
128 * Specification Update").
129 */
130 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
131 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
132 set_cpu_cap(c, X86_FEATURE_11AP);
133
134
135#ifdef CONFIG_X86_INTEL_USERCOPY
136 /*
137 * Set up the preferred alignment for movsl bulk memory moves
138 */
139 switch (c->x86) {
140 case 4: /* 486: untested */
141 break;
142 case 5: /* Old Pentia: untested */
143 break;
144 case 6: /* PII/PIII only like movsl with 8-byte alignment */
145 movsl_mask.mask = 7;
146 break;
147 case 15: /* P4 is OK down to 8-byte alignment */
148 movsl_mask.mask = 7;
149 break;
150 }
151#endif
152
153#ifdef CONFIG_X86_NUMAQ
154 numaq_tsc_disable();
155#endif
80} 156}
157#else
158static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
159{
160}
161#endif
81 162
163static void __cpuinit srat_detect_node(void)
164{
165#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
166 unsigned node;
167 int cpu = smp_processor_id();
168 int apicid = hard_smp_processor_id();
169
170 /* Don't do the funky fallback heuristics the AMD version employs
171 for now. */
172 node = apicid_to_node[apicid];
173 if (node == NUMA_NO_NODE || !node_online(node))
174 node = first_node(node_online_map);
175 numa_set_node(cpu, node);
176
177 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
178#endif
179}
82 180
83/* 181/*
84 * find out the number of processor cores on the die 182 * find out the number of processor cores on the die
85 */ 183 */
86static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) 184static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
87{ 185{
88 unsigned int eax, ebx, ecx, edx; 186 unsigned int eax, ebx, ecx, edx;
89 187
@@ -98,45 +196,51 @@ static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
98 return 1; 196 return 1;
99} 197}
100 198
101#ifdef CONFIG_X86_F00F_BUG 199static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
102static void __cpuinit trap_init_f00f_bug(void)
103{ 200{
104 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); 201 /* Intel VMX MSR indicated features */
105 202#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
106 /* 203#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
107 * Update the IDT descriptor and reload the IDT so that 204#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
108 * it uses the read-only mapped virtual address. 205#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
109 */ 206#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
110 idt_descr.address = fix_to_virt(FIX_F00F_IDT); 207#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
111 load_idt(&idt_descr); 208
209 u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
210
211 clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
212 clear_cpu_cap(c, X86_FEATURE_VNMI);
213 clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
214 clear_cpu_cap(c, X86_FEATURE_EPT);
215 clear_cpu_cap(c, X86_FEATURE_VPID);
216
217 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
218 msr_ctl = vmx_msr_high | vmx_msr_low;
219 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
220 set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
221 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
222 set_cpu_cap(c, X86_FEATURE_VNMI);
223 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
224 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
225 vmx_msr_low, vmx_msr_high);
226 msr_ctl2 = vmx_msr_high | vmx_msr_low;
227 if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
228 (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
229 set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
230 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
231 set_cpu_cap(c, X86_FEATURE_EPT);
232 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
233 set_cpu_cap(c, X86_FEATURE_VPID);
234 }
112} 235}
113#endif
114 236
115static void __cpuinit init_intel(struct cpuinfo_x86 *c) 237static void __cpuinit init_intel(struct cpuinfo_x86 *c)
116{ 238{
117 unsigned int l2 = 0; 239 unsigned int l2 = 0;
118 char *p = NULL;
119 240
120 early_init_intel(c); 241 early_init_intel(c);
121 242
122#ifdef CONFIG_X86_F00F_BUG 243 intel_workarounds(c);
123 /*
124 * All current models of Pentium and Pentium with MMX technology CPUs
125 * have the F0 0F bug, which lets nonprivileged users lock up the system.
126 * Note that the workaround only should be initialized once...
127 */
128 c->f00f_bug = 0;
129 if (!paravirt_enabled() && c->x86 == 5) {
130 static int f00f_workaround_enabled;
131
132 c->f00f_bug = 1;
133 if (!f00f_workaround_enabled) {
134 trap_init_f00f_bug();
135 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
136 f00f_workaround_enabled = 1;
137 }
138 }
139#endif
140 244
141 l2 = init_intel_cacheinfo(c); 245 l2 = init_intel_cacheinfo(c);
142 if (c->cpuid_level > 9) { 246 if (c->cpuid_level > 9) {
@@ -146,16 +250,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
146 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 250 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
147 } 251 }
148 252
149 /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ 253 if (cpu_has_xmm2)
150 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) 254 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
151 clear_cpu_cap(c, X86_FEATURE_SEP); 255 if (cpu_has_ds) {
256 unsigned int l1;
257 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
258 if (!(l1 & (1<<11)))
259 set_cpu_cap(c, X86_FEATURE_BTS);
260 if (!(l1 & (1<<12)))
261 set_cpu_cap(c, X86_FEATURE_PEBS);
262 ds_init_intel(c);
263 }
152 264
265#ifdef CONFIG_X86_64
266 if (c->x86 == 15)
267 c->x86_cache_alignment = c->x86_clflush_size * 2;
268 if (c->x86 == 6)
269 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
270#else
153 /* 271 /*
154 * Names for the Pentium II/Celeron processors 272 * Names for the Pentium II/Celeron processors
155 * detectable only by also checking the cache size. 273 * detectable only by also checking the cache size.
156 * Dixon is NOT a Celeron. 274 * Dixon is NOT a Celeron.
157 */ 275 */
158 if (c->x86 == 6) { 276 if (c->x86 == 6) {
277 char *p = NULL;
278
159 switch (c->x86_model) { 279 switch (c->x86_model) {
160 case 5: 280 case 5:
161 if (c->x86_mask == 0) { 281 if (c->x86_mask == 0) {
@@ -178,70 +298,41 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
178 p = "Celeron (Coppermine)"; 298 p = "Celeron (Coppermine)";
179 break; 299 break;
180 } 300 }
181 }
182
183 if (p)
184 strcpy(c->x86_model_id, p);
185
186 c->x86_max_cores = num_cpu_cores(c);
187
188 detect_ht(c);
189 301
190 /* Work around errata */ 302 if (p)
191 Intel_errata_workarounds(c); 303 strcpy(c->x86_model_id, p);
192
193#ifdef CONFIG_X86_INTEL_USERCOPY
194 /*
195 * Set up the preferred alignment for movsl bulk memory moves
196 */
197 switch (c->x86) {
198 case 4: /* 486: untested */
199 break;
200 case 5: /* Old Pentia: untested */
201 break;
202 case 6: /* PII/PIII only like movsl with 8-byte alignment */
203 movsl_mask.mask = 7;
204 break;
205 case 15: /* P4 is OK down to 8-byte alignment */
206 movsl_mask.mask = 7;
207 break;
208 } 304 }
209#endif
210 305
211 if (cpu_has_xmm2) 306 if (c->x86 == 15)
212 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
213 if (c->x86 == 15) {
214 set_cpu_cap(c, X86_FEATURE_P4); 307 set_cpu_cap(c, X86_FEATURE_P4);
215 }
216 if (c->x86 == 6) 308 if (c->x86 == 6)
217 set_cpu_cap(c, X86_FEATURE_P3); 309 set_cpu_cap(c, X86_FEATURE_P3);
218 if (cpu_has_ds) {
219 unsigned int l1;
220 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
221 if (!(l1 & (1<<11)))
222 set_cpu_cap(c, X86_FEATURE_BTS);
223 if (!(l1 & (1<<12)))
224 set_cpu_cap(c, X86_FEATURE_PEBS);
225 }
226 310
227 if (cpu_has_bts) 311 if (cpu_has_bts)
228 ds_init_intel(c); 312 ptrace_bts_init_intel(c);
229 313
230 /* 314#endif
231 * See if we have a good local APIC by checking for buggy Pentia,
232 * i.e. all B steppings and the C2 stepping of P54C when using their
233 * integrated APIC (see 11AP erratum in "Pentium Processor
234 * Specification Update").
235 */
236 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
237 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
238 set_cpu_cap(c, X86_FEATURE_11AP);
239 315
240#ifdef CONFIG_X86_NUMAQ 316 detect_extended_topology(c);
241 numaq_tsc_disable(); 317 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
318 /*
319 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
320 * detection.
321 */
322 c->x86_max_cores = intel_num_cpu_cores(c);
323#ifdef CONFIG_X86_32
324 detect_ht(c);
242#endif 325#endif
326 }
327
328 /* Work around errata */
329 srat_detect_node();
330
331 if (cpu_has(c, X86_FEATURE_VMX))
332 detect_vmx_virtcap(c);
243} 333}
244 334
335#ifdef CONFIG_X86_32
245static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) 336static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
246{ 337{
247 /* 338 /*
@@ -254,10 +345,12 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
254 size = 256; 345 size = 256;
255 return size; 346 return size;
256} 347}
348#endif
257 349
258static struct cpu_dev intel_cpu_dev __cpuinitdata = { 350static struct cpu_dev intel_cpu_dev __cpuinitdata = {
259 .c_vendor = "Intel", 351 .c_vendor = "Intel",
260 .c_ident = { "GenuineIntel" }, 352 .c_ident = { "GenuineIntel" },
353#ifdef CONFIG_X86_32
261 .c_models = { 354 .c_models = {
262 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = 355 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
263 { 356 {
@@ -307,76 +400,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = {
307 } 400 }
308 }, 401 },
309 }, 402 },
403 .c_size_cache = intel_size_cache,
404#endif
310 .c_early_init = early_init_intel, 405 .c_early_init = early_init_intel,
311 .c_init = init_intel, 406 .c_init = init_intel,
312 .c_size_cache = intel_size_cache, 407 .c_x86_vendor = X86_VENDOR_INTEL,
313}; 408};
314 409
315cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); 410cpu_dev_register(intel_cpu_dev);
316
317#ifndef CONFIG_X86_CMPXCHG
318unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
319{
320 u8 prev;
321 unsigned long flags;
322
323 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
324 local_irq_save(flags);
325 prev = *(u8 *)ptr;
326 if (prev == old)
327 *(u8 *)ptr = new;
328 local_irq_restore(flags);
329 return prev;
330}
331EXPORT_SYMBOL(cmpxchg_386_u8);
332
333unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
334{
335 u16 prev;
336 unsigned long flags;
337
338 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
339 local_irq_save(flags);
340 prev = *(u16 *)ptr;
341 if (prev == old)
342 *(u16 *)ptr = new;
343 local_irq_restore(flags);
344 return prev;
345}
346EXPORT_SYMBOL(cmpxchg_386_u16);
347
348unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
349{
350 u32 prev;
351 unsigned long flags;
352
353 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
354 local_irq_save(flags);
355 prev = *(u32 *)ptr;
356 if (prev == old)
357 *(u32 *)ptr = new;
358 local_irq_restore(flags);
359 return prev;
360}
361EXPORT_SYMBOL(cmpxchg_386_u32);
362#endif
363
364#ifndef CONFIG_X86_CMPXCHG64
365unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
366{
367 u64 prev;
368 unsigned long flags;
369
370 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
371 local_irq_save(flags);
372 prev = *(u64 *)ptr;
373 if (prev == old)
374 *(u64 *)ptr = new;
375 local_irq_restore(flags);
376 return prev;
377}
378EXPORT_SYMBOL(cmpxchg_486_u64);
379#endif
380
381/* arch_initcall(intel_cpu_init); */
382 411
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
deleted file mode 100644
index 1019c58d39f..00000000000
--- a/arch/x86/kernel/cpu/intel_64.c
+++ /dev/null
@@ -1,95 +0,0 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3#include <asm/processor.h>
4#include <asm/ptrace.h>
5#include <asm/topology.h>
6#include <asm/numa_64.h>
7
8#include "cpu.h"
9
10static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
11{
12 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
13 (c->x86 == 0x6 && c->x86_model >= 0x0e))
14 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15
16 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
17}
18
19/*
20 * find out the number of processor cores on the die
21 */
22static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
23{
24 unsigned int eax, t;
25
26 if (c->cpuid_level < 4)
27 return 1;
28
29 cpuid_count(4, 0, &eax, &t, &t, &t);
30
31 if (eax & 0x1f)
32 return ((eax >> 26) + 1);
33 else
34 return 1;
35}
36
37static void __cpuinit srat_detect_node(void)
38{
39#ifdef CONFIG_NUMA
40 unsigned node;
41 int cpu = smp_processor_id();
42 int apicid = hard_smp_processor_id();
43
44 /* Don't do the funky fallback heuristics the AMD version employs
45 for now. */
46 node = apicid_to_node[apicid];
47 if (node == NUMA_NO_NODE || !node_online(node))
48 node = first_node(node_online_map);
49 numa_set_node(cpu, node);
50
51 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
52#endif
53}
54
55static void __cpuinit init_intel(struct cpuinfo_x86 *c)
56{
57 init_intel_cacheinfo(c);
58 if (c->cpuid_level > 9) {
59 unsigned eax = cpuid_eax(10);
60 /* Check for version and the number of counters */
61 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
62 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
63 }
64
65 if (cpu_has_ds) {
66 unsigned int l1, l2;
67 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
68 if (!(l1 & (1<<11)))
69 set_cpu_cap(c, X86_FEATURE_BTS);
70 if (!(l1 & (1<<12)))
71 set_cpu_cap(c, X86_FEATURE_PEBS);
72 }
73
74
75 if (cpu_has_bts)
76 ds_init_intel(c);
77
78 if (c->x86 == 15)
79 c->x86_cache_alignment = c->x86_clflush_size * 2;
80 if (c->x86 == 6)
81 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
82 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
83 c->x86_max_cores = intel_num_cpu_cores(c);
84
85 srat_detect_node();
86}
87
88static struct cpu_dev intel_cpu_dev __cpuinitdata = {
89 .c_vendor = "Intel",
90 .c_ident = { "GenuineIntel" },
91 .c_early_init = early_init_intel,
92 .c_init = init_intel,
93};
94cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
95
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 6b0a10b002f..3f46afbb1cf 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * Routines to indentify caches on Intel CPU. 2 * Routines to indentify caches on Intel CPU.
3 * 3 *
4 * Changes: 4 * Changes:
5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4) 5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. 6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. 7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
8 */ 8 */
@@ -13,6 +13,7 @@
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/pci.h>
16 17
17#include <asm/processor.h> 18#include <asm/processor.h>
18#include <asm/smp.h> 19#include <asm/smp.h>
@@ -130,9 +131,18 @@ struct _cpuid4_info {
130 union _cpuid4_leaf_ebx ebx; 131 union _cpuid4_leaf_ebx ebx;
131 union _cpuid4_leaf_ecx ecx; 132 union _cpuid4_leaf_ecx ecx;
132 unsigned long size; 133 unsigned long size;
134 unsigned long can_disable;
133 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ 135 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */
134}; 136};
135 137
138#ifdef CONFIG_PCI
139static struct pci_device_id k8_nb_id[] = {
140 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
141 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
142 {}
143};
144#endif
145
136unsigned short num_cache_leaves; 146unsigned short num_cache_leaves;
137 147
138/* AMD doesn't have CPUID4. Emulate it here to report the same 148/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -182,9 +192,10 @@ static unsigned short assocs[] __cpuinitdata = {
182static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; 192static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 };
183static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; 193static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 };
184 194
185static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, 195static void __cpuinit
186 union _cpuid4_leaf_ebx *ebx, 196amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
187 union _cpuid4_leaf_ecx *ecx) 197 union _cpuid4_leaf_ebx *ebx,
198 union _cpuid4_leaf_ecx *ecx)
188{ 199{
189 unsigned dummy; 200 unsigned dummy;
190 unsigned line_size, lines_per_tag, assoc, size_in_kb; 201 unsigned line_size, lines_per_tag, assoc, size_in_kb;
@@ -251,27 +262,40 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
251 (ebx->split.ways_of_associativity + 1) - 1; 262 (ebx->split.ways_of_associativity + 1) - 1;
252} 263}
253 264
254static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) 265static void __cpuinit
266amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
267{
268 if (index < 3)
269 return;
270 this_leaf->can_disable = 1;
271}
272
273static int
274__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
255{ 275{
256 union _cpuid4_leaf_eax eax; 276 union _cpuid4_leaf_eax eax;
257 union _cpuid4_leaf_ebx ebx; 277 union _cpuid4_leaf_ebx ebx;
258 union _cpuid4_leaf_ecx ecx; 278 union _cpuid4_leaf_ecx ecx;
259 unsigned edx; 279 unsigned edx;
260 280
261 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 281 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
262 amd_cpuid4(index, &eax, &ebx, &ecx); 282 amd_cpuid4(index, &eax, &ebx, &ecx);
263 else 283 if (boot_cpu_data.x86 >= 0x10)
264 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 284 amd_check_l3_disable(index, this_leaf);
285 } else {
286 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
287 }
288
265 if (eax.split.type == CACHE_TYPE_NULL) 289 if (eax.split.type == CACHE_TYPE_NULL)
266 return -EIO; /* better error ? */ 290 return -EIO; /* better error ? */
267 291
268 this_leaf->eax = eax; 292 this_leaf->eax = eax;
269 this_leaf->ebx = ebx; 293 this_leaf->ebx = ebx;
270 this_leaf->ecx = ecx; 294 this_leaf->ecx = ecx;
271 this_leaf->size = (ecx.split.number_of_sets + 1) * 295 this_leaf->size = (ecx.split.number_of_sets + 1) *
272 (ebx.split.coherency_line_size + 1) * 296 (ebx.split.coherency_line_size + 1) *
273 (ebx.split.physical_line_partition + 1) * 297 (ebx.split.physical_line_partition + 1) *
274 (ebx.split.ways_of_associativity + 1); 298 (ebx.split.ways_of_associativity + 1);
275 return 0; 299 return 0;
276} 300}
277 301
@@ -453,7 +477,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
453 477
454/* pointer to _cpuid4_info array (for each cache leaf) */ 478/* pointer to _cpuid4_info array (for each cache leaf) */
455static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); 479static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
456#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) 480#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y]))
457 481
458#ifdef CONFIG_SMP 482#ifdef CONFIG_SMP
459static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) 483static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
@@ -490,7 +514,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
490 514
491 this_leaf = CPUID4_INFO_IDX(cpu, index); 515 this_leaf = CPUID4_INFO_IDX(cpu, index);
492 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { 516 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
493 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 517 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
494 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 518 cpu_clear(cpu, sibling_leaf->shared_cpu_map);
495 } 519 }
496} 520}
@@ -572,7 +596,7 @@ struct _index_kobject {
572 596
573/* pointer to array of kobjects for cpuX/cache/indexY */ 597/* pointer to array of kobjects for cpuX/cache/indexY */
574static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); 598static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
575#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) 599#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y]))
576 600
577#define show_one_plus(file_name, object, val) \ 601#define show_one_plus(file_name, object, val) \
578static ssize_t show_##file_name \ 602static ssize_t show_##file_name \
@@ -637,6 +661,99 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
637 } 661 }
638} 662}
639 663
664#define to_object(k) container_of(k, struct _index_kobject, kobj)
665#define to_attr(a) container_of(a, struct _cache_attr, attr)
666
667#ifdef CONFIG_PCI
668static struct pci_dev *get_k8_northbridge(int node)
669{
670 struct pci_dev *dev = NULL;
671 int i;
672
673 for (i = 0; i <= node; i++) {
674 do {
675 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
676 if (!dev)
677 break;
678 } while (!pci_match_id(&k8_nb_id[0], dev));
679 if (!dev)
680 break;
681 }
682 return dev;
683}
684#else
685static struct pci_dev *get_k8_northbridge(int node)
686{
687 return NULL;
688}
689#endif
690
691static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
692{
693 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
694 struct pci_dev *dev = NULL;
695 ssize_t ret = 0;
696 int i;
697
698 if (!this_leaf->can_disable)
699 return sprintf(buf, "Feature not enabled\n");
700
701 dev = get_k8_northbridge(node);
702 if (!dev) {
703 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
704 return -EINVAL;
705 }
706
707 for (i = 0; i < 2; i++) {
708 unsigned int reg;
709
710 pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
711
712 ret += sprintf(buf, "%sEntry: %d\n", buf, i);
713 ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n",
714 buf,
715 reg & 0x80000000 ? "Disabled" : "Allowed",
716 reg & 0x40000000 ? "Disabled" : "Allowed");
717 ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
718 buf, (reg & 0x30000) >> 16, reg & 0xfff);
719 }
720 return ret;
721}
722
723static ssize_t
724store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
725 size_t count)
726{
727 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
728 struct pci_dev *dev = NULL;
729 unsigned int ret, index, val;
730
731 if (!this_leaf->can_disable)
732 return 0;
733
734 if (strlen(buf) > 15)
735 return -EINVAL;
736
737 ret = sscanf(buf, "%x %x", &index, &val);
738 if (ret != 2)
739 return -EINVAL;
740 if (index > 1)
741 return -EINVAL;
742
743 val |= 0xc0000000;
744 dev = get_k8_northbridge(node);
745 if (!dev) {
746 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
747 return -EINVAL;
748 }
749
750 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
751 wbinvd();
752 pci_write_config_dword(dev, 0x1BC + index * 4, val);
753
754 return 1;
755}
756
640struct _cache_attr { 757struct _cache_attr {
641 struct attribute attr; 758 struct attribute attr;
642 ssize_t (*show)(struct _cpuid4_info *, char *); 759 ssize_t (*show)(struct _cpuid4_info *, char *);
@@ -657,6 +774,8 @@ define_one_ro(size);
657define_one_ro(shared_cpu_map); 774define_one_ro(shared_cpu_map);
658define_one_ro(shared_cpu_list); 775define_one_ro(shared_cpu_list);
659 776
777static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
778
660static struct attribute * default_attrs[] = { 779static struct attribute * default_attrs[] = {
661 &type.attr, 780 &type.attr,
662 &level.attr, 781 &level.attr,
@@ -667,12 +786,10 @@ static struct attribute * default_attrs[] = {
667 &size.attr, 786 &size.attr,
668 &shared_cpu_map.attr, 787 &shared_cpu_map.attr,
669 &shared_cpu_list.attr, 788 &shared_cpu_list.attr,
789 &cache_disable.attr,
670 NULL 790 NULL
671}; 791};
672 792
673#define to_object(k) container_of(k, struct _index_kobject, kobj)
674#define to_attr(a) container_of(a, struct _cache_attr, attr)
675
676static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) 793static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
677{ 794{
678 struct _cache_attr *fattr = to_attr(attr); 795 struct _cache_attr *fattr = to_attr(attr);
@@ -682,14 +799,22 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
682 ret = fattr->show ? 799 ret = fattr->show ?
683 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 800 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
684 buf) : 801 buf) :
685 0; 802 0;
686 return ret; 803 return ret;
687} 804}
688 805
689static ssize_t store(struct kobject * kobj, struct attribute * attr, 806static ssize_t store(struct kobject * kobj, struct attribute * attr,
690 const char * buf, size_t count) 807 const char * buf, size_t count)
691{ 808{
692 return 0; 809 struct _cache_attr *fattr = to_attr(attr);
810 struct _index_kobject *this_leaf = to_object(kobj);
811 ssize_t ret;
812
813 ret = fattr->store ?
814 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
815 buf, count) :
816 0;
817 return ret;
693} 818}
694 819
695static struct sysfs_ops sysfs_ops = { 820static struct sysfs_ops sysfs_ops = {
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index f390c9f6635..dd3af6e7b39 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Athlon/Hammer specific Machine Check Exception Reporting 2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk> 3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */ 4 */
5 5
6#include <linux/init.h> 6#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index 774d87cfd8c..0ebf3fc6a61 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * mce.c - x86 Machine Check Exception Reporting 2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk> 3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@redhat.com>
4 */ 4 */
5 5
6#include <linux/init.h> 6#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index 726a5fcdf34..4b031a4ac85 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -860,7 +860,7 @@ error:
860 return err; 860 return err;
861} 861}
862 862
863static void mce_remove_device(unsigned int cpu) 863static __cpuinit void mce_remove_device(unsigned int cpu)
864{ 864{
865 int i; 865 int i;
866 866
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index cc1fccdd31e..a74af128efc 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Non Fatal Machine Check Exception Reporting 2 * Non Fatal Machine Check Exception Reporting
3 * 3 *
4 * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk> 4 * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
5 * 5 *
6 * This file contains routines to check for non-fatal MCEs every 15s 6 * This file contains routines to check for non-fatal MCEs every 15s
7 * 7 *
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
new file mode 100644
index 00000000000..dfea390e160
--- /dev/null
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -0,0 +1,32 @@
1#!/usr/bin/perl
2#
3# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
4#
5
6($in, $out) = @ARGV;
7
8open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n";
9open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
10
11print OUT "#include <asm/cpufeature.h>\n\n";
12print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
13
14while (defined($line = <IN>)) {
15 if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
16 $macro = $1;
17 $feature = $2;
18 $tail = $3;
19 if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
20 $feature = $1;
21 }
22
23 if ($feature ne '') {
24 printf OUT "\t%-32s = \"%s\",\n",
25 "[$macro]", "\L$feature";
26 }
27 }
28}
29print OUT "};\n";
30
31close(IN);
32close(OUT);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index cb7d3b6a80e..4e8d77f01ee 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -401,12 +401,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
401 tmp |= ~((1<<(hi - 1)) - 1); 401 tmp |= ~((1<<(hi - 1)) - 1);
402 402
403 if (tmp != mask_lo) { 403 if (tmp != mask_lo) {
404 static int once = 1; 404 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
405
406 if (once) {
407 printk(KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
408 once = 0;
409 }
410 mask_lo = tmp; 405 mask_lo = tmp;
411 } 406 }
412 } 407 }
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 84c480bb371..4c4214690dd 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
405 } 405 }
406 /* RED-PEN: base can be > 32bit */ 406 /* RED-PEN: base can be > 32bit */
407 len += seq_printf(seq, 407 len += seq_printf(seq,
408 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", 408 "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
409 i, base, base >> (20 - PAGE_SHIFT), size, factor, 409 i, base, base >> (20 - PAGE_SHIFT), size, factor,
410 mtrr_attrib_to_str(type), mtrr_usage_table[i]); 410 mtrr_usage_table[i], mtrr_attrib_to_str(type));
411 } 411 }
412 } 412 }
413 return 0; 413 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index b117d7f8a56..c78c04821ea 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -729,7 +729,7 @@ struct var_mtrr_range_state {
729 mtrr_type type; 729 mtrr_type type;
730}; 730};
731 731
732struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; 732static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
733static int __initdata debug_print; 733static int __initdata debug_print;
734 734
735static int __init 735static int __init
@@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
759 /* take out UC ranges */ 759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) { 760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type; 761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE) 762 if (type != MTRR_TYPE_UNCACHABLE &&
763 type != MTRR_TYPE_WRPROT)
763 continue; 764 continue;
764 size = range_state[i].size_pfn; 765 size = range_state[i].size_pfn;
765 if (!size) 766 if (!size)
@@ -834,7 +835,14 @@ static int __init enable_mtrr_cleanup_setup(char *str)
834 enable_mtrr_cleanup = 1; 835 enable_mtrr_cleanup = 1;
835 return 0; 836 return 0;
836} 837}
837early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); 838early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
839
840static int __init mtrr_cleanup_debug_setup(char *str)
841{
842 debug_print = 1;
843 return 0;
844}
845early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
838 846
839struct var_mtrr_state { 847struct var_mtrr_state {
840 unsigned long range_startk; 848 unsigned long range_startk;
@@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits)
898 } 906 }
899} 907}
900 908
909static unsigned long to_size_factor(unsigned long sizek, char *factorp)
910{
911 char factor;
912 unsigned long base = sizek;
913
914 if (base & ((1<<10) - 1)) {
915 /* not MB alignment */
916 factor = 'K';
917 } else if (base & ((1<<20) - 1)){
918 factor = 'M';
919 base >>= 10;
920 } else {
921 factor = 'G';
922 base >>= 20;
923 }
924
925 *factorp = factor;
926
927 return base;
928}
929
901static unsigned int __init 930static unsigned int __init
902range_to_mtrr(unsigned int reg, unsigned long range_startk, 931range_to_mtrr(unsigned int reg, unsigned long range_startk,
903 unsigned long range_sizek, unsigned char type) 932 unsigned long range_sizek, unsigned char type)
@@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
919 align = max_align; 948 align = max_align;
920 949
921 sizek = 1 << align; 950 sizek = 1 << align;
922 if (debug_print) 951 if (debug_print) {
952 char start_factor = 'K', size_factor = 'K';
953 unsigned long start_base, size_base;
954
955 start_base = to_size_factor(range_startk, &start_factor),
956 size_base = to_size_factor(sizek, &size_factor),
957
923 printk(KERN_DEBUG "Setting variable MTRR %d, " 958 printk(KERN_DEBUG "Setting variable MTRR %d, "
924 "base: %ldMB, range: %ldMB, type %s\n", 959 "base: %ld%cB, range: %ld%cB, type %s\n",
925 reg, range_startk >> 10, sizek >> 10, 960 reg, start_base, start_factor,
961 size_base, size_factor,
926 (type == MTRR_TYPE_UNCACHABLE)?"UC": 962 (type == MTRR_TYPE_UNCACHABLE)?"UC":
927 ((type == MTRR_TYPE_WRBACK)?"WB":"Other") 963 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
928 ); 964 );
965 }
929 save_var_mtrr(reg++, range_startk, sizek, type); 966 save_var_mtrr(reg++, range_startk, sizek, type);
930 range_startk += sizek; 967 range_startk += sizek;
931 range_sizek -= sizek; 968 range_sizek -= sizek;
@@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
970 /* try to append some small hole */ 1007 /* try to append some small hole */
971 range0_basek = state->range_startk; 1008 range0_basek = state->range_startk;
972 range0_sizek = ALIGN(state->range_sizek, chunk_sizek); 1009 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
1010
1011 /* no increase */
973 if (range0_sizek == state->range_sizek) { 1012 if (range0_sizek == state->range_sizek) {
974 if (debug_print) 1013 if (debug_print)
975 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", 1014 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
@@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
980 return 0; 1019 return 0;
981 } 1020 }
982 1021
983 range0_sizek -= chunk_sizek; 1022 /* only cut back, when it is not the last */
984 if (range0_sizek && sizek) { 1023 if (sizek) {
985 while (range0_basek + range0_sizek > (basek + sizek)) { 1024 while (range0_basek + range0_sizek > (basek + sizek)) {
986 range0_sizek -= chunk_sizek; 1025 if (range0_sizek >= chunk_sizek)
987 if (!range0_sizek) 1026 range0_sizek -= chunk_sizek;
988 break; 1027 else
989 } 1028 range0_sizek = 0;
1029
1030 if (!range0_sizek)
1031 break;
1032 }
1033 }
1034
1035second_try:
1036 range_basek = range0_basek + range0_sizek;
1037
1038 /* one hole in the middle */
1039 if (range_basek > basek && range_basek <= (basek + sizek))
1040 second_sizek = range_basek - basek;
1041
1042 if (range0_sizek > state->range_sizek) {
1043
1044 /* one hole in middle or at end */
1045 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
1046
1047 /* hole size should be less than half of range0 size */
1048 if (hole_sizek >= (range0_sizek >> 1) &&
1049 range0_sizek >= chunk_sizek) {
1050 range0_sizek -= chunk_sizek;
1051 second_sizek = 0;
1052 hole_sizek = 0;
1053
1054 goto second_try;
1055 }
990 } 1056 }
991 1057
992 if (range0_sizek) { 1058 if (range0_sizek) {
@@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
996 (range0_basek + range0_sizek)<<10); 1062 (range0_basek + range0_sizek)<<10);
997 state->reg = range_to_mtrr(state->reg, range0_basek, 1063 state->reg = range_to_mtrr(state->reg, range0_basek,
998 range0_sizek, MTRR_TYPE_WRBACK); 1064 range0_sizek, MTRR_TYPE_WRBACK);
999
1000 }
1001
1002 range_basek = range0_basek + range0_sizek;
1003 range_sizek = chunk_sizek;
1004
1005 if (range_basek + range_sizek > basek &&
1006 range_basek + range_sizek <= (basek + sizek)) {
1007 /* one hole */
1008 second_basek = basek;
1009 second_sizek = range_basek + range_sizek - basek;
1010 } 1065 }
1011 1066
1012 /* if last piece, only could one hole near end */ 1067 if (range0_sizek < state->range_sizek) {
1013 if ((second_basek || !basek) && 1068 /* need to handle left over */
1014 range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
1015 (chunk_sizek >> 1)) {
1016 /*
1017 * one hole in middle (second_sizek is 0) or at end
1018 * (second_sizek is 0 )
1019 */
1020 hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
1021 - second_sizek;
1022 hole_basek = range_basek + range_sizek - hole_sizek
1023 - second_sizek;
1024 } else {
1025 /* fallback for big hole, or several holes */
1026 range_sizek = state->range_sizek - range0_sizek; 1069 range_sizek = state->range_sizek - range0_sizek;
1027 second_basek = 0; 1070
1028 second_sizek = 0; 1071 if (debug_print)
1072 printk(KERN_DEBUG "range: %016lx - %016lx\n",
1073 range_basek<<10,
1074 (range_basek + range_sizek)<<10);
1075 state->reg = range_to_mtrr(state->reg, range_basek,
1076 range_sizek, MTRR_TYPE_WRBACK);
1029 } 1077 }
1030 1078
1031 if (debug_print)
1032 printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
1033 (range_basek + range_sizek)<<10);
1034 state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
1035 MTRR_TYPE_WRBACK);
1036 if (hole_sizek) { 1079 if (hole_sizek) {
1080 hole_basek = range_basek - hole_sizek - second_sizek;
1037 if (debug_print) 1081 if (debug_print)
1038 printk(KERN_DEBUG "hole: %016lx - %016lx\n", 1082 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1039 hole_basek<<10, (hole_basek + hole_sizek)<<10); 1083 hole_basek<<10,
1040 state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, 1084 (hole_basek + hole_sizek)<<10);
1041 MTRR_TYPE_UNCACHABLE); 1085 state->reg = range_to_mtrr(state->reg, hole_basek,
1042 1086 hole_sizek, MTRR_TYPE_UNCACHABLE);
1043 } 1087 }
1044 1088
1045 return second_sizek; 1089 return second_sizek;
@@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result {
1154}; 1198};
1155 1199
1156/* 1200/*
1157 * gran_size: 1M, 2M, ..., 2G 1201 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
1158 * chunk size: gran_size, ..., 4G 1202 * chunk size: gran_size, ..., 2G
1159 * so we need (2+13)*6 1203 * so we need (1+16)*8
1160 */ 1204 */
1161#define NUM_RESULT 90 1205#define NUM_RESULT 136
1162#define PSHIFT (PAGE_SHIFT - 10) 1206#define PSHIFT (PAGE_SHIFT - 10)
1163 1207
1164static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; 1208static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
@@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1168static int __init mtrr_cleanup(unsigned address_bits) 1212static int __init mtrr_cleanup(unsigned address_bits)
1169{ 1213{
1170 unsigned long extra_remove_base, extra_remove_size; 1214 unsigned long extra_remove_base, extra_remove_size;
1171 unsigned long i, base, size, def, dummy; 1215 unsigned long base, size, def, dummy;
1172 mtrr_type type; 1216 mtrr_type type;
1173 int nr_range, nr_range_new; 1217 int nr_range, nr_range_new;
1174 u64 chunk_size, gran_size; 1218 u64 chunk_size, gran_size;
1175 unsigned long range_sums, range_sums_new; 1219 unsigned long range_sums, range_sums_new;
1176 int index_good; 1220 int index_good;
1177 int num_reg_good; 1221 int num_reg_good;
1222 int i;
1178 1223
1179 /* extra one for all 0 */ 1224 /* extra one for all 0 */
1180 int num[MTRR_NUM_TYPES + 1]; 1225 int num[MTRR_NUM_TYPES + 1];
@@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1204 continue; 1249 continue;
1205 if (!size) 1250 if (!size)
1206 type = MTRR_NUM_TYPES; 1251 type = MTRR_NUM_TYPES;
1252 if (type == MTRR_TYPE_WRPROT)
1253 type = MTRR_TYPE_UNCACHABLE;
1207 num[type]++; 1254 num[type]++;
1208 } 1255 }
1209 1256
@@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits)
1216 num_var_ranges - num[MTRR_NUM_TYPES]) 1263 num_var_ranges - num[MTRR_NUM_TYPES])
1217 return 0; 1264 return 0;
1218 1265
1266 /* print original var MTRRs at first, for debugging: */
1267 printk(KERN_DEBUG "original variable MTRRs\n");
1268 for (i = 0; i < num_var_ranges; i++) {
1269 char start_factor = 'K', size_factor = 'K';
1270 unsigned long start_base, size_base;
1271
1272 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1273 if (!size_base)
1274 continue;
1275
1276 size_base = to_size_factor(size_base, &size_factor),
1277 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1278 start_base = to_size_factor(start_base, &start_factor),
1279 type = range_state[i].type;
1280
1281 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1282 i, start_base, start_factor,
1283 size_base, size_factor,
1284 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1285 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1286 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1287 );
1288 }
1289
1219 memset(range, 0, sizeof(range)); 1290 memset(range, 0, sizeof(range));
1220 extra_remove_size = 0; 1291 extra_remove_size = 0;
1221 if (mtrr_tom2) { 1292 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1222 extra_remove_base = 1 << (32 - PAGE_SHIFT); 1293 if (mtrr_tom2)
1223 extra_remove_size = 1294 extra_remove_size =
1224 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; 1295 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1225 }
1226 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, 1296 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1227 extra_remove_size); 1297 extra_remove_size);
1298 /*
1299 * [0, 1M) should always be coverred by var mtrr with WB
1300 * and fixed mtrrs should take effective before var mtrr for it
1301 */
1302 nr_range = add_range_with_merge(range, nr_range, 0,
1303 (1ULL<<(20 - PAGE_SHIFT)) - 1);
1304 /* sort the ranges */
1305 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
1306
1228 range_sums = sum_ranges(range, nr_range); 1307 range_sums = sum_ranges(range, nr_range);
1229 printk(KERN_INFO "total RAM coverred: %ldM\n", 1308 printk(KERN_INFO "total RAM coverred: %ldM\n",
1230 range_sums >> (20 - PAGE_SHIFT)); 1309 range_sums >> (20 - PAGE_SHIFT));
1231 1310
1232 if (mtrr_chunk_size && mtrr_gran_size) { 1311 if (mtrr_chunk_size && mtrr_gran_size) {
1233 int num_reg; 1312 int num_reg;
1313 char gran_factor, chunk_factor, lose_factor;
1314 unsigned long gran_base, chunk_base, lose_base;
1234 1315
1235 debug_print = 1; 1316 debug_print++;
1236 /* convert ranges to var ranges state */ 1317 /* convert ranges to var ranges state */
1237 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, 1318 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
1238 mtrr_gran_size); 1319 mtrr_gran_size);
@@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits)
1256 result[i].lose_cover_sizek = 1337 result[i].lose_cover_sizek =
1257 (range_sums - range_sums_new) << PSHIFT; 1338 (range_sums - range_sums_new) << PSHIFT;
1258 1339
1259 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1340 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1260 result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10, 1341 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1261 result[i].chunk_sizek >> 10); 1342 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1262 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", 1343 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1344 result[i].bad?"*BAD*":" ",
1345 gran_base, gran_factor, chunk_base, chunk_factor);
1346 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1263 result[i].num_reg, result[i].bad?"-":"", 1347 result[i].num_reg, result[i].bad?"-":"",
1264 result[i].lose_cover_sizek >> 10); 1348 lose_base, lose_factor);
1265 if (!result[i].bad) { 1349 if (!result[i].bad) {
1266 set_var_mtrr_all(address_bits); 1350 set_var_mtrr_all(address_bits);
1267 return 1; 1351 return 1;
1268 } 1352 }
1269 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " 1353 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1270 "will find optimal one\n"); 1354 "will find optimal one\n");
1271 debug_print = 0; 1355 debug_print--;
1272 memset(result, 0, sizeof(result[0])); 1356 memset(result, 0, sizeof(result[0]));
1273 } 1357 }
1274 1358
1275 i = 0; 1359 i = 0;
1276 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); 1360 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1277 memset(result, 0, sizeof(result)); 1361 memset(result, 0, sizeof(result));
1278 for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { 1362 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1279 for (chunk_size = gran_size; chunk_size < (1ULL<<33); 1363 char gran_factor;
1364 unsigned long gran_base;
1365
1366 if (debug_print)
1367 gran_base = to_size_factor(gran_size >> 10, &gran_factor);
1368
1369 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1280 chunk_size <<= 1) { 1370 chunk_size <<= 1) {
1281 int num_reg; 1371 int num_reg;
1282 1372
1283 if (debug_print) 1373 if (debug_print) {
1284 printk(KERN_INFO 1374 char chunk_factor;
1285 "\ngran_size: %lldM chunk_size_size: %lldM\n", 1375 unsigned long chunk_base;
1286 gran_size >> 20, chunk_size >> 20); 1376
1377 chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
1378 printk(KERN_INFO "\n");
1379 printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
1380 gran_base, gran_factor, chunk_base, chunk_factor);
1381 }
1287 if (i >= NUM_RESULT) 1382 if (i >= NUM_RESULT)
1288 continue; 1383 continue;
1289 1384
@@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits)
1326 1421
1327 /* print out all */ 1422 /* print out all */
1328 for (i = 0; i < NUM_RESULT; i++) { 1423 for (i = 0; i < NUM_RESULT; i++) {
1329 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1424 char gran_factor, chunk_factor, lose_factor;
1330 result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, 1425 unsigned long gran_base, chunk_base, lose_base;
1331 result[i].chunk_sizek >> 10); 1426
1332 printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n", 1427 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1333 result[i].num_reg, result[i].bad?"-":"", 1428 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1334 result[i].lose_cover_sizek >> 10); 1429 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1430 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1431 result[i].bad?"*BAD*":" ",
1432 gran_base, gran_factor, chunk_base, chunk_factor);
1433 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1434 result[i].num_reg, result[i].bad?"-":"",
1435 lose_base, lose_factor);
1335 } 1436 }
1336 1437
1337 /* try to find the optimal index */ 1438 /* try to find the optimal index */
@@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1339 nr_mtrr_spare_reg = num_var_ranges - 1; 1440 nr_mtrr_spare_reg = num_var_ranges - 1;
1340 num_reg_good = -1; 1441 num_reg_good = -1;
1341 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { 1442 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1342 if (!min_loss_pfn[i]) { 1443 if (!min_loss_pfn[i])
1343 num_reg_good = i; 1444 num_reg_good = i;
1344 break;
1345 }
1346 } 1445 }
1347 1446
1348 index_good = -1; 1447 index_good = -1;
@@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits)
1358 } 1457 }
1359 1458
1360 if (index_good != -1) { 1459 if (index_good != -1) {
1460 char gran_factor, chunk_factor, lose_factor;
1461 unsigned long gran_base, chunk_base, lose_base;
1462
1361 printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); 1463 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1362 i = index_good; 1464 i = index_good;
1363 printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", 1465 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1364 result[i].gran_sizek >> 10, 1466 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1365 result[i].chunk_sizek >> 10); 1467 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1366 printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n", 1468 printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
1367 result[i].num_reg, 1469 gran_base, gran_factor, chunk_base, chunk_factor);
1368 result[i].lose_cover_sizek >> 10); 1470 printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
1471 result[i].num_reg, lose_base, lose_factor);
1369 /* convert ranges to var ranges state */ 1472 /* convert ranges to var ranges state */
1370 chunk_size = result[i].chunk_sizek; 1473 chunk_size = result[i].chunk_sizek;
1371 chunk_size <<= 10; 1474 chunk_size <<= 10;
1372 gran_size = result[i].gran_sizek; 1475 gran_size = result[i].gran_sizek;
1373 gran_size <<= 10; 1476 gran_size <<= 10;
1374 debug_print = 1; 1477 debug_print++;
1375 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); 1478 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1479 debug_print--;
1376 set_var_mtrr_all(address_bits); 1480 set_var_mtrr_all(address_bits);
1377 return 1; 1481 return 1;
1378 } 1482 }
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 05cc22dbd4f..9abd48b2267 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -17,6 +17,8 @@
17#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h>
21
20#include <asm/apic.h> 22#include <asm/apic.h>
21#include <asm/intel_arch_perfmon.h> 23#include <asm/intel_arch_perfmon.h>
22 24
@@ -295,13 +297,19 @@ static int setup_k7_watchdog(unsigned nmi_hz)
295 /* setup the timer */ 297 /* setup the timer */
296 wrmsr(evntsel_msr, evntsel, 0); 298 wrmsr(evntsel_msr, evntsel, 0);
297 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); 299 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
298 apic_write(APIC_LVTPC, APIC_DM_NMI);
299 evntsel |= K7_EVNTSEL_ENABLE;
300 wrmsr(evntsel_msr, evntsel, 0);
301 300
301 /* initialize the wd struct before enabling */
302 wd->perfctr_msr = perfctr_msr; 302 wd->perfctr_msr = perfctr_msr;
303 wd->evntsel_msr = evntsel_msr; 303 wd->evntsel_msr = evntsel_msr;
304 wd->cccr_msr = 0; /* unused */ 304 wd->cccr_msr = 0; /* unused */
305
306 /* ok, everything is initialized, announce that we're set */
307 cpu_nmi_set_wd_enabled();
308
309 apic_write(APIC_LVTPC, APIC_DM_NMI);
310 evntsel |= K7_EVNTSEL_ENABLE;
311 wrmsr(evntsel_msr, evntsel, 0);
312
305 return 1; 313 return 1;
306} 314}
307 315
@@ -330,7 +338,8 @@ static void single_msr_unreserve(void)
330 release_perfctr_nmi(wd_ops->perfctr); 338 release_perfctr_nmi(wd_ops->perfctr);
331} 339}
332 340
333static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) 341static void __kprobes
342single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
334{ 343{
335 /* start the cycle over again */ 344 /* start the cycle over again */
336 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); 345 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
@@ -379,17 +388,23 @@ static int setup_p6_watchdog(unsigned nmi_hz)
379 wrmsr(evntsel_msr, evntsel, 0); 388 wrmsr(evntsel_msr, evntsel, 0);
380 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 389 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
381 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); 390 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
382 apic_write(APIC_LVTPC, APIC_DM_NMI);
383 evntsel |= P6_EVNTSEL0_ENABLE;
384 wrmsr(evntsel_msr, evntsel, 0);
385 391
392 /* initialize the wd struct before enabling */
386 wd->perfctr_msr = perfctr_msr; 393 wd->perfctr_msr = perfctr_msr;
387 wd->evntsel_msr = evntsel_msr; 394 wd->evntsel_msr = evntsel_msr;
388 wd->cccr_msr = 0; /* unused */ 395 wd->cccr_msr = 0; /* unused */
396
397 /* ok, everything is initialized, announce that we're set */
398 cpu_nmi_set_wd_enabled();
399
400 apic_write(APIC_LVTPC, APIC_DM_NMI);
401 evntsel |= P6_EVNTSEL0_ENABLE;
402 wrmsr(evntsel_msr, evntsel, 0);
403
389 return 1; 404 return 1;
390} 405}
391 406
392static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) 407static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
393{ 408{
394 /* 409 /*
395 * P6 based Pentium M need to re-unmask 410 * P6 based Pentium M need to re-unmask
@@ -432,6 +447,27 @@ static const struct wd_ops p6_wd_ops = {
432#define P4_CCCR_ENABLE (1 << 12) 447#define P4_CCCR_ENABLE (1 << 12)
433#define P4_CCCR_OVF (1 << 31) 448#define P4_CCCR_OVF (1 << 31)
434 449
450#define P4_CONTROLS 18
451static unsigned int p4_controls[18] = {
452 MSR_P4_BPU_CCCR0,
453 MSR_P4_BPU_CCCR1,
454 MSR_P4_BPU_CCCR2,
455 MSR_P4_BPU_CCCR3,
456 MSR_P4_MS_CCCR0,
457 MSR_P4_MS_CCCR1,
458 MSR_P4_MS_CCCR2,
459 MSR_P4_MS_CCCR3,
460 MSR_P4_FLAME_CCCR0,
461 MSR_P4_FLAME_CCCR1,
462 MSR_P4_FLAME_CCCR2,
463 MSR_P4_FLAME_CCCR3,
464 MSR_P4_IQ_CCCR0,
465 MSR_P4_IQ_CCCR1,
466 MSR_P4_IQ_CCCR2,
467 MSR_P4_IQ_CCCR3,
468 MSR_P4_IQ_CCCR4,
469 MSR_P4_IQ_CCCR5,
470};
435/* 471/*
436 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter 472 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
437 * CRU_ESCR0 (with any non-null event selector) through a complemented 473 * CRU_ESCR0 (with any non-null event selector) through a complemented
@@ -473,6 +509,26 @@ static int setup_p4_watchdog(unsigned nmi_hz)
473 evntsel_msr = MSR_P4_CRU_ESCR0; 509 evntsel_msr = MSR_P4_CRU_ESCR0;
474 cccr_msr = MSR_P4_IQ_CCCR0; 510 cccr_msr = MSR_P4_IQ_CCCR0;
475 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); 511 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
512
513 /*
514 * If we're on the kdump kernel or other situation, we may
515 * still have other performance counter registers set to
516 * interrupt and they'll keep interrupting forever because
517 * of the P4_CCCR_OVF quirk. So we need to ACK all the
518 * pending interrupts and disable all the registers here,
519 * before reenabling the NMI delivery. Refer to p4_rearm()
520 * about the P4_CCCR_OVF quirk.
521 */
522 if (reset_devices) {
523 unsigned int low, high;
524 int i;
525
526 for (i = 0; i < P4_CONTROLS; i++) {
527 rdmsr(p4_controls[i], low, high);
528 low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
529 wrmsr(p4_controls[i], low, high);
530 }
531 }
476 } else { 532 } else {
477 /* logical cpu 1 */ 533 /* logical cpu 1 */
478 perfctr_msr = MSR_P4_IQ_PERFCTR1; 534 perfctr_msr = MSR_P4_IQ_PERFCTR1;
@@ -499,12 +555,17 @@ static int setup_p4_watchdog(unsigned nmi_hz)
499 wrmsr(evntsel_msr, evntsel, 0); 555 wrmsr(evntsel_msr, evntsel, 0);
500 wrmsr(cccr_msr, cccr_val, 0); 556 wrmsr(cccr_msr, cccr_val, 0);
501 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); 557 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
502 apic_write(APIC_LVTPC, APIC_DM_NMI); 558
503 cccr_val |= P4_CCCR_ENABLE;
504 wrmsr(cccr_msr, cccr_val, 0);
505 wd->perfctr_msr = perfctr_msr; 559 wd->perfctr_msr = perfctr_msr;
506 wd->evntsel_msr = evntsel_msr; 560 wd->evntsel_msr = evntsel_msr;
507 wd->cccr_msr = cccr_msr; 561 wd->cccr_msr = cccr_msr;
562
563 /* ok, everything is initialized, announce that we're set */
564 cpu_nmi_set_wd_enabled();
565
566 apic_write(APIC_LVTPC, APIC_DM_NMI);
567 cccr_val |= P4_CCCR_ENABLE;
568 wrmsr(cccr_msr, cccr_val, 0);
508 return 1; 569 return 1;
509} 570}
510 571
@@ -547,7 +608,7 @@ static void p4_unreserve(void)
547 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); 608 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
548} 609}
549 610
550static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) 611static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
551{ 612{
552 unsigned dummy; 613 unsigned dummy;
553 /* 614 /*
@@ -620,13 +681,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
620 wrmsr(evntsel_msr, evntsel, 0); 681 wrmsr(evntsel_msr, evntsel, 0);
621 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 682 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
622 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); 683 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
623 apic_write(APIC_LVTPC, APIC_DM_NMI);
624 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
625 wrmsr(evntsel_msr, evntsel, 0);
626 684
627 wd->perfctr_msr = perfctr_msr; 685 wd->perfctr_msr = perfctr_msr;
628 wd->evntsel_msr = evntsel_msr; 686 wd->evntsel_msr = evntsel_msr;
629 wd->cccr_msr = 0; /* unused */ 687 wd->cccr_msr = 0; /* unused */
688
689 /* ok, everything is initialized, announce that we're set */
690 cpu_nmi_set_wd_enabled();
691
692 apic_write(APIC_LVTPC, APIC_DM_NMI);
693 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
694 wrmsr(evntsel_msr, evntsel, 0);
630 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 695 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
631 return 1; 696 return 1;
632} 697}
@@ -722,7 +787,7 @@ unsigned lapic_adjust_nmi_hz(unsigned hz)
722 return hz; 787 return hz;
723} 788}
724 789
725int lapic_wd_event(unsigned nmi_hz) 790int __kprobes lapic_wd_event(unsigned nmi_hz)
726{ 791{
727 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 792 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
728 u64 ctr; 793 u64 ctr;
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
new file mode 100644
index 00000000000..5abbea297e0
--- /dev/null
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -0,0 +1,20 @@
1/*
2 * Strings for the various x86 power flags
3 *
4 * This file must not contain any executable code.
5 */
6
7#include <asm/cpufeature.h>
8
9const char *const x86_power_flags[32] = {
10 "ts", /* temperature sensor */
11 "fid", /* frequency id control */
12 "vid", /* voltage id control */
13 "ttp", /* thermal trip */
14 "tm",
15 "stc",
16 "100mhzsteps",
17 "hwpstate",
18 "", /* tsc invariant mapped to constant_tsc */
19 /* nothing */
20};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index a26c480b949..01b1244ef1c 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -160,14 +160,16 @@ static void *c_start(struct seq_file *m, loff_t *pos)
160{ 160{
161 if (*pos == 0) /* just in case, cpu 0 is not the first */ 161 if (*pos == 0) /* just in case, cpu 0 is not the first */
162 *pos = first_cpu(cpu_online_map); 162 *pos = first_cpu(cpu_online_map);
163 if ((*pos) < nr_cpu_ids && cpu_online(*pos)) 163 else
164 *pos = next_cpu_nr(*pos - 1, cpu_online_map);
165 if ((*pos) < nr_cpu_ids)
164 return &cpu_data(*pos); 166 return &cpu_data(*pos);
165 return NULL; 167 return NULL;
166} 168}
167 169
168static void *c_next(struct seq_file *m, void *v, loff_t *pos) 170static void *c_next(struct seq_file *m, void *v, loff_t *pos)
169{ 171{
170 *pos = next_cpu(*pos, cpu_online_map); 172 (*pos)++;
171 return c_start(m, pos); 173 return c_start(m, pos);
172} 174}
173 175
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index b911a2c61b8..52b3fefbd5a 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -5,6 +5,18 @@
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include "cpu.h" 6#include "cpu.h"
7 7
8static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
9{
10 u32 xlvl;
11
12 /* Transmeta-defined flags: level 0x80860001 */
13 xlvl = cpuid_eax(0x80860000);
14 if ((xlvl & 0xffff0000) == 0x80860000) {
15 if (xlvl >= 0x80860001)
16 c->x86_capability[2] = cpuid_edx(0x80860001);
17 }
18}
19
8static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) 20static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
9{ 21{
10 unsigned int cap_mask, uk, max, dummy; 22 unsigned int cap_mask, uk, max, dummy;
@@ -12,7 +24,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
12 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; 24 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
13 char cpu_info[65]; 25 char cpu_info[65];
14 26
15 get_model_name(c); /* Same as AMD/Cyrix */ 27 early_init_transmeta(c);
28
16 display_cacheinfo(c); 29 display_cacheinfo(c);
17 30
18 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
@@ -85,23 +98,12 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
85#endif 98#endif
86} 99}
87 100
88static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c)
89{
90 u32 xlvl;
91
92 /* Transmeta-defined flags: level 0x80860001 */
93 xlvl = cpuid_eax(0x80860000);
94 if ((xlvl & 0xffff0000) == 0x80860000) {
95 if (xlvl >= 0x80860001)
96 c->x86_capability[2] = cpuid_edx(0x80860001);
97 }
98}
99
100static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { 101static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
101 .c_vendor = "Transmeta", 102 .c_vendor = "Transmeta",
102 .c_ident = { "GenuineTMx86", "TransmetaCPU" }, 103 .c_ident = { "GenuineTMx86", "TransmetaCPU" },
104 .c_early_init = early_init_transmeta,
103 .c_init = init_transmeta, 105 .c_init = init_transmeta,
104 .c_identify = transmeta_identify, 106 .c_x86_vendor = X86_VENDOR_TRANSMETA,
105}; 107};
106 108
107cpu_vendor_dev_register(X86_VENDOR_TRANSMETA, &transmeta_cpu_dev); 109cpu_dev_register(transmeta_cpu_dev);
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index b1fc90989d7..e777f79e096 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -19,7 +19,8 @@ static struct cpu_dev umc_cpu_dev __cpuinitdata = {
19 } 19 }
20 }, 20 },
21 }, 21 },
22 .c_x86_vendor = X86_VENDOR_UMC,
22}; 23};
23 24
24cpu_vendor_dev_register(X86_VENDOR_UMC, &umc_cpu_dev); 25cpu_dev_register(umc_cpu_dev);
25 26
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 8e9cd6a8ec1..72cefd1e649 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -36,7 +36,6 @@
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/major.h> 37#include <linux/major.h>
38#include <linux/fs.h> 38#include <linux/fs.h>
39#include <linux/smp_lock.h>
40#include <linux/device.h> 39#include <linux/device.h>
41#include <linux/cpu.h> 40#include <linux/cpu.h>
42#include <linux/notifier.h> 41#include <linux/notifier.h>
@@ -148,8 +147,8 @@ static __cpuinit int cpuid_device_create(int cpu)
148{ 147{
149 struct device *dev; 148 struct device *dev;
150 149
151 dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), 150 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
152 NULL, "cpu%d", cpu); 151 "cpu%d", cpu);
153 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 152 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
154} 153}
155 154
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 72d0c56c1b4..f7cdb3b457a 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -13,6 +13,9 @@
13 13
14static void *kdump_buf_page; 14static void *kdump_buf_page;
15 15
16/* Stores the physical address of elf header of crash image. */
17unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
18
16/** 19/**
17 * copy_oldmem_page - copy one page from "oldmem" 20 * copy_oldmem_page - copy one page from "oldmem"
18 * @pfn: page frame number to be copied 21 * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 15e6c6bc4a4..045b36cada6 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -7,9 +7,11 @@
7 7
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/crash_dump.h> 9#include <linux/crash_dump.h>
10#include <linux/uaccess.h>
11#include <linux/io.h>
10 12
11#include <asm/uaccess.h> 13/* Stores the physical address of elf header of crash image. */
12#include <asm/io.h> 14unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
13 15
14/** 16/**
15 * copy_oldmem_page - copy one page from "oldmem" 17 * copy_oldmem_page - copy one page from "oldmem"
@@ -25,7 +27,7 @@
25 * in the current kernel. We stitch up a pte, similar to kmap_atomic. 27 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
26 */ 28 */
27ssize_t copy_oldmem_page(unsigned long pfn, char *buf, 29ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
28 size_t csize, unsigned long offset, int userbuf) 30 size_t csize, unsigned long offset, int userbuf)
29{ 31{
30 void *vaddr; 32 void *vaddr;
31 33
@@ -33,14 +35,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
33 return 0; 35 return 0;
34 36
35 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); 37 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
38 if (!vaddr)
39 return -ENOMEM;
36 40
37 if (userbuf) { 41 if (userbuf) {
38 if (copy_to_user(buf, (vaddr + offset), csize)) { 42 if (copy_to_user(buf, vaddr + offset, csize)) {
39 iounmap(vaddr); 43 iounmap(vaddr);
40 return -EFAULT; 44 return -EFAULT;
41 } 45 }
42 } else 46 } else
43 memcpy(buf, (vaddr + offset), csize); 47 memcpy(buf, vaddr + offset, csize);
44 48
45 iounmap(vaddr); 49 iounmap(vaddr);
46 return csize; 50 return csize;
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index a47798b59f0..b4f14c6c09d 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = {
66 .ds = __USER_DS, 66 .ds = __USER_DS,
67 .fs = __KERNEL_PERCPU, 67 .fs = __KERNEL_PERCPU,
68 68
69 .__cr3 = __pa(swapper_pg_dir) 69 .__cr3 = __pa_nodebug(swapper_pg_dir),
70 } 70 }
71}; 71};
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 11c11b8ec48..2b69994fd3a 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -2,26 +2,49 @@
2 * Debug Store support 2 * Debug Store support
3 * 3 *
4 * This provides a low-level interface to the hardware's Debug Store 4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for last branch recording (LBR) and 5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * Different architectures use a different DS layout/pointer size. 8 * It manages:
9 * The below functions therefore work on a void*. 9 * - per-thread and per-cpu allocation of BTS and PEBS
10 * - buffer memory allocation (optional)
11 * - buffer overflow handling
12 * - buffer access
10 * 13 *
14 * It assumes:
15 * - get_task_struct on all parameter tasks
16 * - current is allowed to trace parameter tasks
11 * 17 *
12 * Since there is no user for PEBS, yet, only LBR (or branch
13 * trace store, BTS) is supported.
14 * 18 *
15 * 19 * Copyright (C) 2007-2008 Intel Corporation.
16 * Copyright (C) 2007 Intel Corporation. 20 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
17 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
18 */ 21 */
19 22
23
24#ifdef CONFIG_X86_DS
25
20#include <asm/ds.h> 26#include <asm/ds.h>
21 27
22#include <linux/errno.h> 28#include <linux/errno.h>
23#include <linux/string.h> 29#include <linux/string.h>
24#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/sched.h>
32#include <linux/mm.h>
33
34
35/*
36 * The configuration for a particular DS hardware implementation.
37 */
38struct ds_configuration {
39 /* the size of the DS structure in bytes */
40 unsigned char sizeof_ds;
41 /* the size of one pointer-typed field in the DS structure in bytes;
42 this covers the first 8 fields related to buffer management. */
43 unsigned char sizeof_field;
44 /* the size of a BTS/PEBS record in bytes */
45 unsigned char sizeof_rec[2];
46};
47static struct ds_configuration ds_cfg;
25 48
26 49
27/* 50/*
@@ -44,378 +67,747 @@
44 * (interrupt occurs when write pointer passes interrupt pointer) 67 * (interrupt occurs when write pointer passes interrupt pointer)
45 * - value to which counter is reset following counter overflow 68 * - value to which counter is reset following counter overflow
46 * 69 *
47 * On later architectures, the last branch recording hardware uses 70 * Later architectures use 64bit pointers throughout, whereas earlier
48 * 64bit pointers even in 32bit mode. 71 * architectures use 32bit pointers in 32bit mode.
49 *
50 *
51 * Branch Trace Store (BTS) records store information about control
52 * flow changes. They at least provide the following information:
53 * - source linear address
54 * - destination linear address
55 * 72 *
56 * Netburst supported a predicated bit that had been dropped in later
57 * architectures. We do not suppor it.
58 * 73 *
74 * We compute the base address for the first 8 fields based on:
75 * - the field size stored in the DS configuration
76 * - the relative field position
77 * - an offset giving the start of the respective region
59 * 78 *
60 * In order to abstract from the actual DS and BTS layout, we describe 79 * This offset is further used to index various arrays holding
61 * the access to the relevant fields. 80 * information for BTS and PEBS at the respective index.
62 * Thanks to Andi Kleen for proposing this design.
63 * 81 *
64 * The implementation, however, is not as general as it might seem. In 82 * On later 32bit processors, we only access the lower 32bit of the
65 * order to stay somewhat simple and efficient, we assume an 83 * 64bit pointer fields. The upper halves will be zeroed out.
66 * underlying unsigned type (mostly a pointer type) and we expect the
67 * field to be at least as big as that type.
68 */ 84 */
69 85
70/* 86enum ds_field {
71 * A special from_ip address to indicate that the BTS record is an 87 ds_buffer_base = 0,
72 * info record that needs to be interpreted or skipped. 88 ds_index,
73 */ 89 ds_absolute_maximum,
74#define BTS_ESCAPE_ADDRESS (-1) 90 ds_interrupt_threshold,
91};
75 92
76/* 93enum ds_qualifier {
77 * A field access descriptor 94 ds_bts = 0,
78 */ 95 ds_pebs
79struct access_desc {
80 unsigned char offset;
81 unsigned char size;
82}; 96};
83 97
98static inline unsigned long ds_get(const unsigned char *base,
99 enum ds_qualifier qual, enum ds_field field)
100{
101 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
102 return *(unsigned long *)base;
103}
104
105static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
106 enum ds_field field, unsigned long value)
107{
108 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
109 (*(unsigned long *)base) = value;
110}
111
112
84/* 113/*
85 * The configuration for a particular DS/BTS hardware implementation. 114 * Locking is done only for allocating BTS or PEBS resources and for
115 * guarding context and buffer memory allocation.
116 *
117 * Most functions require the current task to own the ds context part
118 * they are going to access. All the locking is done when validating
119 * access to the context.
86 */ 120 */
87struct ds_configuration { 121static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
88 /* the DS configuration */
89 unsigned char sizeof_ds;
90 struct access_desc bts_buffer_base;
91 struct access_desc bts_index;
92 struct access_desc bts_absolute_maximum;
93 struct access_desc bts_interrupt_threshold;
94 /* the BTS configuration */
95 unsigned char sizeof_bts;
96 struct access_desc from_ip;
97 struct access_desc to_ip;
98 /* BTS variants used to store additional information like
99 timestamps */
100 struct access_desc info_type;
101 struct access_desc info_data;
102 unsigned long debugctl_mask;
103};
104 122
105/* 123/*
106 * The global configuration used by the below accessor functions 124 * Validate that the current task is allowed to access the BTS/PEBS
125 * buffer of the parameter task.
126 *
127 * Returns 0, if access is granted; -Eerrno, otherwise.
107 */ 128 */
108static struct ds_configuration ds_cfg; 129static inline int ds_validate_access(struct ds_context *context,
130 enum ds_qualifier qual)
131{
132 if (!context)
133 return -EPERM;
134
135 if (context->owner[qual] == current)
136 return 0;
137
138 return -EPERM;
139}
140
109 141
110/* 142/*
111 * Accessor functions for some DS and BTS fields using the above 143 * We either support (system-wide) per-cpu or per-thread allocation.
112 * global ptrace_bts_cfg. 144 * We distinguish the two based on the task_struct pointer, where a
145 * NULL pointer indicates per-cpu allocation for the current cpu.
146 *
147 * Allocations are use-counted. As soon as resources are allocated,
148 * further allocations must be of the same type (per-cpu or
149 * per-thread). We model this by counting allocations (i.e. the number
150 * of tracers of a certain type) for one type negatively:
151 * =0 no tracers
152 * >0 number of per-thread tracers
153 * <0 number of per-cpu tracers
154 *
155 * The below functions to get and put tracers and to check the
156 * allocation type require the ds_lock to be held by the caller.
157 *
158 * Tracers essentially gives the number of ds contexts for a certain
159 * type of allocation.
113 */ 160 */
114static inline unsigned long get_bts_buffer_base(char *base) 161static long tracers;
162
163static inline void get_tracer(struct task_struct *task)
115{ 164{
116 return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); 165 tracers += (task ? 1 : -1);
117} 166}
118static inline void set_bts_buffer_base(char *base, unsigned long value) 167
168static inline void put_tracer(struct task_struct *task)
119{ 169{
120 (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; 170 tracers -= (task ? 1 : -1);
121} 171}
122static inline unsigned long get_bts_index(char *base) 172
173static inline int check_tracer(struct task_struct *task)
123{ 174{
124 return *(unsigned long *)(base + ds_cfg.bts_index.offset); 175 return (task ? (tracers >= 0) : (tracers <= 0));
125} 176}
126static inline void set_bts_index(char *base, unsigned long value) 177
178
179/*
180 * The DS context is either attached to a thread or to a cpu:
181 * - in the former case, the thread_struct contains a pointer to the
182 * attached context.
183 * - in the latter case, we use a static array of per-cpu context
184 * pointers.
185 *
186 * Contexts are use-counted. They are allocated on first access and
187 * deallocated when the last user puts the context.
188 *
189 * We distinguish between an allocating and a non-allocating get of a
190 * context:
191 * - the allocating get is used for requesting BTS/PEBS resources. It
192 * requires the caller to hold the global ds_lock.
193 * - the non-allocating get is used for all other cases. A
194 * non-existing context indicates an error. It acquires and releases
195 * the ds_lock itself for obtaining the context.
196 *
197 * A context and its DS configuration are allocated and deallocated
198 * together. A context always has a DS configuration of the
199 * appropriate size.
200 */
201static DEFINE_PER_CPU(struct ds_context *, system_context);
202
203#define this_system_context per_cpu(system_context, smp_processor_id())
204
205/*
206 * Returns the pointer to the parameter task's context or to the
207 * system-wide context, if task is NULL.
208 *
209 * Increases the use count of the returned context, if not NULL.
210 */
211static inline struct ds_context *ds_get_context(struct task_struct *task)
127{ 212{
128 (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; 213 struct ds_context *context;
214
215 spin_lock(&ds_lock);
216
217 context = (task ? task->thread.ds_ctx : this_system_context);
218 if (context)
219 context->count++;
220
221 spin_unlock(&ds_lock);
222
223 return context;
129} 224}
130static inline unsigned long get_bts_absolute_maximum(char *base) 225
226/*
227 * Same as ds_get_context, but allocates the context and it's DS
228 * structure, if necessary; returns NULL; if out of memory.
229 *
230 * pre: requires ds_lock to be held
231 */
232static inline struct ds_context *ds_alloc_context(struct task_struct *task)
131{ 233{
132 return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); 234 struct ds_context **p_context =
235 (task ? &task->thread.ds_ctx : &this_system_context);
236 struct ds_context *context = *p_context;
237
238 if (!context) {
239 context = kzalloc(sizeof(*context), GFP_KERNEL);
240
241 if (!context)
242 return NULL;
243
244 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
245 if (!context->ds) {
246 kfree(context);
247 return NULL;
248 }
249
250 *p_context = context;
251
252 context->this = p_context;
253 context->task = task;
254
255 if (task)
256 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
257
258 if (!task || (task == current))
259 wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
260
261 get_tracer(task);
262 }
263
264 context->count++;
265
266 return context;
133} 267}
134static inline void set_bts_absolute_maximum(char *base, unsigned long value) 268
269/*
270 * Decreases the use count of the parameter context, if not NULL.
271 * Deallocates the context, if the use count reaches zero.
272 */
273static inline void ds_put_context(struct ds_context *context)
135{ 274{
136 (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; 275 if (!context)
276 return;
277
278 spin_lock(&ds_lock);
279
280 if (--context->count)
281 goto out;
282
283 *(context->this) = NULL;
284
285 if (context->task)
286 clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
287
288 if (!context->task || (context->task == current))
289 wrmsrl(MSR_IA32_DS_AREA, 0);
290
291 put_tracer(context->task);
292
293 /* free any leftover buffers from tracers that did not
294 * deallocate them properly. */
295 kfree(context->buffer[ds_bts]);
296 kfree(context->buffer[ds_pebs]);
297 kfree(context->ds);
298 kfree(context);
299 out:
300 spin_unlock(&ds_lock);
137} 301}
138static inline unsigned long get_bts_interrupt_threshold(char *base) 302
303
304/*
305 * Handle a buffer overflow
306 *
307 * task: the task whose buffers are overflowing;
308 * NULL for a buffer overflow on the current cpu
309 * context: the ds context
310 * qual: the buffer type
311 */
312static void ds_overflow(struct task_struct *task, struct ds_context *context,
313 enum ds_qualifier qual)
139{ 314{
140 return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); 315 if (!context)
316 return;
317
318 if (context->callback[qual])
319 (*context->callback[qual])(task);
320
321 /* todo: do some more overflow handling */
141} 322}
142static inline void set_bts_interrupt_threshold(char *base, unsigned long value) 323
324
325/*
326 * Allocate a non-pageable buffer of the parameter size.
327 * Checks the memory and the locked memory rlimit.
328 *
329 * Returns the buffer, if successful;
330 * NULL, if out of memory or rlimit exceeded.
331 *
332 * size: the requested buffer size in bytes
333 * pages (out): if not NULL, contains the number of pages reserved
334 */
335static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
143{ 336{
144 (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; 337 unsigned long rlim, vm, pgsz;
338 void *buffer;
339
340 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
341
342 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
343 vm = current->mm->total_vm + pgsz;
344 if (rlim < vm)
345 return NULL;
346
347 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
348 vm = current->mm->locked_vm + pgsz;
349 if (rlim < vm)
350 return NULL;
351
352 buffer = kzalloc(size, GFP_KERNEL);
353 if (!buffer)
354 return NULL;
355
356 current->mm->total_vm += pgsz;
357 current->mm->locked_vm += pgsz;
358
359 if (pages)
360 *pages = pgsz;
361
362 return buffer;
145} 363}
146static inline unsigned long get_from_ip(char *base) 364
365static int ds_request(struct task_struct *task, void *base, size_t size,
366 ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
147{ 367{
148 return *(unsigned long *)(base + ds_cfg.from_ip.offset); 368 struct ds_context *context;
369 unsigned long buffer, adj;
370 const unsigned long alignment = (1 << 3);
371 int error = 0;
372
373 if (!ds_cfg.sizeof_ds)
374 return -EOPNOTSUPP;
375
376 /* we require some space to do alignment adjustments below */
377 if (size < (alignment + ds_cfg.sizeof_rec[qual]))
378 return -EINVAL;
379
380 /* buffer overflow notification is not yet implemented */
381 if (ovfl)
382 return -EOPNOTSUPP;
383
384
385 spin_lock(&ds_lock);
386
387 if (!check_tracer(task))
388 return -EPERM;
389
390 error = -ENOMEM;
391 context = ds_alloc_context(task);
392 if (!context)
393 goto out_unlock;
394
395 error = -EALREADY;
396 if (context->owner[qual] == current)
397 goto out_unlock;
398 error = -EPERM;
399 if (context->owner[qual] != NULL)
400 goto out_unlock;
401 context->owner[qual] = current;
402
403 spin_unlock(&ds_lock);
404
405
406 error = -ENOMEM;
407 if (!base) {
408 base = ds_allocate_buffer(size, &context->pages[qual]);
409 if (!base)
410 goto out_release;
411
412 context->buffer[qual] = base;
413 }
414 error = 0;
415
416 context->callback[qual] = ovfl;
417
418 /* adjust the buffer address and size to meet alignment
419 * constraints:
420 * - buffer is double-word aligned
421 * - size is multiple of record size
422 *
423 * We checked the size at the very beginning; we have enough
424 * space to do the adjustment.
425 */
426 buffer = (unsigned long)base;
427
428 adj = ALIGN(buffer, alignment) - buffer;
429 buffer += adj;
430 size -= adj;
431
432 size /= ds_cfg.sizeof_rec[qual];
433 size *= ds_cfg.sizeof_rec[qual];
434
435 ds_set(context->ds, qual, ds_buffer_base, buffer);
436 ds_set(context->ds, qual, ds_index, buffer);
437 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
438
439 if (ovfl) {
440 /* todo: select a suitable interrupt threshold */
441 } else
442 ds_set(context->ds, qual,
443 ds_interrupt_threshold, buffer + size + 1);
444
445 /* we keep the context until ds_release */
446 return error;
447
448 out_release:
449 context->owner[qual] = NULL;
450 ds_put_context(context);
451 return error;
452
453 out_unlock:
454 spin_unlock(&ds_lock);
455 ds_put_context(context);
456 return error;
149} 457}
150static inline void set_from_ip(char *base, unsigned long value) 458
459int ds_request_bts(struct task_struct *task, void *base, size_t size,
460 ds_ovfl_callback_t ovfl)
151{ 461{
152 (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; 462 return ds_request(task, base, size, ovfl, ds_bts);
153} 463}
154static inline unsigned long get_to_ip(char *base) 464
465int ds_request_pebs(struct task_struct *task, void *base, size_t size,
466 ds_ovfl_callback_t ovfl)
155{ 467{
156 return *(unsigned long *)(base + ds_cfg.to_ip.offset); 468 return ds_request(task, base, size, ovfl, ds_pebs);
157} 469}
158static inline void set_to_ip(char *base, unsigned long value) 470
471static int ds_release(struct task_struct *task, enum ds_qualifier qual)
159{ 472{
160 (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; 473 struct ds_context *context;
474 int error;
475
476 context = ds_get_context(task);
477 error = ds_validate_access(context, qual);
478 if (error < 0)
479 goto out;
480
481 kfree(context->buffer[qual]);
482 context->buffer[qual] = NULL;
483
484 current->mm->total_vm -= context->pages[qual];
485 current->mm->locked_vm -= context->pages[qual];
486 context->pages[qual] = 0;
487 context->owner[qual] = NULL;
488
489 /*
490 * we put the context twice:
491 * once for the ds_get_context
492 * once for the corresponding ds_request
493 */
494 ds_put_context(context);
495 out:
496 ds_put_context(context);
497 return error;
161} 498}
162static inline unsigned char get_info_type(char *base) 499
500int ds_release_bts(struct task_struct *task)
163{ 501{
164 return *(unsigned char *)(base + ds_cfg.info_type.offset); 502 return ds_release(task, ds_bts);
165} 503}
166static inline void set_info_type(char *base, unsigned char value) 504
505int ds_release_pebs(struct task_struct *task)
167{ 506{
168 (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; 507 return ds_release(task, ds_pebs);
169} 508}
170static inline unsigned long get_info_data(char *base) 509
510static int ds_get_index(struct task_struct *task, size_t *pos,
511 enum ds_qualifier qual)
171{ 512{
172 return *(unsigned long *)(base + ds_cfg.info_data.offset); 513 struct ds_context *context;
514 unsigned long base, index;
515 int error;
516
517 context = ds_get_context(task);
518 error = ds_validate_access(context, qual);
519 if (error < 0)
520 goto out;
521
522 base = ds_get(context->ds, qual, ds_buffer_base);
523 index = ds_get(context->ds, qual, ds_index);
524
525 error = ((index - base) / ds_cfg.sizeof_rec[qual]);
526 if (pos)
527 *pos = error;
528 out:
529 ds_put_context(context);
530 return error;
173} 531}
174static inline void set_info_data(char *base, unsigned long value) 532
533int ds_get_bts_index(struct task_struct *task, size_t *pos)
175{ 534{
176 (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; 535 return ds_get_index(task, pos, ds_bts);
177} 536}
178 537
538int ds_get_pebs_index(struct task_struct *task, size_t *pos)
539{
540 return ds_get_index(task, pos, ds_pebs);
541}
179 542
180int ds_allocate(void **dsp, size_t bts_size_in_bytes) 543static int ds_get_end(struct task_struct *task, size_t *pos,
544 enum ds_qualifier qual)
181{ 545{
182 size_t bts_size_in_records; 546 struct ds_context *context;
183 unsigned long bts; 547 unsigned long base, end;
184 void *ds; 548 int error;
549
550 context = ds_get_context(task);
551 error = ds_validate_access(context, qual);
552 if (error < 0)
553 goto out;
554
555 base = ds_get(context->ds, qual, ds_buffer_base);
556 end = ds_get(context->ds, qual, ds_absolute_maximum);
557
558 error = ((end - base) / ds_cfg.sizeof_rec[qual]);
559 if (pos)
560 *pos = error;
561 out:
562 ds_put_context(context);
563 return error;
564}
185 565
186 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 566int ds_get_bts_end(struct task_struct *task, size_t *pos)
187 return -EOPNOTSUPP; 567{
568 return ds_get_end(task, pos, ds_bts);
569}
188 570
189 if (bts_size_in_bytes < 0) 571int ds_get_pebs_end(struct task_struct *task, size_t *pos)
190 return -EINVAL; 572{
573 return ds_get_end(task, pos, ds_pebs);
574}
191 575
192 bts_size_in_records = 576static int ds_access(struct task_struct *task, size_t index,
193 bts_size_in_bytes / ds_cfg.sizeof_bts; 577 const void **record, enum ds_qualifier qual)
194 bts_size_in_bytes = 578{
195 bts_size_in_records * ds_cfg.sizeof_bts; 579 struct ds_context *context;
580 unsigned long base, idx;
581 int error;
196 582
197 if (bts_size_in_bytes <= 0) 583 if (!record)
198 return -EINVAL; 584 return -EINVAL;
199 585
200 bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); 586 context = ds_get_context(task);
201 587 error = ds_validate_access(context, qual);
202 if (!bts) 588 if (error < 0)
203 return -ENOMEM; 589 goto out;
204 590
205 ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); 591 base = ds_get(context->ds, qual, ds_buffer_base);
592 idx = base + (index * ds_cfg.sizeof_rec[qual]);
206 593
207 if (!ds) { 594 error = -EINVAL;
208 kfree((void *)bts); 595 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
209 return -ENOMEM; 596 goto out;
210 }
211
212 set_bts_buffer_base(ds, bts);
213 set_bts_index(ds, bts);
214 set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
215 set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
216 597
217 *dsp = ds; 598 *record = (const void *)idx;
218 return 0; 599 error = ds_cfg.sizeof_rec[qual];
600 out:
601 ds_put_context(context);
602 return error;
219} 603}
220 604
221int ds_free(void **dsp) 605int ds_access_bts(struct task_struct *task, size_t index, const void **record)
222{ 606{
223 if (*dsp) { 607 return ds_access(task, index, record, ds_bts);
224 kfree((void *)get_bts_buffer_base(*dsp));
225 kfree(*dsp);
226 *dsp = NULL;
227 }
228 return 0;
229} 608}
230 609
231int ds_get_bts_size(void *ds) 610int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
232{ 611{
233 int size_in_bytes; 612 return ds_access(task, index, record, ds_pebs);
234
235 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
236 return -EOPNOTSUPP;
237
238 if (!ds)
239 return 0;
240
241 size_in_bytes =
242 get_bts_absolute_maximum(ds) -
243 get_bts_buffer_base(ds);
244 return size_in_bytes;
245} 613}
246 614
247int ds_get_bts_end(void *ds) 615static int ds_write(struct task_struct *task, const void *record, size_t size,
616 enum ds_qualifier qual, int force)
248{ 617{
249 int size_in_bytes = ds_get_bts_size(ds); 618 struct ds_context *context;
250 619 int error;
251 if (size_in_bytes <= 0)
252 return size_in_bytes;
253 620
254 return size_in_bytes / ds_cfg.sizeof_bts; 621 if (!record)
255} 622 return -EINVAL;
256 623
257int ds_get_bts_index(void *ds) 624 error = -EPERM;
258{ 625 context = ds_get_context(task);
259 int index_offset_in_bytes; 626 if (!context)
627 goto out;
260 628
261 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 629 if (!force) {
262 return -EOPNOTSUPP; 630 error = ds_validate_access(context, qual);
631 if (error < 0)
632 goto out;
633 }
263 634
264 index_offset_in_bytes = 635 error = 0;
265 get_bts_index(ds) - 636 while (size) {
266 get_bts_buffer_base(ds); 637 unsigned long base, index, end, write_end, int_th;
638 unsigned long write_size, adj_write_size;
639
640 /*
641 * write as much as possible without producing an
642 * overflow interrupt.
643 *
644 * interrupt_threshold must either be
645 * - bigger than absolute_maximum or
646 * - point to a record between buffer_base and absolute_maximum
647 *
648 * index points to a valid record.
649 */
650 base = ds_get(context->ds, qual, ds_buffer_base);
651 index = ds_get(context->ds, qual, ds_index);
652 end = ds_get(context->ds, qual, ds_absolute_maximum);
653 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
654
655 write_end = min(end, int_th);
656
657 /* if we are already beyond the interrupt threshold,
658 * we fill the entire buffer */
659 if (write_end <= index)
660 write_end = end;
661
662 if (write_end <= index)
663 goto out;
664
665 write_size = min((unsigned long) size, write_end - index);
666 memcpy((void *)index, record, write_size);
667
668 record = (const char *)record + write_size;
669 size -= write_size;
670 error += write_size;
671
672 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
673 adj_write_size *= ds_cfg.sizeof_rec[qual];
674
675 /* zero out trailing bytes */
676 memset((char *)index + write_size, 0,
677 adj_write_size - write_size);
678 index += adj_write_size;
679
680 if (index >= end)
681 index = base;
682 ds_set(context->ds, qual, ds_index, index);
683
684 if (index >= int_th)
685 ds_overflow(task, context, qual);
686 }
267 687
268 return index_offset_in_bytes / ds_cfg.sizeof_bts; 688 out:
689 ds_put_context(context);
690 return error;
269} 691}
270 692
271int ds_set_overflow(void *ds, int method) 693int ds_write_bts(struct task_struct *task, const void *record, size_t size)
272{ 694{
273 switch (method) { 695 return ds_write(task, record, size, ds_bts, /* force = */ 0);
274 case DS_O_SIGNAL:
275 return -EOPNOTSUPP;
276 case DS_O_WRAP:
277 return 0;
278 default:
279 return -EINVAL;
280 }
281} 696}
282 697
283int ds_get_overflow(void *ds) 698int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
284{ 699{
285 return DS_O_WRAP; 700 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
286} 701}
287 702
288int ds_clear(void *ds) 703int ds_unchecked_write_bts(struct task_struct *task,
704 const void *record, size_t size)
289{ 705{
290 int bts_size = ds_get_bts_size(ds); 706 return ds_write(task, record, size, ds_bts, /* force = */ 1);
291 unsigned long bts_base;
292
293 if (bts_size <= 0)
294 return bts_size;
295
296 bts_base = get_bts_buffer_base(ds);
297 memset((void *)bts_base, 0, bts_size);
298
299 set_bts_index(ds, bts_base);
300 return 0;
301} 707}
302 708
303int ds_read_bts(void *ds, int index, struct bts_struct *out) 709int ds_unchecked_write_pebs(struct task_struct *task,
710 const void *record, size_t size)
304{ 711{
305 void *bts; 712 return ds_write(task, record, size, ds_pebs, /* force = */ 1);
713}
306 714
307 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 715static int ds_reset_or_clear(struct task_struct *task,
308 return -EOPNOTSUPP; 716 enum ds_qualifier qual, int clear)
717{
718 struct ds_context *context;
719 unsigned long base, end;
720 int error;
309 721
310 if (index < 0) 722 context = ds_get_context(task);
311 return -EINVAL; 723 error = ds_validate_access(context, qual);
724 if (error < 0)
725 goto out;
312 726
313 if (index >= ds_get_bts_size(ds)) 727 base = ds_get(context->ds, qual, ds_buffer_base);
314 return -EINVAL; 728 end = ds_get(context->ds, qual, ds_absolute_maximum);
315 729
316 bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); 730 if (clear)
731 memset((void *)base, 0, end - base);
317 732
318 memset(out, 0, sizeof(*out)); 733 ds_set(context->ds, qual, ds_index, base);
319 if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
320 out->qualifier = get_info_type(bts);
321 out->variant.jiffies = get_info_data(bts);
322 } else {
323 out->qualifier = BTS_BRANCH;
324 out->variant.lbr.from_ip = get_from_ip(bts);
325 out->variant.lbr.to_ip = get_to_ip(bts);
326 }
327 734
328 return sizeof(*out);; 735 error = 0;
736 out:
737 ds_put_context(context);
738 return error;
329} 739}
330 740
331int ds_write_bts(void *ds, const struct bts_struct *in) 741int ds_reset_bts(struct task_struct *task)
332{ 742{
333 unsigned long bts; 743 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
334 744}
335 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
336 return -EOPNOTSUPP;
337
338 if (ds_get_bts_size(ds) <= 0)
339 return -ENXIO;
340 745
341 bts = get_bts_index(ds); 746int ds_reset_pebs(struct task_struct *task)
747{
748 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
749}
342 750
343 memset((void *)bts, 0, ds_cfg.sizeof_bts); 751int ds_clear_bts(struct task_struct *task)
344 switch (in->qualifier) { 752{
345 case BTS_INVALID: 753 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
346 break; 754}
347 755
348 case BTS_BRANCH: 756int ds_clear_pebs(struct task_struct *task)
349 set_from_ip((void *)bts, in->variant.lbr.from_ip); 757{
350 set_to_ip((void *)bts, in->variant.lbr.to_ip); 758 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
351 break; 759}
352 760
353 case BTS_TASK_ARRIVES: 761int ds_get_pebs_reset(struct task_struct *task, u64 *value)
354 case BTS_TASK_DEPARTS: 762{
355 set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); 763 struct ds_context *context;
356 set_info_type((void *)bts, in->qualifier); 764 int error;
357 set_info_data((void *)bts, in->variant.jiffies);
358 break;
359 765
360 default: 766 if (!value)
361 return -EINVAL; 767 return -EINVAL;
362 }
363 768
364 bts = bts + ds_cfg.sizeof_bts; 769 context = ds_get_context(task);
365 if (bts >= get_bts_absolute_maximum(ds)) 770 error = ds_validate_access(context, ds_pebs);
366 bts = get_bts_buffer_base(ds); 771 if (error < 0)
367 set_bts_index(ds, bts); 772 goto out;
368 773
369 return ds_cfg.sizeof_bts; 774 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
775
776 error = 0;
777 out:
778 ds_put_context(context);
779 return error;
370} 780}
371 781
372unsigned long ds_debugctl_mask(void) 782int ds_set_pebs_reset(struct task_struct *task, u64 value)
373{ 783{
374 return ds_cfg.debugctl_mask; 784 struct ds_context *context;
375} 785 int error;
376 786
377#ifdef __i386__ 787 context = ds_get_context(task);
378static const struct ds_configuration ds_cfg_netburst = { 788 error = ds_validate_access(context, ds_pebs);
379 .sizeof_ds = 9 * 4, 789 if (error < 0)
380 .bts_buffer_base = { 0, 4 }, 790 goto out;
381 .bts_index = { 4, 4 },
382 .bts_absolute_maximum = { 8, 4 },
383 .bts_interrupt_threshold = { 12, 4 },
384 .sizeof_bts = 3 * 4,
385 .from_ip = { 0, 4 },
386 .to_ip = { 4, 4 },
387 .info_type = { 4, 1 },
388 .info_data = { 8, 4 },
389 .debugctl_mask = (1<<2)|(1<<3)
390};
391 791
392static const struct ds_configuration ds_cfg_pentium_m = { 792 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
393 .sizeof_ds = 9 * 4, 793
394 .bts_buffer_base = { 0, 4 }, 794 error = 0;
395 .bts_index = { 4, 4 }, 795 out:
396 .bts_absolute_maximum = { 8, 4 }, 796 ds_put_context(context);
397 .bts_interrupt_threshold = { 12, 4 }, 797 return error;
398 .sizeof_bts = 3 * 4, 798}
399 .from_ip = { 0, 4 }, 799
400 .to_ip = { 4, 4 }, 800static const struct ds_configuration ds_cfg_var = {
401 .info_type = { 4, 1 }, 801 .sizeof_ds = sizeof(long) * 12,
402 .info_data = { 8, 4 }, 802 .sizeof_field = sizeof(long),
403 .debugctl_mask = (1<<6)|(1<<7) 803 .sizeof_rec[ds_bts] = sizeof(long) * 3,
804 .sizeof_rec[ds_pebs] = sizeof(long) * 10
404}; 805};
405#endif /* _i386_ */ 806static const struct ds_configuration ds_cfg_64 = {
406 807 .sizeof_ds = 8 * 12,
407static const struct ds_configuration ds_cfg_core2 = { 808 .sizeof_field = 8,
408 .sizeof_ds = 9 * 8, 809 .sizeof_rec[ds_bts] = 8 * 3,
409 .bts_buffer_base = { 0, 8 }, 810 .sizeof_rec[ds_pebs] = 8 * 10
410 .bts_index = { 8, 8 },
411 .bts_absolute_maximum = { 16, 8 },
412 .bts_interrupt_threshold = { 24, 8 },
413 .sizeof_bts = 3 * 8,
414 .from_ip = { 0, 8 },
415 .to_ip = { 8, 8 },
416 .info_type = { 8, 1 },
417 .info_data = { 16, 8 },
418 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
419}; 811};
420 812
421static inline void 813static inline void
@@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
429 switch (c->x86) { 821 switch (c->x86) {
430 case 0x6: 822 case 0x6:
431 switch (c->x86_model) { 823 switch (c->x86_model) {
432#ifdef __i386__
433 case 0xD: 824 case 0xD:
434 case 0xE: /* Pentium M */ 825 case 0xE: /* Pentium M */
435 ds_configure(&ds_cfg_pentium_m); 826 ds_configure(&ds_cfg_var);
436 break; 827 break;
437#endif /* _i386_ */
438 case 0xF: /* Core2 */ 828 case 0xF: /* Core2 */
439 ds_configure(&ds_cfg_core2); 829 case 0x1C: /* Atom */
830 ds_configure(&ds_cfg_64);
440 break; 831 break;
441 default: 832 default:
442 /* sorry, don't know about them */ 833 /* sorry, don't know about them */
@@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
445 break; 836 break;
446 case 0xF: 837 case 0xF:
447 switch (c->x86_model) { 838 switch (c->x86_model) {
448#ifdef __i386__
449 case 0x0: 839 case 0x0:
450 case 0x1: 840 case 0x1:
451 case 0x2: /* Netburst */ 841 case 0x2: /* Netburst */
452 ds_configure(&ds_cfg_netburst); 842 ds_configure(&ds_cfg_var);
453 break; 843 break;
454#endif /* _i386_ */
455 default: 844 default:
456 /* sorry, don't know about them */ 845 /* sorry, don't know about them */
457 break; 846 break;
@@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
462 break; 851 break;
463 } 852 }
464} 853}
854
855void ds_free(struct ds_context *context)
856{
857 /* This is called when the task owning the parameter context
858 * is dying. There should not be any user of that context left
859 * to disturb us, anymore. */
860 unsigned long leftovers = context->count;
861 while (leftovers--)
862 ds_put_context(context);
863}
864#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
new file mode 100644
index 00000000000..b3614752197
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -0,0 +1,449 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16#include <linux/sysfs.h>
17
18#include <asm/stacktrace.h>
19
20#define STACKSLOTS_PER_LINE 8
21#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33
34static inline int valid_stack_ptr(struct thread_info *tinfo,
35 void *p, unsigned int size, void *end)
36{
37 void *t = tinfo;
38 if (end) {
39 if (p < end && p >= (end-THREAD_SIZE))
40 return 1;
41 else
42 return 0;
43 }
44 return p > t && p < t + THREAD_SIZE - size;
45}
46
47/* The form of the top of the frame on the stack */
48struct stack_frame {
49 struct stack_frame *next_frame;
50 unsigned long return_address;
51};
52
53static inline unsigned long
54print_context_stack(struct thread_info *tinfo,
55 unsigned long *stack, unsigned long bp,
56 const struct stacktrace_ops *ops, void *data,
57 unsigned long *end)
58{
59 struct stack_frame *frame = (struct stack_frame *)bp;
60
61 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
62 unsigned long addr;
63
64 addr = *stack;
65 if (__kernel_text_address(addr)) {
66 if ((unsigned long) stack == bp + sizeof(long)) {
67 ops->address(data, addr, 1);
68 frame = frame->next_frame;
69 bp = (unsigned long) frame;
70 } else {
71 ops->address(data, addr, bp == 0);
72 }
73 }
74 stack++;
75 }
76 return bp;
77}
78
79void dump_trace(struct task_struct *task, struct pt_regs *regs,
80 unsigned long *stack, unsigned long bp,
81 const struct stacktrace_ops *ops, void *data)
82{
83 if (!task)
84 task = current;
85
86 if (!stack) {
87 unsigned long dummy;
88 stack = &dummy;
89 if (task && task != current)
90 stack = (unsigned long *)task->thread.sp;
91 }
92
93#ifdef CONFIG_FRAME_POINTER
94 if (!bp) {
95 if (task == current) {
96 /* Grab bp right from our regs */
97 get_bp(bp);
98 } else {
99 /* bp is the last reg pushed by switch_to */
100 bp = *(unsigned long *) task->thread.sp;
101 }
102 }
103#endif
104
105 for (;;) {
106 struct thread_info *context;
107
108 context = (struct thread_info *)
109 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
110 bp = print_context_stack(context, stack, bp, ops, data, NULL);
111
112 stack = (unsigned long *)context->previous_esp;
113 if (!stack)
114 break;
115 if (ops->stack(data, "IRQ") < 0)
116 break;
117 touch_nmi_watchdog();
118 }
119}
120EXPORT_SYMBOL(dump_trace);
121
122static void
123print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
124{
125 printk(data);
126 print_symbol(msg, symbol);
127 printk("\n");
128}
129
130static void print_trace_warning(void *data, char *msg)
131{
132 printk("%s%s\n", (char *)data, msg);
133}
134
135static int print_trace_stack(void *data, char *name)
136{
137 printk("%s <%s> ", (char *)data, name);
138 return 0;
139}
140
141/*
142 * Print one address/symbol entries per line.
143 */
144static void print_trace_address(void *data, unsigned long addr, int reliable)
145{
146 touch_nmi_watchdog();
147 printk(data);
148 printk_address(addr, reliable);
149}
150
151static const struct stacktrace_ops print_trace_ops = {
152 .warning = print_trace_warning,
153 .warning_symbol = print_trace_warning_symbol,
154 .stack = print_trace_stack,
155 .address = print_trace_address,
156};
157
158static void
159show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
160 unsigned long *stack, unsigned long bp, char *log_lvl)
161{
162 printk("%sCall Trace:\n", log_lvl);
163 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
164}
165
166void show_trace(struct task_struct *task, struct pt_regs *regs,
167 unsigned long *stack, unsigned long bp)
168{
169 show_trace_log_lvl(task, regs, stack, bp, "");
170}
171
172static void
173show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
174 unsigned long *sp, unsigned long bp, char *log_lvl)
175{
176 unsigned long *stack;
177 int i;
178
179 if (sp == NULL) {
180 if (task)
181 sp = (unsigned long *)task->thread.sp;
182 else
183 sp = (unsigned long *)&sp;
184 }
185
186 stack = sp;
187 for (i = 0; i < kstack_depth_to_print; i++) {
188 if (kstack_end(stack))
189 break;
190 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
191 printk("\n%s", log_lvl);
192 printk(" %08lx", *stack++);
193 touch_nmi_watchdog();
194 }
195 printk("\n");
196 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
197}
198
199void show_stack(struct task_struct *task, unsigned long *sp)
200{
201 show_stack_log_lvl(task, NULL, sp, 0, "");
202}
203
204/*
205 * The architecture-independent dump_stack generator
206 */
207void dump_stack(void)
208{
209 unsigned long bp = 0;
210 unsigned long stack;
211
212#ifdef CONFIG_FRAME_POINTER
213 if (!bp)
214 get_bp(bp);
215#endif
216
217 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
218 current->pid, current->comm, print_tainted(),
219 init_utsname()->release,
220 (int)strcspn(init_utsname()->version, " "),
221 init_utsname()->version);
222 show_trace(NULL, NULL, &stack, bp);
223}
224
225EXPORT_SYMBOL(dump_stack);
226
227void show_registers(struct pt_regs *regs)
228{
229 int i;
230
231 print_modules();
232 __show_regs(regs, 0);
233
234 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
235 TASK_COMM_LEN, current->comm, task_pid_nr(current),
236 current_thread_info(), current, task_thread_info(current));
237 /*
238 * When in-kernel, we also print out the stack and code at the
239 * time of the fault..
240 */
241 if (!user_mode_vm(regs)) {
242 unsigned int code_prologue = code_bytes * 43 / 64;
243 unsigned int code_len = code_bytes;
244 unsigned char c;
245 u8 *ip;
246
247 printk(KERN_EMERG "Stack:\n");
248 show_stack_log_lvl(NULL, regs, &regs->sp,
249 0, KERN_EMERG);
250
251 printk(KERN_EMERG "Code: ");
252
253 ip = (u8 *)regs->ip - code_prologue;
254 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
255 /* try starting at IP */
256 ip = (u8 *)regs->ip;
257 code_len = code_len - code_prologue + 1;
258 }
259 for (i = 0; i < code_len; i++, ip++) {
260 if (ip < (u8 *)PAGE_OFFSET ||
261 probe_kernel_address(ip, c)) {
262 printk(" Bad EIP value.");
263 break;
264 }
265 if (ip == (u8 *)regs->ip)
266 printk("<%02x> ", c);
267 else
268 printk("%02x ", c);
269 }
270 }
271 printk("\n");
272}
273
274int is_valid_bugaddr(unsigned long ip)
275{
276 unsigned short ud2;
277
278 if (ip < PAGE_OFFSET)
279 return 0;
280 if (probe_kernel_address((unsigned short *)ip, ud2))
281 return 0;
282
283 return ud2 == 0x0b0f;
284}
285
286static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
287static int die_owner = -1;
288static unsigned int die_nest_count;
289
290unsigned __kprobes long oops_begin(void)
291{
292 unsigned long flags;
293
294 oops_enter();
295
296 if (die_owner != raw_smp_processor_id()) {
297 console_verbose();
298 raw_local_irq_save(flags);
299 __raw_spin_lock(&die_lock);
300 die_owner = smp_processor_id();
301 die_nest_count = 0;
302 bust_spinlocks(1);
303 } else {
304 raw_local_irq_save(flags);
305 }
306 die_nest_count++;
307 return flags;
308}
309
310void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
311{
312 bust_spinlocks(0);
313 die_owner = -1;
314 add_taint(TAINT_DIE);
315 __raw_spin_unlock(&die_lock);
316 raw_local_irq_restore(flags);
317
318 if (!regs)
319 return;
320
321 if (kexec_should_crash(current))
322 crash_kexec(regs);
323 if (in_interrupt())
324 panic("Fatal exception in interrupt");
325 if (panic_on_oops)
326 panic("Fatal exception");
327 oops_exit();
328 do_exit(signr);
329}
330
331int __kprobes __die(const char *str, struct pt_regs *regs, long err)
332{
333 unsigned short ss;
334 unsigned long sp;
335
336 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
337#ifdef CONFIG_PREEMPT
338 printk("PREEMPT ");
339#endif
340#ifdef CONFIG_SMP
341 printk("SMP ");
342#endif
343#ifdef CONFIG_DEBUG_PAGEALLOC
344 printk("DEBUG_PAGEALLOC");
345#endif
346 printk("\n");
347 sysfs_printk_last_file();
348 if (notify_die(DIE_OOPS, str, regs, err,
349 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
350 return 1;
351
352 show_registers(regs);
353 /* Executive summary in case the oops scrolled away */
354 sp = (unsigned long) (&regs->sp);
355 savesegment(ss, ss);
356 if (user_mode(regs)) {
357 sp = regs->sp;
358 ss = regs->ss & 0xffff;
359 }
360 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
361 print_symbol("%s", regs->ip);
362 printk(" SS:ESP %04x:%08lx\n", ss, sp);
363 return 0;
364}
365
366/*
367 * This is gone through when something in the kernel has done something bad
368 * and is about to be terminated:
369 */
370void die(const char *str, struct pt_regs *regs, long err)
371{
372 unsigned long flags = oops_begin();
373
374 if (die_nest_count < 3) {
375 report_bug(regs->ip, regs);
376
377 if (__die(str, regs, err))
378 regs = NULL;
379 } else {
380 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
381 }
382
383 oops_end(flags, regs, SIGSEGV);
384}
385
386static DEFINE_SPINLOCK(nmi_print_lock);
387
388void notrace __kprobes
389die_nmi(char *str, struct pt_regs *regs, int do_panic)
390{
391 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
392 return;
393
394 spin_lock(&nmi_print_lock);
395 /*
396 * We are in trouble anyway, lets at least try
397 * to get a message out:
398 */
399 bust_spinlocks(1);
400 printk(KERN_EMERG "%s", str);
401 printk(" on CPU%d, ip %08lx, registers:\n",
402 smp_processor_id(), regs->ip);
403 show_registers(regs);
404 if (do_panic)
405 panic("Non maskable interrupt");
406 console_silent();
407 spin_unlock(&nmi_print_lock);
408
409 /*
410 * If we are in kernel we are probably nested up pretty bad
411 * and might aswell get out now while we still can:
412 */
413 if (!user_mode_vm(regs)) {
414 current->thread.trap_no = 2;
415 crash_kexec(regs);
416 }
417
418 bust_spinlocks(0);
419 do_exit(SIGSEGV);
420}
421
422static int __init oops_setup(char *s)
423{
424 if (!s)
425 return -EINVAL;
426 if (!strcmp(s, "panic"))
427 panic_on_oops = 1;
428 return 0;
429}
430early_param("oops", oops_setup);
431
432static int __init kstack_setup(char *s)
433{
434 if (!s)
435 return -EINVAL;
436 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
437 return 0;
438}
439early_param("kstack", kstack_setup);
440
441static int __init code_bytes_setup(char *s)
442{
443 code_bytes = simple_strtoul(s, NULL, 0);
444 if (code_bytes > 8192)
445 code_bytes = 8192;
446
447 return 1;
448}
449__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
new file mode 100644
index 00000000000..96a5db7da8a
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -0,0 +1,575 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16#include <linux/sysfs.h>
17
18#include <asm/stacktrace.h>
19
20#define STACKSLOTS_PER_LINE 4
21#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33
34static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
35 unsigned *usedp, char **idp)
36{
37 static char ids[][8] = {
38 [DEBUG_STACK - 1] = "#DB",
39 [NMI_STACK - 1] = "NMI",
40 [DOUBLEFAULT_STACK - 1] = "#DF",
41 [STACKFAULT_STACK - 1] = "#SS",
42 [MCE_STACK - 1] = "#MC",
43#if DEBUG_STKSZ > EXCEPTION_STKSZ
44 [N_EXCEPTION_STACKS ...
45 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
46#endif
47 };
48 unsigned k;
49
50 /*
51 * Iterate over all exception stacks, and figure out whether
52 * 'stack' is in one of them:
53 */
54 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
55 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
56 /*
57 * Is 'stack' above this exception frame's end?
58 * If yes then skip to the next frame.
59 */
60 if (stack >= end)
61 continue;
62 /*
63 * Is 'stack' above this exception frame's start address?
64 * If yes then we found the right frame.
65 */
66 if (stack >= end - EXCEPTION_STKSZ) {
67 /*
68 * Make sure we only iterate through an exception
69 * stack once. If it comes up for the second time
70 * then there's something wrong going on - just
71 * break out and return NULL:
72 */
73 if (*usedp & (1U << k))
74 break;
75 *usedp |= 1U << k;
76 *idp = ids[k];
77 return (unsigned long *)end;
78 }
79 /*
80 * If this is a debug stack, and if it has a larger size than
81 * the usual exception stacks, then 'stack' might still
82 * be within the lower portion of the debug stack:
83 */
84#if DEBUG_STKSZ > EXCEPTION_STKSZ
85 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
86 unsigned j = N_EXCEPTION_STACKS - 1;
87
88 /*
89 * Black magic. A large debug stack is composed of
90 * multiple exception stack entries, which we
91 * iterate through now. Dont look:
92 */
93 do {
94 ++j;
95 end -= EXCEPTION_STKSZ;
96 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
97 } while (stack < end - EXCEPTION_STKSZ);
98 if (*usedp & (1U << j))
99 break;
100 *usedp |= 1U << j;
101 *idp = ids[j];
102 return (unsigned long *)end;
103 }
104#endif
105 }
106 return NULL;
107}
108
109/*
110 * x86-64 can have up to three kernel stacks:
111 * process stack
112 * interrupt stack
113 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
114 */
115
116static inline int valid_stack_ptr(struct thread_info *tinfo,
117 void *p, unsigned int size, void *end)
118{
119 void *t = tinfo;
120 if (end) {
121 if (p < end && p >= (end-THREAD_SIZE))
122 return 1;
123 else
124 return 0;
125 }
126 return p > t && p < t + THREAD_SIZE - size;
127}
128
129/* The form of the top of the frame on the stack */
130struct stack_frame {
131 struct stack_frame *next_frame;
132 unsigned long return_address;
133};
134
135static inline unsigned long
136print_context_stack(struct thread_info *tinfo,
137 unsigned long *stack, unsigned long bp,
138 const struct stacktrace_ops *ops, void *data,
139 unsigned long *end)
140{
141 struct stack_frame *frame = (struct stack_frame *)bp;
142
143 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
144 unsigned long addr;
145
146 addr = *stack;
147 if (__kernel_text_address(addr)) {
148 if ((unsigned long) stack == bp + sizeof(long)) {
149 ops->address(data, addr, 1);
150 frame = frame->next_frame;
151 bp = (unsigned long) frame;
152 } else {
153 ops->address(data, addr, bp == 0);
154 }
155 }
156 stack++;
157 }
158 return bp;
159}
160
161void dump_trace(struct task_struct *task, struct pt_regs *regs,
162 unsigned long *stack, unsigned long bp,
163 const struct stacktrace_ops *ops, void *data)
164{
165 const unsigned cpu = get_cpu();
166 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
167 unsigned used = 0;
168 struct thread_info *tinfo;
169
170 if (!task)
171 task = current;
172
173 if (!stack) {
174 unsigned long dummy;
175 stack = &dummy;
176 if (task && task != current)
177 stack = (unsigned long *)task->thread.sp;
178 }
179
180#ifdef CONFIG_FRAME_POINTER
181 if (!bp) {
182 if (task == current) {
183 /* Grab bp right from our regs */
184 get_bp(bp);
185 } else {
186 /* bp is the last reg pushed by switch_to */
187 bp = *(unsigned long *) task->thread.sp;
188 }
189 }
190#endif
191
192 /*
193 * Print function call entries in all stacks, starting at the
194 * current stack address. If the stacks consist of nested
195 * exceptions
196 */
197 tinfo = task_thread_info(task);
198 for (;;) {
199 char *id;
200 unsigned long *estack_end;
201 estack_end = in_exception_stack(cpu, (unsigned long)stack,
202 &used, &id);
203
204 if (estack_end) {
205 if (ops->stack(data, id) < 0)
206 break;
207
208 bp = print_context_stack(tinfo, stack, bp, ops,
209 data, estack_end);
210 ops->stack(data, "<EOE>");
211 /*
212 * We link to the next stack via the
213 * second-to-last pointer (index -2 to end) in the
214 * exception stack:
215 */
216 stack = (unsigned long *) estack_end[-2];
217 continue;
218 }
219 if (irqstack_end) {
220 unsigned long *irqstack;
221 irqstack = irqstack_end -
222 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
223
224 if (stack >= irqstack && stack < irqstack_end) {
225 if (ops->stack(data, "IRQ") < 0)
226 break;
227 bp = print_context_stack(tinfo, stack, bp,
228 ops, data, irqstack_end);
229 /*
230 * We link to the next stack (which would be
231 * the process stack normally) the last
232 * pointer (index -1 to end) in the IRQ stack:
233 */
234 stack = (unsigned long *) (irqstack_end[-1]);
235 irqstack_end = NULL;
236 ops->stack(data, "EOI");
237 continue;
238 }
239 }
240 break;
241 }
242
243 /*
244 * This handles the process stack:
245 */
246 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
247 put_cpu();
248}
249EXPORT_SYMBOL(dump_trace);
250
251static void
252print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
253{
254 printk(data);
255 print_symbol(msg, symbol);
256 printk("\n");
257}
258
259static void print_trace_warning(void *data, char *msg)
260{
261 printk("%s%s\n", (char *)data, msg);
262}
263
264static int print_trace_stack(void *data, char *name)
265{
266 printk("%s <%s> ", (char *)data, name);
267 return 0;
268}
269
270/*
271 * Print one address/symbol entries per line.
272 */
273static void print_trace_address(void *data, unsigned long addr, int reliable)
274{
275 touch_nmi_watchdog();
276 printk(data);
277 printk_address(addr, reliable);
278}
279
280static const struct stacktrace_ops print_trace_ops = {
281 .warning = print_trace_warning,
282 .warning_symbol = print_trace_warning_symbol,
283 .stack = print_trace_stack,
284 .address = print_trace_address,
285};
286
287static void
288show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
289 unsigned long *stack, unsigned long bp, char *log_lvl)
290{
291 printk("%sCall Trace:\n", log_lvl);
292 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
293}
294
295void show_trace(struct task_struct *task, struct pt_regs *regs,
296 unsigned long *stack, unsigned long bp)
297{
298 show_trace_log_lvl(task, regs, stack, bp, "");
299}
300
301static void
302show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
303 unsigned long *sp, unsigned long bp, char *log_lvl)
304{
305 unsigned long *stack;
306 int i;
307 const int cpu = smp_processor_id();
308 unsigned long *irqstack_end =
309 (unsigned long *) (cpu_pda(cpu)->irqstackptr);
310 unsigned long *irqstack =
311 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
312
313 /*
314 * debugging aid: "show_stack(NULL, NULL);" prints the
315 * back trace for this cpu.
316 */
317
318 if (sp == NULL) {
319 if (task)
320 sp = (unsigned long *)task->thread.sp;
321 else
322 sp = (unsigned long *)&sp;
323 }
324
325 stack = sp;
326 for (i = 0; i < kstack_depth_to_print; i++) {
327 if (stack >= irqstack && stack <= irqstack_end) {
328 if (stack == irqstack_end) {
329 stack = (unsigned long *) (irqstack_end[-1]);
330 printk(" <EOI> ");
331 }
332 } else {
333 if (((long) stack & (THREAD_SIZE-1)) == 0)
334 break;
335 }
336 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
337 printk("\n%s", log_lvl);
338 printk(" %016lx", *stack++);
339 touch_nmi_watchdog();
340 }
341 printk("\n");
342 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
343}
344
345void show_stack(struct task_struct *task, unsigned long *sp)
346{
347 show_stack_log_lvl(task, NULL, sp, 0, "");
348}
349
350/*
351 * The architecture-independent dump_stack generator
352 */
353void dump_stack(void)
354{
355 unsigned long bp = 0;
356 unsigned long stack;
357
358#ifdef CONFIG_FRAME_POINTER
359 if (!bp)
360 get_bp(bp);
361#endif
362
363 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
364 current->pid, current->comm, print_tainted(),
365 init_utsname()->release,
366 (int)strcspn(init_utsname()->version, " "),
367 init_utsname()->version);
368 show_trace(NULL, NULL, &stack, bp);
369}
370EXPORT_SYMBOL(dump_stack);
371
372void show_registers(struct pt_regs *regs)
373{
374 int i;
375 unsigned long sp;
376 const int cpu = smp_processor_id();
377 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
378
379 sp = regs->sp;
380 printk("CPU %d ", cpu);
381 __show_regs(regs, 1);
382 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
383 cur->comm, cur->pid, task_thread_info(cur), cur);
384
385 /*
386 * When in-kernel, we also print out the stack and code at the
387 * time of the fault..
388 */
389 if (!user_mode(regs)) {
390 unsigned int code_prologue = code_bytes * 43 / 64;
391 unsigned int code_len = code_bytes;
392 unsigned char c;
393 u8 *ip;
394
395 printk(KERN_EMERG "Stack:\n");
396 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
397 regs->bp, KERN_EMERG);
398
399 printk(KERN_EMERG "Code: ");
400
401 ip = (u8 *)regs->ip - code_prologue;
402 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
403 /* try starting at IP */
404 ip = (u8 *)regs->ip;
405 code_len = code_len - code_prologue + 1;
406 }
407 for (i = 0; i < code_len; i++, ip++) {
408 if (ip < (u8 *)PAGE_OFFSET ||
409 probe_kernel_address(ip, c)) {
410 printk(" Bad RIP value.");
411 break;
412 }
413 if (ip == (u8 *)regs->ip)
414 printk("<%02x> ", c);
415 else
416 printk("%02x ", c);
417 }
418 }
419 printk("\n");
420}
421
422int is_valid_bugaddr(unsigned long ip)
423{
424 unsigned short ud2;
425
426 if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
427 return 0;
428
429 return ud2 == 0x0b0f;
430}
431
432static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
433static int die_owner = -1;
434static unsigned int die_nest_count;
435
436unsigned __kprobes long oops_begin(void)
437{
438 int cpu;
439 unsigned long flags;
440
441 oops_enter();
442
443 /* racy, but better than risking deadlock. */
444 raw_local_irq_save(flags);
445 cpu = smp_processor_id();
446 if (!__raw_spin_trylock(&die_lock)) {
447 if (cpu == die_owner)
448 /* nested oops. should stop eventually */;
449 else
450 __raw_spin_lock(&die_lock);
451 }
452 die_nest_count++;
453 die_owner = cpu;
454 console_verbose();
455 bust_spinlocks(1);
456 return flags;
457}
458
459void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
460{
461 die_owner = -1;
462 bust_spinlocks(0);
463 die_nest_count--;
464 if (!die_nest_count)
465 /* Nest count reaches zero, release the lock. */
466 __raw_spin_unlock(&die_lock);
467 raw_local_irq_restore(flags);
468 if (!regs) {
469 oops_exit();
470 return;
471 }
472 if (in_interrupt())
473 panic("Fatal exception in interrupt");
474 if (panic_on_oops)
475 panic("Fatal exception");
476 oops_exit();
477 do_exit(signr);
478}
479
480int __kprobes __die(const char *str, struct pt_regs *regs, long err)
481{
482 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
483#ifdef CONFIG_PREEMPT
484 printk("PREEMPT ");
485#endif
486#ifdef CONFIG_SMP
487 printk("SMP ");
488#endif
489#ifdef CONFIG_DEBUG_PAGEALLOC
490 printk("DEBUG_PAGEALLOC");
491#endif
492 printk("\n");
493 sysfs_printk_last_file();
494 if (notify_die(DIE_OOPS, str, regs, err,
495 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
496 return 1;
497
498 show_registers(regs);
499 add_taint(TAINT_DIE);
500 /* Executive summary in case the oops scrolled away */
501 printk(KERN_ALERT "RIP ");
502 printk_address(regs->ip, 1);
503 printk(" RSP <%016lx>\n", regs->sp);
504 if (kexec_should_crash(current))
505 crash_kexec(regs);
506 return 0;
507}
508
509void die(const char *str, struct pt_regs *regs, long err)
510{
511 unsigned long flags = oops_begin();
512
513 if (!user_mode(regs))
514 report_bug(regs->ip, regs);
515
516 if (__die(str, regs, err))
517 regs = NULL;
518 oops_end(flags, regs, SIGSEGV);
519}
520
521notrace __kprobes void
522die_nmi(char *str, struct pt_regs *regs, int do_panic)
523{
524 unsigned long flags;
525
526 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
527 return;
528
529 flags = oops_begin();
530 /*
531 * We are in trouble anyway, lets at least try
532 * to get a message out.
533 */
534 printk(KERN_EMERG "%s", str);
535 printk(" on CPU%d, ip %08lx, registers:\n",
536 smp_processor_id(), regs->ip);
537 show_registers(regs);
538 if (kexec_should_crash(current))
539 crash_kexec(regs);
540 if (do_panic || panic_on_oops)
541 panic("Non maskable interrupt");
542 oops_end(flags, NULL, SIGBUS);
543 nmi_exit();
544 local_irq_enable();
545 do_exit(SIGBUS);
546}
547
548static int __init oops_setup(char *s)
549{
550 if (!s)
551 return -EINVAL;
552 if (!strcmp(s, "panic"))
553 panic_on_oops = 1;
554 return 0;
555}
556early_param("oops", oops_setup);
557
558static int __init kstack_setup(char *s)
559{
560 if (!s)
561 return -EINVAL;
562 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
563 return 0;
564}
565early_param("kstack", kstack_setup);
566
567static int __init code_bytes_setup(char *s)
568{
569 code_bytes = simple_strtoul(s, NULL, 0);
570 if (code_bytes > 8192)
571 code_bytes = 8192;
572
573 return 1;
574}
575__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 66e48aa2dd1..ce97bf3bed1 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -148,6 +148,9 @@ void __init e820_print_map(char *who)
148 case E820_NVS: 148 case E820_NVS:
149 printk(KERN_CONT "(ACPI NVS)\n"); 149 printk(KERN_CONT "(ACPI NVS)\n");
150 break; 150 break;
151 case E820_UNUSABLE:
152 printk("(unusable)\n");
153 break;
151 default: 154 default:
152 printk(KERN_CONT "type %u\n", e820.map[i].type); 155 printk(KERN_CONT "type %u\n", e820.map[i].type);
153 break; 156 break;
@@ -1260,6 +1263,7 @@ static inline const char *e820_type_to_string(int e820_type)
1260 case E820_RAM: return "System RAM"; 1263 case E820_RAM: return "System RAM";
1261 case E820_ACPI: return "ACPI Tables"; 1264 case E820_ACPI: return "ACPI Tables";
1262 case E820_NVS: return "ACPI Non-volatile Storage"; 1265 case E820_NVS: return "ACPI Non-volatile Storage";
1266 case E820_UNUSABLE: return "Unusable memory";
1263 default: return "reserved"; 1267 default: return "reserved";
1264 } 1268 }
1265} 1269}
@@ -1267,6 +1271,7 @@ static inline const char *e820_type_to_string(int e820_type)
1267/* 1271/*
1268 * Mark e820 reserved areas as busy for the resource manager. 1272 * Mark e820 reserved areas as busy for the resource manager.
1269 */ 1273 */
1274static struct resource __initdata *e820_res;
1270void __init e820_reserve_resources(void) 1275void __init e820_reserve_resources(void)
1271{ 1276{
1272 int i; 1277 int i;
@@ -1274,20 +1279,26 @@ void __init e820_reserve_resources(void)
1274 u64 end; 1279 u64 end;
1275 1280
1276 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); 1281 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
1282 e820_res = res;
1277 for (i = 0; i < e820.nr_map; i++) { 1283 for (i = 0; i < e820.nr_map; i++) {
1278 end = e820.map[i].addr + e820.map[i].size - 1; 1284 end = e820.map[i].addr + e820.map[i].size - 1;
1279#ifndef CONFIG_RESOURCES_64BIT 1285 if (end != (resource_size_t)end) {
1280 if (end > 0x100000000ULL) {
1281 res++; 1286 res++;
1282 continue; 1287 continue;
1283 } 1288 }
1284#endif
1285 res->name = e820_type_to_string(e820.map[i].type); 1289 res->name = e820_type_to_string(e820.map[i].type);
1286 res->start = e820.map[i].addr; 1290 res->start = e820.map[i].addr;
1287 res->end = end; 1291 res->end = end;
1288 1292
1289 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 1293 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1290 insert_resource(&iomem_resource, res); 1294
1295 /*
1296 * don't register the region that could be conflicted with
1297 * pci device BAR resource and insert them later in
1298 * pcibios_resource_survey()
1299 */
1300 if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20))
1301 insert_resource(&iomem_resource, res);
1291 res++; 1302 res++;
1292 } 1303 }
1293 1304
@@ -1299,6 +1310,19 @@ void __init e820_reserve_resources(void)
1299 } 1310 }
1300} 1311}
1301 1312
1313void __init e820_reserve_resources_late(void)
1314{
1315 int i;
1316 struct resource *res;
1317
1318 res = e820_res;
1319 for (i = 0; i < e820.nr_map; i++) {
1320 if (!res->parent && res->end)
1321 reserve_region_with_split(&iomem_resource, res->start, res->end, res->name);
1322 res++;
1323 }
1324}
1325
1302char *__init default_machine_specific_memory_setup(void) 1326char *__init default_machine_specific_memory_setup(void)
1303{ 1327{
1304 char *who = "BIOS-e820"; 1328 char *who = "BIOS-e820";
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index 4353cf5e6fa..3ce029ffaa5 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -95,6 +95,113 @@ static void __init nvidia_bugs(int num, int slot, int func)
95 95
96} 96}
97 97
98#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
99static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
100{
101 u32 d;
102 u8 b;
103
104 b = read_pci_config_byte(num, slot, func, 0xac);
105 b &= ~(1<<5);
106 write_pci_config_byte(num, slot, func, 0xac, b);
107
108 d = read_pci_config(num, slot, func, 0x70);
109 d |= 1<<8;
110 write_pci_config(num, slot, func, 0x70, d);
111
112 d = read_pci_config(num, slot, func, 0x8);
113 d &= 0xff;
114 return d;
115}
116
117static void __init ati_bugs(int num, int slot, int func)
118{
119 u32 d;
120 u8 b;
121
122 if (acpi_use_timer_override)
123 return;
124
125 d = ati_ixp4x0_rev(num, slot, func);
126 if (d < 0x82)
127 acpi_skip_timer_override = 1;
128 else {
129 /* check for IRQ0 interrupt swap */
130 outb(0x72, 0xcd6); b = inb(0xcd7);
131 if (!(b & 0x2))
132 acpi_skip_timer_override = 1;
133 }
134
135 if (acpi_skip_timer_override) {
136 printk(KERN_INFO "SB4X0 revision 0x%x\n", d);
137 printk(KERN_INFO "Ignoring ACPI timer override.\n");
138 printk(KERN_INFO "If you got timer trouble "
139 "try acpi_use_timer_override\n");
140 }
141}
142
143static u32 __init ati_sbx00_rev(int num, int slot, int func)
144{
145 u32 old, d;
146
147 d = read_pci_config(num, slot, func, 0x70);
148 old = d;
149 d &= ~(1<<8);
150 write_pci_config(num, slot, func, 0x70, d);
151 d = read_pci_config(num, slot, func, 0x8);
152 d &= 0xff;
153 write_pci_config(num, slot, func, 0x70, old);
154
155 return d;
156}
157
158static void __init ati_bugs_contd(int num, int slot, int func)
159{
160 u32 d, rev;
161
162 if (acpi_use_timer_override)
163 return;
164
165 rev = ati_sbx00_rev(num, slot, func);
166 if (rev > 0x13)
167 return;
168
169 /* check for IRQ0 interrupt swap */
170 d = read_pci_config(num, slot, func, 0x64);
171 if (!(d & (1<<14)))
172 acpi_skip_timer_override = 1;
173
174 if (acpi_skip_timer_override) {
175 printk(KERN_INFO "SB600 revision 0x%x\n", rev);
176 printk(KERN_INFO "Ignoring ACPI timer override.\n");
177 printk(KERN_INFO "If you got timer trouble "
178 "try acpi_use_timer_override\n");
179 }
180}
181#else
182static void __init ati_bugs(int num, int slot, int func)
183{
184}
185
186static void __init ati_bugs_contd(int num, int slot, int func)
187{
188}
189#endif
190
191#ifdef CONFIG_DMAR
192static void __init intel_g33_dmar(int num, int slot, int func)
193{
194 struct acpi_table_header *dmar_tbl;
195 acpi_status status;
196
197 status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
198 if (ACPI_SUCCESS(status)) {
199 printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
200 dmar_disabled = 1;
201 }
202}
203#endif
204
98#define QFLAG_APPLY_ONCE 0x1 205#define QFLAG_APPLY_ONCE 0x1
99#define QFLAG_APPLIED 0x2 206#define QFLAG_APPLIED 0x2
100#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) 207#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -114,6 +221,14 @@ static struct chipset early_qrk[] __initdata = {
114 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, 221 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
115 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 222 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
116 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, 223 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
224 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
225 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
226 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
227 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
228#ifdef CONFIG_DMAR
229 { PCI_VENDOR_ID_INTEL, 0x29c0,
230 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
231#endif
117 {} 232 {}
118}; 233};
119 234
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index ff9e7350da5..34ad997d383 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -3,11 +3,19 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/screen_info.h> 5#include <linux/screen_info.h>
6#include <linux/usb/ch9.h>
7#include <linux/pci_regs.h>
8#include <linux/pci_ids.h>
9#include <linux/errno.h>
6#include <asm/io.h> 10#include <asm/io.h>
7#include <asm/processor.h> 11#include <asm/processor.h>
8#include <asm/fcntl.h> 12#include <asm/fcntl.h>
9#include <asm/setup.h> 13#include <asm/setup.h>
10#include <xen/hvc-console.h> 14#include <xen/hvc-console.h>
15#include <asm/pci-direct.h>
16#include <asm/pgtable.h>
17#include <asm/fixmap.h>
18#include <linux/usb/ehci_def.h>
11 19
12/* Simple VGA output */ 20/* Simple VGA output */
13#define VGABASE (__ISA_IO_base + 0xb8000) 21#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8; /* ttyS0 */
78static int early_serial_putc(unsigned char ch) 86static int early_serial_putc(unsigned char ch)
79{ 87{
80 unsigned timeout = 0xffff; 88 unsigned timeout = 0xffff;
89
81 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) 90 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
82 cpu_relax(); 91 cpu_relax();
83 outb(ch, early_serial_base + TXR); 92 outb(ch, early_serial_base + TXR);
@@ -111,7 +120,7 @@ static __init void early_serial_init(char *s)
111 if (!strncmp(s, "0x", 2)) { 120 if (!strncmp(s, "0x", 2)) {
112 early_serial_base = simple_strtoul(s, &e, 16); 121 early_serial_base = simple_strtoul(s, &e, 16);
113 } else { 122 } else {
114 static int bases[] = { 0x3f8, 0x2f8 }; 123 static const int __initconst bases[] = { 0x3f8, 0x2f8 };
115 124
116 if (!strncmp(s, "ttyS", 4)) 125 if (!strncmp(s, "ttyS", 4))
117 s += 4; 126 s += 4;
@@ -151,6 +160,721 @@ static struct console early_serial_console = {
151 .index = -1, 160 .index = -1,
152}; 161};
153 162
163#ifdef CONFIG_EARLY_PRINTK_DBGP
164
165static struct ehci_caps __iomem *ehci_caps;
166static struct ehci_regs __iomem *ehci_regs;
167static struct ehci_dbg_port __iomem *ehci_debug;
168static unsigned int dbgp_endpoint_out;
169
170struct ehci_dev {
171 u32 bus;
172 u32 slot;
173 u32 func;
174};
175
176static struct ehci_dev ehci_dev;
177
178#define USB_DEBUG_DEVNUM 127
179
180#define DBGP_DATA_TOGGLE 0x8800
181
182static inline u32 dbgp_pid_update(u32 x, u32 tok)
183{
184 return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
185}
186
187static inline u32 dbgp_len_update(u32 x, u32 len)
188{
189 return (x & ~0x0f) | (len & 0x0f);
190}
191
192/*
193 * USB Packet IDs (PIDs)
194 */
195
196/* token */
197#define USB_PID_OUT 0xe1
198#define USB_PID_IN 0x69
199#define USB_PID_SOF 0xa5
200#define USB_PID_SETUP 0x2d
201/* handshake */
202#define USB_PID_ACK 0xd2
203#define USB_PID_NAK 0x5a
204#define USB_PID_STALL 0x1e
205#define USB_PID_NYET 0x96
206/* data */
207#define USB_PID_DATA0 0xc3
208#define USB_PID_DATA1 0x4b
209#define USB_PID_DATA2 0x87
210#define USB_PID_MDATA 0x0f
211/* Special */
212#define USB_PID_PREAMBLE 0x3c
213#define USB_PID_ERR 0x3c
214#define USB_PID_SPLIT 0x78
215#define USB_PID_PING 0xb4
216#define USB_PID_UNDEF_0 0xf0
217
218#define USB_PID_DATA_TOGGLE 0x88
219#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
220
221#define PCI_CAP_ID_EHCI_DEBUG 0xa
222
223#define HUB_ROOT_RESET_TIME 50 /* times are in msec */
224#define HUB_SHORT_RESET_TIME 10
225#define HUB_LONG_RESET_TIME 200
226#define HUB_RESET_TIMEOUT 500
227
228#define DBGP_MAX_PACKET 8
229
230static int dbgp_wait_until_complete(void)
231{
232 u32 ctrl;
233 int loop = 0x100000;
234
235 do {
236 ctrl = readl(&ehci_debug->control);
237 /* Stop when the transaction is finished */
238 if (ctrl & DBGP_DONE)
239 break;
240 } while (--loop > 0);
241
242 if (!loop)
243 return -1;
244
245 /*
246 * Now that we have observed the completed transaction,
247 * clear the done bit.
248 */
249 writel(ctrl | DBGP_DONE, &ehci_debug->control);
250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
251}
252
253static void dbgp_mdelay(int ms)
254{
255 int i;
256
257 while (ms--) {
258 for (i = 0; i < 1000; i++)
259 outb(0x1, 0x80);
260 }
261}
262
263static void dbgp_breath(void)
264{
265 /* Sleep to give the debug port a chance to breathe */
266}
267
268static int dbgp_wait_until_done(unsigned ctrl)
269{
270 u32 pids, lpid;
271 int ret;
272 int loop = 3;
273
274retry:
275 writel(ctrl | DBGP_GO, &ehci_debug->control);
276 ret = dbgp_wait_until_complete();
277 pids = readl(&ehci_debug->pids);
278 lpid = DBGP_PID_GET(pids);
279
280 if (ret < 0)
281 return ret;
282
283 /*
284 * If the port is getting full or it has dropped data
285 * start pacing ourselves, not necessary but it's friendly.
286 */
287 if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
288 dbgp_breath();
289
290 /* If I get a NACK reissue the transmission */
291 if (lpid == USB_PID_NAK) {
292 if (--loop > 0)
293 goto retry;
294 }
295
296 return ret;
297}
298
299static void dbgp_set_data(const void *buf, int size)
300{
301 const unsigned char *bytes = buf;
302 u32 lo, hi;
303 int i;
304
305 lo = hi = 0;
306 for (i = 0; i < 4 && i < size; i++)
307 lo |= bytes[i] << (8*i);
308 for (; i < 8 && i < size; i++)
309 hi |= bytes[i] << (8*(i - 4));
310 writel(lo, &ehci_debug->data03);
311 writel(hi, &ehci_debug->data47);
312}
313
314static void dbgp_get_data(void *buf, int size)
315{
316 unsigned char *bytes = buf;
317 u32 lo, hi;
318 int i;
319
320 lo = readl(&ehci_debug->data03);
321 hi = readl(&ehci_debug->data47);
322 for (i = 0; i < 4 && i < size; i++)
323 bytes[i] = (lo >> (8*i)) & 0xff;
324 for (; i < 8 && i < size; i++)
325 bytes[i] = (hi >> (8*(i - 4))) & 0xff;
326}
327
328static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
329 const char *bytes, int size)
330{
331 u32 pids, addr, ctrl;
332 int ret;
333
334 if (size > DBGP_MAX_PACKET)
335 return -1;
336
337 addr = DBGP_EPADDR(devnum, endpoint);
338
339 pids = readl(&ehci_debug->pids);
340 pids = dbgp_pid_update(pids, USB_PID_OUT);
341
342 ctrl = readl(&ehci_debug->control);
343 ctrl = dbgp_len_update(ctrl, size);
344 ctrl |= DBGP_OUT;
345 ctrl |= DBGP_GO;
346
347 dbgp_set_data(bytes, size);
348 writel(addr, &ehci_debug->address);
349 writel(pids, &ehci_debug->pids);
350
351 ret = dbgp_wait_until_done(ctrl);
352 if (ret < 0)
353 return ret;
354
355 return ret;
356}
357
358static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
359 int size)
360{
361 u32 pids, addr, ctrl;
362 int ret;
363
364 if (size > DBGP_MAX_PACKET)
365 return -1;
366
367 addr = DBGP_EPADDR(devnum, endpoint);
368
369 pids = readl(&ehci_debug->pids);
370 pids = dbgp_pid_update(pids, USB_PID_IN);
371
372 ctrl = readl(&ehci_debug->control);
373 ctrl = dbgp_len_update(ctrl, size);
374 ctrl &= ~DBGP_OUT;
375 ctrl |= DBGP_GO;
376
377 writel(addr, &ehci_debug->address);
378 writel(pids, &ehci_debug->pids);
379 ret = dbgp_wait_until_done(ctrl);
380 if (ret < 0)
381 return ret;
382
383 if (size > ret)
384 size = ret;
385 dbgp_get_data(data, size);
386 return ret;
387}
388
389static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
390 int value, int index, void *data, int size)
391{
392 u32 pids, addr, ctrl;
393 struct usb_ctrlrequest req;
394 int read;
395 int ret;
396
397 read = (requesttype & USB_DIR_IN) != 0;
398 if (size > (read ? DBGP_MAX_PACKET:0))
399 return -1;
400
401 /* Compute the control message */
402 req.bRequestType = requesttype;
403 req.bRequest = request;
404 req.wValue = cpu_to_le16(value);
405 req.wIndex = cpu_to_le16(index);
406 req.wLength = cpu_to_le16(size);
407
408 pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
409 addr = DBGP_EPADDR(devnum, 0);
410
411 ctrl = readl(&ehci_debug->control);
412 ctrl = dbgp_len_update(ctrl, sizeof(req));
413 ctrl |= DBGP_OUT;
414 ctrl |= DBGP_GO;
415
416 /* Send the setup message */
417 dbgp_set_data(&req, sizeof(req));
418 writel(addr, &ehci_debug->address);
419 writel(pids, &ehci_debug->pids);
420 ret = dbgp_wait_until_done(ctrl);
421 if (ret < 0)
422 return ret;
423
424 /* Read the result */
425 return dbgp_bulk_read(devnum, 0, data, size);
426}
427
428
429/* Find a PCI capability */
430static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
431{
432 u8 pos;
433 int bytes;
434
435 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
436 PCI_STATUS_CAP_LIST))
437 return 0;
438
439 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
440 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
441 u8 id;
442
443 pos &= ~3;
444 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
445 if (id == 0xff)
446 break;
447 if (id == cap)
448 return pos;
449
450 pos = read_pci_config_byte(num, slot, func,
451 pos+PCI_CAP_LIST_NEXT);
452 }
453 return 0;
454}
455
456static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
457{
458 u32 class;
459
460 class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
461 if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
462 return 0;
463
464 return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
465}
466
467static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
468{
469 u32 bus, slot, func;
470
471 for (bus = 0; bus < 256; bus++) {
472 for (slot = 0; slot < 32; slot++) {
473 for (func = 0; func < 8; func++) {
474 unsigned cap;
475
476 cap = __find_dbgp(bus, slot, func);
477
478 if (!cap)
479 continue;
480 if (ehci_num-- != 0)
481 continue;
482 *rbus = bus;
483 *rslot = slot;
484 *rfunc = func;
485 return cap;
486 }
487 }
488 }
489 return 0;
490}
491
492static int ehci_reset_port(int port)
493{
494 u32 portsc;
495 u32 delay_time, delay;
496 int loop;
497
498 /* Reset the usb debug port */
499 portsc = readl(&ehci_regs->port_status[port - 1]);
500 portsc &= ~PORT_PE;
501 portsc |= PORT_RESET;
502 writel(portsc, &ehci_regs->port_status[port - 1]);
503
504 delay = HUB_ROOT_RESET_TIME;
505 for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
506 delay_time += delay) {
507 dbgp_mdelay(delay);
508
509 portsc = readl(&ehci_regs->port_status[port - 1]);
510 if (portsc & PORT_RESET) {
511 /* force reset to complete */
512 loop = 2;
513 writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
514 &ehci_regs->port_status[port - 1]);
515 do {
516 portsc = readl(&ehci_regs->port_status[port-1]);
517 } while ((portsc & PORT_RESET) && (--loop > 0));
518 }
519
520 /* Device went away? */
521 if (!(portsc & PORT_CONNECT))
522 return -ENOTCONN;
523
524 /* bomb out completely if something weird happend */
525 if ((portsc & PORT_CSC))
526 return -EINVAL;
527
528 /* If we've finished resetting, then break out of the loop */
529 if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
530 return 0;
531 }
532 return -EBUSY;
533}
534
535static int ehci_wait_for_port(int port)
536{
537 u32 status;
538 int ret, reps;
539
540 for (reps = 0; reps < 3; reps++) {
541 dbgp_mdelay(100);
542 status = readl(&ehci_regs->status);
543 if (status & STS_PCD) {
544 ret = ehci_reset_port(port);
545 if (ret == 0)
546 return 0;
547 }
548 }
549 return -ENOTCONN;
550}
551
552#ifdef DBGP_DEBUG
553# define dbgp_printk early_printk
554#else
555static inline void dbgp_printk(const char *fmt, ...) { }
556#endif
557
558typedef void (*set_debug_port_t)(int port);
559
560static void default_set_debug_port(int port)
561{
562}
563
564static set_debug_port_t set_debug_port = default_set_debug_port;
565
566static void nvidia_set_debug_port(int port)
567{
568 u32 dword;
569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
570 0x74);
571 dword &= ~(0x0f<<12);
572 dword |= ((port & 0x0f)<<12);
573 write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
574 dword);
575 dbgp_printk("set debug port to %d\n", port);
576}
577
578static void __init detect_set_debug_port(void)
579{
580 u32 vendorid;
581
582 vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
583 0x00);
584
585 if ((vendorid & 0xffff) == 0x10de) {
586 dbgp_printk("using nvidia set_debug_port\n");
587 set_debug_port = nvidia_set_debug_port;
588 }
589}
590
591static int __init ehci_setup(void)
592{
593 struct usb_debug_descriptor dbgp_desc;
594 u32 cmd, ctrl, status, portsc, hcs_params;
595 u32 debug_port, new_debug_port = 0, n_ports;
596 u32 devnum;
597 int ret, i;
598 int loop;
599 int port_map_tried;
600 int playtimes = 3;
601
602try_next_time:
603 port_map_tried = 0;
604
605try_next_port:
606
607 hcs_params = readl(&ehci_caps->hcs_params);
608 debug_port = HCS_DEBUG_PORT(hcs_params);
609 n_ports = HCS_N_PORTS(hcs_params);
610
611 dbgp_printk("debug_port: %d\n", debug_port);
612 dbgp_printk("n_ports: %d\n", n_ports);
613
614 for (i = 1; i <= n_ports; i++) {
615 portsc = readl(&ehci_regs->port_status[i-1]);
616 dbgp_printk("portstatus%d: %08x\n", i, portsc);
617 }
618
619 if (port_map_tried && (new_debug_port != debug_port)) {
620 if (--playtimes) {
621 set_debug_port(new_debug_port);
622 goto try_next_time;
623 }
624 return -1;
625 }
626
627 loop = 10;
628 /* Reset the EHCI controller */
629 cmd = readl(&ehci_regs->command);
630 cmd |= CMD_RESET;
631 writel(cmd, &ehci_regs->command);
632 do {
633 cmd = readl(&ehci_regs->command);
634 } while ((cmd & CMD_RESET) && (--loop > 0));
635
636 if (!loop) {
637 dbgp_printk("can not reset ehci\n");
638 return -1;
639 }
640 dbgp_printk("ehci reset done\n");
641
642 /* Claim ownership, but do not enable yet */
643 ctrl = readl(&ehci_debug->control);
644 ctrl |= DBGP_OWNER;
645 ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
646 writel(ctrl, &ehci_debug->control);
647
648 /* Start the ehci running */
649 cmd = readl(&ehci_regs->command);
650 cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
651 cmd |= CMD_RUN;
652 writel(cmd, &ehci_regs->command);
653
654 /* Ensure everything is routed to the EHCI */
655 writel(FLAG_CF, &ehci_regs->configured_flag);
656
657 /* Wait until the controller is no longer halted */
658 loop = 10;
659 do {
660 status = readl(&ehci_regs->status);
661 } while ((status & STS_HALT) && (--loop > 0));
662
663 if (!loop) {
664 dbgp_printk("ehci can be started\n");
665 return -1;
666 }
667 dbgp_printk("ehci started\n");
668
669 /* Wait for a device to show up in the debug port */
670 ret = ehci_wait_for_port(debug_port);
671 if (ret < 0) {
672 dbgp_printk("No device found in debug port\n");
673 goto next_debug_port;
674 }
675 dbgp_printk("ehci wait for port done\n");
676
677 /* Enable the debug port */
678 ctrl = readl(&ehci_debug->control);
679 ctrl |= DBGP_CLAIM;
680 writel(ctrl, &ehci_debug->control);
681 ctrl = readl(&ehci_debug->control);
682 if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
683 dbgp_printk("No device in debug port\n");
684 writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
685 goto err;
686 }
687 dbgp_printk("debug ported enabled\n");
688
689 /* Completely transfer the debug device to the debug controller */
690 portsc = readl(&ehci_regs->port_status[debug_port - 1]);
691 portsc &= ~PORT_PE;
692 writel(portsc, &ehci_regs->port_status[debug_port - 1]);
693
694 dbgp_mdelay(100);
695
696 /* Find the debug device and make it device number 127 */
697 for (devnum = 0; devnum <= 127; devnum++) {
698 ret = dbgp_control_msg(devnum,
699 USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
700 USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
701 &dbgp_desc, sizeof(dbgp_desc));
702 if (ret > 0)
703 break;
704 }
705 if (devnum > 127) {
706 dbgp_printk("Could not find attached debug device\n");
707 goto err;
708 }
709 if (ret < 0) {
710 dbgp_printk("Attached device is not a debug device\n");
711 goto err;
712 }
713 dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
714
715 /* Move the device to 127 if it isn't already there */
716 if (devnum != USB_DEBUG_DEVNUM) {
717 ret = dbgp_control_msg(devnum,
718 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
719 USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
720 if (ret < 0) {
721 dbgp_printk("Could not move attached device to %d\n",
722 USB_DEBUG_DEVNUM);
723 goto err;
724 }
725 devnum = USB_DEBUG_DEVNUM;
726 dbgp_printk("debug device renamed to 127\n");
727 }
728
729 /* Enable the debug interface */
730 ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
731 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
732 USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
733 if (ret < 0) {
734 dbgp_printk(" Could not enable the debug device\n");
735 goto err;
736 }
737 dbgp_printk("debug interface enabled\n");
738
739 /* Perform a small write to get the even/odd data state in sync
740 */
741 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
742 if (ret < 0) {
743 dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
744 goto err;
745 }
746 dbgp_printk("small write doned\n");
747
748 return 0;
749err:
750 /* Things didn't work so remove my claim */
751 ctrl = readl(&ehci_debug->control);
752 ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
753 writel(ctrl, &ehci_debug->control);
754 return -1;
755
756next_debug_port:
757 port_map_tried |= (1<<(debug_port - 1));
758 new_debug_port = ((debug_port-1+1)%n_ports) + 1;
759 if (port_map_tried != ((1<<n_ports) - 1)) {
760 set_debug_port(new_debug_port);
761 goto try_next_port;
762 }
763 if (--playtimes) {
764 set_debug_port(new_debug_port);
765 goto try_next_time;
766 }
767
768 return -1;
769}
770
771static int __init early_dbgp_init(char *s)
772{
773 u32 debug_port, bar, offset;
774 u32 bus, slot, func, cap;
775 void __iomem *ehci_bar;
776 u32 dbgp_num;
777 u32 bar_val;
778 char *e;
779 int ret;
780 u8 byte;
781
782 if (!early_pci_allowed())
783 return -1;
784
785 dbgp_num = 0;
786 if (*s)
787 dbgp_num = simple_strtoul(s, &e, 10);
788 dbgp_printk("dbgp_num: %d\n", dbgp_num);
789
790 cap = find_dbgp(dbgp_num, &bus, &slot, &func);
791 if (!cap)
792 return -1;
793
794 dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
795 func);
796
797 debug_port = read_pci_config(bus, slot, func, cap);
798 bar = (debug_port >> 29) & 0x7;
799 bar = (bar * 4) + 0xc;
800 offset = (debug_port >> 16) & 0xfff;
801 dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
802 if (bar != PCI_BASE_ADDRESS_0) {
803 dbgp_printk("only debug ports on bar 1 handled.\n");
804
805 return -1;
806 }
807
808 bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
809 dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
810 if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
811 dbgp_printk("only simple 32bit mmio bars supported\n");
812
813 return -1;
814 }
815
816 /* double check if the mem space is enabled */
817 byte = read_pci_config_byte(bus, slot, func, 0x04);
818 if (!(byte & 0x2)) {
819 byte |= 0x02;
820 write_pci_config_byte(bus, slot, func, 0x04, byte);
821 dbgp_printk("mmio for ehci enabled\n");
822 }
823
824 /*
825 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
826 * than enough. 1K is the biggest I have seen.
827 */
828 set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
829 ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
830 ehci_bar += bar_val & ~PAGE_MASK;
831 dbgp_printk("ehci_bar: %p\n", ehci_bar);
832
833 ehci_caps = ehci_bar;
834 ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
835 ehci_debug = ehci_bar + offset;
836 ehci_dev.bus = bus;
837 ehci_dev.slot = slot;
838 ehci_dev.func = func;
839
840 detect_set_debug_port();
841
842 ret = ehci_setup();
843 if (ret < 0) {
844 dbgp_printk("ehci_setup failed\n");
845 ehci_debug = NULL;
846
847 return -1;
848 }
849
850 return 0;
851}
852
853static void early_dbgp_write(struct console *con, const char *str, u32 n)
854{
855 int chunk, ret;
856
857 if (!ehci_debug)
858 return;
859 while (n > 0) {
860 chunk = n;
861 if (chunk > DBGP_MAX_PACKET)
862 chunk = DBGP_MAX_PACKET;
863 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
864 dbgp_endpoint_out, str, chunk);
865 str += chunk;
866 n -= chunk;
867 }
868}
869
870static struct console early_dbgp_console = {
871 .name = "earlydbg",
872 .write = early_dbgp_write,
873 .flags = CON_PRINTBUFFER,
874 .index = -1,
875};
876#endif
877
154/* Console interface to a host file on AMD's SimNow! */ 878/* Console interface to a host file on AMD's SimNow! */
155 879
156static int simnow_fd; 880static int simnow_fd;
@@ -165,6 +889,7 @@ enum {
165static noinline long simnow(long cmd, long a, long b, long c) 889static noinline long simnow(long cmd, long a, long b, long c)
166{ 890{
167 long ret; 891 long ret;
892
168 asm volatile("cpuid" : 893 asm volatile("cpuid" :
169 "=a" (ret) : 894 "=a" (ret) :
170 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); 895 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
@@ -174,6 +899,7 @@ static noinline long simnow(long cmd, long a, long b, long c)
174static void __init simnow_init(char *str) 899static void __init simnow_init(char *str)
175{ 900{
176 char *fn = "klog"; 901 char *fn = "klog";
902
177 if (*str == '=') 903 if (*str == '=')
178 fn = ++str; 904 fn = ++str;
179 /* error ignored */ 905 /* error ignored */
@@ -194,7 +920,7 @@ static struct console simnow_console = {
194 920
195/* Direct interface for emergencies */ 921/* Direct interface for emergencies */
196static struct console *early_console = &early_vga_console; 922static struct console *early_console = &early_vga_console;
197static int early_console_initialized; 923static int __initdata early_console_initialized;
198 924
199asmlinkage void early_printk(const char *fmt, ...) 925asmlinkage void early_printk(const char *fmt, ...)
200{ 926{
@@ -208,10 +934,11 @@ asmlinkage void early_printk(const char *fmt, ...)
208 va_end(ap); 934 va_end(ap);
209} 935}
210 936
211static int __initdata keep_early;
212 937
213static int __init setup_early_printk(char *buf) 938static int __init setup_early_printk(char *buf)
214{ 939{
940 int keep_early;
941
215 if (!buf) 942 if (!buf)
216 return 0; 943 return 0;
217 944
@@ -219,8 +946,7 @@ static int __init setup_early_printk(char *buf)
219 return 0; 946 return 0;
220 early_console_initialized = 1; 947 early_console_initialized = 1;
221 948
222 if (strstr(buf, "keep")) 949 keep_early = (strstr(buf, "keep") != NULL);
223 keep_early = 1;
224 950
225 if (!strncmp(buf, "serial", 6)) { 951 if (!strncmp(buf, "serial", 6)) {
226 early_serial_init(buf + 6); 952 early_serial_init(buf + 6);
@@ -238,6 +964,17 @@ static int __init setup_early_printk(char *buf)
238 simnow_init(buf + 6); 964 simnow_init(buf + 6);
239 early_console = &simnow_console; 965 early_console = &simnow_console;
240 keep_early = 1; 966 keep_early = 1;
967#ifdef CONFIG_EARLY_PRINTK_DBGP
968 } else if (!strncmp(buf, "dbgp", 4)) {
969 if (early_dbgp_init(buf+4) < 0)
970 return 0;
971 early_console = &early_dbgp_console;
972 /*
973 * usb subsys will reset ehci controller, so don't keep
974 * that early console
975 */
976 keep_early = 0;
977#endif
241#ifdef CONFIG_HVC_XEN 978#ifdef CONFIG_HVC_XEN
242 } else if (!strncmp(buf, "xen", 3)) { 979 } else if (!strncmp(buf, "xen", 3)) {
243 early_console = &xenboot_console; 980 early_console = &xenboot_console;
@@ -251,4 +988,5 @@ static int __init setup_early_printk(char *buf)
251 register_console(early_console); 988 register_console(early_console);
252 return 0; 989 return 0;
253} 990}
991
254early_param("earlyprintk", setup_early_printk); 992early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 06cc8d4254b..1119d247fe1 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -367,6 +367,10 @@ void __init efi_init(void)
367 efi.smbios = config_tables[i].table; 367 efi.smbios = config_tables[i].table;
368 printk(" SMBIOS=0x%lx ", config_tables[i].table); 368 printk(" SMBIOS=0x%lx ", config_tables[i].table);
369 } else if (!efi_guidcmp(config_tables[i].guid, 369 } else if (!efi_guidcmp(config_tables[i].guid,
370 UV_SYSTEM_TABLE_GUID)) {
371 efi.uv_systab = config_tables[i].table;
372 printk(" UVsystab=0x%lx ", config_tables[i].table);
373 } else if (!efi_guidcmp(config_tables[i].guid,
370 HCDP_TABLE_GUID)) { 374 HCDP_TABLE_GUID)) {
371 efi.hcdp = config_tables[i].table; 375 efi.hcdp = config_tables[i].table;
372 printk(" HCDP=0x%lx ", config_tables[i].table); 376 printk(" HCDP=0x%lx ", config_tables[i].table);
@@ -414,9 +418,11 @@ void __init efi_init(void)
414 if (memmap.map == NULL) 418 if (memmap.map == NULL)
415 printk(KERN_ERR "Could not map the EFI memory map!\n"); 419 printk(KERN_ERR "Could not map the EFI memory map!\n");
416 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); 420 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
421
417 if (memmap.desc_size != sizeof(efi_memory_desc_t)) 422 if (memmap.desc_size != sizeof(efi_memory_desc_t))
418 printk(KERN_WARNING "Kernel-defined memdesc" 423 printk(KERN_WARNING
419 "doesn't match the one from EFI!\n"); 424 "Kernel-defined memdesc doesn't match the one from EFI!\n");
425
420 if (add_efi_memmap) 426 if (add_efi_memmap)
421 do_add_efi_memmap(); 427 do_add_efi_memmap();
422 428
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 109792bc7cf..dd65143941a 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -629,7 +629,7 @@ ENTRY(interrupt)
629ENTRY(irq_entries_start) 629ENTRY(irq_entries_start)
630 RING0_INT_FRAME 630 RING0_INT_FRAME
631vector=0 631vector=0
632.rept NR_IRQS 632.rept NR_VECTORS
633 ALIGN 633 ALIGN
634 .if vector 634 .if vector
635 CFI_ADJUST_CFA_OFFSET -4 635 CFI_ADJUST_CFA_OFFSET -4
@@ -730,6 +730,7 @@ error_code:
730 movl $(__USER_DS), %ecx 730 movl $(__USER_DS), %ecx
731 movl %ecx, %ds 731 movl %ecx, %ds
732 movl %ecx, %es 732 movl %ecx, %es
733 TRACE_IRQS_OFF
733 movl %esp,%eax # pt_regs pointer 734 movl %esp,%eax # pt_regs pointer
734 call *%edi 735 call *%edi
735 jmp ret_from_exception 736 jmp ret_from_exception
@@ -760,20 +761,9 @@ ENTRY(device_not_available)
760 RING0_INT_FRAME 761 RING0_INT_FRAME
761 pushl $-1 # mark this as an int 762 pushl $-1 # mark this as an int
762 CFI_ADJUST_CFA_OFFSET 4 763 CFI_ADJUST_CFA_OFFSET 4
763 SAVE_ALL 764 pushl $do_device_not_available
764 GET_CR0_INTO_EAX
765 testl $0x4, %eax # EM (math emulation bit)
766 jne device_not_available_emulate
767 preempt_stop(CLBR_ANY)
768 call math_state_restore
769 jmp ret_from_exception
770device_not_available_emulate:
771 pushl $0 # temporary storage for ORIG_EIP
772 CFI_ADJUST_CFA_OFFSET 4 765 CFI_ADJUST_CFA_OFFSET 4
773 call math_emulate 766 jmp error_code
774 addl $4, %esp
775 CFI_ADJUST_CFA_OFFSET -4
776 jmp ret_from_exception
777 CFI_ENDPROC 767 CFI_ENDPROC
778END(device_not_available) 768END(device_not_available)
779 769
@@ -814,6 +804,7 @@ debug_stack_correct:
814 pushl $-1 # mark this as an int 804 pushl $-1 # mark this as an int
815 CFI_ADJUST_CFA_OFFSET 4 805 CFI_ADJUST_CFA_OFFSET 4
816 SAVE_ALL 806 SAVE_ALL
807 TRACE_IRQS_OFF
817 xorl %edx,%edx # error code 0 808 xorl %edx,%edx # error code 0
818 movl %esp,%eax # pt_regs pointer 809 movl %esp,%eax # pt_regs pointer
819 call do_debug 810 call do_debug
@@ -858,6 +849,7 @@ nmi_stack_correct:
858 pushl %eax 849 pushl %eax
859 CFI_ADJUST_CFA_OFFSET 4 850 CFI_ADJUST_CFA_OFFSET 4
860 SAVE_ALL 851 SAVE_ALL
852 TRACE_IRQS_OFF
861 xorl %edx,%edx # zero error code 853 xorl %edx,%edx # zero error code
862 movl %esp,%eax # pt_regs pointer 854 movl %esp,%eax # pt_regs pointer
863 call do_nmi 855 call do_nmi
@@ -898,6 +890,7 @@ nmi_espfix_stack:
898 pushl %eax 890 pushl %eax
899 CFI_ADJUST_CFA_OFFSET 4 891 CFI_ADJUST_CFA_OFFSET 4
900 SAVE_ALL 892 SAVE_ALL
893 TRACE_IRQS_OFF
901 FIXUP_ESPFIX_STACK # %eax == %esp 894 FIXUP_ESPFIX_STACK # %eax == %esp
902 xorl %edx,%edx # zero error code 895 xorl %edx,%edx # zero error code
903 call do_nmi 896 call do_nmi
@@ -928,6 +921,7 @@ KPROBE_ENTRY(int3)
928 pushl $-1 # mark this as an int 921 pushl $-1 # mark this as an int
929 CFI_ADJUST_CFA_OFFSET 4 922 CFI_ADJUST_CFA_OFFSET 4
930 SAVE_ALL 923 SAVE_ALL
924 TRACE_IRQS_OFF
931 xorl %edx,%edx # zero error code 925 xorl %edx,%edx # zero error code
932 movl %esp,%eax # pt_regs pointer 926 movl %esp,%eax # pt_regs pointer
933 call do_int3 927 call do_int3
@@ -1159,20 +1153,6 @@ ENDPROC(xen_failsafe_callback)
1159#ifdef CONFIG_DYNAMIC_FTRACE 1153#ifdef CONFIG_DYNAMIC_FTRACE
1160 1154
1161ENTRY(mcount) 1155ENTRY(mcount)
1162 pushl %eax
1163 pushl %ecx
1164 pushl %edx
1165 movl 0xc(%esp), %eax
1166 subl $MCOUNT_INSN_SIZE, %eax
1167
1168.globl mcount_call
1169mcount_call:
1170 call ftrace_stub
1171
1172 popl %edx
1173 popl %ecx
1174 popl %eax
1175
1176 ret 1156 ret
1177END(mcount) 1157END(mcount)
1178 1158
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 89434d43960..09e7145484c 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -64,32 +64,6 @@
64#ifdef CONFIG_FTRACE 64#ifdef CONFIG_FTRACE
65#ifdef CONFIG_DYNAMIC_FTRACE 65#ifdef CONFIG_DYNAMIC_FTRACE
66ENTRY(mcount) 66ENTRY(mcount)
67
68 subq $0x38, %rsp
69 movq %rax, (%rsp)
70 movq %rcx, 8(%rsp)
71 movq %rdx, 16(%rsp)
72 movq %rsi, 24(%rsp)
73 movq %rdi, 32(%rsp)
74 movq %r8, 40(%rsp)
75 movq %r9, 48(%rsp)
76
77 movq 0x38(%rsp), %rdi
78 subq $MCOUNT_INSN_SIZE, %rdi
79
80.globl mcount_call
81mcount_call:
82 call ftrace_stub
83
84 movq 48(%rsp), %r9
85 movq 40(%rsp), %r8
86 movq 32(%rsp), %rdi
87 movq 24(%rsp), %rsi
88 movq 16(%rsp), %rdx
89 movq 8(%rsp), %rcx
90 movq (%rsp), %rax
91 addq $0x38, %rsp
92
93 retq 67 retq
94END(mcount) 68END(mcount)
95 69
@@ -275,9 +249,9 @@ ENTRY(native_usergs_sysret64)
275ENTRY(ret_from_fork) 249ENTRY(ret_from_fork)
276 CFI_DEFAULT_STACK 250 CFI_DEFAULT_STACK
277 push kernel_eflags(%rip) 251 push kernel_eflags(%rip)
278 CFI_ADJUST_CFA_OFFSET 4 252 CFI_ADJUST_CFA_OFFSET 8
279 popf # reset kernel eflags 253 popf # reset kernel eflags
280 CFI_ADJUST_CFA_OFFSET -4 254 CFI_ADJUST_CFA_OFFSET -8
281 call schedule_tail 255 call schedule_tail
282 GET_THREAD_INFO(%rcx) 256 GET_THREAD_INFO(%rcx)
283 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 257 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
@@ -667,6 +641,13 @@ END(stub_rt_sigreturn)
667 SAVE_ARGS 641 SAVE_ARGS
668 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler 642 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
669 pushq %rbp 643 pushq %rbp
644 /*
645 * Save rbp twice: One is for marking the stack frame, as usual, and the
646 * other, to fill pt_regs properly. This is because bx comes right
647 * before the last saved register in that structure, and not bp. If the
648 * base pointer were in the place bx is today, this would not be needed.
649 */
650 movq %rbp, -8(%rsp)
670 CFI_ADJUST_CFA_OFFSET 8 651 CFI_ADJUST_CFA_OFFSET 8
671 CFI_REL_OFFSET rbp, 0 652 CFI_REL_OFFSET rbp, 0
672 movq %rsp,%rbp 653 movq %rsp,%rbp
@@ -932,6 +913,9 @@ END(spurious_interrupt)
932 .if \ist 913 .if \ist
933 movq %gs:pda_data_offset, %rbp 914 movq %gs:pda_data_offset, %rbp
934 .endif 915 .endif
916 .if \irqtrace
917 TRACE_IRQS_OFF
918 .endif
935 movq %rsp,%rdi 919 movq %rsp,%rdi
936 movq ORIG_RAX(%rsp),%rsi 920 movq ORIG_RAX(%rsp),%rsi
937 movq $-1,ORIG_RAX(%rsp) 921 movq $-1,ORIG_RAX(%rsp)
@@ -1058,7 +1042,8 @@ KPROBE_ENTRY(error_entry)
1058 je error_kernelspace 1042 je error_kernelspace
1059error_swapgs: 1043error_swapgs:
1060 SWAPGS 1044 SWAPGS
1061error_sti: 1045error_sti:
1046 TRACE_IRQS_OFF
1062 movq %rdi,RDI(%rsp) 1047 movq %rdi,RDI(%rsp)
1063 CFI_REL_OFFSET rdi,RDI 1048 CFI_REL_OFFSET rdi,RDI
1064 movq %rsp,%rdi 1049 movq %rsp,%rdi
@@ -1232,7 +1217,7 @@ ENTRY(simd_coprocessor_error)
1232END(simd_coprocessor_error) 1217END(simd_coprocessor_error)
1233 1218
1234ENTRY(device_not_available) 1219ENTRY(device_not_available)
1235 zeroentry math_state_restore 1220 zeroentry do_device_not_available
1236END(device_not_available) 1221END(device_not_available)
1237 1222
1238 /* runs on exception stack */ 1223 /* runs on exception stack */
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/kernel/es7000_32.c
index 50189af14b8..f454c78fcef 100644
--- a/arch/x86/mach-es7000/es7000plat.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -39,10 +39,94 @@
39#include <asm/nmi.h> 39#include <asm/nmi.h>
40#include <asm/smp.h> 40#include <asm/smp.h>
41#include <asm/apicdef.h> 41#include <asm/apicdef.h>
42#include "es7000.h"
43#include <mach_mpparse.h> 42#include <mach_mpparse.h>
44 43
45/* 44/*
45 * ES7000 chipsets
46 */
47
48#define NON_UNISYS 0
49#define ES7000_CLASSIC 1
50#define ES7000_ZORRO 2
51
52
53#define MIP_REG 1
54#define MIP_PSAI_REG 4
55
56#define MIP_BUSY 1
57#define MIP_SPIN 0xf0000
58#define MIP_VALID 0x0100000000000000ULL
59#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
60
61#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
62
63struct mip_reg_info {
64 unsigned long long mip_info;
65 unsigned long long delivery_info;
66 unsigned long long host_reg;
67 unsigned long long mip_reg;
68};
69
70struct part_info {
71 unsigned char type;
72 unsigned char length;
73 unsigned char part_id;
74 unsigned char apic_mode;
75 unsigned long snum;
76 char ptype[16];
77 char sname[64];
78 char pname[64];
79};
80
81struct psai {
82 unsigned long long entry_type;
83 unsigned long long addr;
84 unsigned long long bep_addr;
85};
86
87struct es7000_mem_info {
88 unsigned char type;
89 unsigned char length;
90 unsigned char resv[6];
91 unsigned long long start;
92 unsigned long long size;
93};
94
95struct es7000_oem_table {
96 unsigned long long hdr;
97 struct mip_reg_info mip;
98 struct part_info pif;
99 struct es7000_mem_info shm;
100 struct psai psai;
101};
102
103#ifdef CONFIG_ACPI
104
105struct oem_table {
106 struct acpi_table_header Header;
107 u32 OEMTableAddr;
108 u32 OEMTableSize;
109};
110
111extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
112extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
113#endif
114
115struct mip_reg {
116 unsigned long long off_0;
117 unsigned long long off_8;
118 unsigned long long off_10;
119 unsigned long long off_18;
120 unsigned long long off_20;
121 unsigned long long off_28;
122 unsigned long long off_30;
123 unsigned long long off_38;
124};
125
126#define MIP_SW_APIC 0x1020b
127#define MIP_FUNC(VALUE) (VALUE & 0xff)
128
129/*
46 * ES7000 Globals 130 * ES7000 Globals
47 */ 131 */
48 132
@@ -72,7 +156,7 @@ es7000_rename_gsi(int ioapic, int gsi)
72 base += nr_ioapic_registers[i]; 156 base += nr_ioapic_registers[i];
73 } 157 }
74 158
75 if (!ioapic && (gsi < 16)) 159 if (!ioapic && (gsi < 16))
76 gsi += base; 160 gsi += base;
77 return gsi; 161 return gsi;
78} 162}
@@ -160,21 +244,38 @@ parse_unisys_oem (char *oemptr)
160} 244}
161 245
162#ifdef CONFIG_ACPI 246#ifdef CONFIG_ACPI
163int __init 247static unsigned long oem_addrX;
164find_unisys_acpi_oem_table(unsigned long *oem_addr) 248static unsigned long oem_size;
249int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
165{ 250{
166 struct acpi_table_header *header = NULL; 251 struct acpi_table_header *header = NULL;
167 int i = 0; 252 int i = 0;
168 while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { 253 acpi_size tbl_size;
254
255 while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) {
169 if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { 256 if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
170 struct oem_table *t = (struct oem_table *)header; 257 struct oem_table *t = (struct oem_table *)header;
171 *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr, 258
172 t->OEMTableSize); 259 oem_addrX = t->OEMTableAddr;
260 oem_size = t->OEMTableSize;
261 early_acpi_os_unmap_memory(header, tbl_size);
262
263 *oem_addr = (unsigned long)__acpi_map_table(oem_addrX,
264 oem_size);
173 return 0; 265 return 0;
174 } 266 }
267 early_acpi_os_unmap_memory(header, tbl_size);
175 } 268 }
176 return -1; 269 return -1;
177} 270}
271
272void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
273{
274 if (!oem_addr)
275 return;
276
277 __acpi_unmap_table((char *)oem_addr, oem_size);
278}
178#endif 279#endif
179 280
180static void 281static void
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ab115cd15fd..d073d981a73 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -11,17 +11,18 @@
11 11
12#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <linux/hardirq.h> 13#include <linux/hardirq.h>
14#include <linux/uaccess.h>
14#include <linux/ftrace.h> 15#include <linux/ftrace.h>
15#include <linux/percpu.h> 16#include <linux/percpu.h>
16#include <linux/init.h> 17#include <linux/init.h>
17#include <linux/list.h> 18#include <linux/list.h>
18 19
19#include <asm/alternative.h>
20#include <asm/ftrace.h> 20#include <asm/ftrace.h>
21#include <asm/nops.h>
21 22
22 23
23/* Long is fine, even if it is only 4 bytes ;-) */ 24/* Long is fine, even if it is only 4 bytes ;-) */
24static long *ftrace_nop; 25static unsigned long *ftrace_nop;
25 26
26union ftrace_code_union { 27union ftrace_code_union {
27 char code[MCOUNT_INSN_SIZE]; 28 char code[MCOUNT_INSN_SIZE];
@@ -60,11 +61,7 @@ notrace int
60ftrace_modify_code(unsigned long ip, unsigned char *old_code, 61ftrace_modify_code(unsigned long ip, unsigned char *old_code,
61 unsigned char *new_code) 62 unsigned char *new_code)
62{ 63{
63 unsigned replaced; 64 unsigned char replaced[MCOUNT_INSN_SIZE];
64 unsigned old = *(unsigned *)old_code; /* 4 bytes */
65 unsigned new = *(unsigned *)new_code; /* 4 bytes */
66 unsigned char newch = new_code[4];
67 int faulted = 0;
68 65
69 /* 66 /*
70 * Note: Due to modules and __init, code can 67 * Note: Due to modules and __init, code can
@@ -72,29 +69,20 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
72 * as well as code changing. 69 * as well as code changing.
73 * 70 *
74 * No real locking needed, this code is run through 71 * No real locking needed, this code is run through
75 * kstop_machine. 72 * kstop_machine, or before SMP starts.
76 */ 73 */
77 asm volatile ( 74 if (__copy_from_user_inatomic(replaced, (char __user *)ip, MCOUNT_INSN_SIZE))
78 "1: lock\n" 75 return 1;
79 " cmpxchg %3, (%2)\n" 76
80 " jnz 2f\n" 77 if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
81 " movb %b4, 4(%2)\n" 78 return 2;
82 "2:\n"
83 ".section .fixup, \"ax\"\n"
84 "3: movl $1, %0\n"
85 " jmp 2b\n"
86 ".previous\n"
87 _ASM_EXTABLE(1b, 3b)
88 : "=r"(faulted), "=a"(replaced)
89 : "r"(ip), "r"(new), "c"(newch),
90 "0"(faulted), "a"(old)
91 : "memory");
92 sync_core();
93 79
94 if (replaced != old && replaced != new) 80 WARN_ON_ONCE(__copy_to_user_inatomic((char __user *)ip, new_code,
95 faulted = 2; 81 MCOUNT_INSN_SIZE));
96 82
97 return faulted; 83 sync_core();
84
85 return 0;
98} 86}
99 87
100notrace int ftrace_update_ftrace_func(ftrace_func_t func) 88notrace int ftrace_update_ftrace_func(ftrace_func_t func)
@@ -112,30 +100,76 @@ notrace int ftrace_update_ftrace_func(ftrace_func_t func)
112 100
113notrace int ftrace_mcount_set(unsigned long *data) 101notrace int ftrace_mcount_set(unsigned long *data)
114{ 102{
115 unsigned long ip = (long)(&mcount_call); 103 /* mcount is initialized as a nop */
116 unsigned long *addr = data; 104 *data = 0;
117 unsigned char old[MCOUNT_INSN_SIZE], *new;
118
119 /*
120 * Replace the mcount stub with a pointer to the
121 * ip recorder function.
122 */
123 memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
124 new = ftrace_call_replace(ip, *addr);
125 *addr = ftrace_modify_code(ip, old, new);
126
127 return 0; 105 return 0;
128} 106}
129 107
130int __init ftrace_dyn_arch_init(void *data) 108int __init ftrace_dyn_arch_init(void *data)
131{ 109{
132 const unsigned char *const *noptable = find_nop_table(); 110 extern const unsigned char ftrace_test_p6nop[];
133 111 extern const unsigned char ftrace_test_nop5[];
134 /* This is running in kstop_machine */ 112 extern const unsigned char ftrace_test_jmp[];
135 113 int faulted = 0;
136 ftrace_mcount_set(data);
137 114
138 ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE]; 115 /*
116 * There is no good nop for all x86 archs.
117 * We will default to using the P6_NOP5, but first we
118 * will test to make sure that the nop will actually
119 * work on this CPU. If it faults, we will then
120 * go to a lesser efficient 5 byte nop. If that fails
121 * we then just use a jmp as our nop. This isn't the most
122 * efficient nop, but we can not use a multi part nop
123 * since we would then risk being preempted in the middle
124 * of that nop, and if we enabled tracing then, it might
125 * cause a system crash.
126 *
127 * TODO: check the cpuid to determine the best nop.
128 */
129 asm volatile (
130 "jmp ftrace_test_jmp\n"
131 /* This code needs to stay around */
132 ".section .text, \"ax\"\n"
133 "ftrace_test_jmp:"
134 "jmp ftrace_test_p6nop\n"
135 "nop\n"
136 "nop\n"
137 "nop\n" /* 2 byte jmp + 3 bytes */
138 "ftrace_test_p6nop:"
139 P6_NOP5
140 "jmp 1f\n"
141 "ftrace_test_nop5:"
142 ".byte 0x66,0x66,0x66,0x66,0x90\n"
143 "jmp 1f\n"
144 ".previous\n"
145 "1:"
146 ".section .fixup, \"ax\"\n"
147 "2: movl $1, %0\n"
148 " jmp ftrace_test_nop5\n"
149 "3: movl $2, %0\n"
150 " jmp 1b\n"
151 ".previous\n"
152 _ASM_EXTABLE(ftrace_test_p6nop, 2b)
153 _ASM_EXTABLE(ftrace_test_nop5, 3b)
154 : "=r"(faulted) : "0" (faulted));
155
156 switch (faulted) {
157 case 0:
158 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n");
159 ftrace_nop = (unsigned long *)ftrace_test_p6nop;
160 break;
161 case 1:
162 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n");
163 ftrace_nop = (unsigned long *)ftrace_test_nop5;
164 break;
165 case 2:
166 pr_info("ftrace: converting mcount calls to jmp . + 5\n");
167 ftrace_nop = (unsigned long *)ftrace_test_jmp;
168 break;
169 }
170
171 /* The return code is retured via data */
172 *(unsigned long *)data = 0;
139 173
140 return 0; 174 return 0;
141} 175}
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index eaff0bbb144..6c9bfc9e1e9 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -16,87 +16,63 @@
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h> 18#include <linux/hardirq.h>
19#include <linux/dmar.h>
19 20
20#include <asm/smp.h> 21#include <asm/smp.h>
21#include <asm/ipi.h> 22#include <asm/ipi.h>
22#include <asm/genapic.h> 23#include <asm/genapic.h>
23 24
24#ifdef CONFIG_ACPI 25extern struct genapic apic_flat;
25#include <acpi/acpi_bus.h> 26extern struct genapic apic_physflat;
26#endif 27extern struct genapic apic_x2xpic_uv_x;
27 28extern struct genapic apic_x2apic_phys;
28DEFINE_PER_CPU(int, x2apic_extra_bits); 29extern struct genapic apic_x2apic_cluster;
29 30
30struct genapic __read_mostly *genapic = &apic_flat; 31struct genapic __read_mostly *genapic = &apic_flat;
31 32
32static enum uv_system_type uv_system_type; 33static struct genapic *apic_probe[] __initdata = {
34 &apic_x2apic_uv_x,
35 &apic_x2apic_phys,
36 &apic_x2apic_cluster,
37 &apic_physflat,
38 NULL,
39};
33 40
34/* 41/*
35 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 42 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
36 */ 43 */
37void __init setup_apic_routing(void) 44void __init setup_apic_routing(void)
38{ 45{
39 if (uv_system_type == UV_NON_UNIQUE_APIC) 46 if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) {
40 genapic = &apic_x2apic_uv_x; 47 if (!intr_remapping_enabled)
41 else 48 genapic = &apic_flat;
42#ifdef CONFIG_ACPI 49 }
43 /*
44 * Quirk: some x86_64 machines can only use physical APIC mode
45 * regardless of how many processors are present (x86_64 ES7000
46 * is an example).
47 */
48 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
49 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
50 genapic = &apic_physflat;
51 else
52#endif
53
54 if (max_physical_apicid < 8)
55 genapic = &apic_flat;
56 else
57 genapic = &apic_physflat;
58 50
59 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); 51 if (genapic == &apic_flat) {
52 if (max_physical_apicid >= 8)
53 genapic = &apic_physflat;
54 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
55 }
60} 56}
61 57
62/* Same for both flat and physical. */ 58/* Same for both flat and physical. */
63 59
64void send_IPI_self(int vector) 60void apic_send_IPI_self(int vector)
65{ 61{
66 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); 62 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
67} 63}
68 64
69int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 65int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
70{ 66{
71 if (!strcmp(oem_id, "SGI")) { 67 int i;
72 if (!strcmp(oem_table_id, "UVL")) 68
73 uv_system_type = UV_LEGACY_APIC; 69 for (i = 0; apic_probe[i]; ++i) {
74 else if (!strcmp(oem_table_id, "UVX")) 70 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
75 uv_system_type = UV_X2APIC; 71 genapic = apic_probe[i];
76 else if (!strcmp(oem_table_id, "UVH")) 72 printk(KERN_INFO "Setting APIC routing to %s.\n",
77 uv_system_type = UV_NON_UNIQUE_APIC; 73 genapic->name);
74 return 1;
75 }
78 } 76 }
79 return 0; 77 return 0;
80} 78}
81
82unsigned int read_apic_id(void)
83{
84 unsigned int id;
85
86 WARN_ON(preemptible() && num_online_cpus() > 1);
87 id = apic_read(APIC_ID);
88 if (uv_system_type >= UV_X2APIC)
89 id |= __get_cpu_var(x2apic_extra_bits);
90 return id;
91}
92
93enum uv_system_type get_uv_system_type(void)
94{
95 return uv_system_type;
96}
97
98int is_uv_system(void)
99{
100 return uv_system_type != UV_NONE;
101}
102EXPORT_SYMBOL_GPL(is_uv_system);
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 786548a62d3..c0262791bda 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -15,9 +15,20 @@
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h>
18#include <asm/smp.h> 19#include <asm/smp.h>
19#include <asm/ipi.h> 20#include <asm/ipi.h>
20#include <asm/genapic.h> 21#include <asm/genapic.h>
22#include <mach_apicdef.h>
23
24#ifdef CONFIG_ACPI
25#include <acpi/acpi_bus.h>
26#endif
27
28static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
29{
30 return 1;
31}
21 32
22static cpumask_t flat_target_cpus(void) 33static cpumask_t flat_target_cpus(void)
23{ 34{
@@ -95,9 +106,33 @@ static void flat_send_IPI_all(int vector)
95 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); 106 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
96} 107}
97 108
109static unsigned int get_apic_id(unsigned long x)
110{
111 unsigned int id;
112
113 id = (((x)>>24) & 0xFFu);
114 return id;
115}
116
117static unsigned long set_apic_id(unsigned int id)
118{
119 unsigned long x;
120
121 x = ((id & 0xFFu)<<24);
122 return x;
123}
124
125static unsigned int read_xapic_id(void)
126{
127 unsigned int id;
128
129 id = get_apic_id(apic_read(APIC_ID));
130 return id;
131}
132
98static int flat_apic_id_registered(void) 133static int flat_apic_id_registered(void)
99{ 134{
100 return physid_isset(GET_APIC_ID(read_apic_id()), phys_cpu_present_map); 135 return physid_isset(read_xapic_id(), phys_cpu_present_map);
101} 136}
102 137
103static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) 138static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
@@ -112,6 +147,7 @@ static unsigned int phys_pkg_id(int index_msb)
112 147
113struct genapic apic_flat = { 148struct genapic apic_flat = {
114 .name = "flat", 149 .name = "flat",
150 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
115 .int_delivery_mode = dest_LowestPrio, 151 .int_delivery_mode = dest_LowestPrio,
116 .int_dest_mode = (APIC_DEST_LOGICAL != 0), 152 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
117 .target_cpus = flat_target_cpus, 153 .target_cpus = flat_target_cpus,
@@ -121,8 +157,12 @@ struct genapic apic_flat = {
121 .send_IPI_all = flat_send_IPI_all, 157 .send_IPI_all = flat_send_IPI_all,
122 .send_IPI_allbutself = flat_send_IPI_allbutself, 158 .send_IPI_allbutself = flat_send_IPI_allbutself,
123 .send_IPI_mask = flat_send_IPI_mask, 159 .send_IPI_mask = flat_send_IPI_mask,
160 .send_IPI_self = apic_send_IPI_self,
124 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, 161 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
125 .phys_pkg_id = phys_pkg_id, 162 .phys_pkg_id = phys_pkg_id,
163 .get_apic_id = get_apic_id,
164 .set_apic_id = set_apic_id,
165 .apic_id_mask = (0xFFu<<24),
126}; 166};
127 167
128/* 168/*
@@ -130,6 +170,23 @@ struct genapic apic_flat = {
130 * We cannot use logical delivery in this case because the mask 170 * We cannot use logical delivery in this case because the mask
131 * overflows, so use physical mode. 171 * overflows, so use physical mode.
132 */ 172 */
173static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
174{
175#ifdef CONFIG_ACPI
176 /*
177 * Quirk: some x86_64 machines can only use physical APIC mode
178 * regardless of how many processors are present (x86_64 ES7000
179 * is an example).
180 */
181 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
182 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
183 printk(KERN_DEBUG "system APIC only can use physical flat");
184 return 1;
185 }
186#endif
187
188 return 0;
189}
133 190
134static cpumask_t physflat_target_cpus(void) 191static cpumask_t physflat_target_cpus(void)
135{ 192{
@@ -176,6 +233,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
176 233
177struct genapic apic_physflat = { 234struct genapic apic_physflat = {
178 .name = "physical flat", 235 .name = "physical flat",
236 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
179 .int_delivery_mode = dest_Fixed, 237 .int_delivery_mode = dest_Fixed,
180 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 238 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
181 .target_cpus = physflat_target_cpus, 239 .target_cpus = physflat_target_cpus,
@@ -185,6 +243,10 @@ struct genapic apic_physflat = {
185 .send_IPI_all = physflat_send_IPI_all, 243 .send_IPI_all = physflat_send_IPI_all,
186 .send_IPI_allbutself = physflat_send_IPI_allbutself, 244 .send_IPI_allbutself = physflat_send_IPI_allbutself,
187 .send_IPI_mask = physflat_send_IPI_mask, 245 .send_IPI_mask = physflat_send_IPI_mask,
246 .send_IPI_self = apic_send_IPI_self,
188 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, 247 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
189 .phys_pkg_id = phys_pkg_id, 248 .phys_pkg_id = phys_pkg_id,
249 .get_apic_id = get_apic_id,
250 .set_apic_id = set_apic_id,
251 .apic_id_mask = (0xFFu<<24),
190}; 252};
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
new file mode 100644
index 00000000000..f6a2c8eb48a
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -0,0 +1,159 @@
1#include <linux/threads.h>
2#include <linux/cpumask.h>
3#include <linux/string.h>
4#include <linux/kernel.h>
5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h>
8
9#include <asm/smp.h>
10#include <asm/ipi.h>
11#include <asm/genapic.h>
12
13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{
17 if (cpu_has_x2apic)
18 return 1;
19
20 return 0;
21}
22
23/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
24
25static cpumask_t x2apic_target_cpus(void)
26{
27 return cpumask_of_cpu(0);
28}
29
30/*
31 * for now each logical cpu is in its own vector allocation domain.
32 */
33static cpumask_t x2apic_vector_allocation_domain(int cpu)
34{
35 cpumask_t domain = CPU_MASK_NONE;
36 cpu_set(cpu, domain);
37 return domain;
38}
39
40static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
41 unsigned int dest)
42{
43 unsigned long cfg;
44
45 cfg = __prepare_ICR(0, vector, dest);
46
47 /*
48 * send the IPI.
49 */
50 x2apic_icr_write(cfg, apicid);
51}
52
53/*
54 * for now, we send the IPI's one by one in the cpumask.
55 * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
56 * at once. We have 16 cpu's in a cluster. This will minimize IPI register
57 * writes.
58 */
59static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
60{
61 unsigned long flags;
62 unsigned long query_cpu;
63
64 local_irq_save(flags);
65 for_each_cpu_mask(query_cpu, mask) {
66 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu),
67 vector, APIC_DEST_LOGICAL);
68 }
69 local_irq_restore(flags);
70}
71
72static void x2apic_send_IPI_allbutself(int vector)
73{
74 cpumask_t mask = cpu_online_map;
75
76 cpu_clear(smp_processor_id(), mask);
77
78 if (!cpus_empty(mask))
79 x2apic_send_IPI_mask(mask, vector);
80}
81
82static void x2apic_send_IPI_all(int vector)
83{
84 x2apic_send_IPI_mask(cpu_online_map, vector);
85}
86
87static int x2apic_apic_id_registered(void)
88{
89 return 1;
90}
91
92static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
93{
94 int cpu;
95
96 /*
97 * We're using fixed IRQ delivery, can only return one phys APIC ID.
98 * May as well be the first.
99 */
100 cpu = first_cpu(cpumask);
101 if ((unsigned)cpu < NR_CPUS)
102 return per_cpu(x86_cpu_to_logical_apicid, cpu);
103 else
104 return BAD_APICID;
105}
106
107static unsigned int get_apic_id(unsigned long x)
108{
109 unsigned int id;
110
111 id = x;
112 return id;
113}
114
115static unsigned long set_apic_id(unsigned int id)
116{
117 unsigned long x;
118
119 x = id;
120 return x;
121}
122
123static unsigned int phys_pkg_id(int index_msb)
124{
125 return current_cpu_data.initial_apicid >> index_msb;
126}
127
128static void x2apic_send_IPI_self(int vector)
129{
130 apic_write(APIC_SELF_IPI, vector);
131}
132
133static void init_x2apic_ldr(void)
134{
135 int cpu = smp_processor_id();
136
137 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
138 return;
139}
140
141struct genapic apic_x2apic_cluster = {
142 .name = "cluster x2apic",
143 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
144 .int_delivery_mode = dest_LowestPrio,
145 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
146 .target_cpus = x2apic_target_cpus,
147 .vector_allocation_domain = x2apic_vector_allocation_domain,
148 .apic_id_registered = x2apic_apic_id_registered,
149 .init_apic_ldr = init_x2apic_ldr,
150 .send_IPI_all = x2apic_send_IPI_all,
151 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
152 .send_IPI_mask = x2apic_send_IPI_mask,
153 .send_IPI_self = x2apic_send_IPI_self,
154 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
155 .phys_pkg_id = phys_pkg_id,
156 .get_apic_id = get_apic_id,
157 .set_apic_id = set_apic_id,
158 .apic_id_mask = (0xFFFFFFFFu),
159};
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
new file mode 100644
index 00000000000..d042211768b
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -0,0 +1,154 @@
1#include <linux/threads.h>
2#include <linux/cpumask.h>
3#include <linux/string.h>
4#include <linux/kernel.h>
5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h>
8
9#include <asm/smp.h>
10#include <asm/ipi.h>
11#include <asm/genapic.h>
12
13static int x2apic_phys;
14
15static int set_x2apic_phys_mode(char *arg)
16{
17 x2apic_phys = 1;
18 return 0;
19}
20early_param("x2apic_phys", set_x2apic_phys_mode);
21
22static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
23{
24 if (cpu_has_x2apic && x2apic_phys)
25 return 1;
26
27 return 0;
28}
29
30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
31
32static cpumask_t x2apic_target_cpus(void)
33{
34 return cpumask_of_cpu(0);
35}
36
37static cpumask_t x2apic_vector_allocation_domain(int cpu)
38{
39 cpumask_t domain = CPU_MASK_NONE;
40 cpu_set(cpu, domain);
41 return domain;
42}
43
44static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
45 unsigned int dest)
46{
47 unsigned long cfg;
48
49 cfg = __prepare_ICR(0, vector, dest);
50
51 /*
52 * send the IPI.
53 */
54 x2apic_icr_write(cfg, apicid);
55}
56
57static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
58{
59 unsigned long flags;
60 unsigned long query_cpu;
61
62 local_irq_save(flags);
63 for_each_cpu_mask(query_cpu, mask) {
64 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
65 vector, APIC_DEST_PHYSICAL);
66 }
67 local_irq_restore(flags);
68}
69
70static void x2apic_send_IPI_allbutself(int vector)
71{
72 cpumask_t mask = cpu_online_map;
73
74 cpu_clear(smp_processor_id(), mask);
75
76 if (!cpus_empty(mask))
77 x2apic_send_IPI_mask(mask, vector);
78}
79
80static void x2apic_send_IPI_all(int vector)
81{
82 x2apic_send_IPI_mask(cpu_online_map, vector);
83}
84
85static int x2apic_apic_id_registered(void)
86{
87 return 1;
88}
89
90static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
91{
92 int cpu;
93
94 /*
95 * We're using fixed IRQ delivery, can only return one phys APIC ID.
96 * May as well be the first.
97 */
98 cpu = first_cpu(cpumask);
99 if ((unsigned)cpu < NR_CPUS)
100 return per_cpu(x86_cpu_to_apicid, cpu);
101 else
102 return BAD_APICID;
103}
104
105static unsigned int get_apic_id(unsigned long x)
106{
107 unsigned int id;
108
109 id = x;
110 return id;
111}
112
113static unsigned long set_apic_id(unsigned int id)
114{
115 unsigned long x;
116
117 x = id;
118 return x;
119}
120
121static unsigned int phys_pkg_id(int index_msb)
122{
123 return current_cpu_data.initial_apicid >> index_msb;
124}
125
126void x2apic_send_IPI_self(int vector)
127{
128 apic_write(APIC_SELF_IPI, vector);
129}
130
131void init_x2apic_ldr(void)
132{
133 return;
134}
135
136struct genapic apic_x2apic_phys = {
137 .name = "physical x2apic",
138 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
139 .int_delivery_mode = dest_Fixed,
140 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
141 .target_cpus = x2apic_target_cpus,
142 .vector_allocation_domain = x2apic_vector_allocation_domain,
143 .apic_id_registered = x2apic_apic_id_registered,
144 .init_apic_ldr = init_x2apic_ldr,
145 .send_IPI_all = x2apic_send_IPI_all,
146 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
147 .send_IPI_mask = x2apic_send_IPI_mask,
148 .send_IPI_self = x2apic_send_IPI_self,
149 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
150 .phys_pkg_id = phys_pkg_id,
151 .get_apic_id = get_apic_id,
152 .set_apic_id = set_apic_id,
153 .apic_id_mask = (0xFFFFFFFFu),
154};
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index bfa837cb16b..680a06557c5 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -12,12 +12,12 @@
12#include <linux/threads.h> 12#include <linux/threads.h>
13#include <linux/cpumask.h> 13#include <linux/cpumask.h>
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h> 15#include <linux/ctype.h>
17#include <linux/init.h> 16#include <linux/init.h>
18#include <linux/sched.h> 17#include <linux/sched.h>
19#include <linux/bootmem.h> 18#include <linux/bootmem.h>
20#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/hardirq.h>
21#include <asm/smp.h> 21#include <asm/smp.h>
22#include <asm/ipi.h> 22#include <asm/ipi.h>
23#include <asm/genapic.h> 23#include <asm/genapic.h>
@@ -26,6 +26,36 @@
26#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
27#include <asm/uv/bios.h> 27#include <asm/uv/bios.h>
28 28
29DEFINE_PER_CPU(int, x2apic_extra_bits);
30
31static enum uv_system_type uv_system_type;
32
33static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
34{
35 if (!strcmp(oem_id, "SGI")) {
36 if (!strcmp(oem_table_id, "UVL"))
37 uv_system_type = UV_LEGACY_APIC;
38 else if (!strcmp(oem_table_id, "UVX"))
39 uv_system_type = UV_X2APIC;
40 else if (!strcmp(oem_table_id, "UVH")) {
41 uv_system_type = UV_NON_UNIQUE_APIC;
42 return 1;
43 }
44 }
45 return 0;
46}
47
48enum uv_system_type get_uv_system_type(void)
49{
50 return uv_system_type;
51}
52
53int is_uv_system(void)
54{
55 return uv_system_type != UV_NONE;
56}
57EXPORT_SYMBOL_GPL(is_uv_system);
58
29DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 59DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
30EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); 60EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
31 61
@@ -84,7 +114,7 @@ static void uv_send_IPI_one(int cpu, int vector)
84 unsigned long val, apicid, lapicid; 114 unsigned long val, apicid, lapicid;
85 int pnode; 115 int pnode;
86 116
87 apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ 117 apicid = per_cpu(x86_cpu_to_apicid, cpu);
88 lapicid = apicid & 0x3f; /* ZZZ macro needed */ 118 lapicid = apicid & 0x3f; /* ZZZ macro needed */
89 pnode = uv_apicid_to_pnode(apicid); 119 pnode = uv_apicid_to_pnode(apicid);
90 val = 120 val =
@@ -123,6 +153,10 @@ static int uv_apic_id_registered(void)
123 return 1; 153 return 1;
124} 154}
125 155
156static void uv_init_apic_ldr(void)
157{
158}
159
126static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) 160static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
127{ 161{
128 int cpu; 162 int cpu;
@@ -138,31 +172,59 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
138 return BAD_APICID; 172 return BAD_APICID;
139} 173}
140 174
175static unsigned int get_apic_id(unsigned long x)
176{
177 unsigned int id;
178
179 WARN_ON(preemptible() && num_online_cpus() > 1);
180 id = x | __get_cpu_var(x2apic_extra_bits);
181
182 return id;
183}
184
185static unsigned long set_apic_id(unsigned int id)
186{
187 unsigned long x;
188
189 /* maskout x2apic_extra_bits ? */
190 x = id;
191 return x;
192}
193
194static unsigned int uv_read_apic_id(void)
195{
196
197 return get_apic_id(apic_read(APIC_ID));
198}
199
141static unsigned int phys_pkg_id(int index_msb) 200static unsigned int phys_pkg_id(int index_msb)
142{ 201{
143 return GET_APIC_ID(read_apic_id()) >> index_msb; 202 return uv_read_apic_id() >> index_msb;
144} 203}
145 204
146#ifdef ZZZ /* Needs x2apic patch */
147static void uv_send_IPI_self(int vector) 205static void uv_send_IPI_self(int vector)
148{ 206{
149 apic_write(APIC_SELF_IPI, vector); 207 apic_write(APIC_SELF_IPI, vector);
150} 208}
151#endif
152 209
153struct genapic apic_x2apic_uv_x = { 210struct genapic apic_x2apic_uv_x = {
154 .name = "UV large system", 211 .name = "UV large system",
212 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
155 .int_delivery_mode = dest_Fixed, 213 .int_delivery_mode = dest_Fixed,
156 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 214 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
157 .target_cpus = uv_target_cpus, 215 .target_cpus = uv_target_cpus,
158 .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */ 216 .vector_allocation_domain = uv_vector_allocation_domain,
159 .apic_id_registered = uv_apic_id_registered, 217 .apic_id_registered = uv_apic_id_registered,
218 .init_apic_ldr = uv_init_apic_ldr,
160 .send_IPI_all = uv_send_IPI_all, 219 .send_IPI_all = uv_send_IPI_all,
161 .send_IPI_allbutself = uv_send_IPI_allbutself, 220 .send_IPI_allbutself = uv_send_IPI_allbutself,
162 .send_IPI_mask = uv_send_IPI_mask, 221 .send_IPI_mask = uv_send_IPI_mask,
163 /* ZZZ.send_IPI_self = uv_send_IPI_self, */ 222 .send_IPI_self = uv_send_IPI_self,
164 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, 223 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
165 .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ 224 .phys_pkg_id = phys_pkg_id,
225 .get_apic_id = get_apic_id,
226 .set_apic_id = set_apic_id,
227 .apic_id_mask = (0xFFFFFFFFu),
166}; 228};
167 229
168static __cpuinit void set_x2apic_extra_bits(int pnode) 230static __cpuinit void set_x2apic_extra_bits(int pnode)
@@ -222,12 +284,13 @@ static __init void map_low_mmrs(void)
222 284
223enum map_type {map_wb, map_uc}; 285enum map_type {map_wb, map_uc};
224 286
225static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type) 287static __init void map_high(char *id, unsigned long base, int shift,
288 int max_pnode, enum map_type map_type)
226{ 289{
227 unsigned long bytes, paddr; 290 unsigned long bytes, paddr;
228 291
229 paddr = base << shift; 292 paddr = base << shift;
230 bytes = (1UL << shift); 293 bytes = (1UL << shift) * (max_pnode + 1);
231 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, 294 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
232 paddr + bytes); 295 paddr + bytes);
233 if (map_type == map_uc) 296 if (map_type == map_uc)
@@ -243,7 +306,7 @@ static __init void map_gru_high(int max_pnode)
243 306
244 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 307 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
245 if (gru.s.enable) 308 if (gru.s.enable)
246 map_high("GRU", gru.s.base, shift, map_wb); 309 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
247} 310}
248 311
249static __init void map_config_high(int max_pnode) 312static __init void map_config_high(int max_pnode)
@@ -253,7 +316,7 @@ static __init void map_config_high(int max_pnode)
253 316
254 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); 317 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
255 if (cfg.s.enable) 318 if (cfg.s.enable)
256 map_high("CONFIG", cfg.s.base, shift, map_uc); 319 map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
257} 320}
258 321
259static __init void map_mmr_high(int max_pnode) 322static __init void map_mmr_high(int max_pnode)
@@ -263,7 +326,7 @@ static __init void map_mmr_high(int max_pnode)
263 326
264 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); 327 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
265 if (mmr.s.enable) 328 if (mmr.s.enable)
266 map_high("MMR", mmr.s.base, shift, map_uc); 329 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
267} 330}
268 331
269static __init void map_mmioh_high(int max_pnode) 332static __init void map_mmioh_high(int max_pnode)
@@ -273,17 +336,17 @@ static __init void map_mmioh_high(int max_pnode)
273 336
274 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 337 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
275 if (mmioh.s.enable) 338 if (mmioh.s.enable)
276 map_high("MMIOH", mmioh.s.base, shift, map_uc); 339 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
277} 340}
278 341
279static __init void uv_rtc_init(void) 342static __init void uv_rtc_init(void)
280{ 343{
281 long status, ticks_per_sec, drift; 344 long status;
345 u64 ticks_per_sec;
282 346
283 status = 347 status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK,
284 x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec, 348 &ticks_per_sec);
285 &drift); 349 if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
286 if (status != 0 || ticks_per_sec < 100000) {
287 printk(KERN_WARNING 350 printk(KERN_WARNING
288 "unable to determine platform RTC clock frequency, " 351 "unable to determine platform RTC clock frequency, "
289 "guessing.\n"); 352 "guessing.\n");
@@ -293,7 +356,22 @@ static __init void uv_rtc_init(void)
293 sn_rtc_cycles_per_second = ticks_per_sec; 356 sn_rtc_cycles_per_second = ticks_per_sec;
294} 357}
295 358
296static bool uv_system_inited; 359/*
360 * Called on each cpu to initialize the per_cpu UV data area.
361 * ZZZ hotplug not supported yet
362 */
363void __cpuinit uv_cpu_init(void)
364{
365 /* CPU 0 initilization will be done via uv_system_init. */
366 if (!uv_blade_info)
367 return;
368
369 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
370
371 if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
372 set_x2apic_extra_bits(uv_hub_info->pnode);
373}
374
297 375
298void __init uv_system_init(void) 376void __init uv_system_init(void)
299{ 377{
@@ -349,6 +427,9 @@ void __init uv_system_init(void)
349 gnode_upper = (((unsigned long)node_id.s.node_id) & 427 gnode_upper = (((unsigned long)node_id.s.node_id) &
350 ~((1 << n_val) - 1)) << m_val; 428 ~((1 << n_val) - 1)) << m_val;
351 429
430 uv_bios_init();
431 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
432 &uv_coherency_id, &uv_region_size);
352 uv_rtc_init(); 433 uv_rtc_init();
353 434
354 for_each_present_cpu(cpu) { 435 for_each_present_cpu(cpu) {
@@ -370,7 +451,7 @@ void __init uv_system_init(void)
370 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 451 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
371 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 452 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
372 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 453 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
373 uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ 454 uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id;
374 uv_node_to_blade[nid] = blade; 455 uv_node_to_blade[nid] = blade;
375 uv_cpu_to_blade[cpu] = blade; 456 uv_cpu_to_blade[cpu] = blade;
376 max_pnode = max(pnode, max_pnode); 457 max_pnode = max(pnode, max_pnode);
@@ -385,19 +466,6 @@ void __init uv_system_init(void)
385 map_mmr_high(max_pnode); 466 map_mmr_high(max_pnode);
386 map_config_high(max_pnode); 467 map_config_high(max_pnode);
387 map_mmioh_high(max_pnode); 468 map_mmioh_high(max_pnode);
388 uv_system_inited = true;
389}
390
391/*
392 * Called on each cpu to initialize the per_cpu UV data area.
393 * ZZZ hotplug not supported yet
394 */
395void __cpuinit uv_cpu_init(void)
396{
397 BUG_ON(!uv_system_inited);
398 469
399 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; 470 uv_cpu_init();
400
401 if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
402 set_x2apic_extra_bits(uv_hub_info->pnode);
403} 471}
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 3e66bd364a9..1dcb0f13897 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -35,6 +35,7 @@ void __init reserve_ebda_region(void)
35 35
36 /* start of EBDA area */ 36 /* start of EBDA area */
37 ebda_addr = get_bios_ebda(); 37 ebda_addr = get_bios_ebda();
38 printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
38 39
39 /* Fixup: bios puts an EBDA in the top 64K segment */ 40 /* Fixup: bios puts an EBDA in the top 64K segment */
40 /* of conventional memory, but does not adjust lowmem. */ 41 /* of conventional memory, but does not adjust lowmem. */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 9bfc4d72fb2..d16084f9064 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -108,12 +108,11 @@ void __init x86_64_start_kernel(char * real_mode_data)
108 } 108 }
109 load_idt((const struct desc_ptr *)&idt_descr); 109 load_idt((const struct desc_ptr *)&idt_descr);
110 110
111 early_printk("Kernel alive\n"); 111 if (console_loglevel == 10)
112 early_printk("Kernel alive\n");
112 113
113 x86_64_init_pda(); 114 x86_64_init_pda();
114 115
115 early_printk("Kernel really alive\n");
116
117 x86_64_start_reservations(real_mode_data); 116 x86_64_start_reservations(real_mode_data);
118} 117}
119 118
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index a7010c3a377..e835b4eea70 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4
172 * 172 *
173 * Note that the stack is not yet set up! 173 * Note that the stack is not yet set up!
174 */ 174 */
175#define PTE_ATTR 0x007 /* PRESENT+RW+USER */
176#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
177#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */
178
179default_entry: 175default_entry:
180#ifdef CONFIG_X86_PAE 176#ifdef CONFIG_X86_PAE
181 177
@@ -196,9 +192,9 @@ default_entry:
196 movl $pa(pg0), %edi 192 movl $pa(pg0), %edi
197 movl %edi, pa(init_pg_tables_start) 193 movl %edi, pa(init_pg_tables_start)
198 movl $pa(swapper_pg_pmd), %edx 194 movl $pa(swapper_pg_pmd), %edx
199 movl $PTE_ATTR, %eax 195 movl $PTE_IDENT_ATTR, %eax
20010: 19610:
201 leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ 197 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
202 movl %ecx,(%edx) /* Store PMD entry */ 198 movl %ecx,(%edx) /* Store PMD entry */
203 /* Upper half already zero */ 199 /* Upper half already zero */
204 addl $8,%edx 200 addl $8,%edx
@@ -215,7 +211,7 @@ default_entry:
215 * End condition: we must map up to and including INIT_MAP_BEYOND_END 211 * End condition: we must map up to and including INIT_MAP_BEYOND_END
216 * bytes beyond the end of our own page tables. 212 * bytes beyond the end of our own page tables.
217 */ 213 */
218 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 214 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
219 cmpl %ebp,%eax 215 cmpl %ebp,%eax
220 jb 10b 216 jb 10b
2211: 2171:
@@ -224,7 +220,7 @@ default_entry:
224 movl %eax, pa(max_pfn_mapped) 220 movl %eax, pa(max_pfn_mapped)
225 221
226 /* Do early initialization of the fixmap area */ 222 /* Do early initialization of the fixmap area */
227 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 223 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
228 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) 224 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
229#else /* Not PAE */ 225#else /* Not PAE */
230 226
@@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
233 movl $pa(pg0), %edi 229 movl $pa(pg0), %edi
234 movl %edi, pa(init_pg_tables_start) 230 movl %edi, pa(init_pg_tables_start)
235 movl $pa(swapper_pg_dir), %edx 231 movl $pa(swapper_pg_dir), %edx
236 movl $PTE_ATTR, %eax 232 movl $PTE_IDENT_ATTR, %eax
23710: 23310:
238 leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ 234 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
239 movl %ecx,(%edx) /* Store identity PDE entry */ 235 movl %ecx,(%edx) /* Store identity PDE entry */
240 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ 236 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
241 addl $4,%edx 237 addl $4,%edx
@@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
249 * bytes beyond the end of our own page tables; the +0x007 is 245 * bytes beyond the end of our own page tables; the +0x007 is
250 * the attribute bits 246 * the attribute bits
251 */ 247 */
252 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 248 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
253 cmpl %ebp,%eax 249 cmpl %ebp,%eax
254 jb 10b 250 jb 10b
255 movl %edi,pa(init_pg_tables_end) 251 movl %edi,pa(init_pg_tables_end)
@@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
257 movl %eax, pa(max_pfn_mapped) 253 movl %eax, pa(max_pfn_mapped)
258 254
259 /* Do early initialization of the fixmap area */ 255 /* Do early initialization of the fixmap area */
260 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 256 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
261 movl %eax,pa(swapper_pg_dir+0xffc) 257 movl %eax,pa(swapper_pg_dir+0xffc)
262#endif 258#endif
263 jmp 3f 259 jmp 3f
@@ -634,19 +630,19 @@ ENTRY(empty_zero_page)
634 /* Page-aligned for the benefit of paravirt? */ 630 /* Page-aligned for the benefit of paravirt? */
635 .align PAGE_SIZE_asm 631 .align PAGE_SIZE_asm
636ENTRY(swapper_pg_dir) 632ENTRY(swapper_pg_dir)
637 .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ 633 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
638# if KPMDS == 3 634# if KPMDS == 3
639 .long pa(swapper_pg_pmd+PGD_ATTR),0 635 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
640 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 636 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
641 .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 637 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
642# elif KPMDS == 2 638# elif KPMDS == 2
643 .long 0,0 639 .long 0,0
644 .long pa(swapper_pg_pmd+PGD_ATTR),0 640 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
645 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 641 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
646# elif KPMDS == 1 642# elif KPMDS == 1
647 .long 0,0 643 .long 0,0
648 .long 0,0 644 .long 0,0
649 .long pa(swapper_pg_pmd+PGD_ATTR),0 645 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
650# else 646# else
651# error "Kernel PMDs should be 1, 2 or 3" 647# error "Kernel PMDs should be 1, 2 or 3"
652# endif 648# endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index db3280afe88..26cfdc1d7c7 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -110,7 +110,7 @@ startup_64:
110 movq %rdi, %rax 110 movq %rdi, %rax
111 shrq $PMD_SHIFT, %rax 111 shrq $PMD_SHIFT, %rax
112 andq $(PTRS_PER_PMD - 1), %rax 112 andq $(PTRS_PER_PMD - 1), %rax
113 leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx 113 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
114 leaq level2_spare_pgt(%rip), %rbx 114 leaq level2_spare_pgt(%rip), %rbx
115 movq %rdx, 0(%rbx, %rax, 8) 115 movq %rdx, 0(%rbx, %rax, 8)
116ident_complete: 116ident_complete:
@@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt)
374 /* Since I easily can, map the first 1G. 374 /* Since I easily can, map the first 1G.
375 * Don't set NX because code runs from these pages. 375 * Don't set NX because code runs from these pages.
376 */ 376 */
377 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) 377 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
378 378
379NEXT_PAGE(level2_kernel_pgt) 379NEXT_PAGE(level2_kernel_pgt)
380 /* 380 /*
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 73deaffadd0..77017e834cf 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1,29 +1,49 @@
1#include <linux/clocksource.h> 1#include <linux/clocksource.h>
2#include <linux/clockchips.h> 2#include <linux/clockchips.h>
3#include <linux/interrupt.h>
4#include <linux/sysdev.h>
3#include <linux/delay.h> 5#include <linux/delay.h>
4#include <linux/errno.h> 6#include <linux/errno.h>
5#include <linux/hpet.h> 7#include <linux/hpet.h>
6#include <linux/init.h> 8#include <linux/init.h>
7#include <linux/sysdev.h> 9#include <linux/cpu.h>
8#include <linux/pm.h> 10#include <linux/pm.h>
11#include <linux/io.h>
9 12
10#include <asm/fixmap.h> 13#include <asm/fixmap.h>
11#include <asm/hpet.h>
12#include <asm/i8253.h> 14#include <asm/i8253.h>
13#include <asm/io.h> 15#include <asm/hpet.h>
14 16
15#define HPET_MASK CLOCKSOURCE_MASK(32) 17#define HPET_MASK CLOCKSOURCE_MASK(32)
16#define HPET_SHIFT 22 18#define HPET_SHIFT 22
17 19
18/* FSEC = 10^-15 20/* FSEC = 10^-15
19 NSEC = 10^-9 */ 21 NSEC = 10^-9 */
20#define FSEC_PER_NSEC 1000000L 22#define FSEC_PER_NSEC 1000000L
23
24#define HPET_DEV_USED_BIT 2
25#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT)
26#define HPET_DEV_VALID 0x8
27#define HPET_DEV_FSB_CAP 0x1000
28#define HPET_DEV_PERI_CAP 0x2000
29
30#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
21 31
22/* 32/*
23 * HPET address is set in acpi/boot.c, when an ACPI entry exists 33 * HPET address is set in acpi/boot.c, when an ACPI entry exists
24 */ 34 */
25unsigned long hpet_address; 35unsigned long hpet_address;
26static void __iomem *hpet_virt_address; 36unsigned long hpet_num_timers;
37static void __iomem *hpet_virt_address;
38
39struct hpet_dev {
40 struct clock_event_device evt;
41 unsigned int num;
42 int cpu;
43 unsigned int irq;
44 unsigned int flags;
45 char name[10];
46};
27 47
28unsigned long hpet_readl(unsigned long a) 48unsigned long hpet_readl(unsigned long a)
29{ 49{
@@ -59,7 +79,7 @@ static inline void hpet_clear_mapping(void)
59static int boot_hpet_disable; 79static int boot_hpet_disable;
60int hpet_force_user; 80int hpet_force_user;
61 81
62static int __init hpet_setup(char* str) 82static int __init hpet_setup(char *str)
63{ 83{
64 if (str) { 84 if (str) {
65 if (!strncmp("disable", str, 7)) 85 if (!strncmp("disable", str, 7))
@@ -80,7 +100,7 @@ __setup("nohpet", disable_hpet);
80 100
81static inline int is_hpet_capable(void) 101static inline int is_hpet_capable(void)
82{ 102{
83 return (!boot_hpet_disable && hpet_address); 103 return !boot_hpet_disable && hpet_address;
84} 104}
85 105
86/* 106/*
@@ -102,6 +122,9 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled);
102 * timer 0 and timer 1 in case of RTC emulation. 122 * timer 0 and timer 1 in case of RTC emulation.
103 */ 123 */
104#ifdef CONFIG_HPET 124#ifdef CONFIG_HPET
125
126static void hpet_reserve_msi_timers(struct hpet_data *hd);
127
105static void hpet_reserve_platform_timers(unsigned long id) 128static void hpet_reserve_platform_timers(unsigned long id)
106{ 129{
107 struct hpet __iomem *hpet = hpet_virt_address; 130 struct hpet __iomem *hpet = hpet_virt_address;
@@ -111,25 +134,31 @@ static void hpet_reserve_platform_timers(unsigned long id)
111 134
112 nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; 135 nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
113 136
114 memset(&hd, 0, sizeof (hd)); 137 memset(&hd, 0, sizeof(hd));
115 hd.hd_phys_address = hpet_address; 138 hd.hd_phys_address = hpet_address;
116 hd.hd_address = hpet; 139 hd.hd_address = hpet;
117 hd.hd_nirqs = nrtimers; 140 hd.hd_nirqs = nrtimers;
118 hd.hd_flags = HPET_DATA_PLATFORM;
119 hpet_reserve_timer(&hd, 0); 141 hpet_reserve_timer(&hd, 0);
120 142
121#ifdef CONFIG_HPET_EMULATE_RTC 143#ifdef CONFIG_HPET_EMULATE_RTC
122 hpet_reserve_timer(&hd, 1); 144 hpet_reserve_timer(&hd, 1);
123#endif 145#endif
124 146
147 /*
148 * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254
149 * is wrong for i8259!) not the output IRQ. Many BIOS writers
150 * don't bother configuring *any* comparator interrupts.
151 */
125 hd.hd_irq[0] = HPET_LEGACY_8254; 152 hd.hd_irq[0] = HPET_LEGACY_8254;
126 hd.hd_irq[1] = HPET_LEGACY_RTC; 153 hd.hd_irq[1] = HPET_LEGACY_RTC;
127 154
128 for (i = 2; i < nrtimers; timer++, i++) { 155 for (i = 2; i < nrtimers; timer++, i++) {
129 hd.hd_irq[i] = (readl(&timer->hpet_config) & Tn_INT_ROUTE_CNF_MASK) >> 156 hd.hd_irq[i] = (readl(&timer->hpet_config) &
130 Tn_INT_ROUTE_CNF_SHIFT; 157 Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
131 } 158 }
132 159
160 hpet_reserve_msi_timers(&hd);
161
133 hpet_alloc(&hd); 162 hpet_alloc(&hd);
134 163
135} 164}
@@ -223,60 +252,70 @@ static void hpet_legacy_clockevent_register(void)
223 printk(KERN_DEBUG "hpet clockevent registered\n"); 252 printk(KERN_DEBUG "hpet clockevent registered\n");
224} 253}
225 254
226static void hpet_legacy_set_mode(enum clock_event_mode mode, 255static int hpet_setup_msi_irq(unsigned int irq);
227 struct clock_event_device *evt) 256
257static void hpet_set_mode(enum clock_event_mode mode,
258 struct clock_event_device *evt, int timer)
228{ 259{
229 unsigned long cfg, cmp, now; 260 unsigned long cfg, cmp, now;
230 uint64_t delta; 261 uint64_t delta;
231 262
232 switch(mode) { 263 switch (mode) {
233 case CLOCK_EVT_MODE_PERIODIC: 264 case CLOCK_EVT_MODE_PERIODIC:
234 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult; 265 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
235 delta >>= hpet_clockevent.shift; 266 delta >>= evt->shift;
236 now = hpet_readl(HPET_COUNTER); 267 now = hpet_readl(HPET_COUNTER);
237 cmp = now + (unsigned long) delta; 268 cmp = now + (unsigned long) delta;
238 cfg = hpet_readl(HPET_T0_CFG); 269 cfg = hpet_readl(HPET_Tn_CFG(timer));
239 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | 270 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
240 HPET_TN_SETVAL | HPET_TN_32BIT; 271 HPET_TN_SETVAL | HPET_TN_32BIT;
241 hpet_writel(cfg, HPET_T0_CFG); 272 hpet_writel(cfg, HPET_Tn_CFG(timer));
242 /* 273 /*
243 * The first write after writing TN_SETVAL to the 274 * The first write after writing TN_SETVAL to the
244 * config register sets the counter value, the second 275 * config register sets the counter value, the second
245 * write sets the period. 276 * write sets the period.
246 */ 277 */
247 hpet_writel(cmp, HPET_T0_CMP); 278 hpet_writel(cmp, HPET_Tn_CMP(timer));
248 udelay(1); 279 udelay(1);
249 hpet_writel((unsigned long) delta, HPET_T0_CMP); 280 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
250 break; 281 break;
251 282
252 case CLOCK_EVT_MODE_ONESHOT: 283 case CLOCK_EVT_MODE_ONESHOT:
253 cfg = hpet_readl(HPET_T0_CFG); 284 cfg = hpet_readl(HPET_Tn_CFG(timer));
254 cfg &= ~HPET_TN_PERIODIC; 285 cfg &= ~HPET_TN_PERIODIC;
255 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; 286 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
256 hpet_writel(cfg, HPET_T0_CFG); 287 hpet_writel(cfg, HPET_Tn_CFG(timer));
257 break; 288 break;
258 289
259 case CLOCK_EVT_MODE_UNUSED: 290 case CLOCK_EVT_MODE_UNUSED:
260 case CLOCK_EVT_MODE_SHUTDOWN: 291 case CLOCK_EVT_MODE_SHUTDOWN:
261 cfg = hpet_readl(HPET_T0_CFG); 292 cfg = hpet_readl(HPET_Tn_CFG(timer));
262 cfg &= ~HPET_TN_ENABLE; 293 cfg &= ~HPET_TN_ENABLE;
263 hpet_writel(cfg, HPET_T0_CFG); 294 hpet_writel(cfg, HPET_Tn_CFG(timer));
264 break; 295 break;
265 296
266 case CLOCK_EVT_MODE_RESUME: 297 case CLOCK_EVT_MODE_RESUME:
267 hpet_enable_legacy_int(); 298 if (timer == 0) {
299 hpet_enable_legacy_int();
300 } else {
301 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
302 hpet_setup_msi_irq(hdev->irq);
303 disable_irq(hdev->irq);
304 irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
305 enable_irq(hdev->irq);
306 }
268 break; 307 break;
269 } 308 }
270} 309}
271 310
272static int hpet_legacy_next_event(unsigned long delta, 311static int hpet_next_event(unsigned long delta,
273 struct clock_event_device *evt) 312 struct clock_event_device *evt, int timer)
274{ 313{
275 u32 cnt; 314 u32 cnt;
276 315
277 cnt = hpet_readl(HPET_COUNTER); 316 cnt = hpet_readl(HPET_COUNTER);
278 cnt += (u32) delta; 317 cnt += (u32) delta;
279 hpet_writel(cnt, HPET_T0_CMP); 318 hpet_writel(cnt, HPET_Tn_CMP(timer));
280 319
281 /* 320 /*
282 * We need to read back the CMP register to make sure that 321 * We need to read back the CMP register to make sure that
@@ -288,6 +327,347 @@ static int hpet_legacy_next_event(unsigned long delta,
288 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 327 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
289} 328}
290 329
330static void hpet_legacy_set_mode(enum clock_event_mode mode,
331 struct clock_event_device *evt)
332{
333 hpet_set_mode(mode, evt, 0);
334}
335
336static int hpet_legacy_next_event(unsigned long delta,
337 struct clock_event_device *evt)
338{
339 return hpet_next_event(delta, evt, 0);
340}
341
342/*
343 * HPET MSI Support
344 */
345#ifdef CONFIG_PCI_MSI
346
347static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
348static struct hpet_dev *hpet_devs;
349
350void hpet_msi_unmask(unsigned int irq)
351{
352 struct hpet_dev *hdev = get_irq_data(irq);
353 unsigned long cfg;
354
355 /* unmask it */
356 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
357 cfg |= HPET_TN_FSB;
358 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
359}
360
361void hpet_msi_mask(unsigned int irq)
362{
363 unsigned long cfg;
364 struct hpet_dev *hdev = get_irq_data(irq);
365
366 /* mask it */
367 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
368 cfg &= ~HPET_TN_FSB;
369 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
370}
371
372void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
373{
374 struct hpet_dev *hdev = get_irq_data(irq);
375
376 hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
377 hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
378}
379
380void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
381{
382 struct hpet_dev *hdev = get_irq_data(irq);
383
384 msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
385 msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
386 msg->address_hi = 0;
387}
388
389static void hpet_msi_set_mode(enum clock_event_mode mode,
390 struct clock_event_device *evt)
391{
392 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
393 hpet_set_mode(mode, evt, hdev->num);
394}
395
396static int hpet_msi_next_event(unsigned long delta,
397 struct clock_event_device *evt)
398{
399 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
400 return hpet_next_event(delta, evt, hdev->num);
401}
402
403static int hpet_setup_msi_irq(unsigned int irq)
404{
405 if (arch_setup_hpet_msi(irq)) {
406 destroy_irq(irq);
407 return -EINVAL;
408 }
409 return 0;
410}
411
412static int hpet_assign_irq(struct hpet_dev *dev)
413{
414 unsigned int irq;
415
416 irq = create_irq();
417 if (!irq)
418 return -EINVAL;
419
420 set_irq_data(irq, dev);
421
422 if (hpet_setup_msi_irq(irq))
423 return -EINVAL;
424
425 dev->irq = irq;
426 return 0;
427}
428
429static irqreturn_t hpet_interrupt_handler(int irq, void *data)
430{
431 struct hpet_dev *dev = (struct hpet_dev *)data;
432 struct clock_event_device *hevt = &dev->evt;
433
434 if (!hevt->event_handler) {
435 printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n",
436 dev->num);
437 return IRQ_HANDLED;
438 }
439
440 hevt->event_handler(hevt);
441 return IRQ_HANDLED;
442}
443
444static int hpet_setup_irq(struct hpet_dev *dev)
445{
446
447 if (request_irq(dev->irq, hpet_interrupt_handler,
448 IRQF_SHARED|IRQF_NOBALANCING, dev->name, dev))
449 return -1;
450
451 disable_irq(dev->irq);
452 irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
453 enable_irq(dev->irq);
454
455 printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
456 dev->name, dev->irq);
457
458 return 0;
459}
460
461/* This should be called in specific @cpu */
462static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
463{
464 struct clock_event_device *evt = &hdev->evt;
465 uint64_t hpet_freq;
466
467 WARN_ON(cpu != smp_processor_id());
468 if (!(hdev->flags & HPET_DEV_VALID))
469 return;
470
471 if (hpet_setup_msi_irq(hdev->irq))
472 return;
473
474 hdev->cpu = cpu;
475 per_cpu(cpu_hpet_dev, cpu) = hdev;
476 evt->name = hdev->name;
477 hpet_setup_irq(hdev);
478 evt->irq = hdev->irq;
479
480 evt->rating = 110;
481 evt->features = CLOCK_EVT_FEAT_ONESHOT;
482 if (hdev->flags & HPET_DEV_PERI_CAP)
483 evt->features |= CLOCK_EVT_FEAT_PERIODIC;
484
485 evt->set_mode = hpet_msi_set_mode;
486 evt->set_next_event = hpet_msi_next_event;
487 evt->shift = 32;
488
489 /*
490 * The period is a femto seconds value. We need to calculate the
491 * scaled math multiplication factor for nanosecond to hpet tick
492 * conversion.
493 */
494 hpet_freq = 1000000000000000ULL;
495 do_div(hpet_freq, hpet_period);
496 evt->mult = div_sc((unsigned long) hpet_freq,
497 NSEC_PER_SEC, evt->shift);
498 /* Calculate the max delta */
499 evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
500 /* 5 usec minimum reprogramming delta. */
501 evt->min_delta_ns = 5000;
502
503 evt->cpumask = cpumask_of_cpu(hdev->cpu);
504 clockevents_register_device(evt);
505}
506
507#ifdef CONFIG_HPET
508/* Reserve at least one timer for userspace (/dev/hpet) */
509#define RESERVE_TIMERS 1
510#else
511#define RESERVE_TIMERS 0
512#endif
513
514static void hpet_msi_capability_lookup(unsigned int start_timer)
515{
516 unsigned int id;
517 unsigned int num_timers;
518 unsigned int num_timers_used = 0;
519 int i;
520
521 id = hpet_readl(HPET_ID);
522
523 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
524 num_timers++; /* Value read out starts from 0 */
525
526 hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
527 if (!hpet_devs)
528 return;
529
530 hpet_num_timers = num_timers;
531
532 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
533 struct hpet_dev *hdev = &hpet_devs[num_timers_used];
534 unsigned long cfg = hpet_readl(HPET_Tn_CFG(i));
535
536 /* Only consider HPET timer with MSI support */
537 if (!(cfg & HPET_TN_FSB_CAP))
538 continue;
539
540 hdev->flags = 0;
541 if (cfg & HPET_TN_PERIODIC_CAP)
542 hdev->flags |= HPET_DEV_PERI_CAP;
543 hdev->num = i;
544
545 sprintf(hdev->name, "hpet%d", i);
546 if (hpet_assign_irq(hdev))
547 continue;
548
549 hdev->flags |= HPET_DEV_FSB_CAP;
550 hdev->flags |= HPET_DEV_VALID;
551 num_timers_used++;
552 if (num_timers_used == num_possible_cpus())
553 break;
554 }
555
556 printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n",
557 num_timers, num_timers_used);
558}
559
560#ifdef CONFIG_HPET
561static void hpet_reserve_msi_timers(struct hpet_data *hd)
562{
563 int i;
564
565 if (!hpet_devs)
566 return;
567
568 for (i = 0; i < hpet_num_timers; i++) {
569 struct hpet_dev *hdev = &hpet_devs[i];
570
571 if (!(hdev->flags & HPET_DEV_VALID))
572 continue;
573
574 hd->hd_irq[hdev->num] = hdev->irq;
575 hpet_reserve_timer(hd, hdev->num);
576 }
577}
578#endif
579
580static struct hpet_dev *hpet_get_unused_timer(void)
581{
582 int i;
583
584 if (!hpet_devs)
585 return NULL;
586
587 for (i = 0; i < hpet_num_timers; i++) {
588 struct hpet_dev *hdev = &hpet_devs[i];
589
590 if (!(hdev->flags & HPET_DEV_VALID))
591 continue;
592 if (test_and_set_bit(HPET_DEV_USED_BIT,
593 (unsigned long *)&hdev->flags))
594 continue;
595 return hdev;
596 }
597 return NULL;
598}
599
600struct hpet_work_struct {
601 struct delayed_work work;
602 struct completion complete;
603};
604
605static void hpet_work(struct work_struct *w)
606{
607 struct hpet_dev *hdev;
608 int cpu = smp_processor_id();
609 struct hpet_work_struct *hpet_work;
610
611 hpet_work = container_of(w, struct hpet_work_struct, work.work);
612
613 hdev = hpet_get_unused_timer();
614 if (hdev)
615 init_one_hpet_msi_clockevent(hdev, cpu);
616
617 complete(&hpet_work->complete);
618}
619
620static int hpet_cpuhp_notify(struct notifier_block *n,
621 unsigned long action, void *hcpu)
622{
623 unsigned long cpu = (unsigned long)hcpu;
624 struct hpet_work_struct work;
625 struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
626
627 switch (action & 0xf) {
628 case CPU_ONLINE:
629 INIT_DELAYED_WORK(&work.work, hpet_work);
630 init_completion(&work.complete);
631 /* FIXME: add schedule_work_on() */
632 schedule_delayed_work_on(cpu, &work.work, 0);
633 wait_for_completion(&work.complete);
634 break;
635 case CPU_DEAD:
636 if (hdev) {
637 free_irq(hdev->irq, hdev);
638 hdev->flags &= ~HPET_DEV_USED;
639 per_cpu(cpu_hpet_dev, cpu) = NULL;
640 }
641 break;
642 }
643 return NOTIFY_OK;
644}
645#else
646
647static int hpet_setup_msi_irq(unsigned int irq)
648{
649 return 0;
650}
651static void hpet_msi_capability_lookup(unsigned int start_timer)
652{
653 return;
654}
655
656#ifdef CONFIG_HPET
657static void hpet_reserve_msi_timers(struct hpet_data *hd)
658{
659 return;
660}
661#endif
662
663static int hpet_cpuhp_notify(struct notifier_block *n,
664 unsigned long action, void *hcpu)
665{
666 return NOTIFY_OK;
667}
668
669#endif
670
291/* 671/*
292 * Clock source related code 672 * Clock source related code
293 */ 673 */
@@ -423,8 +803,10 @@ int __init hpet_enable(void)
423 803
424 if (id & HPET_ID_LEGSUP) { 804 if (id & HPET_ID_LEGSUP) {
425 hpet_legacy_clockevent_register(); 805 hpet_legacy_clockevent_register();
806 hpet_msi_capability_lookup(2);
426 return 1; 807 return 1;
427 } 808 }
809 hpet_msi_capability_lookup(0);
428 return 0; 810 return 0;
429 811
430out_nohpet: 812out_nohpet:
@@ -441,6 +823,8 @@ out_nohpet:
441 */ 823 */
442static __init int hpet_late_init(void) 824static __init int hpet_late_init(void)
443{ 825{
826 int cpu;
827
444 if (boot_hpet_disable) 828 if (boot_hpet_disable)
445 return -ENODEV; 829 return -ENODEV;
446 830
@@ -456,6 +840,13 @@ static __init int hpet_late_init(void)
456 840
457 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 841 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
458 842
843 for_each_online_cpu(cpu) {
844 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
845 }
846
847 /* This notifier should be called after workqueue is ready */
848 hotcpu_notifier(hpet_cpuhp_notify, -20);
849
459 return 0; 850 return 0;
460} 851}
461fs_initcall(hpet_late_init); 852fs_initcall(hpet_late_init);
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index eb9ddd8efb8..1f20608d4ca 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -21,9 +21,12 @@
21# include <asm/sigcontext32.h> 21# include <asm/sigcontext32.h>
22# include <asm/user32.h> 22# include <asm/user32.h>
23#else 23#else
24# define save_i387_ia32 save_i387 24# define save_i387_xstate_ia32 save_i387_xstate
25# define restore_i387_ia32 restore_i387 25# define restore_i387_xstate_ia32 restore_i387_xstate
26# define _fpstate_ia32 _fpstate 26# define _fpstate_ia32 _fpstate
27# define _xstate_ia32 _xstate
28# define sig_xstate_ia32_size sig_xstate_size
29# define fx_sw_reserved_ia32 fx_sw_reserved
27# define user_i387_ia32_struct user_i387_struct 30# define user_i387_ia32_struct user_i387_struct
28# define user32_fxsr_struct user_fxsr_struct 31# define user32_fxsr_struct user_fxsr_struct
29#endif 32#endif
@@ -36,6 +39,7 @@
36 39
37static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; 40static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
38unsigned int xstate_size; 41unsigned int xstate_size;
42unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
39static struct i387_fxsave_struct fx_scratch __cpuinitdata; 43static struct i387_fxsave_struct fx_scratch __cpuinitdata;
40 44
41void __cpuinit mxcsr_feature_mask_init(void) 45void __cpuinit mxcsr_feature_mask_init(void)
@@ -61,6 +65,11 @@ void __init init_thread_xstate(void)
61 return; 65 return;
62 } 66 }
63 67
68 if (cpu_has_xsave) {
69 xsave_cntxt_init();
70 return;
71 }
72
64 if (cpu_has_fxsr) 73 if (cpu_has_fxsr)
65 xstate_size = sizeof(struct i387_fxsave_struct); 74 xstate_size = sizeof(struct i387_fxsave_struct);
66#ifdef CONFIG_X86_32 75#ifdef CONFIG_X86_32
@@ -83,9 +92,19 @@ void __cpuinit fpu_init(void)
83 92
84 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ 93 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
85 94
95 /*
96 * Boot processor to setup the FP and extended state context info.
97 */
98 if (!smp_processor_id())
99 init_thread_xstate();
100 xsave_init();
101
86 mxcsr_feature_mask_init(); 102 mxcsr_feature_mask_init();
87 /* clean state in init */ 103 /* clean state in init */
88 current_thread_info()->status = 0; 104 if (cpu_has_xsave)
105 current_thread_info()->status = TS_XSAVE;
106 else
107 current_thread_info()->status = 0;
89 clear_used_math(); 108 clear_used_math();
90} 109}
91#endif /* CONFIG_X86_64 */ 110#endif /* CONFIG_X86_64 */
@@ -195,6 +214,13 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
195 */ 214 */
196 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 215 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
197 216
217 /*
218 * update the header bits in the xsave header, indicating the
219 * presence of FP and SSE state.
220 */
221 if (cpu_has_xsave)
222 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
223
198 return ret; 224 return ret;
199} 225}
200 226
@@ -395,6 +421,12 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
395 if (!ret) 421 if (!ret)
396 convert_to_fxsr(target, &env); 422 convert_to_fxsr(target, &env);
397 423
424 /*
425 * update the header bit in the xsave header, indicating the
426 * presence of FP.
427 */
428 if (cpu_has_xsave)
429 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
398 return ret; 430 return ret;
399} 431}
400 432
@@ -407,7 +439,6 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
407 struct task_struct *tsk = current; 439 struct task_struct *tsk = current;
408 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; 440 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
409 441
410 unlazy_fpu(tsk);
411 fp->status = fp->swd; 442 fp->status = fp->swd;
412 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) 443 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
413 return -1; 444 return -1;
@@ -421,8 +452,6 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
421 struct user_i387_ia32_struct env; 452 struct user_i387_ia32_struct env;
422 int err = 0; 453 int err = 0;
423 454
424 unlazy_fpu(tsk);
425
426 convert_from_fxsr(&env, tsk); 455 convert_from_fxsr(&env, tsk);
427 if (__copy_to_user(buf, &env, sizeof(env))) 456 if (__copy_to_user(buf, &env, sizeof(env)))
428 return -1; 457 return -1;
@@ -432,16 +461,54 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
432 if (err) 461 if (err)
433 return -1; 462 return -1;
434 463
435 if (__copy_to_user(&buf->_fxsr_env[0], fx, 464 if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size))
436 sizeof(struct i387_fxsave_struct))) 465 return -1;
466 return 1;
467}
468
469static int save_i387_xsave(void __user *buf)
470{
471 struct task_struct *tsk = current;
472 struct _fpstate_ia32 __user *fx = buf;
473 int err = 0;
474
475 /*
476 * For legacy compatible, we always set FP/SSE bits in the bit
477 * vector while saving the state to the user context.
478 * This will enable us capturing any changes(during sigreturn) to
479 * the FP/SSE bits by the legacy applications which don't touch
480 * xstate_bv in the xsave header.
481 *
482 * xsave aware applications can change the xstate_bv in the xsave
483 * header as well as change any contents in the memory layout.
484 * xrestore as part of sigreturn will capture all the changes.
485 */
486 tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
487
488 if (save_i387_fxsave(fx) < 0)
489 return -1;
490
491 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32,
492 sizeof(struct _fpx_sw_bytes));
493 err |= __put_user(FP_XSTATE_MAGIC2,
494 (__u32 __user *) (buf + sig_xstate_ia32_size
495 - FP_XSTATE_MAGIC2_SIZE));
496 if (err)
437 return -1; 497 return -1;
498
438 return 1; 499 return 1;
439} 500}
440 501
441int save_i387_ia32(struct _fpstate_ia32 __user *buf) 502int save_i387_xstate_ia32(void __user *buf)
442{ 503{
504 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
505 struct task_struct *tsk = current;
506
443 if (!used_math()) 507 if (!used_math())
444 return 0; 508 return 0;
509
510 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size))
511 return -EACCES;
445 /* 512 /*
446 * This will cause a "finit" to be triggered by the next 513 * This will cause a "finit" to be triggered by the next
447 * attempted FPU operation by the 'current' process. 514 * attempted FPU operation by the 'current' process.
@@ -451,13 +518,17 @@ int save_i387_ia32(struct _fpstate_ia32 __user *buf)
451 if (!HAVE_HWFP) { 518 if (!HAVE_HWFP) {
452 return fpregs_soft_get(current, NULL, 519 return fpregs_soft_get(current, NULL,
453 0, sizeof(struct user_i387_ia32_struct), 520 0, sizeof(struct user_i387_ia32_struct),
454 NULL, buf) ? -1 : 1; 521 NULL, fp) ? -1 : 1;
455 } 522 }
456 523
524 unlazy_fpu(tsk);
525
526 if (cpu_has_xsave)
527 return save_i387_xsave(fp);
457 if (cpu_has_fxsr) 528 if (cpu_has_fxsr)
458 return save_i387_fxsave(buf); 529 return save_i387_fxsave(fp);
459 else 530 else
460 return save_i387_fsave(buf); 531 return save_i387_fsave(fp);
461} 532}
462 533
463static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) 534static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
@@ -468,14 +539,15 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
468 sizeof(struct i387_fsave_struct)); 539 sizeof(struct i387_fsave_struct));
469} 540}
470 541
471static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) 542static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
543 unsigned int size)
472{ 544{
473 struct task_struct *tsk = current; 545 struct task_struct *tsk = current;
474 struct user_i387_ia32_struct env; 546 struct user_i387_ia32_struct env;
475 int err; 547 int err;
476 548
477 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], 549 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
478 sizeof(struct i387_fxsave_struct)); 550 size);
479 /* mxcsr reserved bits must be masked to zero for security reasons */ 551 /* mxcsr reserved bits must be masked to zero for security reasons */
480 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 552 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
481 if (err || __copy_from_user(&env, buf, sizeof(env))) 553 if (err || __copy_from_user(&env, buf, sizeof(env)))
@@ -485,14 +557,69 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
485 return 0; 557 return 0;
486} 558}
487 559
488int restore_i387_ia32(struct _fpstate_ia32 __user *buf) 560static int restore_i387_xsave(void __user *buf)
561{
562 struct _fpx_sw_bytes fx_sw_user;
563 struct _fpstate_ia32 __user *fx_user =
564 ((struct _fpstate_ia32 __user *) buf);
565 struct i387_fxsave_struct __user *fx =
566 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
567 struct xsave_hdr_struct *xsave_hdr =
568 &current->thread.xstate->xsave.xsave_hdr;
569 u64 mask;
570 int err;
571
572 if (check_for_xstate(fx, buf, &fx_sw_user))
573 goto fx_only;
574
575 mask = fx_sw_user.xstate_bv;
576
577 err = restore_i387_fxsave(buf, fx_sw_user.xstate_size);
578
579 xsave_hdr->xstate_bv &= pcntxt_mask;
580 /*
581 * These bits must be zero.
582 */
583 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
584
585 /*
586 * Init the state that is not present in the memory layout
587 * and enabled by the OS.
588 */
589 mask = ~(pcntxt_mask & ~mask);
590 xsave_hdr->xstate_bv &= mask;
591
592 return err;
593fx_only:
594 /*
595 * Couldn't find the extended state information in the memory
596 * layout. Restore the FP/SSE and init the other extended state
597 * enabled by the OS.
598 */
599 xsave_hdr->xstate_bv = XSTATE_FPSSE;
600 return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct));
601}
602
603int restore_i387_xstate_ia32(void __user *buf)
489{ 604{
490 int err; 605 int err;
491 struct task_struct *tsk = current; 606 struct task_struct *tsk = current;
607 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
492 608
493 if (HAVE_HWFP) 609 if (HAVE_HWFP)
494 clear_fpu(tsk); 610 clear_fpu(tsk);
495 611
612 if (!buf) {
613 if (used_math()) {
614 clear_fpu(tsk);
615 clear_used_math();
616 }
617
618 return 0;
619 } else
620 if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size))
621 return -EACCES;
622
496 if (!used_math()) { 623 if (!used_math()) {
497 err = init_fpu(tsk); 624 err = init_fpu(tsk);
498 if (err) 625 if (err)
@@ -500,14 +627,17 @@ int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
500 } 627 }
501 628
502 if (HAVE_HWFP) { 629 if (HAVE_HWFP) {
503 if (cpu_has_fxsr) 630 if (cpu_has_xsave)
504 err = restore_i387_fxsave(buf); 631 err = restore_i387_xsave(buf);
632 else if (cpu_has_fxsr)
633 err = restore_i387_fxsave(fp, sizeof(struct
634 i387_fxsave_struct));
505 else 635 else
506 err = restore_i387_fsave(buf); 636 err = restore_i387_fsave(fp);
507 } else { 637 } else {
508 err = fpregs_soft_set(current, NULL, 638 err = fpregs_soft_set(current, NULL,
509 0, sizeof(struct user_i387_ia32_struct), 639 0, sizeof(struct user_i387_ia32_struct),
510 NULL, buf) != 0; 640 NULL, fp) != 0;
511 } 641 }
512 set_used_math(); 642 set_used_math();
513 643
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index dc92b49d920..4b8a53d841f 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -282,6 +282,30 @@ static int __init i8259A_init_sysfs(void)
282 282
283device_initcall(i8259A_init_sysfs); 283device_initcall(i8259A_init_sysfs);
284 284
285void mask_8259A(void)
286{
287 unsigned long flags;
288
289 spin_lock_irqsave(&i8259A_lock, flags);
290
291 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
292 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
293
294 spin_unlock_irqrestore(&i8259A_lock, flags);
295}
296
297void unmask_8259A(void)
298{
299 unsigned long flags;
300
301 spin_lock_irqsave(&i8259A_lock, flags);
302
303 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
304 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
305
306 spin_unlock_irqrestore(&i8259A_lock, flags);
307}
308
285void init_8259A(int auto_eoi) 309void init_8259A(int auto_eoi)
286{ 310{
287 unsigned long flags; 311 unsigned long flags;
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic.c
index 61a83b70c18..b764d7429c6 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic.c
@@ -27,16 +27,21 @@
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/pci.h> 28#include <linux/pci.h>
29#include <linux/mc146818rtc.h> 29#include <linux/mc146818rtc.h>
30#include <linux/compiler.h>
30#include <linux/acpi.h> 31#include <linux/acpi.h>
32#include <linux/module.h>
31#include <linux/sysdev.h> 33#include <linux/sysdev.h>
32#include <linux/msi.h> 34#include <linux/msi.h>
33#include <linux/htirq.h> 35#include <linux/htirq.h>
34#include <linux/dmar.h> 36#include <linux/freezer.h>
35#include <linux/jiffies.h> 37#include <linux/kthread.h>
38#include <linux/jiffies.h> /* time_after() */
36#ifdef CONFIG_ACPI 39#ifdef CONFIG_ACPI
37#include <acpi/acpi_bus.h> 40#include <acpi/acpi_bus.h>
38#endif 41#endif
39#include <linux/bootmem.h> 42#include <linux/bootmem.h>
43#include <linux/dmar.h>
44#include <linux/hpet.h>
40 45
41#include <asm/idle.h> 46#include <asm/idle.h>
42#include <asm/io.h> 47#include <asm/io.h>
@@ -45,60 +50,28 @@
45#include <asm/proto.h> 50#include <asm/proto.h>
46#include <asm/acpi.h> 51#include <asm/acpi.h>
47#include <asm/dma.h> 52#include <asm/dma.h>
53#include <asm/timer.h>
48#include <asm/i8259.h> 54#include <asm/i8259.h>
49#include <asm/nmi.h> 55#include <asm/nmi.h>
50#include <asm/msidef.h> 56#include <asm/msidef.h>
51#include <asm/hypertransport.h> 57#include <asm/hypertransport.h>
58#include <asm/setup.h>
59#include <asm/irq_remapping.h>
60#include <asm/hpet.h>
61#include <asm/uv/uv_hub.h>
62#include <asm/uv/uv_irq.h>
52 63
53#include <mach_ipi.h> 64#include <mach_ipi.h>
54#include <mach_apic.h> 65#include <mach_apic.h>
66#include <mach_apicdef.h>
55 67
56struct irq_cfg { 68#define __apicdebuginit(type) static type __init
57 cpumask_t domain;
58 cpumask_t old_domain;
59 unsigned move_cleanup_count;
60 u8 vector;
61 u8 move_in_progress : 1;
62};
63
64/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
65static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
66 [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
67 [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
68 [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
69 [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
70 [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
71 [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
72 [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
73 [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
74 [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
75 [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
76 [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
77 [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
78 [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
79 [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
80 [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
81 [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
82};
83
84static int assign_irq_vector(int irq, cpumask_t mask);
85 69
86int first_system_vector = 0xfe; 70/*
87 71 * Is the SiS APIC rmw bug present ?
88char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; 72 * -1 = don't know, 0 = no, 1 = yes
89 73 */
90#define __apicdebuginit __init 74int sis_apic_bug = -1;
91
92int sis_apic_bug; /* not actually supported, dummy for compile */
93
94static int no_timer_check;
95
96static int disable_timer_pin_1 __initdata;
97
98int timer_through_8259 __initdata;
99
100/* Where if anywhere is the i8259 connect in external int mode */
101static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
102 75
103static DEFINE_SPINLOCK(ioapic_lock); 76static DEFINE_SPINLOCK(ioapic_lock);
104static DEFINE_SPINLOCK(vector_lock); 77static DEFINE_SPINLOCK(vector_lock);
@@ -118,11 +91,69 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
118/* # of MP IRQ source entries */ 91/* # of MP IRQ source entries */
119int mp_irq_entries; 92int mp_irq_entries;
120 93
94#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
95int mp_bus_id_to_type[MAX_MP_BUSSES];
96#endif
97
121DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); 98DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
122 99
100int skip_ioapic_setup;
101
102static int __init parse_noapic(char *str)
103{
104 /* disable IO-APIC */
105 disable_ioapic_setup();
106 return 0;
107}
108early_param("noapic", parse_noapic);
109
110struct irq_pin_list;
111struct irq_cfg {
112 unsigned int irq;
113 struct irq_pin_list *irq_2_pin;
114 cpumask_t domain;
115 cpumask_t old_domain;
116 unsigned move_cleanup_count;
117 u8 vector;
118 u8 move_in_progress : 1;
119};
120
121/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
122static struct irq_cfg irq_cfgx[NR_IRQS] = {
123 [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
124 [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
125 [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
126 [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
127 [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
128 [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
129 [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
130 [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
131 [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
132 [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
133 [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
134 [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
135 [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
136 [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
137 [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
138 [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
139};
140
141#define for_each_irq_cfg(irq, cfg) \
142 for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
143
144static struct irq_cfg *irq_cfg(unsigned int irq)
145{
146 return irq < nr_irqs ? irq_cfgx + irq : NULL;
147}
148
149static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
150{
151 return irq_cfg(irq);
152}
153
123/* 154/*
124 * Rough estimation of how many shared IRQs there are, can 155 * Rough estimation of how many shared IRQs there are, can be changed
125 * be changed anytime. 156 * anytime.
126 */ 157 */
127#define MAX_PLUS_SHARED_IRQS NR_IRQS 158#define MAX_PLUS_SHARED_IRQS NR_IRQS
128#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) 159#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
@@ -134,9 +165,36 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
134 * between pins and IRQs. 165 * between pins and IRQs.
135 */ 166 */
136 167
137static struct irq_pin_list { 168struct irq_pin_list {
138 short apic, pin, next; 169 int apic, pin;
139} irq_2_pin[PIN_MAP_SIZE]; 170 struct irq_pin_list *next;
171};
172
173static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
174static struct irq_pin_list *irq_2_pin_ptr;
175
176static void __init irq_2_pin_init(void)
177{
178 struct irq_pin_list *pin = irq_2_pin_head;
179 int i;
180
181 for (i = 1; i < PIN_MAP_SIZE; i++)
182 pin[i-1].next = &pin[i];
183
184 irq_2_pin_ptr = &pin[0];
185}
186
187static struct irq_pin_list *get_one_free_irq_2_pin(void)
188{
189 struct irq_pin_list *pin = irq_2_pin_ptr;
190
191 if (!pin)
192 panic("can not get more irq_2_pin\n");
193
194 irq_2_pin_ptr = pin->next;
195 pin->next = NULL;
196 return pin;
197}
140 198
141struct io_apic { 199struct io_apic {
142 unsigned int index; 200 unsigned int index;
@@ -167,10 +225,15 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
167/* 225/*
168 * Re-write a value: to be used for read-modify-write 226 * Re-write a value: to be used for read-modify-write
169 * cycles where the read already set up the index register. 227 * cycles where the read already set up the index register.
228 *
229 * Older SiS APIC requires we rewrite the index register
170 */ 230 */
171static inline void io_apic_modify(unsigned int apic, unsigned int value) 231static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
172{ 232{
173 struct io_apic __iomem *io_apic = io_apic_base(apic); 233 struct io_apic __iomem *io_apic = io_apic_base(apic);
234
235 if (sis_apic_bug)
236 writel(reg, &io_apic->index);
174 writel(value, &io_apic->data); 237 writel(value, &io_apic->data);
175} 238}
176 239
@@ -178,16 +241,17 @@ static bool io_apic_level_ack_pending(unsigned int irq)
178{ 241{
179 struct irq_pin_list *entry; 242 struct irq_pin_list *entry;
180 unsigned long flags; 243 unsigned long flags;
244 struct irq_cfg *cfg = irq_cfg(irq);
181 245
182 spin_lock_irqsave(&ioapic_lock, flags); 246 spin_lock_irqsave(&ioapic_lock, flags);
183 entry = irq_2_pin + irq; 247 entry = cfg->irq_2_pin;
184 for (;;) { 248 for (;;) {
185 unsigned int reg; 249 unsigned int reg;
186 int pin; 250 int pin;
187 251
188 pin = entry->pin; 252 if (!entry)
189 if (pin == -1)
190 break; 253 break;
254 pin = entry->pin;
191 reg = io_apic_read(entry->apic, 0x10 + pin*2); 255 reg = io_apic_read(entry->apic, 0x10 + pin*2);
192 /* Is the remote IRR bit set? */ 256 /* Is the remote IRR bit set? */
193 if (reg & IO_APIC_REDIR_REMOTE_IRR) { 257 if (reg & IO_APIC_REDIR_REMOTE_IRR) {
@@ -196,45 +260,13 @@ static bool io_apic_level_ack_pending(unsigned int irq)
196 } 260 }
197 if (!entry->next) 261 if (!entry->next)
198 break; 262 break;
199 entry = irq_2_pin + entry->next; 263 entry = entry->next;
200 } 264 }
201 spin_unlock_irqrestore(&ioapic_lock, flags); 265 spin_unlock_irqrestore(&ioapic_lock, flags);
202 266
203 return false; 267 return false;
204} 268}
205 269
206/*
207 * Synchronize the IO-APIC and the CPU by doing
208 * a dummy read from the IO-APIC
209 */
210static inline void io_apic_sync(unsigned int apic)
211{
212 struct io_apic __iomem *io_apic = io_apic_base(apic);
213 readl(&io_apic->data);
214}
215
216#define __DO_ACTION(R, ACTION, FINAL) \
217 \
218{ \
219 int pin; \
220 struct irq_pin_list *entry = irq_2_pin + irq; \
221 \
222 BUG_ON(irq >= NR_IRQS); \
223 for (;;) { \
224 unsigned int reg; \
225 pin = entry->pin; \
226 if (pin == -1) \
227 break; \
228 reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
229 reg ACTION; \
230 io_apic_modify(entry->apic, reg); \
231 FINAL; \
232 if (!entry->next) \
233 break; \
234 entry = irq_2_pin + entry->next; \
235 } \
236}
237
238union entry_union { 270union entry_union {
239 struct { u32 w1, w2; }; 271 struct { u32 w1, w2; };
240 struct IO_APIC_route_entry entry; 272 struct IO_APIC_route_entry entry;
@@ -294,54 +326,71 @@ static void ioapic_mask_entry(int apic, int pin)
294static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) 326static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
295{ 327{
296 int apic, pin; 328 int apic, pin;
297 struct irq_pin_list *entry = irq_2_pin + irq; 329 struct irq_cfg *cfg;
330 struct irq_pin_list *entry;
298 331
299 BUG_ON(irq >= NR_IRQS); 332 cfg = irq_cfg(irq);
333 entry = cfg->irq_2_pin;
300 for (;;) { 334 for (;;) {
301 unsigned int reg; 335 unsigned int reg;
336
337 if (!entry)
338 break;
339
302 apic = entry->apic; 340 apic = entry->apic;
303 pin = entry->pin; 341 pin = entry->pin;
304 if (pin == -1) 342#ifdef CONFIG_INTR_REMAP
305 break; 343 /*
344 * With interrupt-remapping, destination information comes
345 * from interrupt-remapping table entry.
346 */
347 if (!irq_remapped(irq))
348 io_apic_write(apic, 0x11 + pin*2, dest);
349#else
306 io_apic_write(apic, 0x11 + pin*2, dest); 350 io_apic_write(apic, 0x11 + pin*2, dest);
351#endif
307 reg = io_apic_read(apic, 0x10 + pin*2); 352 reg = io_apic_read(apic, 0x10 + pin*2);
308 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 353 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
309 reg |= vector; 354 reg |= vector;
310 io_apic_modify(apic, reg); 355 io_apic_modify(apic, 0x10 + pin*2, reg);
311 if (!entry->next) 356 if (!entry->next)
312 break; 357 break;
313 entry = irq_2_pin + entry->next; 358 entry = entry->next;
314 } 359 }
315} 360}
316 361
362static int assign_irq_vector(int irq, cpumask_t mask);
363
317static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) 364static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
318{ 365{
319 struct irq_cfg *cfg = irq_cfg + irq; 366 struct irq_cfg *cfg;
320 unsigned long flags; 367 unsigned long flags;
321 unsigned int dest; 368 unsigned int dest;
322 cpumask_t tmp; 369 cpumask_t tmp;
370 struct irq_desc *desc;
323 371
324 cpus_and(tmp, mask, cpu_online_map); 372 cpus_and(tmp, mask, cpu_online_map);
325 if (cpus_empty(tmp)) 373 if (cpus_empty(tmp))
326 return; 374 return;
327 375
376 cfg = irq_cfg(irq);
328 if (assign_irq_vector(irq, mask)) 377 if (assign_irq_vector(irq, mask))
329 return; 378 return;
330 379
331 cpus_and(tmp, cfg->domain, mask); 380 cpus_and(tmp, cfg->domain, mask);
332 dest = cpu_mask_to_apicid(tmp); 381 dest = cpu_mask_to_apicid(tmp);
333
334 /* 382 /*
335 * Only the high 8 bits are valid. 383 * Only the high 8 bits are valid.
336 */ 384 */
337 dest = SET_APIC_LOGICAL_ID(dest); 385 dest = SET_APIC_LOGICAL_ID(dest);
338 386
387 desc = irq_to_desc(irq);
339 spin_lock_irqsave(&ioapic_lock, flags); 388 spin_lock_irqsave(&ioapic_lock, flags);
340 __target_IO_APIC_irq(irq, dest, cfg->vector); 389 __target_IO_APIC_irq(irq, dest, cfg->vector);
341 irq_desc[irq].affinity = mask; 390 desc->affinity = mask;
342 spin_unlock_irqrestore(&ioapic_lock, flags); 391 spin_unlock_irqrestore(&ioapic_lock, flags);
343} 392}
344#endif 393#endif /* CONFIG_SMP */
345 394
346/* 395/*
347 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are 396 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
@@ -350,19 +399,30 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
350 */ 399 */
351static void add_pin_to_irq(unsigned int irq, int apic, int pin) 400static void add_pin_to_irq(unsigned int irq, int apic, int pin)
352{ 401{
353 static int first_free_entry = NR_IRQS; 402 struct irq_cfg *cfg;
354 struct irq_pin_list *entry = irq_2_pin + irq; 403 struct irq_pin_list *entry;
404
405 /* first time to refer irq_cfg, so with new */
406 cfg = irq_cfg_alloc(irq);
407 entry = cfg->irq_2_pin;
408 if (!entry) {
409 entry = get_one_free_irq_2_pin();
410 cfg->irq_2_pin = entry;
411 entry->apic = apic;
412 entry->pin = pin;
413 return;
414 }
355 415
356 BUG_ON(irq >= NR_IRQS); 416 while (entry->next) {
357 while (entry->next) 417 /* not again, please */
358 entry = irq_2_pin + entry->next; 418 if (entry->apic == apic && entry->pin == pin)
419 return;
359 420
360 if (entry->pin != -1) { 421 entry = entry->next;
361 entry->next = first_free_entry;
362 entry = irq_2_pin + entry->next;
363 if (++first_free_entry >= PIN_MAP_SIZE)
364 panic("io_apic.c: ran out of irq_2_pin entries!");
365 } 422 }
423
424 entry->next = get_one_free_irq_2_pin();
425 entry = entry->next;
366 entry->apic = apic; 426 entry->apic = apic;
367 entry->pin = pin; 427 entry->pin = pin;
368} 428}
@@ -374,30 +434,86 @@ static void __init replace_pin_at_irq(unsigned int irq,
374 int oldapic, int oldpin, 434 int oldapic, int oldpin,
375 int newapic, int newpin) 435 int newapic, int newpin)
376{ 436{
377 struct irq_pin_list *entry = irq_2_pin + irq; 437 struct irq_cfg *cfg = irq_cfg(irq);
438 struct irq_pin_list *entry = cfg->irq_2_pin;
439 int replaced = 0;
378 440
379 while (1) { 441 while (entry) {
380 if (entry->apic == oldapic && entry->pin == oldpin) { 442 if (entry->apic == oldapic && entry->pin == oldpin) {
381 entry->apic = newapic; 443 entry->apic = newapic;
382 entry->pin = newpin; 444 entry->pin = newpin;
383 } 445 replaced = 1;
384 if (!entry->next) 446 /* every one is different, right? */
385 break; 447 break;
386 entry = irq_2_pin + entry->next; 448 }
449 entry = entry->next;
450 }
451
452 /* why? call replace before add? */
453 if (!replaced)
454 add_pin_to_irq(irq, newapic, newpin);
455}
456
457static inline void io_apic_modify_irq(unsigned int irq,
458 int mask_and, int mask_or,
459 void (*final)(struct irq_pin_list *entry))
460{
461 int pin;
462 struct irq_cfg *cfg;
463 struct irq_pin_list *entry;
464
465 cfg = irq_cfg(irq);
466 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
467 unsigned int reg;
468 pin = entry->pin;
469 reg = io_apic_read(entry->apic, 0x10 + pin * 2);
470 reg &= mask_and;
471 reg |= mask_or;
472 io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
473 if (final)
474 final(entry);
387 } 475 }
388} 476}
389 477
478static void __unmask_IO_APIC_irq(unsigned int irq)
479{
480 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
481}
390 482
391#define DO_ACTION(name,R,ACTION, FINAL) \ 483#ifdef CONFIG_X86_64
392 \ 484void io_apic_sync(struct irq_pin_list *entry)
393 static void name##_IO_APIC_irq (unsigned int irq) \ 485{
394 __DO_ACTION(R, ACTION, FINAL) 486 /*
487 * Synchronize the IO-APIC and the CPU by doing
488 * a dummy read from the IO-APIC
489 */
490 struct io_apic __iomem *io_apic;
491 io_apic = io_apic_base(entry->apic);
492 readl(&io_apic->data);
493}
395 494
396/* mask = 1 */ 495static void __mask_IO_APIC_irq(unsigned int irq)
397DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) 496{
497 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
498}
499#else /* CONFIG_X86_32 */
500static void __mask_IO_APIC_irq(unsigned int irq)
501{
502 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
503}
504
505static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
506{
507 io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
508 IO_APIC_REDIR_MASKED, NULL);
509}
398 510
399/* mask = 0 */ 511static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
400DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) 512{
513 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
514 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
515}
516#endif /* CONFIG_X86_32 */
401 517
402static void mask_IO_APIC_irq (unsigned int irq) 518static void mask_IO_APIC_irq (unsigned int irq)
403{ 519{
@@ -440,24 +556,145 @@ static void clear_IO_APIC (void)
440 clear_IO_APIC_pin(apic, pin); 556 clear_IO_APIC_pin(apic, pin);
441} 557}
442 558
443int skip_ioapic_setup; 559#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32)
444int ioapic_force; 560void send_IPI_self(int vector)
561{
562 unsigned int cfg;
445 563
446static int __init parse_noapic(char *str) 564 /*
565 * Wait for idle.
566 */
567 apic_wait_icr_idle();
568 cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
569 /*
570 * Send the IPI. The write to APIC_ICR fires this off.
571 */
572 apic_write(APIC_ICR, cfg);
573}
574#endif /* !CONFIG_SMP && CONFIG_X86_32*/
575
576#ifdef CONFIG_X86_32
577/*
578 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
579 * specific CPU-side IRQs.
580 */
581
582#define MAX_PIRQS 8
583static int pirq_entries [MAX_PIRQS];
584static int pirqs_enabled;
585
586static int __init ioapic_pirq_setup(char *str)
447{ 587{
448 disable_ioapic_setup(); 588 int i, max;
589 int ints[MAX_PIRQS+1];
590
591 get_options(str, ARRAY_SIZE(ints), ints);
592
593 for (i = 0; i < MAX_PIRQS; i++)
594 pirq_entries[i] = -1;
595
596 pirqs_enabled = 1;
597 apic_printk(APIC_VERBOSE, KERN_INFO
598 "PIRQ redirection, working around broken MP-BIOS.\n");
599 max = MAX_PIRQS;
600 if (ints[0] < MAX_PIRQS)
601 max = ints[0];
602
603 for (i = 0; i < max; i++) {
604 apic_printk(APIC_VERBOSE, KERN_DEBUG
605 "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
606 /*
607 * PIRQs are mapped upside down, usually.
608 */
609 pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
610 }
611 return 1;
612}
613
614__setup("pirq=", ioapic_pirq_setup);
615#endif /* CONFIG_X86_32 */
616
617#ifdef CONFIG_INTR_REMAP
618/* I/O APIC RTE contents at the OS boot up */
619static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
620
621/*
622 * Saves and masks all the unmasked IO-APIC RTE's
623 */
624int save_mask_IO_APIC_setup(void)
625{
626 union IO_APIC_reg_01 reg_01;
627 unsigned long flags;
628 int apic, pin;
629
630 /*
631 * The number of IO-APIC IRQ registers (== #pins):
632 */
633 for (apic = 0; apic < nr_ioapics; apic++) {
634 spin_lock_irqsave(&ioapic_lock, flags);
635 reg_01.raw = io_apic_read(apic, 1);
636 spin_unlock_irqrestore(&ioapic_lock, flags);
637 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
638 }
639
640 for (apic = 0; apic < nr_ioapics; apic++) {
641 early_ioapic_entries[apic] =
642 kzalloc(sizeof(struct IO_APIC_route_entry) *
643 nr_ioapic_registers[apic], GFP_KERNEL);
644 if (!early_ioapic_entries[apic])
645 goto nomem;
646 }
647
648 for (apic = 0; apic < nr_ioapics; apic++)
649 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
650 struct IO_APIC_route_entry entry;
651
652 entry = early_ioapic_entries[apic][pin] =
653 ioapic_read_entry(apic, pin);
654 if (!entry.mask) {
655 entry.mask = 1;
656 ioapic_write_entry(apic, pin, entry);
657 }
658 }
659
449 return 0; 660 return 0;
661
662nomem:
663 while (apic >= 0)
664 kfree(early_ioapic_entries[apic--]);
665 memset(early_ioapic_entries, 0,
666 ARRAY_SIZE(early_ioapic_entries));
667
668 return -ENOMEM;
450} 669}
451early_param("noapic", parse_noapic);
452 670
453/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ 671void restore_IO_APIC_setup(void)
454static int __init disable_timer_pin_setup(char *arg)
455{ 672{
456 disable_timer_pin_1 = 1; 673 int apic, pin;
457 return 1; 674
675 for (apic = 0; apic < nr_ioapics; apic++) {
676 if (!early_ioapic_entries[apic])
677 break;
678 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
679 ioapic_write_entry(apic, pin,
680 early_ioapic_entries[apic][pin]);
681 kfree(early_ioapic_entries[apic]);
682 early_ioapic_entries[apic] = NULL;
683 }
458} 684}
459__setup("disable_timer_pin_1", disable_timer_pin_setup);
460 685
686void reinit_intr_remapped_IO_APIC(int intr_remapping)
687{
688 /*
689 * for now plain restore of previous settings.
690 * TBD: In the case of OS enabling interrupt-remapping,
691 * IO-APIC RTE's need to be setup to point to interrupt-remapping
692 * table entries. for now, do a plain restore, and wait for
693 * the setup_IO_APIC_irqs() to do proper initialization.
694 */
695 restore_IO_APIC_setup();
696}
697#endif
461 698
462/* 699/*
463 * Find the IRQ entry number of a certain pin. 700 * Find the IRQ entry number of a certain pin.
@@ -561,22 +798,54 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
561 best_guess = irq; 798 best_guess = irq;
562 } 799 }
563 } 800 }
564 BUG_ON(best_guess >= NR_IRQS);
565 return best_guess; 801 return best_guess;
566} 802}
567 803
804EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
805
806#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
807/*
808 * EISA Edge/Level control register, ELCR
809 */
810static int EISA_ELCR(unsigned int irq)
811{
812 if (irq < 16) {
813 unsigned int port = 0x4d0 + (irq >> 3);
814 return (inb(port) >> (irq & 7)) & 1;
815 }
816 apic_printk(APIC_VERBOSE, KERN_INFO
817 "Broken MPtable reports ISA irq %d\n", irq);
818 return 0;
819}
820
821#endif
822
568/* ISA interrupts are always polarity zero edge triggered, 823/* ISA interrupts are always polarity zero edge triggered,
569 * when listed as conforming in the MP table. */ 824 * when listed as conforming in the MP table. */
570 825
571#define default_ISA_trigger(idx) (0) 826#define default_ISA_trigger(idx) (0)
572#define default_ISA_polarity(idx) (0) 827#define default_ISA_polarity(idx) (0)
573 828
829/* EISA interrupts are always polarity zero and can be edge or level
830 * trigger depending on the ELCR value. If an interrupt is listed as
831 * EISA conforming in the MP table, that means its trigger type must
832 * be read in from the ELCR */
833
834#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
835#define default_EISA_polarity(idx) default_ISA_polarity(idx)
836
574/* PCI interrupts are always polarity one level triggered, 837/* PCI interrupts are always polarity one level triggered,
575 * when listed as conforming in the MP table. */ 838 * when listed as conforming in the MP table. */
576 839
577#define default_PCI_trigger(idx) (1) 840#define default_PCI_trigger(idx) (1)
578#define default_PCI_polarity(idx) (1) 841#define default_PCI_polarity(idx) (1)
579 842
843/* MCA interrupts are always polarity zero level triggered,
844 * when listed as conforming in the MP table. */
845
846#define default_MCA_trigger(idx) (1)
847#define default_MCA_polarity(idx) default_ISA_polarity(idx)
848
580static int MPBIOS_polarity(int idx) 849static int MPBIOS_polarity(int idx)
581{ 850{
582 int bus = mp_irqs[idx].mp_srcbus; 851 int bus = mp_irqs[idx].mp_srcbus;
@@ -634,6 +903,36 @@ static int MPBIOS_trigger(int idx)
634 trigger = default_ISA_trigger(idx); 903 trigger = default_ISA_trigger(idx);
635 else 904 else
636 trigger = default_PCI_trigger(idx); 905 trigger = default_PCI_trigger(idx);
906#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
907 switch (mp_bus_id_to_type[bus]) {
908 case MP_BUS_ISA: /* ISA pin */
909 {
910 /* set before the switch */
911 break;
912 }
913 case MP_BUS_EISA: /* EISA pin */
914 {
915 trigger = default_EISA_trigger(idx);
916 break;
917 }
918 case MP_BUS_PCI: /* PCI pin */
919 {
920 /* set before the switch */
921 break;
922 }
923 case MP_BUS_MCA: /* MCA pin */
924 {
925 trigger = default_MCA_trigger(idx);
926 break;
927 }
928 default:
929 {
930 printk(KERN_WARNING "broken BIOS!!\n");
931 trigger = 1;
932 break;
933 }
934 }
935#endif
637 break; 936 break;
638 case 1: /* edge */ 937 case 1: /* edge */
639 { 938 {
@@ -671,6 +970,7 @@ static inline int irq_trigger(int idx)
671 return MPBIOS_trigger(idx); 970 return MPBIOS_trigger(idx);
672} 971}
673 972
973int (*ioapic_renumber_irq)(int ioapic, int irq);
674static int pin_2_irq(int idx, int apic, int pin) 974static int pin_2_irq(int idx, int apic, int pin)
675{ 975{
676 int irq, i; 976 int irq, i;
@@ -692,8 +992,32 @@ static int pin_2_irq(int idx, int apic, int pin)
692 while (i < apic) 992 while (i < apic)
693 irq += nr_ioapic_registers[i++]; 993 irq += nr_ioapic_registers[i++];
694 irq += pin; 994 irq += pin;
995 /*
996 * For MPS mode, so far only needed by ES7000 platform
997 */
998 if (ioapic_renumber_irq)
999 irq = ioapic_renumber_irq(apic, irq);
695 } 1000 }
696 BUG_ON(irq >= NR_IRQS); 1001
1002#ifdef CONFIG_X86_32
1003 /*
1004 * PCI IRQ command line redirection. Yes, limits are hardcoded.
1005 */
1006 if ((pin >= 16) && (pin <= 23)) {
1007 if (pirq_entries[pin-16] != -1) {
1008 if (!pirq_entries[pin-16]) {
1009 apic_printk(APIC_VERBOSE, KERN_DEBUG
1010 "disabling PIRQ%d\n", pin-16);
1011 } else {
1012 irq = pirq_entries[pin-16];
1013 apic_printk(APIC_VERBOSE, KERN_DEBUG
1014 "using PIRQ%d -> IRQ %d\n",
1015 pin-16, irq);
1016 }
1017 }
1018 }
1019#endif
1020
697 return irq; 1021 return irq;
698} 1022}
699 1023
@@ -728,8 +1052,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
728 int cpu; 1052 int cpu;
729 struct irq_cfg *cfg; 1053 struct irq_cfg *cfg;
730 1054
731 BUG_ON((unsigned)irq >= NR_IRQS); 1055 cfg = irq_cfg(irq);
732 cfg = &irq_cfg[irq];
733 1056
734 /* Only try and allocate irqs on cpus that are present */ 1057 /* Only try and allocate irqs on cpus that are present */
735 cpus_and(mask, mask, cpu_online_map); 1058 cpus_and(mask, mask, cpu_online_map);
@@ -764,8 +1087,13 @@ next:
764 } 1087 }
765 if (unlikely(current_vector == vector)) 1088 if (unlikely(current_vector == vector))
766 continue; 1089 continue;
1090#ifdef CONFIG_X86_64
767 if (vector == IA32_SYSCALL_VECTOR) 1091 if (vector == IA32_SYSCALL_VECTOR)
768 goto next; 1092 goto next;
1093#else
1094 if (vector == SYSCALL_VECTOR)
1095 goto next;
1096#endif
769 for_each_cpu_mask_nr(new_cpu, new_mask) 1097 for_each_cpu_mask_nr(new_cpu, new_mask)
770 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 1098 if (per_cpu(vector_irq, new_cpu)[vector] != -1)
771 goto next; 1099 goto next;
@@ -802,8 +1130,7 @@ static void __clear_irq_vector(int irq)
802 cpumask_t mask; 1130 cpumask_t mask;
803 int cpu, vector; 1131 int cpu, vector;
804 1132
805 BUG_ON((unsigned)irq >= NR_IRQS); 1133 cfg = irq_cfg(irq);
806 cfg = &irq_cfg[irq];
807 BUG_ON(!cfg->vector); 1134 BUG_ON(!cfg->vector);
808 1135
809 vector = cfg->vector; 1136 vector = cfg->vector;
@@ -820,12 +1147,13 @@ void __setup_vector_irq(int cpu)
820 /* Initialize vector_irq on a new cpu */ 1147 /* Initialize vector_irq on a new cpu */
821 /* This function must be called with vector_lock held */ 1148 /* This function must be called with vector_lock held */
822 int irq, vector; 1149 int irq, vector;
1150 struct irq_cfg *cfg;
823 1151
824 /* Mark the inuse vectors */ 1152 /* Mark the inuse vectors */
825 for (irq = 0; irq < NR_IRQS; ++irq) { 1153 for_each_irq_cfg(irq, cfg) {
826 if (!cpu_isset(cpu, irq_cfg[irq].domain)) 1154 if (!cpu_isset(cpu, cfg->domain))
827 continue; 1155 continue;
828 vector = irq_cfg[irq].vector; 1156 vector = cfg->vector;
829 per_cpu(vector_irq, cpu)[vector] = irq; 1157 per_cpu(vector_irq, cpu)[vector] = irq;
830 } 1158 }
831 /* Mark the free vectors */ 1159 /* Mark the free vectors */
@@ -833,36 +1161,154 @@ void __setup_vector_irq(int cpu)
833 irq = per_cpu(vector_irq, cpu)[vector]; 1161 irq = per_cpu(vector_irq, cpu)[vector];
834 if (irq < 0) 1162 if (irq < 0)
835 continue; 1163 continue;
836 if (!cpu_isset(cpu, irq_cfg[irq].domain)) 1164
1165 cfg = irq_cfg(irq);
1166 if (!cpu_isset(cpu, cfg->domain))
837 per_cpu(vector_irq, cpu)[vector] = -1; 1167 per_cpu(vector_irq, cpu)[vector] = -1;
838 } 1168 }
839} 1169}
840 1170
841static struct irq_chip ioapic_chip; 1171static struct irq_chip ioapic_chip;
1172#ifdef CONFIG_INTR_REMAP
1173static struct irq_chip ir_ioapic_chip;
1174#endif
1175
1176#define IOAPIC_AUTO -1
1177#define IOAPIC_EDGE 0
1178#define IOAPIC_LEVEL 1
1179
1180#ifdef CONFIG_X86_32
1181static inline int IO_APIC_irq_trigger(int irq)
1182{
1183 int apic, idx, pin;
1184
1185 for (apic = 0; apic < nr_ioapics; apic++) {
1186 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1187 idx = find_irq_entry(apic, pin, mp_INT);
1188 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
1189 return irq_trigger(idx);
1190 }
1191 }
1192 /*
1193 * nonexistent IRQs are edge default
1194 */
1195 return 0;
1196}
1197#else
1198static inline int IO_APIC_irq_trigger(int irq)
1199{
1200 return 1;
1201}
1202#endif
842 1203
843static void ioapic_register_intr(int irq, unsigned long trigger) 1204static void ioapic_register_intr(int irq, unsigned long trigger)
844{ 1205{
845 if (trigger) { 1206 struct irq_desc *desc;
846 irq_desc[irq].status |= IRQ_LEVEL; 1207
1208 desc = irq_to_desc(irq);
1209
1210 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1211 trigger == IOAPIC_LEVEL)
1212 desc->status |= IRQ_LEVEL;
1213 else
1214 desc->status &= ~IRQ_LEVEL;
1215
1216#ifdef CONFIG_INTR_REMAP
1217 if (irq_remapped(irq)) {
1218 desc->status |= IRQ_MOVE_PCNTXT;
1219 if (trigger)
1220 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
1221 handle_fasteoi_irq,
1222 "fasteoi");
1223 else
1224 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
1225 handle_edge_irq, "edge");
1226 return;
1227 }
1228#endif
1229 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1230 trigger == IOAPIC_LEVEL)
847 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1231 set_irq_chip_and_handler_name(irq, &ioapic_chip,
848 handle_fasteoi_irq, "fasteoi"); 1232 handle_fasteoi_irq,
849 } else { 1233 "fasteoi");
850 irq_desc[irq].status &= ~IRQ_LEVEL; 1234 else
851 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1235 set_irq_chip_and_handler_name(irq, &ioapic_chip,
852 handle_edge_irq, "edge"); 1236 handle_edge_irq, "edge");
1237}
1238
1239static int setup_ioapic_entry(int apic, int irq,
1240 struct IO_APIC_route_entry *entry,
1241 unsigned int destination, int trigger,
1242 int polarity, int vector)
1243{
1244 /*
1245 * add it to the IO-APIC irq-routing table:
1246 */
1247 memset(entry,0,sizeof(*entry));
1248
1249#ifdef CONFIG_INTR_REMAP
1250 if (intr_remapping_enabled) {
1251 struct intel_iommu *iommu = map_ioapic_to_ir(apic);
1252 struct irte irte;
1253 struct IR_IO_APIC_route_entry *ir_entry =
1254 (struct IR_IO_APIC_route_entry *) entry;
1255 int index;
1256
1257 if (!iommu)
1258 panic("No mapping iommu for ioapic %d\n", apic);
1259
1260 index = alloc_irte(iommu, irq, 1);
1261 if (index < 0)
1262 panic("Failed to allocate IRTE for ioapic %d\n", apic);
1263
1264 memset(&irte, 0, sizeof(irte));
1265
1266 irte.present = 1;
1267 irte.dst_mode = INT_DEST_MODE;
1268 irte.trigger_mode = trigger;
1269 irte.dlvry_mode = INT_DELIVERY_MODE;
1270 irte.vector = vector;
1271 irte.dest_id = IRTE_DEST(destination);
1272
1273 modify_irte(irq, &irte);
1274
1275 ir_entry->index2 = (index >> 15) & 0x1;
1276 ir_entry->zero = 0;
1277 ir_entry->format = 1;
1278 ir_entry->index = (index & 0x7fff);
1279 } else
1280#endif
1281 {
1282 entry->delivery_mode = INT_DELIVERY_MODE;
1283 entry->dest_mode = INT_DEST_MODE;
1284 entry->dest = destination;
853 } 1285 }
1286
1287 entry->mask = 0; /* enable IRQ */
1288 entry->trigger = trigger;
1289 entry->polarity = polarity;
1290 entry->vector = vector;
1291
1292 /* Mask level triggered irqs.
1293 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
1294 */
1295 if (trigger)
1296 entry->mask = 1;
1297 return 0;
854} 1298}
855 1299
856static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, 1300static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
857 int trigger, int polarity) 1301 int trigger, int polarity)
858{ 1302{
859 struct irq_cfg *cfg = irq_cfg + irq; 1303 struct irq_cfg *cfg;
860 struct IO_APIC_route_entry entry; 1304 struct IO_APIC_route_entry entry;
861 cpumask_t mask; 1305 cpumask_t mask;
862 1306
863 if (!IO_APIC_IRQ(irq)) 1307 if (!IO_APIC_IRQ(irq))
864 return; 1308 return;
865 1309
1310 cfg = irq_cfg(irq);
1311
866 mask = TARGET_CPUS; 1312 mask = TARGET_CPUS;
867 if (assign_irq_vector(irq, mask)) 1313 if (assign_irq_vector(irq, mask))
868 return; 1314 return;
@@ -875,24 +1321,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
875 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, 1321 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
876 irq, trigger, polarity); 1322 irq, trigger, polarity);
877 1323
878 /*
879 * add it to the IO-APIC irq-routing table:
880 */
881 memset(&entry,0,sizeof(entry));
882
883 entry.delivery_mode = INT_DELIVERY_MODE;
884 entry.dest_mode = INT_DEST_MODE;
885 entry.dest = cpu_mask_to_apicid(mask);
886 entry.mask = 0; /* enable IRQ */
887 entry.trigger = trigger;
888 entry.polarity = polarity;
889 entry.vector = cfg->vector;
890 1324
891 /* Mask level triggered irqs. 1325 if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
892 * Use IRQ_DELAYED_DISABLE for edge triggered irqs. 1326 cpu_mask_to_apicid(mask), trigger, polarity,
893 */ 1327 cfg->vector)) {
894 if (trigger) 1328 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
895 entry.mask = 1; 1329 mp_ioapics[apic].mp_apicid, pin);
1330 __clear_irq_vector(irq);
1331 return;
1332 }
896 1333
897 ioapic_register_intr(irq, trigger); 1334 ioapic_register_intr(irq, trigger);
898 if (irq < 16) 1335 if (irq < 16)
@@ -903,37 +1340,49 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
903 1340
904static void __init setup_IO_APIC_irqs(void) 1341static void __init setup_IO_APIC_irqs(void)
905{ 1342{
906 int apic, pin, idx, irq, first_notcon = 1; 1343 int apic, pin, idx, irq;
1344 int notcon = 0;
907 1345
908 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1346 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
909 1347
910 for (apic = 0; apic < nr_ioapics; apic++) { 1348 for (apic = 0; apic < nr_ioapics; apic++) {
911 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1349 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
912
913 idx = find_irq_entry(apic,pin,mp_INT);
914 if (idx == -1) {
915 if (first_notcon) {
916 apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin);
917 first_notcon = 0;
918 } else
919 apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin);
920 continue;
921 }
922 if (!first_notcon) {
923 apic_printk(APIC_VERBOSE, " not connected.\n");
924 first_notcon = 1;
925 }
926 1350
927 irq = pin_2_irq(idx, apic, pin); 1351 idx = find_irq_entry(apic, pin, mp_INT);
928 add_pin_to_irq(irq, apic, pin); 1352 if (idx == -1) {
1353 if (!notcon) {
1354 notcon = 1;
1355 apic_printk(APIC_VERBOSE,
1356 KERN_DEBUG " %d-%d",
1357 mp_ioapics[apic].mp_apicid,
1358 pin);
1359 } else
1360 apic_printk(APIC_VERBOSE, " %d-%d",
1361 mp_ioapics[apic].mp_apicid,
1362 pin);
1363 continue;
1364 }
1365 if (notcon) {
1366 apic_printk(APIC_VERBOSE,
1367 " (apicid-pin) not connected\n");
1368 notcon = 0;
1369 }
929 1370
930 setup_IO_APIC_irq(apic, pin, irq, 1371 irq = pin_2_irq(idx, apic, pin);
931 irq_trigger(idx), irq_polarity(idx)); 1372#ifdef CONFIG_X86_32
932 } 1373 if (multi_timer_check(apic, irq))
1374 continue;
1375#endif
1376 add_pin_to_irq(irq, apic, pin);
1377
1378 setup_IO_APIC_irq(apic, pin, irq,
1379 irq_trigger(idx), irq_polarity(idx));
1380 }
933 } 1381 }
934 1382
935 if (!first_notcon) 1383 if (notcon)
936 apic_printk(APIC_VERBOSE, " not connected.\n"); 1384 apic_printk(APIC_VERBOSE,
1385 " (apicid-pin) not connected\n");
937} 1386}
938 1387
939/* 1388/*
@@ -944,6 +1393,11 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
944{ 1393{
945 struct IO_APIC_route_entry entry; 1394 struct IO_APIC_route_entry entry;
946 1395
1396#ifdef CONFIG_INTR_REMAP
1397 if (intr_remapping_enabled)
1398 return;
1399#endif
1400
947 memset(&entry, 0, sizeof(entry)); 1401 memset(&entry, 0, sizeof(entry));
948 1402
949 /* 1403 /*
@@ -970,13 +1424,17 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
970 ioapic_write_entry(apic, pin, entry); 1424 ioapic_write_entry(apic, pin, entry);
971} 1425}
972 1426
973void __apicdebuginit print_IO_APIC(void) 1427
1428__apicdebuginit(void) print_IO_APIC(void)
974{ 1429{
975 int apic, i; 1430 int apic, i;
976 union IO_APIC_reg_00 reg_00; 1431 union IO_APIC_reg_00 reg_00;
977 union IO_APIC_reg_01 reg_01; 1432 union IO_APIC_reg_01 reg_01;
978 union IO_APIC_reg_02 reg_02; 1433 union IO_APIC_reg_02 reg_02;
1434 union IO_APIC_reg_03 reg_03;
979 unsigned long flags; 1435 unsigned long flags;
1436 struct irq_cfg *cfg;
1437 unsigned int irq;
980 1438
981 if (apic_verbosity == APIC_QUIET) 1439 if (apic_verbosity == APIC_QUIET)
982 return; 1440 return;
@@ -999,12 +1457,16 @@ void __apicdebuginit print_IO_APIC(void)
999 reg_01.raw = io_apic_read(apic, 1); 1457 reg_01.raw = io_apic_read(apic, 1);
1000 if (reg_01.bits.version >= 0x10) 1458 if (reg_01.bits.version >= 0x10)
1001 reg_02.raw = io_apic_read(apic, 2); 1459 reg_02.raw = io_apic_read(apic, 2);
1460 if (reg_01.bits.version >= 0x20)
1461 reg_03.raw = io_apic_read(apic, 3);
1002 spin_unlock_irqrestore(&ioapic_lock, flags); 1462 spin_unlock_irqrestore(&ioapic_lock, flags);
1003 1463
1004 printk("\n"); 1464 printk("\n");
1005 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); 1465 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
1006 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1466 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1007 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1467 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1468 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1469 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1008 1470
1009 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01); 1471 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
1010 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); 1472 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
@@ -1012,11 +1474,27 @@ void __apicdebuginit print_IO_APIC(void)
1012 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); 1474 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1013 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); 1475 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1014 1476
1015 if (reg_01.bits.version >= 0x10) { 1477 /*
1478 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
1479 * but the value of reg_02 is read as the previous read register
1480 * value, so ignore it if reg_02 == reg_01.
1481 */
1482 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1016 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); 1483 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1017 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); 1484 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1018 } 1485 }
1019 1486
1487 /*
1488 * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
1489 * or reg_03, but the value of reg_0[23] is read as the previous read
1490 * register value, so ignore it if reg_03 == reg_0[12].
1491 */
1492 if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
1493 reg_03.raw != reg_01.raw) {
1494 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1495 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1496 }
1497
1020 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1498 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1021 1499
1022 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" 1500 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
@@ -1045,16 +1523,16 @@ void __apicdebuginit print_IO_APIC(void)
1045 } 1523 }
1046 } 1524 }
1047 printk(KERN_DEBUG "IRQ to pin mappings:\n"); 1525 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1048 for (i = 0; i < NR_IRQS; i++) { 1526 for_each_irq_cfg(irq, cfg) {
1049 struct irq_pin_list *entry = irq_2_pin + i; 1527 struct irq_pin_list *entry = cfg->irq_2_pin;
1050 if (entry->pin < 0) 1528 if (!entry)
1051 continue; 1529 continue;
1052 printk(KERN_DEBUG "IRQ%d ", i); 1530 printk(KERN_DEBUG "IRQ%d ", irq);
1053 for (;;) { 1531 for (;;) {
1054 printk("-> %d:%d", entry->apic, entry->pin); 1532 printk("-> %d:%d", entry->apic, entry->pin);
1055 if (!entry->next) 1533 if (!entry->next)
1056 break; 1534 break;
1057 entry = irq_2_pin + entry->next; 1535 entry = entry->next;
1058 } 1536 }
1059 printk("\n"); 1537 printk("\n");
1060 } 1538 }
@@ -1064,9 +1542,7 @@ void __apicdebuginit print_IO_APIC(void)
1064 return; 1542 return;
1065} 1543}
1066 1544
1067#if 0 1545__apicdebuginit(void) print_APIC_bitfield(int base)
1068
1069static __apicdebuginit void print_APIC_bitfield (int base)
1070{ 1546{
1071 unsigned int v; 1547 unsigned int v;
1072 int i, j; 1548 int i, j;
@@ -1087,9 +1563,10 @@ static __apicdebuginit void print_APIC_bitfield (int base)
1087 } 1563 }
1088} 1564}
1089 1565
1090void __apicdebuginit print_local_APIC(void * dummy) 1566__apicdebuginit(void) print_local_APIC(void *dummy)
1091{ 1567{
1092 unsigned int v, ver, maxlvt; 1568 unsigned int v, ver, maxlvt;
1569 u64 icr;
1093 1570
1094 if (apic_verbosity == APIC_QUIET) 1571 if (apic_verbosity == APIC_QUIET)
1095 return; 1572 return;
@@ -1097,7 +1574,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
1097 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1574 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1098 smp_processor_id(), hard_smp_processor_id()); 1575 smp_processor_id(), hard_smp_processor_id());
1099 v = apic_read(APIC_ID); 1576 v = apic_read(APIC_ID);
1100 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); 1577 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
1101 v = apic_read(APIC_LVR); 1578 v = apic_read(APIC_LVR);
1102 printk(KERN_INFO "... APIC VERSION: %08x\n", v); 1579 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1103 ver = GET_APIC_VERSION(v); 1580 ver = GET_APIC_VERSION(v);
@@ -1106,20 +1583,31 @@ void __apicdebuginit print_local_APIC(void * dummy)
1106 v = apic_read(APIC_TASKPRI); 1583 v = apic_read(APIC_TASKPRI);
1107 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); 1584 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
1108 1585
1109 v = apic_read(APIC_ARBPRI); 1586 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1110 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, 1587 if (!APIC_XAPIC(ver)) {
1111 v & APIC_ARBPRI_MASK); 1588 v = apic_read(APIC_ARBPRI);
1112 v = apic_read(APIC_PROCPRI); 1589 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
1113 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); 1590 v & APIC_ARBPRI_MASK);
1591 }
1592 v = apic_read(APIC_PROCPRI);
1593 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
1594 }
1595
1596 /*
1597 * Remote read supported only in the 82489DX and local APIC for
1598 * Pentium processors.
1599 */
1600 if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
1601 v = apic_read(APIC_RRR);
1602 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1603 }
1114 1604
1115 v = apic_read(APIC_EOI);
1116 printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
1117 v = apic_read(APIC_RRR);
1118 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1119 v = apic_read(APIC_LDR); 1605 v = apic_read(APIC_LDR);
1120 printk(KERN_DEBUG "... APIC LDR: %08x\n", v); 1606 printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
1121 v = apic_read(APIC_DFR); 1607 if (!x2apic_enabled()) {
1122 printk(KERN_DEBUG "... APIC DFR: %08x\n", v); 1608 v = apic_read(APIC_DFR);
1609 printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
1610 }
1123 v = apic_read(APIC_SPIV); 1611 v = apic_read(APIC_SPIV);
1124 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); 1612 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1125 1613
@@ -1130,13 +1618,17 @@ void __apicdebuginit print_local_APIC(void * dummy)
1130 printk(KERN_DEBUG "... APIC IRR field:\n"); 1618 printk(KERN_DEBUG "... APIC IRR field:\n");
1131 print_APIC_bitfield(APIC_IRR); 1619 print_APIC_bitfield(APIC_IRR);
1132 1620
1133 v = apic_read(APIC_ESR); 1621 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1134 printk(KERN_DEBUG "... APIC ESR: %08x\n", v); 1622 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1623 apic_write(APIC_ESR, 0);
1135 1624
1136 v = apic_read(APIC_ICR); 1625 v = apic_read(APIC_ESR);
1137 printk(KERN_DEBUG "... APIC ICR: %08x\n", v); 1626 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1138 v = apic_read(APIC_ICR2); 1627 }
1139 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); 1628
1629 icr = apic_icr_read();
1630 printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
1631 printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
1140 1632
1141 v = apic_read(APIC_LVTT); 1633 v = apic_read(APIC_LVTT);
1142 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); 1634 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
@@ -1164,12 +1656,17 @@ void __apicdebuginit print_local_APIC(void * dummy)
1164 printk("\n"); 1656 printk("\n");
1165} 1657}
1166 1658
1167void print_all_local_APICs (void) 1659__apicdebuginit(void) print_all_local_APICs(void)
1168{ 1660{
1169 on_each_cpu(print_local_APIC, NULL, 1); 1661 int cpu;
1662
1663 preempt_disable();
1664 for_each_online_cpu(cpu)
1665 smp_call_function_single(cpu, print_local_APIC, NULL, 1);
1666 preempt_enable();
1170} 1667}
1171 1668
1172void __apicdebuginit print_PIC(void) 1669__apicdebuginit(void) print_PIC(void)
1173{ 1670{
1174 unsigned int v; 1671 unsigned int v;
1175 unsigned long flags; 1672 unsigned long flags;
@@ -1201,19 +1698,34 @@ void __apicdebuginit print_PIC(void)
1201 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1698 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1202} 1699}
1203 1700
1204#endif /* 0 */ 1701__apicdebuginit(int) print_all_ICs(void)
1702{
1703 print_PIC();
1704 print_all_local_APICs();
1705 print_IO_APIC();
1706
1707 return 0;
1708}
1709
1710fs_initcall(print_all_ICs);
1711
1712
1713/* Where if anywhere is the i8259 connect in external int mode */
1714static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
1205 1715
1206void __init enable_IO_APIC(void) 1716void __init enable_IO_APIC(void)
1207{ 1717{
1208 union IO_APIC_reg_01 reg_01; 1718 union IO_APIC_reg_01 reg_01;
1209 int i8259_apic, i8259_pin; 1719 int i8259_apic, i8259_pin;
1210 int i, apic; 1720 int apic;
1211 unsigned long flags; 1721 unsigned long flags;
1212 1722
1213 for (i = 0; i < PIN_MAP_SIZE; i++) { 1723#ifdef CONFIG_X86_32
1214 irq_2_pin[i].pin = -1; 1724 int i;
1215 irq_2_pin[i].next = 0; 1725 if (!pirqs_enabled)
1216 } 1726 for (i = 0; i < MAX_PIRQS; i++)
1727 pirq_entries[i] = -1;
1728#endif
1217 1729
1218 /* 1730 /*
1219 * The number of IO-APIC IRQ registers (== #pins): 1731 * The number of IO-APIC IRQ registers (== #pins):
@@ -1243,6 +1755,10 @@ void __init enable_IO_APIC(void)
1243 } 1755 }
1244 found_i8259: 1756 found_i8259:
1245 /* Look to see what if the MP table has reported the ExtINT */ 1757 /* Look to see what if the MP table has reported the ExtINT */
1758 /* If we could not find the appropriate pin by looking at the ioapic
1759 * the i8259 probably is not connected the ioapic but give the
1760 * mptable a chance anyway.
1761 */
1246 i8259_pin = find_isa_irq_pin(0, mp_ExtINT); 1762 i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
1247 i8259_apic = find_isa_irq_apic(0, mp_ExtINT); 1763 i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
1248 /* Trust the MP table if nothing is setup in the hardware */ 1764 /* Trust the MP table if nothing is setup in the hardware */
@@ -1291,7 +1807,7 @@ void disable_IO_APIC(void)
1291 entry.dest_mode = 0; /* Physical */ 1807 entry.dest_mode = 0; /* Physical */
1292 entry.delivery_mode = dest_ExtINT; /* ExtInt */ 1808 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1293 entry.vector = 0; 1809 entry.vector = 0;
1294 entry.dest = GET_APIC_ID(read_apic_id()); 1810 entry.dest = read_apic_id();
1295 1811
1296 /* 1812 /*
1297 * Add it to the IO-APIC irq-routing table: 1813 * Add it to the IO-APIC irq-routing table:
@@ -1302,6 +1818,133 @@ void disable_IO_APIC(void)
1302 disconnect_bsp_APIC(ioapic_i8259.pin != -1); 1818 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1303} 1819}
1304 1820
1821#ifdef CONFIG_X86_32
1822/*
1823 * function to set the IO-APIC physical IDs based on the
1824 * values stored in the MPC table.
1825 *
1826 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1827 */
1828
1829static void __init setup_ioapic_ids_from_mpc(void)
1830{
1831 union IO_APIC_reg_00 reg_00;
1832 physid_mask_t phys_id_present_map;
1833 int apic;
1834 int i;
1835 unsigned char old_id;
1836 unsigned long flags;
1837
1838 if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
1839 return;
1840
1841 /*
1842 * Don't check I/O APIC IDs for xAPIC systems. They have
1843 * no meaning without the serial APIC bus.
1844 */
1845 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1846 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
1847 return;
1848 /*
1849 * This is broken; anything with a real cpu count has to
1850 * circumvent this idiocy regardless.
1851 */
1852 phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
1853
1854 /*
1855 * Set the IOAPIC ID to the value stored in the MPC table.
1856 */
1857 for (apic = 0; apic < nr_ioapics; apic++) {
1858
1859 /* Read the register 0 value */
1860 spin_lock_irqsave(&ioapic_lock, flags);
1861 reg_00.raw = io_apic_read(apic, 0);
1862 spin_unlock_irqrestore(&ioapic_lock, flags);
1863
1864 old_id = mp_ioapics[apic].mp_apicid;
1865
1866 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
1867 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1868 apic, mp_ioapics[apic].mp_apicid);
1869 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1870 reg_00.bits.ID);
1871 mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
1872 }
1873
1874 /*
1875 * Sanity check, is the ID really free? Every APIC in a
1876 * system must have a unique ID or we get lots of nice
1877 * 'stuck on smp_invalidate_needed IPI wait' messages.
1878 */
1879 if (check_apicid_used(phys_id_present_map,
1880 mp_ioapics[apic].mp_apicid)) {
1881 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1882 apic, mp_ioapics[apic].mp_apicid);
1883 for (i = 0; i < get_physical_broadcast(); i++)
1884 if (!physid_isset(i, phys_id_present_map))
1885 break;
1886 if (i >= get_physical_broadcast())
1887 panic("Max APIC ID exceeded!\n");
1888 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1889 i);
1890 physid_set(i, phys_id_present_map);
1891 mp_ioapics[apic].mp_apicid = i;
1892 } else {
1893 physid_mask_t tmp;
1894 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
1895 apic_printk(APIC_VERBOSE, "Setting %d in the "
1896 "phys_id_present_map\n",
1897 mp_ioapics[apic].mp_apicid);
1898 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1899 }
1900
1901
1902 /*
1903 * We need to adjust the IRQ routing table
1904 * if the ID changed.
1905 */
1906 if (old_id != mp_ioapics[apic].mp_apicid)
1907 for (i = 0; i < mp_irq_entries; i++)
1908 if (mp_irqs[i].mp_dstapic == old_id)
1909 mp_irqs[i].mp_dstapic
1910 = mp_ioapics[apic].mp_apicid;
1911
1912 /*
1913 * Read the right value from the MPC table and
1914 * write it into the ID register.
1915 */
1916 apic_printk(APIC_VERBOSE, KERN_INFO
1917 "...changing IO-APIC physical APIC ID to %d ...",
1918 mp_ioapics[apic].mp_apicid);
1919
1920 reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
1921 spin_lock_irqsave(&ioapic_lock, flags);
1922 io_apic_write(apic, 0, reg_00.raw);
1923 spin_unlock_irqrestore(&ioapic_lock, flags);
1924
1925 /*
1926 * Sanity check
1927 */
1928 spin_lock_irqsave(&ioapic_lock, flags);
1929 reg_00.raw = io_apic_read(apic, 0);
1930 spin_unlock_irqrestore(&ioapic_lock, flags);
1931 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
1932 printk("could not set ID!\n");
1933 else
1934 apic_printk(APIC_VERBOSE, " ok.\n");
1935 }
1936}
1937#endif
1938
1939int no_timer_check __initdata;
1940
1941static int __init notimercheck(char *s)
1942{
1943 no_timer_check = 1;
1944 return 1;
1945}
1946__setup("no_timer_check", notimercheck);
1947
1305/* 1948/*
1306 * There is a nasty bug in some older SMP boards, their mptable lies 1949 * There is a nasty bug in some older SMP boards, their mptable lies
1307 * about the timer IRQ. We do the following to work around the situation: 1950 * about the timer IRQ. We do the following to work around the situation:
@@ -1315,6 +1958,9 @@ static int __init timer_irq_works(void)
1315 unsigned long t1 = jiffies; 1958 unsigned long t1 = jiffies;
1316 unsigned long flags; 1959 unsigned long flags;
1317 1960
1961 if (no_timer_check)
1962 return 1;
1963
1318 local_save_flags(flags); 1964 local_save_flags(flags);
1319 local_irq_enable(); 1965 local_irq_enable();
1320 /* Let ten ticks pass... */ 1966 /* Let ten ticks pass... */
@@ -1375,9 +2021,11 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
1375 return was_pending; 2021 return was_pending;
1376} 2022}
1377 2023
2024#ifdef CONFIG_X86_64
1378static int ioapic_retrigger_irq(unsigned int irq) 2025static int ioapic_retrigger_irq(unsigned int irq)
1379{ 2026{
1380 struct irq_cfg *cfg = &irq_cfg[irq]; 2027
2028 struct irq_cfg *cfg = irq_cfg(irq);
1381 unsigned long flags; 2029 unsigned long flags;
1382 2030
1383 spin_lock_irqsave(&vector_lock, flags); 2031 spin_lock_irqsave(&vector_lock, flags);
@@ -1386,6 +2034,14 @@ static int ioapic_retrigger_irq(unsigned int irq)
1386 2034
1387 return 1; 2035 return 1;
1388} 2036}
2037#else
2038static int ioapic_retrigger_irq(unsigned int irq)
2039{
2040 send_IPI_self(irq_cfg(irq)->vector);
2041
2042 return 1;
2043}
2044#endif
1389 2045
1390/* 2046/*
1391 * Level and edge triggered IO-APIC interrupts need different handling, 2047 * Level and edge triggered IO-APIC interrupts need different handling,
@@ -1397,11 +2053,159 @@ static int ioapic_retrigger_irq(unsigned int irq)
1397 */ 2053 */
1398 2054
1399#ifdef CONFIG_SMP 2055#ifdef CONFIG_SMP
2056
2057#ifdef CONFIG_INTR_REMAP
2058static void ir_irq_migration(struct work_struct *work);
2059
2060static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
2061
2062/*
2063 * Migrate the IO-APIC irq in the presence of intr-remapping.
2064 *
2065 * For edge triggered, irq migration is a simple atomic update(of vector
2066 * and cpu destination) of IRTE and flush the hardware cache.
2067 *
2068 * For level triggered, we need to modify the io-apic RTE aswell with the update
2069 * vector information, along with modifying IRTE with vector and destination.
2070 * So irq migration for level triggered is little bit more complex compared to
2071 * edge triggered migration. But the good news is, we use the same algorithm
2072 * for level triggered migration as we have today, only difference being,
2073 * we now initiate the irq migration from process context instead of the
2074 * interrupt context.
2075 *
2076 * In future, when we do a directed EOI (combined with cpu EOI broadcast
2077 * suppression) to the IO-APIC, level triggered irq migration will also be
2078 * as simple as edge triggered migration and we can do the irq migration
2079 * with a simple atomic update to IO-APIC RTE.
2080 */
2081static void migrate_ioapic_irq(int irq, cpumask_t mask)
2082{
2083 struct irq_cfg *cfg;
2084 struct irq_desc *desc;
2085 cpumask_t tmp, cleanup_mask;
2086 struct irte irte;
2087 int modify_ioapic_rte;
2088 unsigned int dest;
2089 unsigned long flags;
2090
2091 cpus_and(tmp, mask, cpu_online_map);
2092 if (cpus_empty(tmp))
2093 return;
2094
2095 if (get_irte(irq, &irte))
2096 return;
2097
2098 if (assign_irq_vector(irq, mask))
2099 return;
2100
2101 cfg = irq_cfg(irq);
2102 cpus_and(tmp, cfg->domain, mask);
2103 dest = cpu_mask_to_apicid(tmp);
2104
2105 desc = irq_to_desc(irq);
2106 modify_ioapic_rte = desc->status & IRQ_LEVEL;
2107 if (modify_ioapic_rte) {
2108 spin_lock_irqsave(&ioapic_lock, flags);
2109 __target_IO_APIC_irq(irq, dest, cfg->vector);
2110 spin_unlock_irqrestore(&ioapic_lock, flags);
2111 }
2112
2113 irte.vector = cfg->vector;
2114 irte.dest_id = IRTE_DEST(dest);
2115
2116 /*
2117 * Modified the IRTE and flushes the Interrupt entry cache.
2118 */
2119 modify_irte(irq, &irte);
2120
2121 if (cfg->move_in_progress) {
2122 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
2123 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
2124 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2125 cfg->move_in_progress = 0;
2126 }
2127
2128 desc->affinity = mask;
2129}
2130
2131static int migrate_irq_remapped_level(int irq)
2132{
2133 int ret = -1;
2134 struct irq_desc *desc = irq_to_desc(irq);
2135
2136 mask_IO_APIC_irq(irq);
2137
2138 if (io_apic_level_ack_pending(irq)) {
2139 /*
2140 * Interrupt in progress. Migrating irq now will change the
2141 * vector information in the IO-APIC RTE and that will confuse
2142 * the EOI broadcast performed by cpu.
2143 * So, delay the irq migration to the next instance.
2144 */
2145 schedule_delayed_work(&ir_migration_work, 1);
2146 goto unmask;
2147 }
2148
2149 /* everthing is clear. we have right of way */
2150 migrate_ioapic_irq(irq, desc->pending_mask);
2151
2152 ret = 0;
2153 desc->status &= ~IRQ_MOVE_PENDING;
2154 cpus_clear(desc->pending_mask);
2155
2156unmask:
2157 unmask_IO_APIC_irq(irq);
2158 return ret;
2159}
2160
2161static void ir_irq_migration(struct work_struct *work)
2162{
2163 unsigned int irq;
2164 struct irq_desc *desc;
2165
2166 for_each_irq_desc(irq, desc) {
2167 if (desc->status & IRQ_MOVE_PENDING) {
2168 unsigned long flags;
2169
2170 spin_lock_irqsave(&desc->lock, flags);
2171 if (!desc->chip->set_affinity ||
2172 !(desc->status & IRQ_MOVE_PENDING)) {
2173 desc->status &= ~IRQ_MOVE_PENDING;
2174 spin_unlock_irqrestore(&desc->lock, flags);
2175 continue;
2176 }
2177
2178 desc->chip->set_affinity(irq, desc->pending_mask);
2179 spin_unlock_irqrestore(&desc->lock, flags);
2180 }
2181 }
2182}
2183
2184/*
2185 * Migrates the IRQ destination in the process context.
2186 */
2187static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
2188{
2189 struct irq_desc *desc = irq_to_desc(irq);
2190
2191 if (desc->status & IRQ_LEVEL) {
2192 desc->status |= IRQ_MOVE_PENDING;
2193 desc->pending_mask = mask;
2194 migrate_irq_remapped_level(irq);
2195 return;
2196 }
2197
2198 migrate_ioapic_irq(irq, mask);
2199}
2200#endif
2201
1400asmlinkage void smp_irq_move_cleanup_interrupt(void) 2202asmlinkage void smp_irq_move_cleanup_interrupt(void)
1401{ 2203{
1402 unsigned vector, me; 2204 unsigned vector, me;
1403 ack_APIC_irq(); 2205 ack_APIC_irq();
2206#ifdef CONFIG_X86_64
1404 exit_idle(); 2207 exit_idle();
2208#endif
1405 irq_enter(); 2209 irq_enter();
1406 2210
1407 me = smp_processor_id(); 2211 me = smp_processor_id();
@@ -1410,11 +2214,12 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
1410 struct irq_desc *desc; 2214 struct irq_desc *desc;
1411 struct irq_cfg *cfg; 2215 struct irq_cfg *cfg;
1412 irq = __get_cpu_var(vector_irq)[vector]; 2216 irq = __get_cpu_var(vector_irq)[vector];
1413 if (irq >= NR_IRQS) 2217
2218 desc = irq_to_desc(irq);
2219 if (!desc)
1414 continue; 2220 continue;
1415 2221
1416 desc = irq_desc + irq; 2222 cfg = irq_cfg(irq);
1417 cfg = irq_cfg + irq;
1418 spin_lock(&desc->lock); 2223 spin_lock(&desc->lock);
1419 if (!cfg->move_cleanup_count) 2224 if (!cfg->move_cleanup_count)
1420 goto unlock; 2225 goto unlock;
@@ -1433,7 +2238,7 @@ unlock:
1433 2238
1434static void irq_complete_move(unsigned int irq) 2239static void irq_complete_move(unsigned int irq)
1435{ 2240{
1436 struct irq_cfg *cfg = irq_cfg + irq; 2241 struct irq_cfg *cfg = irq_cfg(irq);
1437 unsigned vector, me; 2242 unsigned vector, me;
1438 2243
1439 if (likely(!cfg->move_in_progress)) 2244 if (likely(!cfg->move_in_progress))
@@ -1453,6 +2258,17 @@ static void irq_complete_move(unsigned int irq)
1453#else 2258#else
1454static inline void irq_complete_move(unsigned int irq) {} 2259static inline void irq_complete_move(unsigned int irq) {}
1455#endif 2260#endif
2261#ifdef CONFIG_INTR_REMAP
2262static void ack_x2apic_level(unsigned int irq)
2263{
2264 ack_x2APIC_irq();
2265}
2266
2267static void ack_x2apic_edge(unsigned int irq)
2268{
2269 ack_x2APIC_irq();
2270}
2271#endif
1456 2272
1457static void ack_apic_edge(unsigned int irq) 2273static void ack_apic_edge(unsigned int irq)
1458{ 2274{
@@ -1461,19 +2277,50 @@ static void ack_apic_edge(unsigned int irq)
1461 ack_APIC_irq(); 2277 ack_APIC_irq();
1462} 2278}
1463 2279
2280atomic_t irq_mis_count;
2281
1464static void ack_apic_level(unsigned int irq) 2282static void ack_apic_level(unsigned int irq)
1465{ 2283{
2284#ifdef CONFIG_X86_32
2285 unsigned long v;
2286 int i;
2287#endif
1466 int do_unmask_irq = 0; 2288 int do_unmask_irq = 0;
1467 2289
1468 irq_complete_move(irq); 2290 irq_complete_move(irq);
1469#ifdef CONFIG_GENERIC_PENDING_IRQ 2291#ifdef CONFIG_GENERIC_PENDING_IRQ
1470 /* If we are moving the irq we need to mask it */ 2292 /* If we are moving the irq we need to mask it */
1471 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { 2293 if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
1472 do_unmask_irq = 1; 2294 do_unmask_irq = 1;
1473 mask_IO_APIC_irq(irq); 2295 mask_IO_APIC_irq(irq);
1474 } 2296 }
1475#endif 2297#endif
1476 2298
2299#ifdef CONFIG_X86_32
2300 /*
2301 * It appears there is an erratum which affects at least version 0x11
2302 * of I/O APIC (that's the 82093AA and cores integrated into various
2303 * chipsets). Under certain conditions a level-triggered interrupt is
2304 * erroneously delivered as edge-triggered one but the respective IRR
2305 * bit gets set nevertheless. As a result the I/O unit expects an EOI
2306 * message but it will never arrive and further interrupts are blocked
2307 * from the source. The exact reason is so far unknown, but the
2308 * phenomenon was observed when two consecutive interrupt requests
2309 * from a given source get delivered to the same CPU and the source is
2310 * temporarily disabled in between.
2311 *
2312 * A workaround is to simulate an EOI message manually. We achieve it
2313 * by setting the trigger mode to edge and then to level when the edge
2314 * trigger mode gets detected in the TMR of a local APIC for a
2315 * level-triggered interrupt. We mask the source for the time of the
2316 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2317 * The idea is from Manfred Spraul. --macro
2318 */
2319 i = irq_cfg(irq)->vector;
2320
2321 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2322#endif
2323
1477 /* 2324 /*
1478 * We must acknowledge the irq before we move it or the acknowledge will 2325 * We must acknowledge the irq before we move it or the acknowledge will
1479 * not propagate properly. 2326 * not propagate properly.
@@ -1512,24 +2359,51 @@ static void ack_apic_level(unsigned int irq)
1512 move_masked_irq(irq); 2359 move_masked_irq(irq);
1513 unmask_IO_APIC_irq(irq); 2360 unmask_IO_APIC_irq(irq);
1514 } 2361 }
2362
2363#ifdef CONFIG_X86_32
2364 if (!(v & (1 << (i & 0x1f)))) {
2365 atomic_inc(&irq_mis_count);
2366 spin_lock(&ioapic_lock);
2367 __mask_and_edge_IO_APIC_irq(irq);
2368 __unmask_and_level_IO_APIC_irq(irq);
2369 spin_unlock(&ioapic_lock);
2370 }
2371#endif
1515} 2372}
1516 2373
1517static struct irq_chip ioapic_chip __read_mostly = { 2374static struct irq_chip ioapic_chip __read_mostly = {
1518 .name = "IO-APIC", 2375 .name = "IO-APIC",
1519 .startup = startup_ioapic_irq, 2376 .startup = startup_ioapic_irq,
1520 .mask = mask_IO_APIC_irq, 2377 .mask = mask_IO_APIC_irq,
1521 .unmask = unmask_IO_APIC_irq, 2378 .unmask = unmask_IO_APIC_irq,
1522 .ack = ack_apic_edge, 2379 .ack = ack_apic_edge,
1523 .eoi = ack_apic_level, 2380 .eoi = ack_apic_level,
2381#ifdef CONFIG_SMP
2382 .set_affinity = set_ioapic_affinity_irq,
2383#endif
2384 .retrigger = ioapic_retrigger_irq,
2385};
2386
2387#ifdef CONFIG_INTR_REMAP
2388static struct irq_chip ir_ioapic_chip __read_mostly = {
2389 .name = "IR-IO-APIC",
2390 .startup = startup_ioapic_irq,
2391 .mask = mask_IO_APIC_irq,
2392 .unmask = unmask_IO_APIC_irq,
2393 .ack = ack_x2apic_edge,
2394 .eoi = ack_x2apic_level,
1524#ifdef CONFIG_SMP 2395#ifdef CONFIG_SMP
1525 .set_affinity = set_ioapic_affinity_irq, 2396 .set_affinity = set_ir_ioapic_affinity_irq,
1526#endif 2397#endif
1527 .retrigger = ioapic_retrigger_irq, 2398 .retrigger = ioapic_retrigger_irq,
1528}; 2399};
2400#endif
1529 2401
1530static inline void init_IO_APIC_traps(void) 2402static inline void init_IO_APIC_traps(void)
1531{ 2403{
1532 int irq; 2404 int irq;
2405 struct irq_desc *desc;
2406 struct irq_cfg *cfg;
1533 2407
1534 /* 2408 /*
1535 * NOTE! The local APIC isn't very good at handling 2409 * NOTE! The local APIC isn't very good at handling
@@ -1542,8 +2416,8 @@ static inline void init_IO_APIC_traps(void)
1542 * Also, we've got to be careful not to trash gate 2416 * Also, we've got to be careful not to trash gate
1543 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2417 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1544 */ 2418 */
1545 for (irq = 0; irq < NR_IRQS ; irq++) { 2419 for_each_irq_cfg(irq, cfg) {
1546 if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) { 2420 if (IO_APIC_IRQ(irq) && !cfg->vector) {
1547 /* 2421 /*
1548 * Hmm.. We don't have an entry for this, 2422 * Hmm.. We don't have an entry for this,
1549 * so default to an old-fashioned 8259 2423 * so default to an old-fashioned 8259
@@ -1551,27 +2425,33 @@ static inline void init_IO_APIC_traps(void)
1551 */ 2425 */
1552 if (irq < 16) 2426 if (irq < 16)
1553 make_8259A_irq(irq); 2427 make_8259A_irq(irq);
1554 else 2428 else {
2429 desc = irq_to_desc(irq);
1555 /* Strange. Oh, well.. */ 2430 /* Strange. Oh, well.. */
1556 irq_desc[irq].chip = &no_irq_chip; 2431 desc->chip = &no_irq_chip;
2432 }
1557 } 2433 }
1558 } 2434 }
1559} 2435}
1560 2436
1561static void unmask_lapic_irq(unsigned int irq) 2437/*
2438 * The local APIC irq-chip implementation:
2439 */
2440
2441static void mask_lapic_irq(unsigned int irq)
1562{ 2442{
1563 unsigned long v; 2443 unsigned long v;
1564 2444
1565 v = apic_read(APIC_LVT0); 2445 v = apic_read(APIC_LVT0);
1566 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); 2446 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
1567} 2447}
1568 2448
1569static void mask_lapic_irq(unsigned int irq) 2449static void unmask_lapic_irq(unsigned int irq)
1570{ 2450{
1571 unsigned long v; 2451 unsigned long v;
1572 2452
1573 v = apic_read(APIC_LVT0); 2453 v = apic_read(APIC_LVT0);
1574 apic_write(APIC_LVT0, v | APIC_LVT_MASKED); 2454 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
1575} 2455}
1576 2456
1577static void ack_lapic_irq (unsigned int irq) 2457static void ack_lapic_irq (unsigned int irq)
@@ -1588,7 +2468,10 @@ static struct irq_chip lapic_chip __read_mostly = {
1588 2468
1589static void lapic_register_intr(int irq) 2469static void lapic_register_intr(int irq)
1590{ 2470{
1591 irq_desc[irq].status &= ~IRQ_LEVEL; 2471 struct irq_desc *desc;
2472
2473 desc = irq_to_desc(irq);
2474 desc->status &= ~IRQ_LEVEL;
1592 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2475 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
1593 "edge"); 2476 "edge");
1594} 2477}
@@ -1596,19 +2479,19 @@ static void lapic_register_intr(int irq)
1596static void __init setup_nmi(void) 2479static void __init setup_nmi(void)
1597{ 2480{
1598 /* 2481 /*
1599 * Dirty trick to enable the NMI watchdog ... 2482 * Dirty trick to enable the NMI watchdog ...
1600 * We put the 8259A master into AEOI mode and 2483 * We put the 8259A master into AEOI mode and
1601 * unmask on all local APICs LVT0 as NMI. 2484 * unmask on all local APICs LVT0 as NMI.
1602 * 2485 *
1603 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') 2486 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
1604 * is from Maciej W. Rozycki - so we do not have to EOI from 2487 * is from Maciej W. Rozycki - so we do not have to EOI from
1605 * the NMI handler or the timer interrupt. 2488 * the NMI handler or the timer interrupt.
1606 */ 2489 */
1607 printk(KERN_INFO "activating NMI Watchdog ..."); 2490 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
1608 2491
1609 enable_NMI_through_LVT0(); 2492 enable_NMI_through_LVT0();
1610 2493
1611 printk(" done.\n"); 2494 apic_printk(APIC_VERBOSE, " done.\n");
1612} 2495}
1613 2496
1614/* 2497/*
@@ -1625,12 +2508,17 @@ static inline void __init unlock_ExtINT_logic(void)
1625 unsigned char save_control, save_freq_select; 2508 unsigned char save_control, save_freq_select;
1626 2509
1627 pin = find_isa_irq_pin(8, mp_INT); 2510 pin = find_isa_irq_pin(8, mp_INT);
2511 if (pin == -1) {
2512 WARN_ON_ONCE(1);
2513 return;
2514 }
1628 apic = find_isa_irq_apic(8, mp_INT); 2515 apic = find_isa_irq_apic(8, mp_INT);
1629 if (pin == -1) 2516 if (apic == -1) {
2517 WARN_ON_ONCE(1);
1630 return; 2518 return;
2519 }
1631 2520
1632 entry0 = ioapic_read_entry(apic, pin); 2521 entry0 = ioapic_read_entry(apic, pin);
1633
1634 clear_IO_APIC_pin(apic, pin); 2522 clear_IO_APIC_pin(apic, pin);
1635 2523
1636 memset(&entry1, 0, sizeof(entry1)); 2524 memset(&entry1, 0, sizeof(entry1));
@@ -1665,23 +2553,38 @@ static inline void __init unlock_ExtINT_logic(void)
1665 ioapic_write_entry(apic, pin, entry0); 2553 ioapic_write_entry(apic, pin, entry0);
1666} 2554}
1667 2555
2556static int disable_timer_pin_1 __initdata;
2557/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
2558static int __init disable_timer_pin_setup(char *arg)
2559{
2560 disable_timer_pin_1 = 1;
2561 return 0;
2562}
2563early_param("disable_timer_pin_1", disable_timer_pin_setup);
2564
2565int timer_through_8259 __initdata;
2566
1668/* 2567/*
1669 * This code may look a bit paranoid, but it's supposed to cooperate with 2568 * This code may look a bit paranoid, but it's supposed to cooperate with
1670 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ 2569 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
1671 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast 2570 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
1672 * fanatically on his truly buggy board. 2571 * fanatically on his truly buggy board.
1673 * 2572 *
1674 * FIXME: really need to revamp this for modern platforms only. 2573 * FIXME: really need to revamp this for all platforms.
1675 */ 2574 */
1676static inline void __init check_timer(void) 2575static inline void __init check_timer(void)
1677{ 2576{
1678 struct irq_cfg *cfg = irq_cfg + 0; 2577 struct irq_cfg *cfg = irq_cfg(0);
1679 int apic1, pin1, apic2, pin2; 2578 int apic1, pin1, apic2, pin2;
1680 unsigned long flags; 2579 unsigned long flags;
2580 unsigned int ver;
1681 int no_pin1 = 0; 2581 int no_pin1 = 0;
1682 2582
1683 local_irq_save(flags); 2583 local_irq_save(flags);
1684 2584
2585 ver = apic_read(APIC_LVR);
2586 ver = GET_APIC_VERSION(ver);
2587
1685 /* 2588 /*
1686 * get/set the timer IRQ vector: 2589 * get/set the timer IRQ vector:
1687 */ 2590 */
@@ -1690,10 +2593,18 @@ static inline void __init check_timer(void)
1690 2593
1691 /* 2594 /*
1692 * As IRQ0 is to be enabled in the 8259A, the virtual 2595 * As IRQ0 is to be enabled in the 8259A, the virtual
1693 * wire has to be disabled in the local APIC. 2596 * wire has to be disabled in the local APIC. Also
2597 * timer interrupts need to be acknowledged manually in
2598 * the 8259A for the i82489DX when using the NMI
2599 * watchdog as that APIC treats NMIs as level-triggered.
2600 * The AEOI mode will finish them in the 8259A
2601 * automatically.
1694 */ 2602 */
1695 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2603 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1696 init_8259A(1); 2604 init_8259A(1);
2605#ifdef CONFIG_X86_32
2606 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2607#endif
1697 2608
1698 pin1 = find_isa_irq_pin(0, mp_INT); 2609 pin1 = find_isa_irq_pin(0, mp_INT);
1699 apic1 = find_isa_irq_apic(0, mp_INT); 2610 apic1 = find_isa_irq_apic(0, mp_INT);
@@ -1712,6 +2623,10 @@ static inline void __init check_timer(void)
1712 * 8259A. 2623 * 8259A.
1713 */ 2624 */
1714 if (pin1 == -1) { 2625 if (pin1 == -1) {
2626#ifdef CONFIG_INTR_REMAP
2627 if (intr_remapping_enabled)
2628 panic("BIOS bug: timer not connected to IO-APIC");
2629#endif
1715 pin1 = pin2; 2630 pin1 = pin2;
1716 apic1 = apic2; 2631 apic1 = apic2;
1717 no_pin1 = 1; 2632 no_pin1 = 1;
@@ -1729,7 +2644,7 @@ static inline void __init check_timer(void)
1729 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2644 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
1730 } 2645 }
1731 unmask_IO_APIC_irq(0); 2646 unmask_IO_APIC_irq(0);
1732 if (!no_timer_check && timer_irq_works()) { 2647 if (timer_irq_works()) {
1733 if (nmi_watchdog == NMI_IO_APIC) { 2648 if (nmi_watchdog == NMI_IO_APIC) {
1734 setup_nmi(); 2649 setup_nmi();
1735 enable_8259A_irq(0); 2650 enable_8259A_irq(0);
@@ -1738,6 +2653,10 @@ static inline void __init check_timer(void)
1738 clear_IO_APIC_pin(0, pin1); 2653 clear_IO_APIC_pin(0, pin1);
1739 goto out; 2654 goto out;
1740 } 2655 }
2656#ifdef CONFIG_INTR_REMAP
2657 if (intr_remapping_enabled)
2658 panic("timer doesn't work through Interrupt-remapped IO-APIC");
2659#endif
1741 clear_IO_APIC_pin(apic1, pin1); 2660 clear_IO_APIC_pin(apic1, pin1);
1742 if (!no_pin1) 2661 if (!no_pin1)
1743 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " 2662 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
@@ -1777,6 +2696,9 @@ static inline void __init check_timer(void)
1777 "through the IO-APIC - disabling NMI Watchdog!\n"); 2696 "through the IO-APIC - disabling NMI Watchdog!\n");
1778 nmi_watchdog = NMI_NONE; 2697 nmi_watchdog = NMI_NONE;
1779 } 2698 }
2699#ifdef CONFIG_X86_32
2700 timer_ack = 0;
2701#endif
1780 2702
1781 apic_printk(APIC_QUIET, KERN_INFO 2703 apic_printk(APIC_QUIET, KERN_INFO
1782 "...trying to set up timer as Virtual Wire IRQ...\n"); 2704 "...trying to set up timer as Virtual Wire IRQ...\n");
@@ -1813,13 +2735,6 @@ out:
1813 local_irq_restore(flags); 2735 local_irq_restore(flags);
1814} 2736}
1815 2737
1816static int __init notimercheck(char *s)
1817{
1818 no_timer_check = 1;
1819 return 1;
1820}
1821__setup("no_timer_check", notimercheck);
1822
1823/* 2738/*
1824 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available 2739 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
1825 * to devices. However there may be an I/O APIC pin available for 2740 * to devices. However there may be an I/O APIC pin available for
@@ -1837,27 +2752,49 @@ __setup("no_timer_check", notimercheck);
1837 * the I/O APIC in all cases now. No actual device should request 2752 * the I/O APIC in all cases now. No actual device should request
1838 * it anyway. --macro 2753 * it anyway. --macro
1839 */ 2754 */
1840#define PIC_IRQS (1<<2) 2755#define PIC_IRQS (1 << PIC_CASCADE_IR)
1841 2756
1842void __init setup_IO_APIC(void) 2757void __init setup_IO_APIC(void)
1843{ 2758{
1844 2759
2760#ifdef CONFIG_X86_32
2761 enable_IO_APIC();
2762#else
1845 /* 2763 /*
1846 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 2764 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
1847 */ 2765 */
2766#endif
1848 2767
1849 io_apic_irqs = ~PIC_IRQS; 2768 io_apic_irqs = ~PIC_IRQS;
1850 2769
1851 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 2770 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
1852 2771 /*
2772 * Set up IO-APIC IRQ routing.
2773 */
2774#ifdef CONFIG_X86_32
2775 if (!acpi_ioapic)
2776 setup_ioapic_ids_from_mpc();
2777#endif
1853 sync_Arb_IDs(); 2778 sync_Arb_IDs();
1854 setup_IO_APIC_irqs(); 2779 setup_IO_APIC_irqs();
1855 init_IO_APIC_traps(); 2780 init_IO_APIC_traps();
1856 check_timer(); 2781 check_timer();
1857 if (!acpi_ioapic)
1858 print_IO_APIC();
1859} 2782}
1860 2783
2784/*
2785 * Called after all the initialization is done. If we didnt find any
2786 * APIC bugs then we can allow the modify fast path
2787 */
2788
2789static int __init io_apic_bug_finalize(void)
2790{
2791 if (sis_apic_bug == -1)
2792 sis_apic_bug = 0;
2793 return 0;
2794}
2795
2796late_initcall(io_apic_bug_finalize);
2797
1861struct sysfs_ioapic_data { 2798struct sysfs_ioapic_data {
1862 struct sys_device dev; 2799 struct sys_device dev;
1863 struct IO_APIC_route_entry entry[0]; 2800 struct IO_APIC_route_entry entry[0];
@@ -1945,38 +2882,60 @@ device_initcall(ioapic_init_sysfs);
1945/* 2882/*
1946 * Dynamic irq allocate and deallocation 2883 * Dynamic irq allocate and deallocation
1947 */ 2884 */
1948int create_irq(void) 2885unsigned int create_irq_nr(unsigned int irq_want)
1949{ 2886{
1950 /* Allocate an unused irq */ 2887 /* Allocate an unused irq */
1951 int irq; 2888 unsigned int irq;
1952 int new; 2889 unsigned int new;
1953 unsigned long flags; 2890 unsigned long flags;
2891 struct irq_cfg *cfg_new;
1954 2892
1955 irq = -ENOSPC; 2893 irq_want = nr_irqs - 1;
2894
2895 irq = 0;
1956 spin_lock_irqsave(&vector_lock, flags); 2896 spin_lock_irqsave(&vector_lock, flags);
1957 for (new = (NR_IRQS - 1); new >= 0; new--) { 2897 for (new = irq_want; new > 0; new--) {
1958 if (platform_legacy_irq(new)) 2898 if (platform_legacy_irq(new))
1959 continue; 2899 continue;
1960 if (irq_cfg[new].vector != 0) 2900 cfg_new = irq_cfg(new);
2901 if (cfg_new && cfg_new->vector != 0)
1961 continue; 2902 continue;
2903 /* check if need to create one */
2904 if (!cfg_new)
2905 cfg_new = irq_cfg_alloc(new);
1962 if (__assign_irq_vector(new, TARGET_CPUS) == 0) 2906 if (__assign_irq_vector(new, TARGET_CPUS) == 0)
1963 irq = new; 2907 irq = new;
1964 break; 2908 break;
1965 } 2909 }
1966 spin_unlock_irqrestore(&vector_lock, flags); 2910 spin_unlock_irqrestore(&vector_lock, flags);
1967 2911
1968 if (irq >= 0) { 2912 if (irq > 0) {
1969 dynamic_irq_init(irq); 2913 dynamic_irq_init(irq);
1970 } 2914 }
1971 return irq; 2915 return irq;
1972} 2916}
1973 2917
2918int create_irq(void)
2919{
2920 int irq;
2921
2922 irq = create_irq_nr(nr_irqs - 1);
2923
2924 if (irq == 0)
2925 irq = -1;
2926
2927 return irq;
2928}
2929
1974void destroy_irq(unsigned int irq) 2930void destroy_irq(unsigned int irq)
1975{ 2931{
1976 unsigned long flags; 2932 unsigned long flags;
1977 2933
1978 dynamic_irq_cleanup(irq); 2934 dynamic_irq_cleanup(irq);
1979 2935
2936#ifdef CONFIG_INTR_REMAP
2937 free_irte(irq);
2938#endif
1980 spin_lock_irqsave(&vector_lock, flags); 2939 spin_lock_irqsave(&vector_lock, flags);
1981 __clear_irq_vector(irq); 2940 __clear_irq_vector(irq);
1982 spin_unlock_irqrestore(&vector_lock, flags); 2941 spin_unlock_irqrestore(&vector_lock, flags);
@@ -1988,17 +2947,49 @@ void destroy_irq(unsigned int irq)
1988#ifdef CONFIG_PCI_MSI 2947#ifdef CONFIG_PCI_MSI
1989static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) 2948static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
1990{ 2949{
1991 struct irq_cfg *cfg = irq_cfg + irq; 2950 struct irq_cfg *cfg;
1992 int err; 2951 int err;
1993 unsigned dest; 2952 unsigned dest;
1994 cpumask_t tmp; 2953 cpumask_t tmp;
1995 2954
1996 tmp = TARGET_CPUS; 2955 tmp = TARGET_CPUS;
1997 err = assign_irq_vector(irq, tmp); 2956 err = assign_irq_vector(irq, tmp);
1998 if (!err) { 2957 if (err)
1999 cpus_and(tmp, cfg->domain, tmp); 2958 return err;
2000 dest = cpu_mask_to_apicid(tmp); 2959
2960 cfg = irq_cfg(irq);
2961 cpus_and(tmp, cfg->domain, tmp);
2962 dest = cpu_mask_to_apicid(tmp);
2963
2964#ifdef CONFIG_INTR_REMAP
2965 if (irq_remapped(irq)) {
2966 struct irte irte;
2967 int ir_index;
2968 u16 sub_handle;
2969
2970 ir_index = map_irq_to_irte_handle(irq, &sub_handle);
2971 BUG_ON(ir_index == -1);
2972
2973 memset (&irte, 0, sizeof(irte));
2001 2974
2975 irte.present = 1;
2976 irte.dst_mode = INT_DEST_MODE;
2977 irte.trigger_mode = 0; /* edge */
2978 irte.dlvry_mode = INT_DELIVERY_MODE;
2979 irte.vector = cfg->vector;
2980 irte.dest_id = IRTE_DEST(dest);
2981
2982 modify_irte(irq, &irte);
2983
2984 msg->address_hi = MSI_ADDR_BASE_HI;
2985 msg->data = sub_handle;
2986 msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
2987 MSI_ADDR_IR_SHV |
2988 MSI_ADDR_IR_INDEX1(ir_index) |
2989 MSI_ADDR_IR_INDEX2(ir_index);
2990 } else
2991#endif
2992 {
2002 msg->address_hi = MSI_ADDR_BASE_HI; 2993 msg->address_hi = MSI_ADDR_BASE_HI;
2003 msg->address_lo = 2994 msg->address_lo =
2004 MSI_ADDR_BASE_LO | 2995 MSI_ADDR_BASE_LO |
@@ -2024,10 +3015,11 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2024#ifdef CONFIG_SMP 3015#ifdef CONFIG_SMP
2025static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) 3016static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2026{ 3017{
2027 struct irq_cfg *cfg = irq_cfg + irq; 3018 struct irq_cfg *cfg;
2028 struct msi_msg msg; 3019 struct msi_msg msg;
2029 unsigned int dest; 3020 unsigned int dest;
2030 cpumask_t tmp; 3021 cpumask_t tmp;
3022 struct irq_desc *desc;
2031 3023
2032 cpus_and(tmp, mask, cpu_online_map); 3024 cpus_and(tmp, mask, cpu_online_map);
2033 if (cpus_empty(tmp)) 3025 if (cpus_empty(tmp))
@@ -2036,6 +3028,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2036 if (assign_irq_vector(irq, mask)) 3028 if (assign_irq_vector(irq, mask))
2037 return; 3029 return;
2038 3030
3031 cfg = irq_cfg(irq);
2039 cpus_and(tmp, cfg->domain, mask); 3032 cpus_and(tmp, cfg->domain, mask);
2040 dest = cpu_mask_to_apicid(tmp); 3033 dest = cpu_mask_to_apicid(tmp);
2041 3034
@@ -2047,8 +3040,61 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2047 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3040 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2048 3041
2049 write_msi_msg(irq, &msg); 3042 write_msi_msg(irq, &msg);
2050 irq_desc[irq].affinity = mask; 3043 desc = irq_to_desc(irq);
3044 desc->affinity = mask;
3045}
3046
3047#ifdef CONFIG_INTR_REMAP
3048/*
3049 * Migrate the MSI irq to another cpumask. This migration is
3050 * done in the process context using interrupt-remapping hardware.
3051 */
3052static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
3053{
3054 struct irq_cfg *cfg;
3055 unsigned int dest;
3056 cpumask_t tmp, cleanup_mask;
3057 struct irte irte;
3058 struct irq_desc *desc;
3059
3060 cpus_and(tmp, mask, cpu_online_map);
3061 if (cpus_empty(tmp))
3062 return;
3063
3064 if (get_irte(irq, &irte))
3065 return;
3066
3067 if (assign_irq_vector(irq, mask))
3068 return;
3069
3070 cfg = irq_cfg(irq);
3071 cpus_and(tmp, cfg->domain, mask);
3072 dest = cpu_mask_to_apicid(tmp);
3073
3074 irte.vector = cfg->vector;
3075 irte.dest_id = IRTE_DEST(dest);
3076
3077 /*
3078 * atomically update the IRTE with the new destination and vector.
3079 */
3080 modify_irte(irq, &irte);
3081
3082 /*
3083 * After this point, all the interrupts will start arriving
3084 * at the new destination. So, time to cleanup the previous
3085 * vector allocation.
3086 */
3087 if (cfg->move_in_progress) {
3088 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
3089 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
3090 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
3091 cfg->move_in_progress = 0;
3092 }
3093
3094 desc = irq_to_desc(irq);
3095 desc->affinity = mask;
2051} 3096}
3097#endif
2052#endif /* CONFIG_SMP */ 3098#endif /* CONFIG_SMP */
2053 3099
2054/* 3100/*
@@ -2066,26 +3112,179 @@ static struct irq_chip msi_chip = {
2066 .retrigger = ioapic_retrigger_irq, 3112 .retrigger = ioapic_retrigger_irq,
2067}; 3113};
2068 3114
2069int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) 3115#ifdef CONFIG_INTR_REMAP
3116static struct irq_chip msi_ir_chip = {
3117 .name = "IR-PCI-MSI",
3118 .unmask = unmask_msi_irq,
3119 .mask = mask_msi_irq,
3120 .ack = ack_x2apic_edge,
3121#ifdef CONFIG_SMP
3122 .set_affinity = ir_set_msi_irq_affinity,
3123#endif
3124 .retrigger = ioapic_retrigger_irq,
3125};
3126
3127/*
3128 * Map the PCI dev to the corresponding remapping hardware unit
3129 * and allocate 'nvec' consecutive interrupt-remapping table entries
3130 * in it.
3131 */
3132static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3133{
3134 struct intel_iommu *iommu;
3135 int index;
3136
3137 iommu = map_dev_to_ir(dev);
3138 if (!iommu) {
3139 printk(KERN_ERR
3140 "Unable to map PCI %s to iommu\n", pci_name(dev));
3141 return -ENOENT;
3142 }
3143
3144 index = alloc_irte(iommu, irq, nvec);
3145 if (index < 0) {
3146 printk(KERN_ERR
3147 "Unable to allocate %d IRTE for PCI %s\n", nvec,
3148 pci_name(dev));
3149 return -ENOSPC;
3150 }
3151 return index;
3152}
3153#endif
3154
3155static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
2070{ 3156{
3157 int ret;
2071 struct msi_msg msg; 3158 struct msi_msg msg;
2072 int irq, ret;
2073 irq = create_irq();
2074 if (irq < 0)
2075 return irq;
2076 3159
2077 ret = msi_compose_msg(dev, irq, &msg); 3160 ret = msi_compose_msg(dev, irq, &msg);
3161 if (ret < 0)
3162 return ret;
3163
3164 set_irq_msi(irq, desc);
3165 write_msi_msg(irq, &msg);
3166
3167#ifdef CONFIG_INTR_REMAP
3168 if (irq_remapped(irq)) {
3169 struct irq_desc *desc = irq_to_desc(irq);
3170 /*
3171 * irq migration in process context
3172 */
3173 desc->status |= IRQ_MOVE_PCNTXT;
3174 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
3175 } else
3176#endif
3177 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
3178
3179 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
3180
3181 return 0;
3182}
3183
3184static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
3185{
3186 unsigned int irq;
3187
3188 irq = dev->bus->number;
3189 irq <<= 8;
3190 irq |= dev->devfn;
3191 irq <<= 12;
3192
3193 return irq;
3194}
3195
3196int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
3197{
3198 unsigned int irq;
3199 int ret;
3200 unsigned int irq_want;
3201
3202 irq_want = build_irq_for_pci_dev(dev) + 0x100;
3203
3204 irq = create_irq_nr(irq_want);
3205 if (irq == 0)
3206 return -1;
3207
3208#ifdef CONFIG_INTR_REMAP
3209 if (!intr_remapping_enabled)
3210 goto no_ir;
3211
3212 ret = msi_alloc_irte(dev, irq, 1);
3213 if (ret < 0)
3214 goto error;
3215no_ir:
3216#endif
3217 ret = setup_msi_irq(dev, desc, irq);
2078 if (ret < 0) { 3218 if (ret < 0) {
2079 destroy_irq(irq); 3219 destroy_irq(irq);
2080 return ret; 3220 return ret;
2081 } 3221 }
3222 return 0;
2082 3223
2083 set_irq_msi(irq, desc); 3224#ifdef CONFIG_INTR_REMAP
2084 write_msi_msg(irq, &msg); 3225error:
3226 destroy_irq(irq);
3227 return ret;
3228#endif
3229}
3230
3231int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3232{
3233 unsigned int irq;
3234 int ret, sub_handle;
3235 struct msi_desc *desc;
3236 unsigned int irq_want;
2085 3237
2086 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 3238#ifdef CONFIG_INTR_REMAP
3239 struct intel_iommu *iommu = 0;
3240 int index = 0;
3241#endif
2087 3242
3243 irq_want = build_irq_for_pci_dev(dev) + 0x100;
3244 sub_handle = 0;
3245 list_for_each_entry(desc, &dev->msi_list, list) {
3246 irq = create_irq_nr(irq_want--);
3247 if (irq == 0)
3248 return -1;
3249#ifdef CONFIG_INTR_REMAP
3250 if (!intr_remapping_enabled)
3251 goto no_ir;
3252
3253 if (!sub_handle) {
3254 /*
3255 * allocate the consecutive block of IRTE's
3256 * for 'nvec'
3257 */
3258 index = msi_alloc_irte(dev, irq, nvec);
3259 if (index < 0) {
3260 ret = index;
3261 goto error;
3262 }
3263 } else {
3264 iommu = map_dev_to_ir(dev);
3265 if (!iommu) {
3266 ret = -ENOENT;
3267 goto error;
3268 }
3269 /*
3270 * setup the mapping between the irq and the IRTE
3271 * base index, the sub_handle pointing to the
3272 * appropriate interrupt remap table entry.
3273 */
3274 set_irte_irq(irq, iommu, index, sub_handle);
3275 }
3276no_ir:
3277#endif
3278 ret = setup_msi_irq(dev, desc, irq);
3279 if (ret < 0)
3280 goto error;
3281 sub_handle++;
3282 }
2088 return 0; 3283 return 0;
3284
3285error:
3286 destroy_irq(irq);
3287 return ret;
2089} 3288}
2090 3289
2091void arch_teardown_msi_irq(unsigned int irq) 3290void arch_teardown_msi_irq(unsigned int irq)
@@ -2097,10 +3296,11 @@ void arch_teardown_msi_irq(unsigned int irq)
2097#ifdef CONFIG_SMP 3296#ifdef CONFIG_SMP
2098static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) 3297static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
2099{ 3298{
2100 struct irq_cfg *cfg = irq_cfg + irq; 3299 struct irq_cfg *cfg;
2101 struct msi_msg msg; 3300 struct msi_msg msg;
2102 unsigned int dest; 3301 unsigned int dest;
2103 cpumask_t tmp; 3302 cpumask_t tmp;
3303 struct irq_desc *desc;
2104 3304
2105 cpus_and(tmp, mask, cpu_online_map); 3305 cpus_and(tmp, mask, cpu_online_map);
2106 if (cpus_empty(tmp)) 3306 if (cpus_empty(tmp))
@@ -2109,6 +3309,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
2109 if (assign_irq_vector(irq, mask)) 3309 if (assign_irq_vector(irq, mask))
2110 return; 3310 return;
2111 3311
3312 cfg = irq_cfg(irq);
2112 cpus_and(tmp, cfg->domain, mask); 3313 cpus_and(tmp, cfg->domain, mask);
2113 dest = cpu_mask_to_apicid(tmp); 3314 dest = cpu_mask_to_apicid(tmp);
2114 3315
@@ -2120,7 +3321,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
2120 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3321 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2121 3322
2122 dmar_msi_write(irq, &msg); 3323 dmar_msi_write(irq, &msg);
2123 irq_desc[irq].affinity = mask; 3324 desc = irq_to_desc(irq);
3325 desc->affinity = mask;
2124} 3326}
2125#endif /* CONFIG_SMP */ 3327#endif /* CONFIG_SMP */
2126 3328
@@ -2150,6 +3352,69 @@ int arch_setup_dmar_msi(unsigned int irq)
2150} 3352}
2151#endif 3353#endif
2152 3354
3355#ifdef CONFIG_HPET_TIMER
3356
3357#ifdef CONFIG_SMP
3358static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
3359{
3360 struct irq_cfg *cfg;
3361 struct irq_desc *desc;
3362 struct msi_msg msg;
3363 unsigned int dest;
3364 cpumask_t tmp;
3365
3366 cpus_and(tmp, mask, cpu_online_map);
3367 if (cpus_empty(tmp))
3368 return;
3369
3370 if (assign_irq_vector(irq, mask))
3371 return;
3372
3373 cfg = irq_cfg(irq);
3374 cpus_and(tmp, cfg->domain, mask);
3375 dest = cpu_mask_to_apicid(tmp);
3376
3377 hpet_msi_read(irq, &msg);
3378
3379 msg.data &= ~MSI_DATA_VECTOR_MASK;
3380 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3381 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3382 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3383
3384 hpet_msi_write(irq, &msg);
3385 desc = irq_to_desc(irq);
3386 desc->affinity = mask;
3387}
3388#endif /* CONFIG_SMP */
3389
3390struct irq_chip hpet_msi_type = {
3391 .name = "HPET_MSI",
3392 .unmask = hpet_msi_unmask,
3393 .mask = hpet_msi_mask,
3394 .ack = ack_apic_edge,
3395#ifdef CONFIG_SMP
3396 .set_affinity = hpet_msi_set_affinity,
3397#endif
3398 .retrigger = ioapic_retrigger_irq,
3399};
3400
3401int arch_setup_hpet_msi(unsigned int irq)
3402{
3403 int ret;
3404 struct msi_msg msg;
3405
3406 ret = msi_compose_msg(NULL, irq, &msg);
3407 if (ret < 0)
3408 return ret;
3409
3410 hpet_msi_write(irq, &msg);
3411 set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
3412 "edge");
3413
3414 return 0;
3415}
3416#endif
3417
2153#endif /* CONFIG_PCI_MSI */ 3418#endif /* CONFIG_PCI_MSI */
2154/* 3419/*
2155 * Hypertransport interrupt support 3420 * Hypertransport interrupt support
@@ -2174,9 +3439,10 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
2174 3439
2175static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) 3440static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2176{ 3441{
2177 struct irq_cfg *cfg = irq_cfg + irq; 3442 struct irq_cfg *cfg;
2178 unsigned int dest; 3443 unsigned int dest;
2179 cpumask_t tmp; 3444 cpumask_t tmp;
3445 struct irq_desc *desc;
2180 3446
2181 cpus_and(tmp, mask, cpu_online_map); 3447 cpus_and(tmp, mask, cpu_online_map);
2182 if (cpus_empty(tmp)) 3448 if (cpus_empty(tmp))
@@ -2185,11 +3451,13 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2185 if (assign_irq_vector(irq, mask)) 3451 if (assign_irq_vector(irq, mask))
2186 return; 3452 return;
2187 3453
3454 cfg = irq_cfg(irq);
2188 cpus_and(tmp, cfg->domain, mask); 3455 cpus_and(tmp, cfg->domain, mask);
2189 dest = cpu_mask_to_apicid(tmp); 3456 dest = cpu_mask_to_apicid(tmp);
2190 3457
2191 target_ht_irq(irq, dest, cfg->vector); 3458 target_ht_irq(irq, dest, cfg->vector);
2192 irq_desc[irq].affinity = mask; 3459 desc = irq_to_desc(irq);
3460 desc->affinity = mask;
2193} 3461}
2194#endif 3462#endif
2195 3463
@@ -2206,7 +3474,7 @@ static struct irq_chip ht_irq_chip = {
2206 3474
2207int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) 3475int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2208{ 3476{
2209 struct irq_cfg *cfg = irq_cfg + irq; 3477 struct irq_cfg *cfg;
2210 int err; 3478 int err;
2211 cpumask_t tmp; 3479 cpumask_t tmp;
2212 3480
@@ -2216,6 +3484,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2216 struct ht_irq_msg msg; 3484 struct ht_irq_msg msg;
2217 unsigned dest; 3485 unsigned dest;
2218 3486
3487 cfg = irq_cfg(irq);
2219 cpus_and(tmp, cfg->domain, tmp); 3488 cpus_and(tmp, cfg->domain, tmp);
2220 dest = cpu_mask_to_apicid(tmp); 3489 dest = cpu_mask_to_apicid(tmp);
2221 3490
@@ -2238,20 +3507,196 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2238 3507
2239 set_irq_chip_and_handler_name(irq, &ht_irq_chip, 3508 set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2240 handle_edge_irq, "edge"); 3509 handle_edge_irq, "edge");
3510
3511 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
2241 } 3512 }
2242 return err; 3513 return err;
2243} 3514}
2244#endif /* CONFIG_HT_IRQ */ 3515#endif /* CONFIG_HT_IRQ */
2245 3516
3517#ifdef CONFIG_X86_64
3518/*
3519 * Re-target the irq to the specified CPU and enable the specified MMR located
3520 * on the specified blade to allow the sending of MSIs to the specified CPU.
3521 */
3522int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3523 unsigned long mmr_offset)
3524{
3525 const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
3526 struct irq_cfg *cfg;
3527 int mmr_pnode;
3528 unsigned long mmr_value;
3529 struct uv_IO_APIC_route_entry *entry;
3530 unsigned long flags;
3531 int err;
3532
3533 err = assign_irq_vector(irq, *eligible_cpu);
3534 if (err != 0)
3535 return err;
3536
3537 spin_lock_irqsave(&vector_lock, flags);
3538 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
3539 irq_name);
3540 spin_unlock_irqrestore(&vector_lock, flags);
3541
3542 cfg = irq_cfg(irq);
3543
3544 mmr_value = 0;
3545 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3546 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3547
3548 entry->vector = cfg->vector;
3549 entry->delivery_mode = INT_DELIVERY_MODE;
3550 entry->dest_mode = INT_DEST_MODE;
3551 entry->polarity = 0;
3552 entry->trigger = 0;
3553 entry->mask = 0;
3554 entry->dest = cpu_mask_to_apicid(*eligible_cpu);
3555
3556 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3557 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3558
3559 return irq;
3560}
3561
3562/*
3563 * Disable the specified MMR located on the specified blade so that MSIs are
3564 * longer allowed to be sent.
3565 */
3566void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3567{
3568 unsigned long mmr_value;
3569 struct uv_IO_APIC_route_entry *entry;
3570 int mmr_pnode;
3571
3572 mmr_value = 0;
3573 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3574 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3575
3576 entry->mask = 1;
3577
3578 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3579 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3580}
3581#endif /* CONFIG_X86_64 */
3582
3583int __init io_apic_get_redir_entries (int ioapic)
3584{
3585 union IO_APIC_reg_01 reg_01;
3586 unsigned long flags;
3587
3588 spin_lock_irqsave(&ioapic_lock, flags);
3589 reg_01.raw = io_apic_read(ioapic, 1);
3590 spin_unlock_irqrestore(&ioapic_lock, flags);
3591
3592 return reg_01.bits.entries;
3593}
3594
3595int __init probe_nr_irqs(void)
3596{
3597 int idx;
3598 int nr = 0;
3599#ifndef CONFIG_XEN
3600 int nr_min = 32;
3601#else
3602 int nr_min = NR_IRQS;
3603#endif
3604
3605 for (idx = 0; idx < nr_ioapics; idx++)
3606 nr += io_apic_get_redir_entries(idx) + 1;
3607
3608 /* double it for hotplug and msi and nmi */
3609 nr <<= 1;
3610
3611 /* something wrong ? */
3612 if (nr < nr_min)
3613 nr = nr_min;
3614
3615 return nr;
3616}
3617
2246/* -------------------------------------------------------------------------- 3618/* --------------------------------------------------------------------------
2247 ACPI-based IOAPIC Configuration 3619 ACPI-based IOAPIC Configuration
2248 -------------------------------------------------------------------------- */ 3620 -------------------------------------------------------------------------- */
2249 3621
2250#ifdef CONFIG_ACPI 3622#ifdef CONFIG_ACPI
2251 3623
2252#define IO_APIC_MAX_ID 0xFE 3624#ifdef CONFIG_X86_32
3625int __init io_apic_get_unique_id(int ioapic, int apic_id)
3626{
3627 union IO_APIC_reg_00 reg_00;
3628 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
3629 physid_mask_t tmp;
3630 unsigned long flags;
3631 int i = 0;
2253 3632
2254int __init io_apic_get_redir_entries (int ioapic) 3633 /*
3634 * The P4 platform supports up to 256 APIC IDs on two separate APIC
3635 * buses (one for LAPICs, one for IOAPICs), where predecessors only
3636 * supports up to 16 on one shared APIC bus.
3637 *
3638 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
3639 * advantage of new APIC bus architecture.
3640 */
3641
3642 if (physids_empty(apic_id_map))
3643 apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
3644
3645 spin_lock_irqsave(&ioapic_lock, flags);
3646 reg_00.raw = io_apic_read(ioapic, 0);
3647 spin_unlock_irqrestore(&ioapic_lock, flags);
3648
3649 if (apic_id >= get_physical_broadcast()) {
3650 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
3651 "%d\n", ioapic, apic_id, reg_00.bits.ID);
3652 apic_id = reg_00.bits.ID;
3653 }
3654
3655 /*
3656 * Every APIC in a system must have a unique ID or we get lots of nice
3657 * 'stuck on smp_invalidate_needed IPI wait' messages.
3658 */
3659 if (check_apicid_used(apic_id_map, apic_id)) {
3660
3661 for (i = 0; i < get_physical_broadcast(); i++) {
3662 if (!check_apicid_used(apic_id_map, i))
3663 break;
3664 }
3665
3666 if (i == get_physical_broadcast())
3667 panic("Max apic_id exceeded!\n");
3668
3669 printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
3670 "trying %d\n", ioapic, apic_id, i);
3671
3672 apic_id = i;
3673 }
3674
3675 tmp = apicid_to_cpu_present(apic_id);
3676 physids_or(apic_id_map, apic_id_map, tmp);
3677
3678 if (reg_00.bits.ID != apic_id) {
3679 reg_00.bits.ID = apic_id;
3680
3681 spin_lock_irqsave(&ioapic_lock, flags);
3682 io_apic_write(ioapic, 0, reg_00.raw);
3683 reg_00.raw = io_apic_read(ioapic, 0);
3684 spin_unlock_irqrestore(&ioapic_lock, flags);
3685
3686 /* Sanity check */
3687 if (reg_00.bits.ID != apic_id) {
3688 printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
3689 return -1;
3690 }
3691 }
3692
3693 apic_printk(APIC_VERBOSE, KERN_INFO
3694 "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
3695
3696 return apic_id;
3697}
3698
3699int __init io_apic_get_version(int ioapic)
2255{ 3700{
2256 union IO_APIC_reg_01 reg_01; 3701 union IO_APIC_reg_01 reg_01;
2257 unsigned long flags; 3702 unsigned long flags;
@@ -2260,9 +3705,9 @@ int __init io_apic_get_redir_entries (int ioapic)
2260 reg_01.raw = io_apic_read(ioapic, 1); 3705 reg_01.raw = io_apic_read(ioapic, 1);
2261 spin_unlock_irqrestore(&ioapic_lock, flags); 3706 spin_unlock_irqrestore(&ioapic_lock, flags);
2262 3707
2263 return reg_01.bits.entries; 3708 return reg_01.bits.version;
2264} 3709}
2265 3710#endif
2266 3711
2267int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) 3712int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
2268{ 3713{
@@ -2314,6 +3759,7 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
2314void __init setup_ioapic_dest(void) 3759void __init setup_ioapic_dest(void)
2315{ 3760{
2316 int pin, ioapic, irq, irq_entry; 3761 int pin, ioapic, irq, irq_entry;
3762 struct irq_cfg *cfg;
2317 3763
2318 if (skip_ioapic_setup == 1) 3764 if (skip_ioapic_setup == 1)
2319 return; 3765 return;
@@ -2329,10 +3775,15 @@ void __init setup_ioapic_dest(void)
2329 * when you have too many devices, because at that time only boot 3775 * when you have too many devices, because at that time only boot
2330 * cpu is online. 3776 * cpu is online.
2331 */ 3777 */
2332 if (!irq_cfg[irq].vector) 3778 cfg = irq_cfg(irq);
3779 if (!cfg->vector)
2333 setup_IO_APIC_irq(ioapic, pin, irq, 3780 setup_IO_APIC_irq(ioapic, pin, irq,
2334 irq_trigger(irq_entry), 3781 irq_trigger(irq_entry),
2335 irq_polarity(irq_entry)); 3782 irq_polarity(irq_entry));
3783#ifdef CONFIG_INTR_REMAP
3784 else if (intr_remapping_enabled)
3785 set_ir_ioapic_affinity_irq(irq, TARGET_CPUS);
3786#endif
2336 else 3787 else
2337 set_ioapic_affinity_irq(irq, TARGET_CPUS); 3788 set_ioapic_affinity_irq(irq, TARGET_CPUS);
2338 } 3789 }
@@ -2383,18 +3834,33 @@ void __init ioapic_init_mappings(void)
2383 struct resource *ioapic_res; 3834 struct resource *ioapic_res;
2384 int i; 3835 int i;
2385 3836
3837 irq_2_pin_init();
2386 ioapic_res = ioapic_setup_resources(); 3838 ioapic_res = ioapic_setup_resources();
2387 for (i = 0; i < nr_ioapics; i++) { 3839 for (i = 0; i < nr_ioapics; i++) {
2388 if (smp_found_config) { 3840 if (smp_found_config) {
2389 ioapic_phys = mp_ioapics[i].mp_apicaddr; 3841 ioapic_phys = mp_ioapics[i].mp_apicaddr;
3842#ifdef CONFIG_X86_32
3843 if (!ioapic_phys) {
3844 printk(KERN_ERR
3845 "WARNING: bogus zero IO-APIC "
3846 "address found in MPTABLE, "
3847 "disabling IO/APIC support!\n");
3848 smp_found_config = 0;
3849 skip_ioapic_setup = 1;
3850 goto fake_ioapic_page;
3851 }
3852#endif
2390 } else { 3853 } else {
3854#ifdef CONFIG_X86_32
3855fake_ioapic_page:
3856#endif
2391 ioapic_phys = (unsigned long) 3857 ioapic_phys = (unsigned long)
2392 alloc_bootmem_pages(PAGE_SIZE); 3858 alloc_bootmem_pages(PAGE_SIZE);
2393 ioapic_phys = __pa(ioapic_phys); 3859 ioapic_phys = __pa(ioapic_phys);
2394 } 3860 }
2395 set_fixmap_nocache(idx, ioapic_phys); 3861 set_fixmap_nocache(idx, ioapic_phys);
2396 apic_printk(APIC_VERBOSE, 3862 apic_printk(APIC_VERBOSE,
2397 "mapped IOAPIC to %016lx (%016lx)\n", 3863 "mapped IOAPIC to %08lx (%08lx)\n",
2398 __fix_to_virt(idx), ioapic_phys); 3864 __fix_to_virt(idx), ioapic_phys);
2399 idx++; 3865 idx++;
2400 3866
@@ -2428,4 +3894,3 @@ static int __init ioapic_insert_resources(void)
2428/* Insert the IO APIC resources after PCI initialization has occured to handle 3894/* Insert the IO APIC resources after PCI initialization has occured to handle
2429 * IO APICS that are mapped in on a BAR in PCI space. */ 3895 * IO APICS that are mapped in on a BAR in PCI space. */
2430late_initcall(ioapic_insert_resources); 3896late_initcall(ioapic_insert_resources);
2431
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
deleted file mode 100644
index 09cddb57bec..00000000000
--- a/arch/x86/kernel/io_apic_32.c
+++ /dev/null
@@ -1,2901 +0,0 @@
1/*
2 * Intel IO-APIC support for multi-Pentium hosts.
3 *
4 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
5 *
6 * Many thanks to Stig Venaas for trying out countless experimental
7 * patches and reporting/debugging problems patiently!
8 *
9 * (c) 1999, Multiple IO-APIC support, developed by
10 * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
11 * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
12 * further tested and cleaned up by Zach Brown <zab@redhat.com>
13 * and Ingo Molnar <mingo@redhat.com>
14 *
15 * Fixes
16 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
17 * thanks to Eric Gilmore
18 * and Rolf G. Tews
19 * for testing these extensively
20 * Paul Diefenbaugh : Added full ACPI support
21 */
22
23#include <linux/mm.h>
24#include <linux/interrupt.h>
25#include <linux/init.h>
26#include <linux/delay.h>
27#include <linux/sched.h>
28#include <linux/bootmem.h>
29#include <linux/mc146818rtc.h>
30#include <linux/compiler.h>
31#include <linux/acpi.h>
32#include <linux/module.h>
33#include <linux/sysdev.h>
34#include <linux/pci.h>
35#include <linux/msi.h>
36#include <linux/htirq.h>
37#include <linux/freezer.h>
38#include <linux/kthread.h>
39#include <linux/jiffies.h> /* time_after() */
40
41#include <asm/io.h>
42#include <asm/smp.h>
43#include <asm/desc.h>
44#include <asm/timer.h>
45#include <asm/i8259.h>
46#include <asm/nmi.h>
47#include <asm/msidef.h>
48#include <asm/hypertransport.h>
49
50#include <mach_apic.h>
51#include <mach_apicdef.h>
52
53int (*ioapic_renumber_irq)(int ioapic, int irq);
54atomic_t irq_mis_count;
55
56/* Where if anywhere is the i8259 connect in external int mode */
57static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
58
59static DEFINE_SPINLOCK(ioapic_lock);
60DEFINE_SPINLOCK(vector_lock);
61
62int timer_through_8259 __initdata;
63
64/*
65 * Is the SiS APIC rmw bug present ?
66 * -1 = don't know, 0 = no, 1 = yes
67 */
68int sis_apic_bug = -1;
69
70/*
71 * # of IRQ routing registers
72 */
73int nr_ioapic_registers[MAX_IO_APICS];
74
75/* I/O APIC entries */
76struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
77int nr_ioapics;
78
79/* MP IRQ source entries */
80struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
81
82/* # of MP IRQ source entries */
83int mp_irq_entries;
84
85#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
86int mp_bus_id_to_type[MAX_MP_BUSSES];
87#endif
88
89DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
90
91static int disable_timer_pin_1 __initdata;
92
93/*
94 * Rough estimation of how many shared IRQs there are, can
95 * be changed anytime.
96 */
97#define MAX_PLUS_SHARED_IRQS NR_IRQS
98#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
99
100/*
101 * This is performance-critical, we want to do it O(1)
102 *
103 * the indexing order of this array favors 1:1 mappings
104 * between pins and IRQs.
105 */
106
107static struct irq_pin_list {
108 int apic, pin, next;
109} irq_2_pin[PIN_MAP_SIZE];
110
111struct io_apic {
112 unsigned int index;
113 unsigned int unused[3];
114 unsigned int data;
115};
116
117static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
118{
119 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
120 + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
121}
122
123static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
124{
125 struct io_apic __iomem *io_apic = io_apic_base(apic);
126 writel(reg, &io_apic->index);
127 return readl(&io_apic->data);
128}
129
130static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
131{
132 struct io_apic __iomem *io_apic = io_apic_base(apic);
133 writel(reg, &io_apic->index);
134 writel(value, &io_apic->data);
135}
136
137/*
138 * Re-write a value: to be used for read-modify-write
139 * cycles where the read already set up the index register.
140 *
141 * Older SiS APIC requires we rewrite the index register
142 */
143static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
144{
145 volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
146 if (sis_apic_bug)
147 writel(reg, &io_apic->index);
148 writel(value, &io_apic->data);
149}
150
151union entry_union {
152 struct { u32 w1, w2; };
153 struct IO_APIC_route_entry entry;
154};
155
156static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
157{
158 union entry_union eu;
159 unsigned long flags;
160 spin_lock_irqsave(&ioapic_lock, flags);
161 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
162 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
163 spin_unlock_irqrestore(&ioapic_lock, flags);
164 return eu.entry;
165}
166
167/*
168 * When we write a new IO APIC routing entry, we need to write the high
169 * word first! If the mask bit in the low word is clear, we will enable
170 * the interrupt, and we need to make sure the entry is fully populated
171 * before that happens.
172 */
173static void
174__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
175{
176 union entry_union eu;
177 eu.entry = e;
178 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
179 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
180}
181
182static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
183{
184 unsigned long flags;
185 spin_lock_irqsave(&ioapic_lock, flags);
186 __ioapic_write_entry(apic, pin, e);
187 spin_unlock_irqrestore(&ioapic_lock, flags);
188}
189
190/*
191 * When we mask an IO APIC routing entry, we need to write the low
192 * word first, in order to set the mask bit before we change the
193 * high bits!
194 */
195static void ioapic_mask_entry(int apic, int pin)
196{
197 unsigned long flags;
198 union entry_union eu = { .entry.mask = 1 };
199
200 spin_lock_irqsave(&ioapic_lock, flags);
201 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
202 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
203 spin_unlock_irqrestore(&ioapic_lock, flags);
204}
205
206/*
207 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
208 * shared ISA-space IRQs, so we have to support them. We are super
209 * fast in the common case, and fast for shared ISA-space IRQs.
210 */
211static void add_pin_to_irq(unsigned int irq, int apic, int pin)
212{
213 static int first_free_entry = NR_IRQS;
214 struct irq_pin_list *entry = irq_2_pin + irq;
215
216 while (entry->next)
217 entry = irq_2_pin + entry->next;
218
219 if (entry->pin != -1) {
220 entry->next = first_free_entry;
221 entry = irq_2_pin + entry->next;
222 if (++first_free_entry >= PIN_MAP_SIZE)
223 panic("io_apic.c: whoops");
224 }
225 entry->apic = apic;
226 entry->pin = pin;
227}
228
229/*
230 * Reroute an IRQ to a different pin.
231 */
232static void __init replace_pin_at_irq(unsigned int irq,
233 int oldapic, int oldpin,
234 int newapic, int newpin)
235{
236 struct irq_pin_list *entry = irq_2_pin + irq;
237
238 while (1) {
239 if (entry->apic == oldapic && entry->pin == oldpin) {
240 entry->apic = newapic;
241 entry->pin = newpin;
242 }
243 if (!entry->next)
244 break;
245 entry = irq_2_pin + entry->next;
246 }
247}
248
249static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
250{
251 struct irq_pin_list *entry = irq_2_pin + irq;
252 unsigned int pin, reg;
253
254 for (;;) {
255 pin = entry->pin;
256 if (pin == -1)
257 break;
258 reg = io_apic_read(entry->apic, 0x10 + pin*2);
259 reg &= ~disable;
260 reg |= enable;
261 io_apic_modify(entry->apic, 0x10 + pin*2, reg);
262 if (!entry->next)
263 break;
264 entry = irq_2_pin + entry->next;
265 }
266}
267
268/* mask = 1 */
269static void __mask_IO_APIC_irq(unsigned int irq)
270{
271 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
272}
273
274/* mask = 0 */
275static void __unmask_IO_APIC_irq(unsigned int irq)
276{
277 __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
278}
279
280/* mask = 1, trigger = 0 */
281static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
282{
283 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
284 IO_APIC_REDIR_LEVEL_TRIGGER);
285}
286
287/* mask = 0, trigger = 1 */
288static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
289{
290 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
291 IO_APIC_REDIR_MASKED);
292}
293
294static void mask_IO_APIC_irq(unsigned int irq)
295{
296 unsigned long flags;
297
298 spin_lock_irqsave(&ioapic_lock, flags);
299 __mask_IO_APIC_irq(irq);
300 spin_unlock_irqrestore(&ioapic_lock, flags);
301}
302
303static void unmask_IO_APIC_irq(unsigned int irq)
304{
305 unsigned long flags;
306
307 spin_lock_irqsave(&ioapic_lock, flags);
308 __unmask_IO_APIC_irq(irq);
309 spin_unlock_irqrestore(&ioapic_lock, flags);
310}
311
312static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
313{
314 struct IO_APIC_route_entry entry;
315
316 /* Check delivery_mode to be sure we're not clearing an SMI pin */
317 entry = ioapic_read_entry(apic, pin);
318 if (entry.delivery_mode == dest_SMI)
319 return;
320
321 /*
322 * Disable it in the IO-APIC irq-routing table:
323 */
324 ioapic_mask_entry(apic, pin);
325}
326
327static void clear_IO_APIC(void)
328{
329 int apic, pin;
330
331 for (apic = 0; apic < nr_ioapics; apic++)
332 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
333 clear_IO_APIC_pin(apic, pin);
334}
335
336#ifdef CONFIG_SMP
337static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
338{
339 unsigned long flags;
340 int pin;
341 struct irq_pin_list *entry = irq_2_pin + irq;
342 unsigned int apicid_value;
343 cpumask_t tmp;
344
345 cpus_and(tmp, cpumask, cpu_online_map);
346 if (cpus_empty(tmp))
347 tmp = TARGET_CPUS;
348
349 cpus_and(cpumask, tmp, CPU_MASK_ALL);
350
351 apicid_value = cpu_mask_to_apicid(cpumask);
352 /* Prepare to do the io_apic_write */
353 apicid_value = apicid_value << 24;
354 spin_lock_irqsave(&ioapic_lock, flags);
355 for (;;) {
356 pin = entry->pin;
357 if (pin == -1)
358 break;
359 io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
360 if (!entry->next)
361 break;
362 entry = irq_2_pin + entry->next;
363 }
364 irq_desc[irq].affinity = cpumask;
365 spin_unlock_irqrestore(&ioapic_lock, flags);
366}
367
368#if defined(CONFIG_IRQBALANCE)
369# include <asm/processor.h> /* kernel_thread() */
370# include <linux/kernel_stat.h> /* kstat */
371# include <linux/slab.h> /* kmalloc() */
372# include <linux/timer.h>
373
374#define IRQBALANCE_CHECK_ARCH -999
375#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
376#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
377#define BALANCED_IRQ_MORE_DELTA (HZ/10)
378#define BALANCED_IRQ_LESS_DELTA (HZ)
379
380static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
381static int physical_balance __read_mostly;
382static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
383
384static struct irq_cpu_info {
385 unsigned long *last_irq;
386 unsigned long *irq_delta;
387 unsigned long irq;
388} irq_cpu_data[NR_CPUS];
389
390#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
391#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
392#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
393
394#define IDLE_ENOUGH(cpu,now) \
395 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
396
397#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
398
399#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i)))
400
401static cpumask_t balance_irq_affinity[NR_IRQS] = {
402 [0 ... NR_IRQS-1] = CPU_MASK_ALL
403};
404
405void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
406{
407 balance_irq_affinity[irq] = mask;
408}
409
410static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
411 unsigned long now, int direction)
412{
413 int search_idle = 1;
414 int cpu = curr_cpu;
415
416 goto inside;
417
418 do {
419 if (unlikely(cpu == curr_cpu))
420 search_idle = 0;
421inside:
422 if (direction == 1) {
423 cpu++;
424 if (cpu >= NR_CPUS)
425 cpu = 0;
426 } else {
427 cpu--;
428 if (cpu == -1)
429 cpu = NR_CPUS-1;
430 }
431 } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
432 (search_idle && !IDLE_ENOUGH(cpu, now)));
433
434 return cpu;
435}
436
437static inline void balance_irq(int cpu, int irq)
438{
439 unsigned long now = jiffies;
440 cpumask_t allowed_mask;
441 unsigned int new_cpu;
442
443 if (irqbalance_disabled)
444 return;
445
446 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
447 new_cpu = move(cpu, allowed_mask, now, 1);
448 if (cpu != new_cpu)
449 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
450}
451
452static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
453{
454 int i, j;
455
456 for_each_online_cpu(i) {
457 for (j = 0; j < NR_IRQS; j++) {
458 if (!irq_desc[j].action)
459 continue;
460 /* Is it a significant load ? */
461 if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
462 useful_load_threshold)
463 continue;
464 balance_irq(i, j);
465 }
466 }
467 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
468 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
469 return;
470}
471
472static void do_irq_balance(void)
473{
474 int i, j;
475 unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
476 unsigned long move_this_load = 0;
477 int max_loaded = 0, min_loaded = 0;
478 int load;
479 unsigned long useful_load_threshold = balanced_irq_interval + 10;
480 int selected_irq;
481 int tmp_loaded, first_attempt = 1;
482 unsigned long tmp_cpu_irq;
483 unsigned long imbalance = 0;
484 cpumask_t allowed_mask, target_cpu_mask, tmp;
485
486 for_each_possible_cpu(i) {
487 int package_index;
488 CPU_IRQ(i) = 0;
489 if (!cpu_online(i))
490 continue;
491 package_index = CPU_TO_PACKAGEINDEX(i);
492 for (j = 0; j < NR_IRQS; j++) {
493 unsigned long value_now, delta;
494 /* Is this an active IRQ or balancing disabled ? */
495 if (!irq_desc[j].action || irq_balancing_disabled(j))
496 continue;
497 if (package_index == i)
498 IRQ_DELTA(package_index, j) = 0;
499 /* Determine the total count per processor per IRQ */
500 value_now = (unsigned long) kstat_cpu(i).irqs[j];
501
502 /* Determine the activity per processor per IRQ */
503 delta = value_now - LAST_CPU_IRQ(i, j);
504
505 /* Update last_cpu_irq[][] for the next time */
506 LAST_CPU_IRQ(i, j) = value_now;
507
508 /* Ignore IRQs whose rate is less than the clock */
509 if (delta < useful_load_threshold)
510 continue;
511 /* update the load for the processor or package total */
512 IRQ_DELTA(package_index, j) += delta;
513
514 /* Keep track of the higher numbered sibling as well */
515 if (i != package_index)
516 CPU_IRQ(i) += delta;
517 /*
518 * We have sibling A and sibling B in the package
519 *
520 * cpu_irq[A] = load for cpu A + load for cpu B
521 * cpu_irq[B] = load for cpu B
522 */
523 CPU_IRQ(package_index) += delta;
524 }
525 }
526 /* Find the least loaded processor package */
527 for_each_online_cpu(i) {
528 if (i != CPU_TO_PACKAGEINDEX(i))
529 continue;
530 if (min_cpu_irq > CPU_IRQ(i)) {
531 min_cpu_irq = CPU_IRQ(i);
532 min_loaded = i;
533 }
534 }
535 max_cpu_irq = ULONG_MAX;
536
537tryanothercpu:
538 /*
539 * Look for heaviest loaded processor.
540 * We may come back to get the next heaviest loaded processor.
541 * Skip processors with trivial loads.
542 */
543 tmp_cpu_irq = 0;
544 tmp_loaded = -1;
545 for_each_online_cpu(i) {
546 if (i != CPU_TO_PACKAGEINDEX(i))
547 continue;
548 if (max_cpu_irq <= CPU_IRQ(i))
549 continue;
550 if (tmp_cpu_irq < CPU_IRQ(i)) {
551 tmp_cpu_irq = CPU_IRQ(i);
552 tmp_loaded = i;
553 }
554 }
555
556 if (tmp_loaded == -1) {
557 /*
558 * In the case of small number of heavy interrupt sources,
559 * loading some of the cpus too much. We use Ingo's original
560 * approach to rotate them around.
561 */
562 if (!first_attempt && imbalance >= useful_load_threshold) {
563 rotate_irqs_among_cpus(useful_load_threshold);
564 return;
565 }
566 goto not_worth_the_effort;
567 }
568
569 first_attempt = 0; /* heaviest search */
570 max_cpu_irq = tmp_cpu_irq; /* load */
571 max_loaded = tmp_loaded; /* processor */
572 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
573
574 /*
575 * if imbalance is less than approx 10% of max load, then
576 * observe diminishing returns action. - quit
577 */
578 if (imbalance < (max_cpu_irq >> 3))
579 goto not_worth_the_effort;
580
581tryanotherirq:
582 /* if we select an IRQ to move that can't go where we want, then
583 * see if there is another one to try.
584 */
585 move_this_load = 0;
586 selected_irq = -1;
587 for (j = 0; j < NR_IRQS; j++) {
588 /* Is this an active IRQ? */
589 if (!irq_desc[j].action)
590 continue;
591 if (imbalance <= IRQ_DELTA(max_loaded, j))
592 continue;
593 /* Try to find the IRQ that is closest to the imbalance
594 * without going over.
595 */
596 if (move_this_load < IRQ_DELTA(max_loaded, j)) {
597 move_this_load = IRQ_DELTA(max_loaded, j);
598 selected_irq = j;
599 }
600 }
601 if (selected_irq == -1)
602 goto tryanothercpu;
603
604 imbalance = move_this_load;
605
606 /* For physical_balance case, we accumulated both load
607 * values in the one of the siblings cpu_irq[],
608 * to use the same code for physical and logical processors
609 * as much as possible.
610 *
611 * NOTE: the cpu_irq[] array holds the sum of the load for
612 * sibling A and sibling B in the slot for the lowest numbered
613 * sibling (A), _AND_ the load for sibling B in the slot for
614 * the higher numbered sibling.
615 *
616 * We seek the least loaded sibling by making the comparison
617 * (A+B)/2 vs B
618 */
619 load = CPU_IRQ(min_loaded) >> 1;
620 for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) {
621 if (load > CPU_IRQ(j)) {
622 /* This won't change cpu_sibling_map[min_loaded] */
623 load = CPU_IRQ(j);
624 min_loaded = j;
625 }
626 }
627
628 cpus_and(allowed_mask,
629 cpu_online_map,
630 balance_irq_affinity[selected_irq]);
631 target_cpu_mask = cpumask_of_cpu(min_loaded);
632 cpus_and(tmp, target_cpu_mask, allowed_mask);
633
634 if (!cpus_empty(tmp)) {
635 /* mark for change destination */
636 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
637
638 /* Since we made a change, come back sooner to
639 * check for more variation.
640 */
641 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
642 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
643 return;
644 }
645 goto tryanotherirq;
646
647not_worth_the_effort:
648 /*
649 * if we did not find an IRQ to move, then adjust the time interval
650 * upward
651 */
652 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
653 balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
654 return;
655}
656
657static int balanced_irq(void *unused)
658{
659 int i;
660 unsigned long prev_balance_time = jiffies;
661 long time_remaining = balanced_irq_interval;
662
663 /* push everything to CPU 0 to give us a starting point. */
664 for (i = 0 ; i < NR_IRQS ; i++) {
665 irq_desc[i].pending_mask = cpumask_of_cpu(0);
666 set_pending_irq(i, cpumask_of_cpu(0));
667 }
668
669 set_freezable();
670 for ( ; ; ) {
671 time_remaining = schedule_timeout_interruptible(time_remaining);
672 try_to_freeze();
673 if (time_after(jiffies,
674 prev_balance_time+balanced_irq_interval)) {
675 preempt_disable();
676 do_irq_balance();
677 prev_balance_time = jiffies;
678 time_remaining = balanced_irq_interval;
679 preempt_enable();
680 }
681 }
682 return 0;
683}
684
685static int __init balanced_irq_init(void)
686{
687 int i;
688 struct cpuinfo_x86 *c;
689 cpumask_t tmp;
690
691 cpus_shift_right(tmp, cpu_online_map, 2);
692 c = &boot_cpu_data;
693 /* When not overwritten by the command line ask subarchitecture. */
694 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
695 irqbalance_disabled = NO_BALANCE_IRQ;
696 if (irqbalance_disabled)
697 return 0;
698
699 /* disable irqbalance completely if there is only one processor online */
700 if (num_online_cpus() < 2) {
701 irqbalance_disabled = 1;
702 return 0;
703 }
704 /*
705 * Enable physical balance only if more than 1 physical processor
706 * is present
707 */
708 if (smp_num_siblings > 1 && !cpus_empty(tmp))
709 physical_balance = 1;
710
711 for_each_online_cpu(i) {
712 irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
713 irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
714 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
715 printk(KERN_ERR "balanced_irq_init: out of memory");
716 goto failed;
717 }
718 }
719
720 printk(KERN_INFO "Starting balanced_irq\n");
721 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
722 return 0;
723 printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
724failed:
725 for_each_possible_cpu(i) {
726 kfree(irq_cpu_data[i].irq_delta);
727 irq_cpu_data[i].irq_delta = NULL;
728 kfree(irq_cpu_data[i].last_irq);
729 irq_cpu_data[i].last_irq = NULL;
730 }
731 return 0;
732}
733
734int __devinit irqbalance_disable(char *str)
735{
736 irqbalance_disabled = 1;
737 return 1;
738}
739
740__setup("noirqbalance", irqbalance_disable);
741
742late_initcall(balanced_irq_init);
743#endif /* CONFIG_IRQBALANCE */
744#endif /* CONFIG_SMP */
745
746#ifndef CONFIG_SMP
747void send_IPI_self(int vector)
748{
749 unsigned int cfg;
750
751 /*
752 * Wait for idle.
753 */
754 apic_wait_icr_idle();
755 cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
756 /*
757 * Send the IPI. The write to APIC_ICR fires this off.
758 */
759 apic_write(APIC_ICR, cfg);
760}
761#endif /* !CONFIG_SMP */
762
763
764/*
765 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
766 * specific CPU-side IRQs.
767 */
768
769#define MAX_PIRQS 8
770static int pirq_entries [MAX_PIRQS];
771static int pirqs_enabled;
772int skip_ioapic_setup;
773
774static int __init ioapic_pirq_setup(char *str)
775{
776 int i, max;
777 int ints[MAX_PIRQS+1];
778
779 get_options(str, ARRAY_SIZE(ints), ints);
780
781 for (i = 0; i < MAX_PIRQS; i++)
782 pirq_entries[i] = -1;
783
784 pirqs_enabled = 1;
785 apic_printk(APIC_VERBOSE, KERN_INFO
786 "PIRQ redirection, working around broken MP-BIOS.\n");
787 max = MAX_PIRQS;
788 if (ints[0] < MAX_PIRQS)
789 max = ints[0];
790
791 for (i = 0; i < max; i++) {
792 apic_printk(APIC_VERBOSE, KERN_DEBUG
793 "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
794 /*
795 * PIRQs are mapped upside down, usually.
796 */
797 pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
798 }
799 return 1;
800}
801
802__setup("pirq=", ioapic_pirq_setup);
803
804/*
805 * Find the IRQ entry number of a certain pin.
806 */
807static int find_irq_entry(int apic, int pin, int type)
808{
809 int i;
810
811 for (i = 0; i < mp_irq_entries; i++)
812 if (mp_irqs[i].mp_irqtype == type &&
813 (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
814 mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
815 mp_irqs[i].mp_dstirq == pin)
816 return i;
817
818 return -1;
819}
820
821/*
822 * Find the pin to which IRQ[irq] (ISA) is connected
823 */
824static int __init find_isa_irq_pin(int irq, int type)
825{
826 int i;
827
828 for (i = 0; i < mp_irq_entries; i++) {
829 int lbus = mp_irqs[i].mp_srcbus;
830
831 if (test_bit(lbus, mp_bus_not_pci) &&
832 (mp_irqs[i].mp_irqtype == type) &&
833 (mp_irqs[i].mp_srcbusirq == irq))
834
835 return mp_irqs[i].mp_dstirq;
836 }
837 return -1;
838}
839
840static int __init find_isa_irq_apic(int irq, int type)
841{
842 int i;
843
844 for (i = 0; i < mp_irq_entries; i++) {
845 int lbus = mp_irqs[i].mp_srcbus;
846
847 if (test_bit(lbus, mp_bus_not_pci) &&
848 (mp_irqs[i].mp_irqtype == type) &&
849 (mp_irqs[i].mp_srcbusirq == irq))
850 break;
851 }
852 if (i < mp_irq_entries) {
853 int apic;
854 for (apic = 0; apic < nr_ioapics; apic++) {
855 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
856 return apic;
857 }
858 }
859
860 return -1;
861}
862
863/*
864 * Find a specific PCI IRQ entry.
865 * Not an __init, possibly needed by modules
866 */
867static int pin_2_irq(int idx, int apic, int pin);
868
869int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
870{
871 int apic, i, best_guess = -1;
872
873 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
874 "slot:%d, pin:%d.\n", bus, slot, pin);
875 if (test_bit(bus, mp_bus_not_pci)) {
876 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
877 return -1;
878 }
879 for (i = 0; i < mp_irq_entries; i++) {
880 int lbus = mp_irqs[i].mp_srcbus;
881
882 for (apic = 0; apic < nr_ioapics; apic++)
883 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
884 mp_irqs[i].mp_dstapic == MP_APIC_ALL)
885 break;
886
887 if (!test_bit(lbus, mp_bus_not_pci) &&
888 !mp_irqs[i].mp_irqtype &&
889 (bus == lbus) &&
890 (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
891 int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
892
893 if (!(apic || IO_APIC_IRQ(irq)))
894 continue;
895
896 if (pin == (mp_irqs[i].mp_srcbusirq & 3))
897 return irq;
898 /*
899 * Use the first all-but-pin matching entry as a
900 * best-guess fuzzy result for broken mptables.
901 */
902 if (best_guess < 0)
903 best_guess = irq;
904 }
905 }
906 return best_guess;
907}
908EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
909
910/*
911 * This function currently is only a helper for the i386 smp boot process where
912 * we need to reprogram the ioredtbls to cater for the cpus which have come online
913 * so mask in all cases should simply be TARGET_CPUS
914 */
915#ifdef CONFIG_SMP
916void __init setup_ioapic_dest(void)
917{
918 int pin, ioapic, irq, irq_entry;
919
920 if (skip_ioapic_setup == 1)
921 return;
922
923 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
924 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
925 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
926 if (irq_entry == -1)
927 continue;
928 irq = pin_2_irq(irq_entry, ioapic, pin);
929 set_ioapic_affinity_irq(irq, TARGET_CPUS);
930 }
931
932 }
933}
934#endif
935
936#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
937/*
938 * EISA Edge/Level control register, ELCR
939 */
940static int EISA_ELCR(unsigned int irq)
941{
942 if (irq < 16) {
943 unsigned int port = 0x4d0 + (irq >> 3);
944 return (inb(port) >> (irq & 7)) & 1;
945 }
946 apic_printk(APIC_VERBOSE, KERN_INFO
947 "Broken MPtable reports ISA irq %d\n", irq);
948 return 0;
949}
950#endif
951
952/* ISA interrupts are always polarity zero edge triggered,
953 * when listed as conforming in the MP table. */
954
955#define default_ISA_trigger(idx) (0)
956#define default_ISA_polarity(idx) (0)
957
958/* EISA interrupts are always polarity zero and can be edge or level
959 * trigger depending on the ELCR value. If an interrupt is listed as
960 * EISA conforming in the MP table, that means its trigger type must
961 * be read in from the ELCR */
962
963#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
964#define default_EISA_polarity(idx) default_ISA_polarity(idx)
965
966/* PCI interrupts are always polarity one level triggered,
967 * when listed as conforming in the MP table. */
968
969#define default_PCI_trigger(idx) (1)
970#define default_PCI_polarity(idx) (1)
971
972/* MCA interrupts are always polarity zero level triggered,
973 * when listed as conforming in the MP table. */
974
975#define default_MCA_trigger(idx) (1)
976#define default_MCA_polarity(idx) default_ISA_polarity(idx)
977
978static int MPBIOS_polarity(int idx)
979{
980 int bus = mp_irqs[idx].mp_srcbus;
981 int polarity;
982
983 /*
984 * Determine IRQ line polarity (high active or low active):
985 */
986 switch (mp_irqs[idx].mp_irqflag & 3) {
987 case 0: /* conforms, ie. bus-type dependent polarity */
988 {
989 polarity = test_bit(bus, mp_bus_not_pci)?
990 default_ISA_polarity(idx):
991 default_PCI_polarity(idx);
992 break;
993 }
994 case 1: /* high active */
995 {
996 polarity = 0;
997 break;
998 }
999 case 2: /* reserved */
1000 {
1001 printk(KERN_WARNING "broken BIOS!!\n");
1002 polarity = 1;
1003 break;
1004 }
1005 case 3: /* low active */
1006 {
1007 polarity = 1;
1008 break;
1009 }
1010 default: /* invalid */
1011 {
1012 printk(KERN_WARNING "broken BIOS!!\n");
1013 polarity = 1;
1014 break;
1015 }
1016 }
1017 return polarity;
1018}
1019
1020static int MPBIOS_trigger(int idx)
1021{
1022 int bus = mp_irqs[idx].mp_srcbus;
1023 int trigger;
1024
1025 /*
1026 * Determine IRQ trigger mode (edge or level sensitive):
1027 */
1028 switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
1029 case 0: /* conforms, ie. bus-type dependent */
1030 {
1031 trigger = test_bit(bus, mp_bus_not_pci)?
1032 default_ISA_trigger(idx):
1033 default_PCI_trigger(idx);
1034#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
1035 switch (mp_bus_id_to_type[bus]) {
1036 case MP_BUS_ISA: /* ISA pin */
1037 {
1038 /* set before the switch */
1039 break;
1040 }
1041 case MP_BUS_EISA: /* EISA pin */
1042 {
1043 trigger = default_EISA_trigger(idx);
1044 break;
1045 }
1046 case MP_BUS_PCI: /* PCI pin */
1047 {
1048 /* set before the switch */
1049 break;
1050 }
1051 case MP_BUS_MCA: /* MCA pin */
1052 {
1053 trigger = default_MCA_trigger(idx);
1054 break;
1055 }
1056 default:
1057 {
1058 printk(KERN_WARNING "broken BIOS!!\n");
1059 trigger = 1;
1060 break;
1061 }
1062 }
1063#endif
1064 break;
1065 }
1066 case 1: /* edge */
1067 {
1068 trigger = 0;
1069 break;
1070 }
1071 case 2: /* reserved */
1072 {
1073 printk(KERN_WARNING "broken BIOS!!\n");
1074 trigger = 1;
1075 break;
1076 }
1077 case 3: /* level */
1078 {
1079 trigger = 1;
1080 break;
1081 }
1082 default: /* invalid */
1083 {
1084 printk(KERN_WARNING "broken BIOS!!\n");
1085 trigger = 0;
1086 break;
1087 }
1088 }
1089 return trigger;
1090}
1091
1092static inline int irq_polarity(int idx)
1093{
1094 return MPBIOS_polarity(idx);
1095}
1096
1097static inline int irq_trigger(int idx)
1098{
1099 return MPBIOS_trigger(idx);
1100}
1101
1102static int pin_2_irq(int idx, int apic, int pin)
1103{
1104 int irq, i;
1105 int bus = mp_irqs[idx].mp_srcbus;
1106
1107 /*
1108 * Debugging check, we are in big trouble if this message pops up!
1109 */
1110 if (mp_irqs[idx].mp_dstirq != pin)
1111 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1112
1113 if (test_bit(bus, mp_bus_not_pci))
1114 irq = mp_irqs[idx].mp_srcbusirq;
1115 else {
1116 /*
1117 * PCI IRQs are mapped in order
1118 */
1119 i = irq = 0;
1120 while (i < apic)
1121 irq += nr_ioapic_registers[i++];
1122 irq += pin;
1123
1124 /*
1125 * For MPS mode, so far only needed by ES7000 platform
1126 */
1127 if (ioapic_renumber_irq)
1128 irq = ioapic_renumber_irq(apic, irq);
1129 }
1130
1131 /*
1132 * PCI IRQ command line redirection. Yes, limits are hardcoded.
1133 */
1134 if ((pin >= 16) && (pin <= 23)) {
1135 if (pirq_entries[pin-16] != -1) {
1136 if (!pirq_entries[pin-16]) {
1137 apic_printk(APIC_VERBOSE, KERN_DEBUG
1138 "disabling PIRQ%d\n", pin-16);
1139 } else {
1140 irq = pirq_entries[pin-16];
1141 apic_printk(APIC_VERBOSE, KERN_DEBUG
1142 "using PIRQ%d -> IRQ %d\n",
1143 pin-16, irq);
1144 }
1145 }
1146 }
1147 return irq;
1148}
1149
1150static inline int IO_APIC_irq_trigger(int irq)
1151{
1152 int apic, idx, pin;
1153
1154 for (apic = 0; apic < nr_ioapics; apic++) {
1155 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1156 idx = find_irq_entry(apic, pin, mp_INT);
1157 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
1158 return irq_trigger(idx);
1159 }
1160 }
1161 /*
1162 * nonexistent IRQs are edge default
1163 */
1164 return 0;
1165}
1166
1167/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
1168static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
1169
1170static int __assign_irq_vector(int irq)
1171{
1172 static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
1173 int vector, offset;
1174
1175 BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
1176
1177 if (irq_vector[irq] > 0)
1178 return irq_vector[irq];
1179
1180 vector = current_vector;
1181 offset = current_offset;
1182next:
1183 vector += 8;
1184 if (vector >= first_system_vector) {
1185 offset = (offset + 1) % 8;
1186 vector = FIRST_DEVICE_VECTOR + offset;
1187 }
1188 if (vector == current_vector)
1189 return -ENOSPC;
1190 if (test_and_set_bit(vector, used_vectors))
1191 goto next;
1192
1193 current_vector = vector;
1194 current_offset = offset;
1195 irq_vector[irq] = vector;
1196
1197 return vector;
1198}
1199
1200static int assign_irq_vector(int irq)
1201{
1202 unsigned long flags;
1203 int vector;
1204
1205 spin_lock_irqsave(&vector_lock, flags);
1206 vector = __assign_irq_vector(irq);
1207 spin_unlock_irqrestore(&vector_lock, flags);
1208
1209 return vector;
1210}
1211
1212static struct irq_chip ioapic_chip;
1213
1214#define IOAPIC_AUTO -1
1215#define IOAPIC_EDGE 0
1216#define IOAPIC_LEVEL 1
1217
1218static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
1219{
1220 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1221 trigger == IOAPIC_LEVEL) {
1222 irq_desc[irq].status |= IRQ_LEVEL;
1223 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1224 handle_fasteoi_irq, "fasteoi");
1225 } else {
1226 irq_desc[irq].status &= ~IRQ_LEVEL;
1227 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1228 handle_edge_irq, "edge");
1229 }
1230 set_intr_gate(vector, interrupt[irq]);
1231}
1232
1233static void __init setup_IO_APIC_irqs(void)
1234{
1235 struct IO_APIC_route_entry entry;
1236 int apic, pin, idx, irq, first_notcon = 1, vector;
1237
1238 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1239
1240 for (apic = 0; apic < nr_ioapics; apic++) {
1241 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1242
1243 /*
1244 * add it to the IO-APIC irq-routing table:
1245 */
1246 memset(&entry, 0, sizeof(entry));
1247
1248 entry.delivery_mode = INT_DELIVERY_MODE;
1249 entry.dest_mode = INT_DEST_MODE;
1250 entry.mask = 0; /* enable IRQ */
1251 entry.dest.logical.logical_dest =
1252 cpu_mask_to_apicid(TARGET_CPUS);
1253
1254 idx = find_irq_entry(apic, pin, mp_INT);
1255 if (idx == -1) {
1256 if (first_notcon) {
1257 apic_printk(APIC_VERBOSE, KERN_DEBUG
1258 " IO-APIC (apicid-pin) %d-%d",
1259 mp_ioapics[apic].mp_apicid,
1260 pin);
1261 first_notcon = 0;
1262 } else
1263 apic_printk(APIC_VERBOSE, ", %d-%d",
1264 mp_ioapics[apic].mp_apicid, pin);
1265 continue;
1266 }
1267
1268 if (!first_notcon) {
1269 apic_printk(APIC_VERBOSE, " not connected.\n");
1270 first_notcon = 1;
1271 }
1272
1273 entry.trigger = irq_trigger(idx);
1274 entry.polarity = irq_polarity(idx);
1275
1276 if (irq_trigger(idx)) {
1277 entry.trigger = 1;
1278 entry.mask = 1;
1279 }
1280
1281 irq = pin_2_irq(idx, apic, pin);
1282 /*
1283 * skip adding the timer int on secondary nodes, which causes
1284 * a small but painful rift in the time-space continuum
1285 */
1286 if (multi_timer_check(apic, irq))
1287 continue;
1288 else
1289 add_pin_to_irq(irq, apic, pin);
1290
1291 if (!apic && !IO_APIC_IRQ(irq))
1292 continue;
1293
1294 if (IO_APIC_IRQ(irq)) {
1295 vector = assign_irq_vector(irq);
1296 entry.vector = vector;
1297 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
1298
1299 if (!apic && (irq < 16))
1300 disable_8259A_irq(irq);
1301 }
1302 ioapic_write_entry(apic, pin, entry);
1303 }
1304 }
1305
1306 if (!first_notcon)
1307 apic_printk(APIC_VERBOSE, " not connected.\n");
1308}
1309
1310/*
1311 * Set up the timer pin, possibly with the 8259A-master behind.
1312 */
1313static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1314 int vector)
1315{
1316 struct IO_APIC_route_entry entry;
1317
1318 memset(&entry, 0, sizeof(entry));
1319
1320 /*
1321 * We use logical delivery to get the timer IRQ
1322 * to the first CPU.
1323 */
1324 entry.dest_mode = INT_DEST_MODE;
1325 entry.mask = 1; /* mask IRQ now */
1326 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
1327 entry.delivery_mode = INT_DELIVERY_MODE;
1328 entry.polarity = 0;
1329 entry.trigger = 0;
1330 entry.vector = vector;
1331
1332 /*
1333 * The timer IRQ doesn't have to know that behind the
1334 * scene we may have a 8259A-master in AEOI mode ...
1335 */
1336 ioapic_register_intr(0, vector, IOAPIC_EDGE);
1337
1338 /*
1339 * Add it to the IO-APIC irq-routing table:
1340 */
1341 ioapic_write_entry(apic, pin, entry);
1342}
1343
1344void __init print_IO_APIC(void)
1345{
1346 int apic, i;
1347 union IO_APIC_reg_00 reg_00;
1348 union IO_APIC_reg_01 reg_01;
1349 union IO_APIC_reg_02 reg_02;
1350 union IO_APIC_reg_03 reg_03;
1351 unsigned long flags;
1352
1353 if (apic_verbosity == APIC_QUIET)
1354 return;
1355
1356 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1357 for (i = 0; i < nr_ioapics; i++)
1358 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1359 mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
1360
1361 /*
1362 * We are a bit conservative about what we expect. We have to
1363 * know about every hardware change ASAP.
1364 */
1365 printk(KERN_INFO "testing the IO APIC.......................\n");
1366
1367 for (apic = 0; apic < nr_ioapics; apic++) {
1368
1369 spin_lock_irqsave(&ioapic_lock, flags);
1370 reg_00.raw = io_apic_read(apic, 0);
1371 reg_01.raw = io_apic_read(apic, 1);
1372 if (reg_01.bits.version >= 0x10)
1373 reg_02.raw = io_apic_read(apic, 2);
1374 if (reg_01.bits.version >= 0x20)
1375 reg_03.raw = io_apic_read(apic, 3);
1376 spin_unlock_irqrestore(&ioapic_lock, flags);
1377
1378 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
1379 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1380 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1381 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1382 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1383
1384 printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
1385 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1386
1387 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1388 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1389
1390 /*
1391 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
1392 * but the value of reg_02 is read as the previous read register
1393 * value, so ignore it if reg_02 == reg_01.
1394 */
1395 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1396 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1397 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1398 }
1399
1400 /*
1401 * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
1402 * or reg_03, but the value of reg_0[23] is read as the previous read
1403 * register value, so ignore it if reg_03 == reg_0[12].
1404 */
1405 if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
1406 reg_03.raw != reg_01.raw) {
1407 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1408 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1409 }
1410
1411 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1412
1413 printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
1414 " Stat Dest Deli Vect: \n");
1415
1416 for (i = 0; i <= reg_01.bits.entries; i++) {
1417 struct IO_APIC_route_entry entry;
1418
1419 entry = ioapic_read_entry(apic, i);
1420
1421 printk(KERN_DEBUG " %02x %03X %02X ",
1422 i,
1423 entry.dest.logical.logical_dest,
1424 entry.dest.physical.physical_dest
1425 );
1426
1427 printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
1428 entry.mask,
1429 entry.trigger,
1430 entry.irr,
1431 entry.polarity,
1432 entry.delivery_status,
1433 entry.dest_mode,
1434 entry.delivery_mode,
1435 entry.vector
1436 );
1437 }
1438 }
1439 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1440 for (i = 0; i < NR_IRQS; i++) {
1441 struct irq_pin_list *entry = irq_2_pin + i;
1442 if (entry->pin < 0)
1443 continue;
1444 printk(KERN_DEBUG "IRQ%d ", i);
1445 for (;;) {
1446 printk("-> %d:%d", entry->apic, entry->pin);
1447 if (!entry->next)
1448 break;
1449 entry = irq_2_pin + entry->next;
1450 }
1451 printk("\n");
1452 }
1453
1454 printk(KERN_INFO ".................................... done.\n");
1455
1456 return;
1457}
1458
1459#if 0
1460
1461static void print_APIC_bitfield(int base)
1462{
1463 unsigned int v;
1464 int i, j;
1465
1466 if (apic_verbosity == APIC_QUIET)
1467 return;
1468
1469 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
1470 for (i = 0; i < 8; i++) {
1471 v = apic_read(base + i*0x10);
1472 for (j = 0; j < 32; j++) {
1473 if (v & (1<<j))
1474 printk("1");
1475 else
1476 printk("0");
1477 }
1478 printk("\n");
1479 }
1480}
1481
1482void /*__init*/ print_local_APIC(void *dummy)
1483{
1484 unsigned int v, ver, maxlvt;
1485
1486 if (apic_verbosity == APIC_QUIET)
1487 return;
1488
1489 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1490 smp_processor_id(), hard_smp_processor_id());
1491 v = apic_read(APIC_ID);
1492 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
1493 GET_APIC_ID(read_apic_id()));
1494 v = apic_read(APIC_LVR);
1495 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1496 ver = GET_APIC_VERSION(v);
1497 maxlvt = lapic_get_maxlvt();
1498
1499 v = apic_read(APIC_TASKPRI);
1500 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
1501
1502 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1503 v = apic_read(APIC_ARBPRI);
1504 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
1505 v & APIC_ARBPRI_MASK);
1506 v = apic_read(APIC_PROCPRI);
1507 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
1508 }
1509
1510 v = apic_read(APIC_EOI);
1511 printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
1512 v = apic_read(APIC_RRR);
1513 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1514 v = apic_read(APIC_LDR);
1515 printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
1516 v = apic_read(APIC_DFR);
1517 printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
1518 v = apic_read(APIC_SPIV);
1519 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1520
1521 printk(KERN_DEBUG "... APIC ISR field:\n");
1522 print_APIC_bitfield(APIC_ISR);
1523 printk(KERN_DEBUG "... APIC TMR field:\n");
1524 print_APIC_bitfield(APIC_TMR);
1525 printk(KERN_DEBUG "... APIC IRR field:\n");
1526 print_APIC_bitfield(APIC_IRR);
1527
1528 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1529 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1530 apic_write(APIC_ESR, 0);
1531 v = apic_read(APIC_ESR);
1532 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1533 }
1534
1535 v = apic_read(APIC_ICR);
1536 printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
1537 v = apic_read(APIC_ICR2);
1538 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1539
1540 v = apic_read(APIC_LVTT);
1541 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
1542
1543 if (maxlvt > 3) { /* PC is LVT#4. */
1544 v = apic_read(APIC_LVTPC);
1545 printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
1546 }
1547 v = apic_read(APIC_LVT0);
1548 printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
1549 v = apic_read(APIC_LVT1);
1550 printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
1551
1552 if (maxlvt > 2) { /* ERR is LVT#3. */
1553 v = apic_read(APIC_LVTERR);
1554 printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
1555 }
1556
1557 v = apic_read(APIC_TMICT);
1558 printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
1559 v = apic_read(APIC_TMCCT);
1560 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
1561 v = apic_read(APIC_TDCR);
1562 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
1563 printk("\n");
1564}
1565
1566void print_all_local_APICs(void)
1567{
1568 on_each_cpu(print_local_APIC, NULL, 1);
1569}
1570
1571void /*__init*/ print_PIC(void)
1572{
1573 unsigned int v;
1574 unsigned long flags;
1575
1576 if (apic_verbosity == APIC_QUIET)
1577 return;
1578
1579 printk(KERN_DEBUG "\nprinting PIC contents\n");
1580
1581 spin_lock_irqsave(&i8259A_lock, flags);
1582
1583 v = inb(0xa1) << 8 | inb(0x21);
1584 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
1585
1586 v = inb(0xa0) << 8 | inb(0x20);
1587 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
1588
1589 outb(0x0b, 0xa0);
1590 outb(0x0b, 0x20);
1591 v = inb(0xa0) << 8 | inb(0x20);
1592 outb(0x0a, 0xa0);
1593 outb(0x0a, 0x20);
1594
1595 spin_unlock_irqrestore(&i8259A_lock, flags);
1596
1597 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1598
1599 v = inb(0x4d1) << 8 | inb(0x4d0);
1600 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1601}
1602
1603#endif /* 0 */
1604
1605static void __init enable_IO_APIC(void)
1606{
1607 union IO_APIC_reg_01 reg_01;
1608 int i8259_apic, i8259_pin;
1609 int i, apic;
1610 unsigned long flags;
1611
1612 for (i = 0; i < PIN_MAP_SIZE; i++) {
1613 irq_2_pin[i].pin = -1;
1614 irq_2_pin[i].next = 0;
1615 }
1616 if (!pirqs_enabled)
1617 for (i = 0; i < MAX_PIRQS; i++)
1618 pirq_entries[i] = -1;
1619
1620 /*
1621 * The number of IO-APIC IRQ registers (== #pins):
1622 */
1623 for (apic = 0; apic < nr_ioapics; apic++) {
1624 spin_lock_irqsave(&ioapic_lock, flags);
1625 reg_01.raw = io_apic_read(apic, 1);
1626 spin_unlock_irqrestore(&ioapic_lock, flags);
1627 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1628 }
1629 for (apic = 0; apic < nr_ioapics; apic++) {
1630 int pin;
1631 /* See if any of the pins is in ExtINT mode */
1632 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1633 struct IO_APIC_route_entry entry;
1634 entry = ioapic_read_entry(apic, pin);
1635
1636
1637 /* If the interrupt line is enabled and in ExtInt mode
1638 * I have found the pin where the i8259 is connected.
1639 */
1640 if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
1641 ioapic_i8259.apic = apic;
1642 ioapic_i8259.pin = pin;
1643 goto found_i8259;
1644 }
1645 }
1646 }
1647 found_i8259:
1648 /* Look to see what if the MP table has reported the ExtINT */
1649 /* If we could not find the appropriate pin by looking at the ioapic
1650 * the i8259 probably is not connected the ioapic but give the
1651 * mptable a chance anyway.
1652 */
1653 i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
1654 i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
1655 /* Trust the MP table if nothing is setup in the hardware */
1656 if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
1657 printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
1658 ioapic_i8259.pin = i8259_pin;
1659 ioapic_i8259.apic = i8259_apic;
1660 }
1661 /* Complain if the MP table and the hardware disagree */
1662 if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
1663 (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
1664 {
1665 printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
1666 }
1667
1668 /*
1669 * Do not trust the IO-APIC being empty at bootup
1670 */
1671 clear_IO_APIC();
1672}
1673
1674/*
1675 * Not an __init, needed by the reboot code
1676 */
1677void disable_IO_APIC(void)
1678{
1679 /*
1680 * Clear the IO-APIC before rebooting:
1681 */
1682 clear_IO_APIC();
1683
1684 /*
1685 * If the i8259 is routed through an IOAPIC
1686 * Put that IOAPIC in virtual wire mode
1687 * so legacy interrupts can be delivered.
1688 */
1689 if (ioapic_i8259.pin != -1) {
1690 struct IO_APIC_route_entry entry;
1691
1692 memset(&entry, 0, sizeof(entry));
1693 entry.mask = 0; /* Enabled */
1694 entry.trigger = 0; /* Edge */
1695 entry.irr = 0;
1696 entry.polarity = 0; /* High */
1697 entry.delivery_status = 0;
1698 entry.dest_mode = 0; /* Physical */
1699 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1700 entry.vector = 0;
1701 entry.dest.physical.physical_dest =
1702 GET_APIC_ID(read_apic_id());
1703
1704 /*
1705 * Add it to the IO-APIC irq-routing table:
1706 */
1707 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
1708 }
1709 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1710}
1711
1712/*
1713 * function to set the IO-APIC physical IDs based on the
1714 * values stored in the MPC table.
1715 *
1716 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1717 */
1718
1719static void __init setup_ioapic_ids_from_mpc(void)
1720{
1721 union IO_APIC_reg_00 reg_00;
1722 physid_mask_t phys_id_present_map;
1723 int apic;
1724 int i;
1725 unsigned char old_id;
1726 unsigned long flags;
1727
1728#ifdef CONFIG_X86_NUMAQ
1729 if (found_numaq)
1730 return;
1731#endif
1732
1733 /*
1734 * Don't check I/O APIC IDs for xAPIC systems. They have
1735 * no meaning without the serial APIC bus.
1736 */
1737 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1738 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
1739 return;
1740 /*
1741 * This is broken; anything with a real cpu count has to
1742 * circumvent this idiocy regardless.
1743 */
1744 phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
1745
1746 /*
1747 * Set the IOAPIC ID to the value stored in the MPC table.
1748 */
1749 for (apic = 0; apic < nr_ioapics; apic++) {
1750
1751 /* Read the register 0 value */
1752 spin_lock_irqsave(&ioapic_lock, flags);
1753 reg_00.raw = io_apic_read(apic, 0);
1754 spin_unlock_irqrestore(&ioapic_lock, flags);
1755
1756 old_id = mp_ioapics[apic].mp_apicid;
1757
1758 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
1759 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1760 apic, mp_ioapics[apic].mp_apicid);
1761 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1762 reg_00.bits.ID);
1763 mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
1764 }
1765
1766 /*
1767 * Sanity check, is the ID really free? Every APIC in a
1768 * system must have a unique ID or we get lots of nice
1769 * 'stuck on smp_invalidate_needed IPI wait' messages.
1770 */
1771 if (check_apicid_used(phys_id_present_map,
1772 mp_ioapics[apic].mp_apicid)) {
1773 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1774 apic, mp_ioapics[apic].mp_apicid);
1775 for (i = 0; i < get_physical_broadcast(); i++)
1776 if (!physid_isset(i, phys_id_present_map))
1777 break;
1778 if (i >= get_physical_broadcast())
1779 panic("Max APIC ID exceeded!\n");
1780 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1781 i);
1782 physid_set(i, phys_id_present_map);
1783 mp_ioapics[apic].mp_apicid = i;
1784 } else {
1785 physid_mask_t tmp;
1786 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
1787 apic_printk(APIC_VERBOSE, "Setting %d in the "
1788 "phys_id_present_map\n",
1789 mp_ioapics[apic].mp_apicid);
1790 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1791 }
1792
1793
1794 /*
1795 * We need to adjust the IRQ routing table
1796 * if the ID changed.
1797 */
1798 if (old_id != mp_ioapics[apic].mp_apicid)
1799 for (i = 0; i < mp_irq_entries; i++)
1800 if (mp_irqs[i].mp_dstapic == old_id)
1801 mp_irqs[i].mp_dstapic
1802 = mp_ioapics[apic].mp_apicid;
1803
1804 /*
1805 * Read the right value from the MPC table and
1806 * write it into the ID register.
1807 */
1808 apic_printk(APIC_VERBOSE, KERN_INFO
1809 "...changing IO-APIC physical APIC ID to %d ...",
1810 mp_ioapics[apic].mp_apicid);
1811
1812 reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
1813 spin_lock_irqsave(&ioapic_lock, flags);
1814 io_apic_write(apic, 0, reg_00.raw);
1815 spin_unlock_irqrestore(&ioapic_lock, flags);
1816
1817 /*
1818 * Sanity check
1819 */
1820 spin_lock_irqsave(&ioapic_lock, flags);
1821 reg_00.raw = io_apic_read(apic, 0);
1822 spin_unlock_irqrestore(&ioapic_lock, flags);
1823 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
1824 printk("could not set ID!\n");
1825 else
1826 apic_printk(APIC_VERBOSE, " ok.\n");
1827 }
1828}
1829
1830int no_timer_check __initdata;
1831
1832static int __init notimercheck(char *s)
1833{
1834 no_timer_check = 1;
1835 return 1;
1836}
1837__setup("no_timer_check", notimercheck);
1838
1839/*
1840 * There is a nasty bug in some older SMP boards, their mptable lies
1841 * about the timer IRQ. We do the following to work around the situation:
1842 *
1843 * - timer IRQ defaults to IO-APIC IRQ
1844 * - if this function detects that timer IRQs are defunct, then we fall
1845 * back to ISA timer IRQs
1846 */
1847static int __init timer_irq_works(void)
1848{
1849 unsigned long t1 = jiffies;
1850 unsigned long flags;
1851
1852 if (no_timer_check)
1853 return 1;
1854
1855 local_save_flags(flags);
1856 local_irq_enable();
1857 /* Let ten ticks pass... */
1858 mdelay((10 * 1000) / HZ);
1859 local_irq_restore(flags);
1860
1861 /*
1862 * Expect a few ticks at least, to be sure some possible
1863 * glue logic does not lock up after one or two first
1864 * ticks in a non-ExtINT mode. Also the local APIC
1865 * might have cached one ExtINT interrupt. Finally, at
1866 * least one tick may be lost due to delays.
1867 */
1868 if (time_after(jiffies, t1 + 4))
1869 return 1;
1870
1871 return 0;
1872}
1873
1874/*
1875 * In the SMP+IOAPIC case it might happen that there are an unspecified
1876 * number of pending IRQ events unhandled. These cases are very rare,
1877 * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
1878 * better to do it this way as thus we do not have to be aware of
1879 * 'pending' interrupts in the IRQ path, except at this point.
1880 */
1881/*
1882 * Edge triggered needs to resend any interrupt
1883 * that was delayed but this is now handled in the device
1884 * independent code.
1885 */
1886
1887/*
1888 * Startup quirk:
1889 *
1890 * Starting up a edge-triggered IO-APIC interrupt is
1891 * nasty - we need to make sure that we get the edge.
1892 * If it is already asserted for some reason, we need
1893 * return 1 to indicate that is was pending.
1894 *
1895 * This is not complete - we should be able to fake
1896 * an edge even if it isn't on the 8259A...
1897 *
1898 * (We do this for level-triggered IRQs too - it cannot hurt.)
1899 */
1900static unsigned int startup_ioapic_irq(unsigned int irq)
1901{
1902 int was_pending = 0;
1903 unsigned long flags;
1904
1905 spin_lock_irqsave(&ioapic_lock, flags);
1906 if (irq < 16) {
1907 disable_8259A_irq(irq);
1908 if (i8259A_irq_pending(irq))
1909 was_pending = 1;
1910 }
1911 __unmask_IO_APIC_irq(irq);
1912 spin_unlock_irqrestore(&ioapic_lock, flags);
1913
1914 return was_pending;
1915}
1916
1917static void ack_ioapic_irq(unsigned int irq)
1918{
1919 move_native_irq(irq);
1920 ack_APIC_irq();
1921}
1922
1923static void ack_ioapic_quirk_irq(unsigned int irq)
1924{
1925 unsigned long v;
1926 int i;
1927
1928 move_native_irq(irq);
1929/*
1930 * It appears there is an erratum which affects at least version 0x11
1931 * of I/O APIC (that's the 82093AA and cores integrated into various
1932 * chipsets). Under certain conditions a level-triggered interrupt is
1933 * erroneously delivered as edge-triggered one but the respective IRR
1934 * bit gets set nevertheless. As a result the I/O unit expects an EOI
1935 * message but it will never arrive and further interrupts are blocked
1936 * from the source. The exact reason is so far unknown, but the
1937 * phenomenon was observed when two consecutive interrupt requests
1938 * from a given source get delivered to the same CPU and the source is
1939 * temporarily disabled in between.
1940 *
1941 * A workaround is to simulate an EOI message manually. We achieve it
1942 * by setting the trigger mode to edge and then to level when the edge
1943 * trigger mode gets detected in the TMR of a local APIC for a
1944 * level-triggered interrupt. We mask the source for the time of the
1945 * operation to prevent an edge-triggered interrupt escaping meanwhile.
1946 * The idea is from Manfred Spraul. --macro
1947 */
1948 i = irq_vector[irq];
1949
1950 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
1951
1952 ack_APIC_irq();
1953
1954 if (!(v & (1 << (i & 0x1f)))) {
1955 atomic_inc(&irq_mis_count);
1956 spin_lock(&ioapic_lock);
1957 __mask_and_edge_IO_APIC_irq(irq);
1958 __unmask_and_level_IO_APIC_irq(irq);
1959 spin_unlock(&ioapic_lock);
1960 }
1961}
1962
1963static int ioapic_retrigger_irq(unsigned int irq)
1964{
1965 send_IPI_self(irq_vector[irq]);
1966
1967 return 1;
1968}
1969
1970static struct irq_chip ioapic_chip __read_mostly = {
1971 .name = "IO-APIC",
1972 .startup = startup_ioapic_irq,
1973 .mask = mask_IO_APIC_irq,
1974 .unmask = unmask_IO_APIC_irq,
1975 .ack = ack_ioapic_irq,
1976 .eoi = ack_ioapic_quirk_irq,
1977#ifdef CONFIG_SMP
1978 .set_affinity = set_ioapic_affinity_irq,
1979#endif
1980 .retrigger = ioapic_retrigger_irq,
1981};
1982
1983
1984static inline void init_IO_APIC_traps(void)
1985{
1986 int irq;
1987
1988 /*
1989 * NOTE! The local APIC isn't very good at handling
1990 * multiple interrupts at the same interrupt level.
1991 * As the interrupt level is determined by taking the
1992 * vector number and shifting that right by 4, we
1993 * want to spread these out a bit so that they don't
1994 * all fall in the same interrupt level.
1995 *
1996 * Also, we've got to be careful not to trash gate
1997 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1998 */
1999 for (irq = 0; irq < NR_IRQS ; irq++) {
2000 if (IO_APIC_IRQ(irq) && !irq_vector[irq]) {
2001 /*
2002 * Hmm.. We don't have an entry for this,
2003 * so default to an old-fashioned 8259
2004 * interrupt if we can..
2005 */
2006 if (irq < 16)
2007 make_8259A_irq(irq);
2008 else
2009 /* Strange. Oh, well.. */
2010 irq_desc[irq].chip = &no_irq_chip;
2011 }
2012 }
2013}
2014
2015/*
2016 * The local APIC irq-chip implementation:
2017 */
2018
2019static void ack_lapic_irq(unsigned int irq)
2020{
2021 ack_APIC_irq();
2022}
2023
2024static void mask_lapic_irq(unsigned int irq)
2025{
2026 unsigned long v;
2027
2028 v = apic_read(APIC_LVT0);
2029 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
2030}
2031
2032static void unmask_lapic_irq(unsigned int irq)
2033{
2034 unsigned long v;
2035
2036 v = apic_read(APIC_LVT0);
2037 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2038}
2039
2040static struct irq_chip lapic_chip __read_mostly = {
2041 .name = "local-APIC",
2042 .mask = mask_lapic_irq,
2043 .unmask = unmask_lapic_irq,
2044 .ack = ack_lapic_irq,
2045};
2046
2047static void lapic_register_intr(int irq, int vector)
2048{
2049 irq_desc[irq].status &= ~IRQ_LEVEL;
2050 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2051 "edge");
2052 set_intr_gate(vector, interrupt[irq]);
2053}
2054
2055static void __init setup_nmi(void)
2056{
2057 /*
2058 * Dirty trick to enable the NMI watchdog ...
2059 * We put the 8259A master into AEOI mode and
2060 * unmask on all local APICs LVT0 as NMI.
2061 *
2062 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
2063 * is from Maciej W. Rozycki - so we do not have to EOI from
2064 * the NMI handler or the timer interrupt.
2065 */
2066 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2067
2068 enable_NMI_through_LVT0();
2069
2070 apic_printk(APIC_VERBOSE, " done.\n");
2071}
2072
2073/*
2074 * This looks a bit hackish but it's about the only one way of sending
2075 * a few INTA cycles to 8259As and any associated glue logic. ICR does
2076 * not support the ExtINT mode, unfortunately. We need to send these
2077 * cycles as some i82489DX-based boards have glue logic that keeps the
2078 * 8259A interrupt line asserted until INTA. --macro
2079 */
2080static inline void __init unlock_ExtINT_logic(void)
2081{
2082 int apic, pin, i;
2083 struct IO_APIC_route_entry entry0, entry1;
2084 unsigned char save_control, save_freq_select;
2085
2086 pin = find_isa_irq_pin(8, mp_INT);
2087 if (pin == -1) {
2088 WARN_ON_ONCE(1);
2089 return;
2090 }
2091 apic = find_isa_irq_apic(8, mp_INT);
2092 if (apic == -1) {
2093 WARN_ON_ONCE(1);
2094 return;
2095 }
2096
2097 entry0 = ioapic_read_entry(apic, pin);
2098 clear_IO_APIC_pin(apic, pin);
2099
2100 memset(&entry1, 0, sizeof(entry1));
2101
2102 entry1.dest_mode = 0; /* physical delivery */
2103 entry1.mask = 0; /* unmask IRQ now */
2104 entry1.dest.physical.physical_dest = hard_smp_processor_id();
2105 entry1.delivery_mode = dest_ExtINT;
2106 entry1.polarity = entry0.polarity;
2107 entry1.trigger = 0;
2108 entry1.vector = 0;
2109
2110 ioapic_write_entry(apic, pin, entry1);
2111
2112 save_control = CMOS_READ(RTC_CONTROL);
2113 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
2114 CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
2115 RTC_FREQ_SELECT);
2116 CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
2117
2118 i = 100;
2119 while (i-- > 0) {
2120 mdelay(10);
2121 if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
2122 i -= 10;
2123 }
2124
2125 CMOS_WRITE(save_control, RTC_CONTROL);
2126 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
2127 clear_IO_APIC_pin(apic, pin);
2128
2129 ioapic_write_entry(apic, pin, entry0);
2130}
2131
2132/*
2133 * This code may look a bit paranoid, but it's supposed to cooperate with
2134 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
2135 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
2136 * fanatically on his truly buggy board.
2137 */
2138static inline void __init check_timer(void)
2139{
2140 int apic1, pin1, apic2, pin2;
2141 int no_pin1 = 0;
2142 int vector;
2143 unsigned int ver;
2144 unsigned long flags;
2145
2146 local_irq_save(flags);
2147
2148 ver = apic_read(APIC_LVR);
2149 ver = GET_APIC_VERSION(ver);
2150
2151 /*
2152 * get/set the timer IRQ vector:
2153 */
2154 disable_8259A_irq(0);
2155 vector = assign_irq_vector(0);
2156 set_intr_gate(vector, interrupt[0]);
2157
2158 /*
2159 * As IRQ0 is to be enabled in the 8259A, the virtual
2160 * wire has to be disabled in the local APIC. Also
2161 * timer interrupts need to be acknowledged manually in
2162 * the 8259A for the i82489DX when using the NMI
2163 * watchdog as that APIC treats NMIs as level-triggered.
2164 * The AEOI mode will finish them in the 8259A
2165 * automatically.
2166 */
2167 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2168 init_8259A(1);
2169 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2170
2171 pin1 = find_isa_irq_pin(0, mp_INT);
2172 apic1 = find_isa_irq_apic(0, mp_INT);
2173 pin2 = ioapic_i8259.pin;
2174 apic2 = ioapic_i8259.apic;
2175
2176 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
2177 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
2178 vector, apic1, pin1, apic2, pin2);
2179
2180 /*
2181 * Some BIOS writers are clueless and report the ExtINTA
2182 * I/O APIC input from the cascaded 8259A as the timer
2183 * interrupt input. So just in case, if only one pin
2184 * was found above, try it both directly and through the
2185 * 8259A.
2186 */
2187 if (pin1 == -1) {
2188 pin1 = pin2;
2189 apic1 = apic2;
2190 no_pin1 = 1;
2191 } else if (pin2 == -1) {
2192 pin2 = pin1;
2193 apic2 = apic1;
2194 }
2195
2196 if (pin1 != -1) {
2197 /*
2198 * Ok, does IRQ0 through the IOAPIC work?
2199 */
2200 if (no_pin1) {
2201 add_pin_to_irq(0, apic1, pin1);
2202 setup_timer_IRQ0_pin(apic1, pin1, vector);
2203 }
2204 unmask_IO_APIC_irq(0);
2205 if (timer_irq_works()) {
2206 if (nmi_watchdog == NMI_IO_APIC) {
2207 setup_nmi();
2208 enable_8259A_irq(0);
2209 }
2210 if (disable_timer_pin_1 > 0)
2211 clear_IO_APIC_pin(0, pin1);
2212 goto out;
2213 }
2214 clear_IO_APIC_pin(apic1, pin1);
2215 if (!no_pin1)
2216 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
2217 "8254 timer not connected to IO-APIC\n");
2218
2219 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
2220 "(IRQ0) through the 8259A ...\n");
2221 apic_printk(APIC_QUIET, KERN_INFO
2222 "..... (found apic %d pin %d) ...\n", apic2, pin2);
2223 /*
2224 * legacy devices should be connected to IO APIC #0
2225 */
2226 replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
2227 setup_timer_IRQ0_pin(apic2, pin2, vector);
2228 unmask_IO_APIC_irq(0);
2229 enable_8259A_irq(0);
2230 if (timer_irq_works()) {
2231 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2232 timer_through_8259 = 1;
2233 if (nmi_watchdog == NMI_IO_APIC) {
2234 disable_8259A_irq(0);
2235 setup_nmi();
2236 enable_8259A_irq(0);
2237 }
2238 goto out;
2239 }
2240 /*
2241 * Cleanup, just in case ...
2242 */
2243 disable_8259A_irq(0);
2244 clear_IO_APIC_pin(apic2, pin2);
2245 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
2246 }
2247
2248 if (nmi_watchdog == NMI_IO_APIC) {
2249 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
2250 "through the IO-APIC - disabling NMI Watchdog!\n");
2251 nmi_watchdog = NMI_NONE;
2252 }
2253 timer_ack = 0;
2254
2255 apic_printk(APIC_QUIET, KERN_INFO
2256 "...trying to set up timer as Virtual Wire IRQ...\n");
2257
2258 lapic_register_intr(0, vector);
2259 apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2260 enable_8259A_irq(0);
2261
2262 if (timer_irq_works()) {
2263 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2264 goto out;
2265 }
2266 disable_8259A_irq(0);
2267 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
2268 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
2269
2270 apic_printk(APIC_QUIET, KERN_INFO
2271 "...trying to set up timer as ExtINT IRQ...\n");
2272
2273 init_8259A(0);
2274 make_8259A_irq(0);
2275 apic_write(APIC_LVT0, APIC_DM_EXTINT);
2276
2277 unlock_ExtINT_logic();
2278
2279 if (timer_irq_works()) {
2280 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2281 goto out;
2282 }
2283 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
2284 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2285 "report. Then try booting with the 'noapic' option.\n");
2286out:
2287 local_irq_restore(flags);
2288}
2289
2290/*
2291 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
2292 * to devices. However there may be an I/O APIC pin available for
2293 * this interrupt regardless. The pin may be left unconnected, but
2294 * typically it will be reused as an ExtINT cascade interrupt for
2295 * the master 8259A. In the MPS case such a pin will normally be
2296 * reported as an ExtINT interrupt in the MP table. With ACPI
2297 * there is no provision for ExtINT interrupts, and in the absence
2298 * of an override it would be treated as an ordinary ISA I/O APIC
2299 * interrupt, that is edge-triggered and unmasked by default. We
2300 * used to do this, but it caused problems on some systems because
2301 * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
2302 * the same ExtINT cascade interrupt to drive the local APIC of the
2303 * bootstrap processor. Therefore we refrain from routing IRQ2 to
2304 * the I/O APIC in all cases now. No actual device should request
2305 * it anyway. --macro
2306 */
2307#define PIC_IRQS (1 << PIC_CASCADE_IR)
2308
2309void __init setup_IO_APIC(void)
2310{
2311 int i;
2312
2313 /* Reserve all the system vectors. */
2314 for (i = first_system_vector; i < NR_VECTORS; i++)
2315 set_bit(i, used_vectors);
2316
2317 enable_IO_APIC();
2318
2319 io_apic_irqs = ~PIC_IRQS;
2320
2321 printk("ENABLING IO-APIC IRQs\n");
2322
2323 /*
2324 * Set up IO-APIC IRQ routing.
2325 */
2326 if (!acpi_ioapic)
2327 setup_ioapic_ids_from_mpc();
2328 sync_Arb_IDs();
2329 setup_IO_APIC_irqs();
2330 init_IO_APIC_traps();
2331 check_timer();
2332 if (!acpi_ioapic)
2333 print_IO_APIC();
2334}
2335
2336/*
2337 * Called after all the initialization is done. If we didnt find any
2338 * APIC bugs then we can allow the modify fast path
2339 */
2340
2341static int __init io_apic_bug_finalize(void)
2342{
2343 if (sis_apic_bug == -1)
2344 sis_apic_bug = 0;
2345 return 0;
2346}
2347
2348late_initcall(io_apic_bug_finalize);
2349
2350struct sysfs_ioapic_data {
2351 struct sys_device dev;
2352 struct IO_APIC_route_entry entry[0];
2353};
2354static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
2355
2356static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
2357{
2358 struct IO_APIC_route_entry *entry;
2359 struct sysfs_ioapic_data *data;
2360 int i;
2361
2362 data = container_of(dev, struct sysfs_ioapic_data, dev);
2363 entry = data->entry;
2364 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
2365 entry[i] = ioapic_read_entry(dev->id, i);
2366
2367 return 0;
2368}
2369
2370static int ioapic_resume(struct sys_device *dev)
2371{
2372 struct IO_APIC_route_entry *entry;
2373 struct sysfs_ioapic_data *data;
2374 unsigned long flags;
2375 union IO_APIC_reg_00 reg_00;
2376 int i;
2377
2378 data = container_of(dev, struct sysfs_ioapic_data, dev);
2379 entry = data->entry;
2380
2381 spin_lock_irqsave(&ioapic_lock, flags);
2382 reg_00.raw = io_apic_read(dev->id, 0);
2383 if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
2384 reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
2385 io_apic_write(dev->id, 0, reg_00.raw);
2386 }
2387 spin_unlock_irqrestore(&ioapic_lock, flags);
2388 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
2389 ioapic_write_entry(dev->id, i, entry[i]);
2390
2391 return 0;
2392}
2393
2394static struct sysdev_class ioapic_sysdev_class = {
2395 .name = "ioapic",
2396 .suspend = ioapic_suspend,
2397 .resume = ioapic_resume,
2398};
2399
2400static int __init ioapic_init_sysfs(void)
2401{
2402 struct sys_device *dev;
2403 int i, size, error = 0;
2404
2405 error = sysdev_class_register(&ioapic_sysdev_class);
2406 if (error)
2407 return error;
2408
2409 for (i = 0; i < nr_ioapics; i++) {
2410 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
2411 * sizeof(struct IO_APIC_route_entry);
2412 mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
2413 if (!mp_ioapic_data[i]) {
2414 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2415 continue;
2416 }
2417 dev = &mp_ioapic_data[i]->dev;
2418 dev->id = i;
2419 dev->cls = &ioapic_sysdev_class;
2420 error = sysdev_register(dev);
2421 if (error) {
2422 kfree(mp_ioapic_data[i]);
2423 mp_ioapic_data[i] = NULL;
2424 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2425 continue;
2426 }
2427 }
2428
2429 return 0;
2430}
2431
2432device_initcall(ioapic_init_sysfs);
2433
2434/*
2435 * Dynamic irq allocate and deallocation
2436 */
2437int create_irq(void)
2438{
2439 /* Allocate an unused irq */
2440 int irq, new, vector = 0;
2441 unsigned long flags;
2442
2443 irq = -ENOSPC;
2444 spin_lock_irqsave(&vector_lock, flags);
2445 for (new = (NR_IRQS - 1); new >= 0; new--) {
2446 if (platform_legacy_irq(new))
2447 continue;
2448 if (irq_vector[new] != 0)
2449 continue;
2450 vector = __assign_irq_vector(new);
2451 if (likely(vector > 0))
2452 irq = new;
2453 break;
2454 }
2455 spin_unlock_irqrestore(&vector_lock, flags);
2456
2457 if (irq >= 0) {
2458 set_intr_gate(vector, interrupt[irq]);
2459 dynamic_irq_init(irq);
2460 }
2461 return irq;
2462}
2463
2464void destroy_irq(unsigned int irq)
2465{
2466 unsigned long flags;
2467
2468 dynamic_irq_cleanup(irq);
2469
2470 spin_lock_irqsave(&vector_lock, flags);
2471 clear_bit(irq_vector[irq], used_vectors);
2472 irq_vector[irq] = 0;
2473 spin_unlock_irqrestore(&vector_lock, flags);
2474}
2475
2476/*
2477 * MSI message composition
2478 */
2479#ifdef CONFIG_PCI_MSI
2480static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
2481{
2482 int vector;
2483 unsigned dest;
2484
2485 vector = assign_irq_vector(irq);
2486 if (vector >= 0) {
2487 dest = cpu_mask_to_apicid(TARGET_CPUS);
2488
2489 msg->address_hi = MSI_ADDR_BASE_HI;
2490 msg->address_lo =
2491 MSI_ADDR_BASE_LO |
2492 ((INT_DEST_MODE == 0) ?
2493MSI_ADDR_DEST_MODE_PHYSICAL:
2494 MSI_ADDR_DEST_MODE_LOGICAL) |
2495 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2496 MSI_ADDR_REDIRECTION_CPU:
2497 MSI_ADDR_REDIRECTION_LOWPRI) |
2498 MSI_ADDR_DEST_ID(dest);
2499
2500 msg->data =
2501 MSI_DATA_TRIGGER_EDGE |
2502 MSI_DATA_LEVEL_ASSERT |
2503 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2504MSI_DATA_DELIVERY_FIXED:
2505 MSI_DATA_DELIVERY_LOWPRI) |
2506 MSI_DATA_VECTOR(vector);
2507 }
2508 return vector;
2509}
2510
2511#ifdef CONFIG_SMP
2512static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2513{
2514 struct msi_msg msg;
2515 unsigned int dest;
2516 cpumask_t tmp;
2517 int vector;
2518
2519 cpus_and(tmp, mask, cpu_online_map);
2520 if (cpus_empty(tmp))
2521 tmp = TARGET_CPUS;
2522
2523 vector = assign_irq_vector(irq);
2524 if (vector < 0)
2525 return;
2526
2527 dest = cpu_mask_to_apicid(mask);
2528
2529 read_msi_msg(irq, &msg);
2530
2531 msg.data &= ~MSI_DATA_VECTOR_MASK;
2532 msg.data |= MSI_DATA_VECTOR(vector);
2533 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
2534 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2535
2536 write_msi_msg(irq, &msg);
2537 irq_desc[irq].affinity = mask;
2538}
2539#endif /* CONFIG_SMP */
2540
2541/*
2542 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
2543 * which implement the MSI or MSI-X Capability Structure.
2544 */
2545static struct irq_chip msi_chip = {
2546 .name = "PCI-MSI",
2547 .unmask = unmask_msi_irq,
2548 .mask = mask_msi_irq,
2549 .ack = ack_ioapic_irq,
2550#ifdef CONFIG_SMP
2551 .set_affinity = set_msi_irq_affinity,
2552#endif
2553 .retrigger = ioapic_retrigger_irq,
2554};
2555
2556int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
2557{
2558 struct msi_msg msg;
2559 int irq, ret;
2560 irq = create_irq();
2561 if (irq < 0)
2562 return irq;
2563
2564 ret = msi_compose_msg(dev, irq, &msg);
2565 if (ret < 0) {
2566 destroy_irq(irq);
2567 return ret;
2568 }
2569
2570 set_irq_msi(irq, desc);
2571 write_msi_msg(irq, &msg);
2572
2573 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
2574 "edge");
2575
2576 return 0;
2577}
2578
2579void arch_teardown_msi_irq(unsigned int irq)
2580{
2581 destroy_irq(irq);
2582}
2583
2584#endif /* CONFIG_PCI_MSI */
2585
2586/*
2587 * Hypertransport interrupt support
2588 */
2589#ifdef CONFIG_HT_IRQ
2590
2591#ifdef CONFIG_SMP
2592
2593static void target_ht_irq(unsigned int irq, unsigned int dest)
2594{
2595 struct ht_irq_msg msg;
2596 fetch_ht_irq_msg(irq, &msg);
2597
2598 msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
2599 msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
2600
2601 msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
2602 msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
2603
2604 write_ht_irq_msg(irq, &msg);
2605}
2606
2607static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2608{
2609 unsigned int dest;
2610 cpumask_t tmp;
2611
2612 cpus_and(tmp, mask, cpu_online_map);
2613 if (cpus_empty(tmp))
2614 tmp = TARGET_CPUS;
2615
2616 cpus_and(mask, tmp, CPU_MASK_ALL);
2617
2618 dest = cpu_mask_to_apicid(mask);
2619
2620 target_ht_irq(irq, dest);
2621 irq_desc[irq].affinity = mask;
2622}
2623#endif
2624
2625static struct irq_chip ht_irq_chip = {
2626 .name = "PCI-HT",
2627 .mask = mask_ht_irq,
2628 .unmask = unmask_ht_irq,
2629 .ack = ack_ioapic_irq,
2630#ifdef CONFIG_SMP
2631 .set_affinity = set_ht_irq_affinity,
2632#endif
2633 .retrigger = ioapic_retrigger_irq,
2634};
2635
2636int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2637{
2638 int vector;
2639
2640 vector = assign_irq_vector(irq);
2641 if (vector >= 0) {
2642 struct ht_irq_msg msg;
2643 unsigned dest;
2644 cpumask_t tmp;
2645
2646 cpus_clear(tmp);
2647 cpu_set(vector >> 8, tmp);
2648 dest = cpu_mask_to_apicid(tmp);
2649
2650 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
2651
2652 msg.address_lo =
2653 HT_IRQ_LOW_BASE |
2654 HT_IRQ_LOW_DEST_ID(dest) |
2655 HT_IRQ_LOW_VECTOR(vector) |
2656 ((INT_DEST_MODE == 0) ?
2657 HT_IRQ_LOW_DM_PHYSICAL :
2658 HT_IRQ_LOW_DM_LOGICAL) |
2659 HT_IRQ_LOW_RQEOI_EDGE |
2660 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2661 HT_IRQ_LOW_MT_FIXED :
2662 HT_IRQ_LOW_MT_ARBITRATED) |
2663 HT_IRQ_LOW_IRQ_MASKED;
2664
2665 write_ht_irq_msg(irq, &msg);
2666
2667 set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2668 handle_edge_irq, "edge");
2669 }
2670 return vector;
2671}
2672#endif /* CONFIG_HT_IRQ */
2673
2674/* --------------------------------------------------------------------------
2675 ACPI-based IOAPIC Configuration
2676 -------------------------------------------------------------------------- */
2677
2678#ifdef CONFIG_ACPI
2679
2680int __init io_apic_get_unique_id(int ioapic, int apic_id)
2681{
2682 union IO_APIC_reg_00 reg_00;
2683 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
2684 physid_mask_t tmp;
2685 unsigned long flags;
2686 int i = 0;
2687
2688 /*
2689 * The P4 platform supports up to 256 APIC IDs on two separate APIC
2690 * buses (one for LAPICs, one for IOAPICs), where predecessors only
2691 * supports up to 16 on one shared APIC bus.
2692 *
2693 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
2694 * advantage of new APIC bus architecture.
2695 */
2696
2697 if (physids_empty(apic_id_map))
2698 apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
2699
2700 spin_lock_irqsave(&ioapic_lock, flags);
2701 reg_00.raw = io_apic_read(ioapic, 0);
2702 spin_unlock_irqrestore(&ioapic_lock, flags);
2703
2704 if (apic_id >= get_physical_broadcast()) {
2705 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
2706 "%d\n", ioapic, apic_id, reg_00.bits.ID);
2707 apic_id = reg_00.bits.ID;
2708 }
2709
2710 /*
2711 * Every APIC in a system must have a unique ID or we get lots of nice
2712 * 'stuck on smp_invalidate_needed IPI wait' messages.
2713 */
2714 if (check_apicid_used(apic_id_map, apic_id)) {
2715
2716 for (i = 0; i < get_physical_broadcast(); i++) {
2717 if (!check_apicid_used(apic_id_map, i))
2718 break;
2719 }
2720
2721 if (i == get_physical_broadcast())
2722 panic("Max apic_id exceeded!\n");
2723
2724 printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
2725 "trying %d\n", ioapic, apic_id, i);
2726
2727 apic_id = i;
2728 }
2729
2730 tmp = apicid_to_cpu_present(apic_id);
2731 physids_or(apic_id_map, apic_id_map, tmp);
2732
2733 if (reg_00.bits.ID != apic_id) {
2734 reg_00.bits.ID = apic_id;
2735
2736 spin_lock_irqsave(&ioapic_lock, flags);
2737 io_apic_write(ioapic, 0, reg_00.raw);
2738 reg_00.raw = io_apic_read(ioapic, 0);
2739 spin_unlock_irqrestore(&ioapic_lock, flags);
2740
2741 /* Sanity check */
2742 if (reg_00.bits.ID != apic_id) {
2743 printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
2744 return -1;
2745 }
2746 }
2747
2748 apic_printk(APIC_VERBOSE, KERN_INFO
2749 "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
2750
2751 return apic_id;
2752}
2753
2754
2755int __init io_apic_get_version(int ioapic)
2756{
2757 union IO_APIC_reg_01 reg_01;
2758 unsigned long flags;
2759
2760 spin_lock_irqsave(&ioapic_lock, flags);
2761 reg_01.raw = io_apic_read(ioapic, 1);
2762 spin_unlock_irqrestore(&ioapic_lock, flags);
2763
2764 return reg_01.bits.version;
2765}
2766
2767
2768int __init io_apic_get_redir_entries(int ioapic)
2769{
2770 union IO_APIC_reg_01 reg_01;
2771 unsigned long flags;
2772
2773 spin_lock_irqsave(&ioapic_lock, flags);
2774 reg_01.raw = io_apic_read(ioapic, 1);
2775 spin_unlock_irqrestore(&ioapic_lock, flags);
2776
2777 return reg_01.bits.entries;
2778}
2779
2780
2781int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
2782{
2783 struct IO_APIC_route_entry entry;
2784
2785 if (!IO_APIC_IRQ(irq)) {
2786 printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
2787 ioapic);
2788 return -EINVAL;
2789 }
2790
2791 /*
2792 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
2793 * Note that we mask (disable) IRQs now -- these get enabled when the
2794 * corresponding device driver registers for this IRQ.
2795 */
2796
2797 memset(&entry, 0, sizeof(entry));
2798
2799 entry.delivery_mode = INT_DELIVERY_MODE;
2800 entry.dest_mode = INT_DEST_MODE;
2801 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
2802 entry.trigger = edge_level;
2803 entry.polarity = active_high_low;
2804 entry.mask = 1;
2805
2806 /*
2807 * IRQs < 16 are already in the irq_2_pin[] map
2808 */
2809 if (irq >= 16)
2810 add_pin_to_irq(irq, ioapic, pin);
2811
2812 entry.vector = assign_irq_vector(irq);
2813
2814 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
2815 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
2816 mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
2817 edge_level, active_high_low);
2818
2819 ioapic_register_intr(irq, entry.vector, edge_level);
2820
2821 if (!ioapic && (irq < 16))
2822 disable_8259A_irq(irq);
2823
2824 ioapic_write_entry(ioapic, pin, entry);
2825
2826 return 0;
2827}
2828
2829int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
2830{
2831 int i;
2832
2833 if (skip_ioapic_setup)
2834 return -1;
2835
2836 for (i = 0; i < mp_irq_entries; i++)
2837 if (mp_irqs[i].mp_irqtype == mp_INT &&
2838 mp_irqs[i].mp_srcbusirq == bus_irq)
2839 break;
2840 if (i >= mp_irq_entries)
2841 return -1;
2842
2843 *trigger = irq_trigger(i);
2844 *polarity = irq_polarity(i);
2845 return 0;
2846}
2847
2848#endif /* CONFIG_ACPI */
2849
2850static int __init parse_disable_timer_pin_1(char *arg)
2851{
2852 disable_timer_pin_1 = 1;
2853 return 0;
2854}
2855early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
2856
2857static int __init parse_enable_timer_pin_1(char *arg)
2858{
2859 disable_timer_pin_1 = -1;
2860 return 0;
2861}
2862early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
2863
2864static int __init parse_noapic(char *arg)
2865{
2866 /* disable IO-APIC */
2867 disable_ioapic_setup();
2868 return 0;
2869}
2870early_param("noapic", parse_noapic);
2871
2872void __init ioapic_init_mappings(void)
2873{
2874 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
2875 int i;
2876
2877 for (i = 0; i < nr_ioapics; i++) {
2878 if (smp_found_config) {
2879 ioapic_phys = mp_ioapics[i].mp_apicaddr;
2880 if (!ioapic_phys) {
2881 printk(KERN_ERR
2882 "WARNING: bogus zero IO-APIC "
2883 "address found in MPTABLE, "
2884 "disabling IO/APIC support!\n");
2885 smp_found_config = 0;
2886 skip_ioapic_setup = 1;
2887 goto fake_ioapic_page;
2888 }
2889 } else {
2890fake_ioapic_page:
2891 ioapic_phys = (unsigned long)
2892 alloc_bootmem_pages(PAGE_SIZE);
2893 ioapic_phys = __pa(ioapic_phys);
2894 }
2895 set_fixmap_nocache(idx, ioapic_phys);
2896 printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
2897 __fix_to_virt(idx), ioapic_phys);
2898 idx++;
2899 }
2900}
2901
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 50e5e4a31c8..19191430274 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,6 +14,7 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/thread_info.h> 15#include <linux/thread_info.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <asm/syscalls.h>
17 18
18/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ 19/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
19static void set_bitmap(unsigned long *bitmap, unsigned int base, 20static void set_bitmap(unsigned long *bitmap, unsigned int base,
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index 3f7537b669d..f1c688e46f3 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -20,6 +20,8 @@
20 20
21#ifdef CONFIG_X86_32 21#ifdef CONFIG_X86_32
22#include <mach_apic.h> 22#include <mach_apic.h>
23#include <mach_ipi.h>
24
23/* 25/*
24 * the following functions deal with sending IPIs between CPUs. 26 * the following functions deal with sending IPIs between CPUs.
25 * 27 *
@@ -147,7 +149,6 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
147} 149}
148 150
149/* must come after the send_IPI functions above for inlining */ 151/* must come after the send_IPI functions above for inlining */
150#include <mach_ipi.h>
151static int convert_apicid_to_cpu(int apic_id) 152static int convert_apicid_to_cpu(int apic_id)
152{ 153{
153 int i; 154 int i;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
new file mode 100644
index 00000000000..d1d4dc52f64
--- /dev/null
+++ b/arch/x86/kernel/irq.c
@@ -0,0 +1,189 @@
1/*
2 * Common interrupt code for 32 and 64 bit
3 */
4#include <linux/cpu.h>
5#include <linux/interrupt.h>
6#include <linux/kernel_stat.h>
7#include <linux/seq_file.h>
8
9#include <asm/apic.h>
10#include <asm/io_apic.h>
11#include <asm/smp.h>
12
13atomic_t irq_err_count;
14
15/*
16 * 'what should we do if we get a hw irq event on an illegal vector'.
17 * each architecture has to answer this themselves.
18 */
19void ack_bad_irq(unsigned int irq)
20{
21 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
22
23#ifdef CONFIG_X86_LOCAL_APIC
24 /*
25 * Currently unexpected vectors happen only on SMP and APIC.
26 * We _must_ ack these because every local APIC has only N
27 * irq slots per priority level, and a 'hanging, unacked' IRQ
28 * holds up an irq slot - in excessive cases (when multiple
29 * unexpected vectors occur) that might lock up the APIC
30 * completely.
31 * But only ack when the APIC is enabled -AK
32 */
33 if (cpu_has_apic)
34 ack_APIC_irq();
35#endif
36}
37
38#ifdef CONFIG_X86_32
39# define irq_stats(x) (&per_cpu(irq_stat, x))
40#else
41# define irq_stats(x) cpu_pda(x)
42#endif
43/*
44 * /proc/interrupts printing:
45 */
46static int show_other_interrupts(struct seq_file *p)
47{
48 int j;
49
50 seq_printf(p, "NMI: ");
51 for_each_online_cpu(j)
52 seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
53 seq_printf(p, " Non-maskable interrupts\n");
54#ifdef CONFIG_X86_LOCAL_APIC
55 seq_printf(p, "LOC: ");
56 for_each_online_cpu(j)
57 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
58 seq_printf(p, " Local timer interrupts\n");
59#endif
60#ifdef CONFIG_SMP
61 seq_printf(p, "RES: ");
62 for_each_online_cpu(j)
63 seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
64 seq_printf(p, " Rescheduling interrupts\n");
65 seq_printf(p, "CAL: ");
66 for_each_online_cpu(j)
67 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
68 seq_printf(p, " Function call interrupts\n");
69 seq_printf(p, "TLB: ");
70 for_each_online_cpu(j)
71 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
72 seq_printf(p, " TLB shootdowns\n");
73#endif
74#ifdef CONFIG_X86_MCE
75 seq_printf(p, "TRM: ");
76 for_each_online_cpu(j)
77 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
78 seq_printf(p, " Thermal event interrupts\n");
79# ifdef CONFIG_X86_64
80 seq_printf(p, "THR: ");
81 for_each_online_cpu(j)
82 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
83 seq_printf(p, " Threshold APIC interrupts\n");
84# endif
85#endif
86#ifdef CONFIG_X86_LOCAL_APIC
87 seq_printf(p, "SPU: ");
88 for_each_online_cpu(j)
89 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
90 seq_printf(p, " Spurious interrupts\n");
91#endif
92 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
93#if defined(CONFIG_X86_IO_APIC)
94 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
95#endif
96 return 0;
97}
98
99int show_interrupts(struct seq_file *p, void *v)
100{
101 unsigned long flags, any_count = 0;
102 int i = *(loff_t *) v, j;
103 struct irqaction *action;
104 struct irq_desc *desc;
105
106 if (i > nr_irqs)
107 return 0;
108
109 if (i == nr_irqs)
110 return show_other_interrupts(p);
111
112 /* print header */
113 if (i == 0) {
114 seq_printf(p, " ");
115 for_each_online_cpu(j)
116 seq_printf(p, "CPU%-8d", j);
117 seq_putc(p, '\n');
118 }
119
120 desc = irq_to_desc(i);
121 spin_lock_irqsave(&desc->lock, flags);
122#ifndef CONFIG_SMP
123 any_count = kstat_irqs(i);
124#else
125 for_each_online_cpu(j)
126 any_count |= kstat_irqs_cpu(i, j);
127#endif
128 action = desc->action;
129 if (!action && !any_count)
130 goto out;
131
132 seq_printf(p, "%3d: ", i);
133#ifndef CONFIG_SMP
134 seq_printf(p, "%10u ", kstat_irqs(i));
135#else
136 for_each_online_cpu(j)
137 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
138#endif
139 seq_printf(p, " %8s", desc->chip->name);
140 seq_printf(p, "-%-8s", desc->name);
141
142 if (action) {
143 seq_printf(p, " %s", action->name);
144 while ((action = action->next) != NULL)
145 seq_printf(p, ", %s", action->name);
146 }
147
148 seq_putc(p, '\n');
149out:
150 spin_unlock_irqrestore(&desc->lock, flags);
151 return 0;
152}
153
154/*
155 * /proc/stat helpers
156 */
157u64 arch_irq_stat_cpu(unsigned int cpu)
158{
159 u64 sum = irq_stats(cpu)->__nmi_count;
160
161#ifdef CONFIG_X86_LOCAL_APIC
162 sum += irq_stats(cpu)->apic_timer_irqs;
163#endif
164#ifdef CONFIG_SMP
165 sum += irq_stats(cpu)->irq_resched_count;
166 sum += irq_stats(cpu)->irq_call_count;
167 sum += irq_stats(cpu)->irq_tlb_count;
168#endif
169#ifdef CONFIG_X86_MCE
170 sum += irq_stats(cpu)->irq_thermal_count;
171# ifdef CONFIG_X86_64
172 sum += irq_stats(cpu)->irq_threshold_count;
173#endif
174#endif
175#ifdef CONFIG_X86_LOCAL_APIC
176 sum += irq_stats(cpu)->irq_spurious_count;
177#endif
178 return sum;
179}
180
181u64 arch_irq_stat(void)
182{
183 u64 sum = atomic_read(&irq_err_count);
184
185#ifdef CONFIG_X86_IO_APIC
186 sum += atomic_read(&irq_mis_count);
187#endif
188 return sum;
189}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 1cf8c1fcc08..a51382672de 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -25,29 +25,6 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
25DEFINE_PER_CPU(struct pt_regs *, irq_regs); 25DEFINE_PER_CPU(struct pt_regs *, irq_regs);
26EXPORT_PER_CPU_SYMBOL(irq_regs); 26EXPORT_PER_CPU_SYMBOL(irq_regs);
27 27
28/*
29 * 'what should we do if we get a hw irq event on an illegal vector'.
30 * each architecture has to answer this themselves.
31 */
32void ack_bad_irq(unsigned int irq)
33{
34 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
35
36#ifdef CONFIG_X86_LOCAL_APIC
37 /*
38 * Currently unexpected vectors happen only on SMP and APIC.
39 * We _must_ ack these because every local APIC has only N
40 * irq slots per priority level, and a 'hanging, unacked' IRQ
41 * holds up an irq slot - in excessive cases (when multiple
42 * unexpected vectors occur) that might lock up the APIC
43 * completely.
44 * But only ack when the APIC is enabled -AK
45 */
46 if (cpu_has_apic)
47 ack_APIC_irq();
48#endif
49}
50
51#ifdef CONFIG_DEBUG_STACKOVERFLOW 28#ifdef CONFIG_DEBUG_STACKOVERFLOW
52/* Debugging check for stack overflow: is there less than 1KB free? */ 29/* Debugging check for stack overflow: is there less than 1KB free? */
53static int check_stack_overflow(void) 30static int check_stack_overflow(void)
@@ -223,20 +200,25 @@ unsigned int do_IRQ(struct pt_regs *regs)
223{ 200{
224 struct pt_regs *old_regs; 201 struct pt_regs *old_regs;
225 /* high bit used in ret_from_ code */ 202 /* high bit used in ret_from_ code */
226 int overflow, irq = ~regs->orig_ax; 203 int overflow;
227 struct irq_desc *desc = irq_desc + irq; 204 unsigned vector = ~regs->orig_ax;
205 struct irq_desc *desc;
206 unsigned irq;
228 207
229 if (unlikely((unsigned)irq >= NR_IRQS)) {
230 printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
231 __func__, irq);
232 BUG();
233 }
234 208
235 old_regs = set_irq_regs(regs); 209 old_regs = set_irq_regs(regs);
236 irq_enter(); 210 irq_enter();
211 irq = __get_cpu_var(vector_irq)[vector];
237 212
238 overflow = check_stack_overflow(); 213 overflow = check_stack_overflow();
239 214
215 desc = irq_to_desc(irq);
216 if (unlikely(!desc)) {
217 printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n",
218 __func__, irq, vector, smp_processor_id());
219 BUG();
220 }
221
240 if (!execute_on_irq_stack(overflow, desc, irq)) { 222 if (!execute_on_irq_stack(overflow, desc, irq)) {
241 if (unlikely(overflow)) 223 if (unlikely(overflow))
242 print_stack_overflow(); 224 print_stack_overflow();
@@ -248,146 +230,6 @@ unsigned int do_IRQ(struct pt_regs *regs)
248 return 1; 230 return 1;
249} 231}
250 232
251/*
252 * Interrupt statistics:
253 */
254
255atomic_t irq_err_count;
256
257/*
258 * /proc/interrupts printing:
259 */
260
261int show_interrupts(struct seq_file *p, void *v)
262{
263 int i = *(loff_t *) v, j;
264 struct irqaction * action;
265 unsigned long flags;
266
267 if (i == 0) {
268 seq_printf(p, " ");
269 for_each_online_cpu(j)
270 seq_printf(p, "CPU%-8d",j);
271 seq_putc(p, '\n');
272 }
273
274 if (i < NR_IRQS) {
275 unsigned any_count = 0;
276
277 spin_lock_irqsave(&irq_desc[i].lock, flags);
278#ifndef CONFIG_SMP
279 any_count = kstat_irqs(i);
280#else
281 for_each_online_cpu(j)
282 any_count |= kstat_cpu(j).irqs[i];
283#endif
284 action = irq_desc[i].action;
285 if (!action && !any_count)
286 goto skip;
287 seq_printf(p, "%3d: ",i);
288#ifndef CONFIG_SMP
289 seq_printf(p, "%10u ", kstat_irqs(i));
290#else
291 for_each_online_cpu(j)
292 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
293#endif
294 seq_printf(p, " %8s", irq_desc[i].chip->name);
295 seq_printf(p, "-%-8s", irq_desc[i].name);
296
297 if (action) {
298 seq_printf(p, " %s", action->name);
299 while ((action = action->next) != NULL)
300 seq_printf(p, ", %s", action->name);
301 }
302
303 seq_putc(p, '\n');
304skip:
305 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
306 } else if (i == NR_IRQS) {
307 seq_printf(p, "NMI: ");
308 for_each_online_cpu(j)
309 seq_printf(p, "%10u ", nmi_count(j));
310 seq_printf(p, " Non-maskable interrupts\n");
311#ifdef CONFIG_X86_LOCAL_APIC
312 seq_printf(p, "LOC: ");
313 for_each_online_cpu(j)
314 seq_printf(p, "%10u ",
315 per_cpu(irq_stat,j).apic_timer_irqs);
316 seq_printf(p, " Local timer interrupts\n");
317#endif
318#ifdef CONFIG_SMP
319 seq_printf(p, "RES: ");
320 for_each_online_cpu(j)
321 seq_printf(p, "%10u ",
322 per_cpu(irq_stat,j).irq_resched_count);
323 seq_printf(p, " Rescheduling interrupts\n");
324 seq_printf(p, "CAL: ");
325 for_each_online_cpu(j)
326 seq_printf(p, "%10u ",
327 per_cpu(irq_stat,j).irq_call_count);
328 seq_printf(p, " function call interrupts\n");
329 seq_printf(p, "TLB: ");
330 for_each_online_cpu(j)
331 seq_printf(p, "%10u ",
332 per_cpu(irq_stat,j).irq_tlb_count);
333 seq_printf(p, " TLB shootdowns\n");
334#endif
335#ifdef CONFIG_X86_MCE
336 seq_printf(p, "TRM: ");
337 for_each_online_cpu(j)
338 seq_printf(p, "%10u ",
339 per_cpu(irq_stat,j).irq_thermal_count);
340 seq_printf(p, " Thermal event interrupts\n");
341#endif
342#ifdef CONFIG_X86_LOCAL_APIC
343 seq_printf(p, "SPU: ");
344 for_each_online_cpu(j)
345 seq_printf(p, "%10u ",
346 per_cpu(irq_stat,j).irq_spurious_count);
347 seq_printf(p, " Spurious interrupts\n");
348#endif
349 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
350#if defined(CONFIG_X86_IO_APIC)
351 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
352#endif
353 }
354 return 0;
355}
356
357/*
358 * /proc/stat helpers
359 */
360u64 arch_irq_stat_cpu(unsigned int cpu)
361{
362 u64 sum = nmi_count(cpu);
363
364#ifdef CONFIG_X86_LOCAL_APIC
365 sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
366#endif
367#ifdef CONFIG_SMP
368 sum += per_cpu(irq_stat, cpu).irq_resched_count;
369 sum += per_cpu(irq_stat, cpu).irq_call_count;
370 sum += per_cpu(irq_stat, cpu).irq_tlb_count;
371#endif
372#ifdef CONFIG_X86_MCE
373 sum += per_cpu(irq_stat, cpu).irq_thermal_count;
374#endif
375#ifdef CONFIG_X86_LOCAL_APIC
376 sum += per_cpu(irq_stat, cpu).irq_spurious_count;
377#endif
378 return sum;
379}
380
381u64 arch_irq_stat(void)
382{
383 u64 sum = atomic_read(&irq_err_count);
384
385#ifdef CONFIG_X86_IO_APIC
386 sum += atomic_read(&irq_mis_count);
387#endif
388 return sum;
389}
390
391#ifdef CONFIG_HOTPLUG_CPU 233#ifdef CONFIG_HOTPLUG_CPU
392#include <mach_apic.h> 234#include <mach_apic.h>
393 235
@@ -395,20 +237,22 @@ void fixup_irqs(cpumask_t map)
395{ 237{
396 unsigned int irq; 238 unsigned int irq;
397 static int warned; 239 static int warned;
240 struct irq_desc *desc;
398 241
399 for (irq = 0; irq < NR_IRQS; irq++) { 242 for_each_irq_desc(irq, desc) {
400 cpumask_t mask; 243 cpumask_t mask;
244
401 if (irq == 2) 245 if (irq == 2)
402 continue; 246 continue;
403 247
404 cpus_and(mask, irq_desc[irq].affinity, map); 248 cpus_and(mask, desc->affinity, map);
405 if (any_online_cpu(mask) == NR_CPUS) { 249 if (any_online_cpu(mask) == NR_CPUS) {
406 printk("Breaking affinity for irq %i\n", irq); 250 printk("Breaking affinity for irq %i\n", irq);
407 mask = map; 251 mask = map;
408 } 252 }
409 if (irq_desc[irq].chip->set_affinity) 253 if (desc->chip->set_affinity)
410 irq_desc[irq].chip->set_affinity(irq, mask); 254 desc->chip->set_affinity(irq, mask);
411 else if (irq_desc[irq].action && !(warned++)) 255 else if (desc->action && !(warned++))
412 printk("Cannot set affinity for irq %i\n", irq); 256 printk("Cannot set affinity for irq %i\n", irq);
413 } 257 }
414 258
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1f78b238d8d..60eb84eb77a 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,28 +18,6 @@
18#include <asm/idle.h> 18#include <asm/idle.h>
19#include <asm/smp.h> 19#include <asm/smp.h>
20 20
21atomic_t irq_err_count;
22
23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'.
25 * each architecture has to answer this themselves.
26 */
27void ack_bad_irq(unsigned int irq)
28{
29 printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq);
30 /*
31 * Currently unexpected vectors happen only on SMP and APIC.
32 * We _must_ ack these because every local APIC has only N
33 * irq slots per priority level, and a 'hanging, unacked' IRQ
34 * holds up an irq slot - in excessive cases (when multiple
35 * unexpected vectors occur) that might lock up the APIC
36 * completely.
37 * But don't ack when the APIC is disabled. -AK
38 */
39 if (!disable_apic)
40 ack_APIC_irq();
41}
42
43#ifdef CONFIG_DEBUG_STACKOVERFLOW 21#ifdef CONFIG_DEBUG_STACKOVERFLOW
44/* 22/*
45 * Probabilistic stack overflow check: 23 * Probabilistic stack overflow check:
@@ -65,122 +43,6 @@ static inline void stack_overflow_check(struct pt_regs *regs)
65#endif 43#endif
66 44
67/* 45/*
68 * Generic, controller-independent functions:
69 */
70
71int show_interrupts(struct seq_file *p, void *v)
72{
73 int i = *(loff_t *) v, j;
74 struct irqaction * action;
75 unsigned long flags;
76
77 if (i == 0) {
78 seq_printf(p, " ");
79 for_each_online_cpu(j)
80 seq_printf(p, "CPU%-8d",j);
81 seq_putc(p, '\n');
82 }
83
84 if (i < NR_IRQS) {
85 unsigned any_count = 0;
86
87 spin_lock_irqsave(&irq_desc[i].lock, flags);
88#ifndef CONFIG_SMP
89 any_count = kstat_irqs(i);
90#else
91 for_each_online_cpu(j)
92 any_count |= kstat_cpu(j).irqs[i];
93#endif
94 action = irq_desc[i].action;
95 if (!action && !any_count)
96 goto skip;
97 seq_printf(p, "%3d: ",i);
98#ifndef CONFIG_SMP
99 seq_printf(p, "%10u ", kstat_irqs(i));
100#else
101 for_each_online_cpu(j)
102 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
103#endif
104 seq_printf(p, " %8s", irq_desc[i].chip->name);
105 seq_printf(p, "-%-8s", irq_desc[i].name);
106
107 if (action) {
108 seq_printf(p, " %s", action->name);
109 while ((action = action->next) != NULL)
110 seq_printf(p, ", %s", action->name);
111 }
112 seq_putc(p, '\n');
113skip:
114 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
115 } else if (i == NR_IRQS) {
116 seq_printf(p, "NMI: ");
117 for_each_online_cpu(j)
118 seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
119 seq_printf(p, " Non-maskable interrupts\n");
120 seq_printf(p, "LOC: ");
121 for_each_online_cpu(j)
122 seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
123 seq_printf(p, " Local timer interrupts\n");
124#ifdef CONFIG_SMP
125 seq_printf(p, "RES: ");
126 for_each_online_cpu(j)
127 seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count);
128 seq_printf(p, " Rescheduling interrupts\n");
129 seq_printf(p, "CAL: ");
130 for_each_online_cpu(j)
131 seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
132 seq_printf(p, " function call interrupts\n");
133 seq_printf(p, "TLB: ");
134 for_each_online_cpu(j)
135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
136 seq_printf(p, " TLB shootdowns\n");
137#endif
138#ifdef CONFIG_X86_MCE
139 seq_printf(p, "TRM: ");
140 for_each_online_cpu(j)
141 seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
142 seq_printf(p, " Thermal event interrupts\n");
143 seq_printf(p, "THR: ");
144 for_each_online_cpu(j)
145 seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
146 seq_printf(p, " Threshold APIC interrupts\n");
147#endif
148 seq_printf(p, "SPU: ");
149 for_each_online_cpu(j)
150 seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
151 seq_printf(p, " Spurious interrupts\n");
152 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
153 }
154 return 0;
155}
156
157/*
158 * /proc/stat helpers
159 */
160u64 arch_irq_stat_cpu(unsigned int cpu)
161{
162 u64 sum = cpu_pda(cpu)->__nmi_count;
163
164 sum += cpu_pda(cpu)->apic_timer_irqs;
165#ifdef CONFIG_SMP
166 sum += cpu_pda(cpu)->irq_resched_count;
167 sum += cpu_pda(cpu)->irq_call_count;
168 sum += cpu_pda(cpu)->irq_tlb_count;
169#endif
170#ifdef CONFIG_X86_MCE
171 sum += cpu_pda(cpu)->irq_thermal_count;
172 sum += cpu_pda(cpu)->irq_threshold_count;
173#endif
174 sum += cpu_pda(cpu)->irq_spurious_count;
175 return sum;
176}
177
178u64 arch_irq_stat(void)
179{
180 return atomic_read(&irq_err_count);
181}
182
183/*
184 * do_IRQ handles all normal device IRQ's (the special 46 * do_IRQ handles all normal device IRQ's (the special
185 * SMP cross-CPU interrupts have their own specific 47 * SMP cross-CPU interrupts have their own specific
186 * handlers). 48 * handlers).
@@ -188,6 +50,7 @@ u64 arch_irq_stat(void)
188asmlinkage unsigned int do_IRQ(struct pt_regs *regs) 50asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
189{ 51{
190 struct pt_regs *old_regs = set_irq_regs(regs); 52 struct pt_regs *old_regs = set_irq_regs(regs);
53 struct irq_desc *desc;
191 54
192 /* high bit used in ret_from_ code */ 55 /* high bit used in ret_from_ code */
193 unsigned vector = ~regs->orig_ax; 56 unsigned vector = ~regs->orig_ax;
@@ -201,8 +64,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
201 stack_overflow_check(regs); 64 stack_overflow_check(regs);
202#endif 65#endif
203 66
204 if (likely(irq < NR_IRQS)) 67 desc = irq_to_desc(irq);
205 generic_handle_irq(irq); 68 if (likely(desc))
69 generic_handle_irq_desc(irq, desc);
206 else { 70 else {
207 if (!disable_apic) 71 if (!disable_apic)
208 ack_APIC_irq(); 72 ack_APIC_irq();
@@ -223,8 +87,9 @@ void fixup_irqs(cpumask_t map)
223{ 87{
224 unsigned int irq; 88 unsigned int irq;
225 static int warned; 89 static int warned;
90 struct irq_desc *desc;
226 91
227 for (irq = 0; irq < NR_IRQS; irq++) { 92 for_each_irq_desc(irq, desc) {
228 cpumask_t mask; 93 cpumask_t mask;
229 int break_affinity = 0; 94 int break_affinity = 0;
230 int set_affinity = 1; 95 int set_affinity = 1;
@@ -233,32 +98,32 @@ void fixup_irqs(cpumask_t map)
233 continue; 98 continue;
234 99
235 /* interrupt's are disabled at this point */ 100 /* interrupt's are disabled at this point */
236 spin_lock(&irq_desc[irq].lock); 101 spin_lock(&desc->lock);
237 102
238 if (!irq_has_action(irq) || 103 if (!irq_has_action(irq) ||
239 cpus_equal(irq_desc[irq].affinity, map)) { 104 cpus_equal(desc->affinity, map)) {
240 spin_unlock(&irq_desc[irq].lock); 105 spin_unlock(&desc->lock);
241 continue; 106 continue;
242 } 107 }
243 108
244 cpus_and(mask, irq_desc[irq].affinity, map); 109 cpus_and(mask, desc->affinity, map);
245 if (cpus_empty(mask)) { 110 if (cpus_empty(mask)) {
246 break_affinity = 1; 111 break_affinity = 1;
247 mask = map; 112 mask = map;
248 } 113 }
249 114
250 if (irq_desc[irq].chip->mask) 115 if (desc->chip->mask)
251 irq_desc[irq].chip->mask(irq); 116 desc->chip->mask(irq);
252 117
253 if (irq_desc[irq].chip->set_affinity) 118 if (desc->chip->set_affinity)
254 irq_desc[irq].chip->set_affinity(irq, mask); 119 desc->chip->set_affinity(irq, mask);
255 else if (!(warned++)) 120 else if (!(warned++))
256 set_affinity = 0; 121 set_affinity = 0;
257 122
258 if (irq_desc[irq].chip->unmask) 123 if (desc->chip->unmask)
259 irq_desc[irq].chip->unmask(irq); 124 desc->chip->unmask(irq);
260 125
261 spin_unlock(&irq_desc[irq].lock); 126 spin_unlock(&desc->lock);
262 127
263 if (break_affinity && set_affinity) 128 if (break_affinity && set_affinity)
264 printk("Broke affinity for irq %i\n", irq); 129 printk("Broke affinity for irq %i\n", irq);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index d66914287ee..845aa9803e8 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -69,11 +69,48 @@ void __init init_ISA_irqs (void)
69 * 16 old-style INTA-cycle interrupts: 69 * 16 old-style INTA-cycle interrupts:
70 */ 70 */
71 for (i = 0; i < 16; i++) { 71 for (i = 0; i < 16; i++) {
72 /* first time call this irq_desc */
73 struct irq_desc *desc = irq_to_desc(i);
74
75 desc->status = IRQ_DISABLED;
76 desc->action = NULL;
77 desc->depth = 1;
78
72 set_irq_chip_and_handler_name(i, &i8259A_chip, 79 set_irq_chip_and_handler_name(i, &i8259A_chip,
73 handle_level_irq, "XT"); 80 handle_level_irq, "XT");
74 } 81 }
75} 82}
76 83
84/*
85 * IRQ2 is cascade interrupt to second interrupt controller
86 */
87static struct irqaction irq2 = {
88 .handler = no_action,
89 .mask = CPU_MASK_NONE,
90 .name = "cascade",
91};
92
93DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
94 [0 ... IRQ0_VECTOR - 1] = -1,
95 [IRQ0_VECTOR] = 0,
96 [IRQ1_VECTOR] = 1,
97 [IRQ2_VECTOR] = 2,
98 [IRQ3_VECTOR] = 3,
99 [IRQ4_VECTOR] = 4,
100 [IRQ5_VECTOR] = 5,
101 [IRQ6_VECTOR] = 6,
102 [IRQ7_VECTOR] = 7,
103 [IRQ8_VECTOR] = 8,
104 [IRQ9_VECTOR] = 9,
105 [IRQ10_VECTOR] = 10,
106 [IRQ11_VECTOR] = 11,
107 [IRQ12_VECTOR] = 12,
108 [IRQ13_VECTOR] = 13,
109 [IRQ14_VECTOR] = 14,
110 [IRQ15_VECTOR] = 15,
111 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
112};
113
77/* Overridden in paravirt.c */ 114/* Overridden in paravirt.c */
78void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 115void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
79 116
@@ -89,15 +126,50 @@ void __init native_init_IRQ(void)
89 * us. (some of these will be overridden and become 126 * us. (some of these will be overridden and become
90 * 'special' SMP interrupts) 127 * 'special' SMP interrupts)
91 */ 128 */
92 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { 129 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
93 int vector = FIRST_EXTERNAL_VECTOR + i;
94 if (i >= NR_IRQS)
95 break;
96 /* SYSCALL_VECTOR was reserved in trap_init. */ 130 /* SYSCALL_VECTOR was reserved in trap_init. */
97 if (!test_bit(vector, used_vectors)) 131 if (i != SYSCALL_VECTOR)
98 set_intr_gate(vector, interrupt[i]); 132 set_intr_gate(i, interrupt[i]);
99 } 133 }
100 134
135
136#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
137 /*
138 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
139 * IPI, driven by wakeup.
140 */
141 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
142
143 /* IPI for invalidation */
144 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
145
146 /* IPI for generic function call */
147 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
148
149 /* IPI for single call function */
150 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
151
152 /* Low priority IPI to cleanup after moving an irq */
153 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
154#endif
155
156#ifdef CONFIG_X86_LOCAL_APIC
157 /* self generated IPI for local APIC timer */
158 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
159
160 /* IPI vectors for APIC spurious and error interrupts */
161 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
162 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
163#endif
164
165#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
166 /* thermal monitor LVT interrupt */
167 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
168#endif
169
170 if (!acpi_ioapic)
171 setup_irq(2, &irq2);
172
101 /* setup after call gates are initialised (usually add in 173 /* setup after call gates are initialised (usually add in
102 * the architecture specific gates) 174 * the architecture specific gates)
103 */ 175 */
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 1f26fd9ec4f..ff023539128 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -135,51 +135,33 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
135 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 135 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
136}; 136};
137 137
138static void __init init_ISA_irqs (void) 138void __init init_ISA_irqs(void)
139{ 139{
140 int i; 140 int i;
141 141
142 init_bsp_APIC(); 142 init_bsp_APIC();
143 init_8259A(0); 143 init_8259A(0);
144 144
145 for (i = 0; i < NR_IRQS; i++) { 145 for (i = 0; i < 16; i++) {
146 irq_desc[i].status = IRQ_DISABLED; 146 /* first time call this irq_desc */
147 irq_desc[i].action = NULL; 147 struct irq_desc *desc = irq_to_desc(i);
148 irq_desc[i].depth = 1;
149 148
150 if (i < 16) { 149 desc->status = IRQ_DISABLED;
151 /* 150 desc->action = NULL;
152 * 16 old-style INTA-cycle interrupts: 151 desc->depth = 1;
153 */ 152
154 set_irq_chip_and_handler_name(i, &i8259A_chip, 153 /*
154 * 16 old-style INTA-cycle interrupts:
155 */
156 set_irq_chip_and_handler_name(i, &i8259A_chip,
155 handle_level_irq, "XT"); 157 handle_level_irq, "XT");
156 } else {
157 /*
158 * 'high' PCI IRQs filled in on demand
159 */
160 irq_desc[i].chip = &no_irq_chip;
161 }
162 } 158 }
163} 159}
164 160
165void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 161void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
166 162
167void __init native_init_IRQ(void) 163static void __init smp_intr_init(void)
168{ 164{
169 int i;
170
171 init_ISA_irqs();
172 /*
173 * Cover the whole vector space, no vector can escape
174 * us. (some of these will be overridden and become
175 * 'special' SMP interrupts)
176 */
177 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
178 int vector = FIRST_EXTERNAL_VECTOR + i;
179 if (vector != IA32_SYSCALL_VECTOR)
180 set_intr_gate(vector, interrupt[i]);
181 }
182
183#ifdef CONFIG_SMP 165#ifdef CONFIG_SMP
184 /* 166 /*
185 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 167 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -207,6 +189,12 @@ void __init native_init_IRQ(void)
207 /* Low priority IPI to cleanup after moving an irq */ 189 /* Low priority IPI to cleanup after moving an irq */
208 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 190 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
209#endif 191#endif
192}
193
194static void __init apic_intr_init(void)
195{
196 smp_intr_init();
197
210 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 198 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
211 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 199 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
212 200
@@ -216,6 +204,25 @@ void __init native_init_IRQ(void)
216 /* IPI vectors for APIC spurious and error interrupts */ 204 /* IPI vectors for APIC spurious and error interrupts */
217 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 205 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
218 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 206 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
207}
208
209void __init native_init_IRQ(void)
210{
211 int i;
212
213 init_ISA_irqs();
214 /*
215 * Cover the whole vector space, no vector can escape
216 * us. (some of these will be overridden and become
217 * 'special' SMP interrupts)
218 */
219 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
220 int vector = FIRST_EXTERNAL_VECTOR + i;
221 if (vector != IA32_SYSCALL_VECTOR)
222 set_intr_gate(vector, interrupt[i]);
223 }
224
225 apic_intr_init();
219 226
220 if (!acpi_ioapic) 227 if (!acpi_ioapic)
221 setup_irq(2, &irq2); 228 setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index 7377ccb2133..304d8bad655 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges);
16static u32 *flush_words; 16static u32 *flush_words;
17 17
18struct pci_device_id k8_nb_ids[] = { 18struct pci_device_id k8_nb_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, 19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, 20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
21 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
21 {} 22 {}
22}; 23};
23EXPORT_SYMBOL(k8_nb_ids); 24EXPORT_SYMBOL(k8_nb_ids);
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f47f0eb886b..10435a120d2 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -69,6 +69,9 @@ static int gdb_x86vector = -1;
69 */ 69 */
70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) 70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
71{ 71{
72#ifndef CONFIG_X86_32
73 u32 *gdb_regs32 = (u32 *)gdb_regs;
74#endif
72 gdb_regs[GDB_AX] = regs->ax; 75 gdb_regs[GDB_AX] = regs->ax;
73 gdb_regs[GDB_BX] = regs->bx; 76 gdb_regs[GDB_BX] = regs->bx;
74 gdb_regs[GDB_CX] = regs->cx; 77 gdb_regs[GDB_CX] = regs->cx;
@@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
76 gdb_regs[GDB_SI] = regs->si; 79 gdb_regs[GDB_SI] = regs->si;
77 gdb_regs[GDB_DI] = regs->di; 80 gdb_regs[GDB_DI] = regs->di;
78 gdb_regs[GDB_BP] = regs->bp; 81 gdb_regs[GDB_BP] = regs->bp;
79 gdb_regs[GDB_PS] = regs->flags;
80 gdb_regs[GDB_PC] = regs->ip; 82 gdb_regs[GDB_PC] = regs->ip;
81#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
84 gdb_regs[GDB_PS] = regs->flags;
82 gdb_regs[GDB_DS] = regs->ds; 85 gdb_regs[GDB_DS] = regs->ds;
83 gdb_regs[GDB_ES] = regs->es; 86 gdb_regs[GDB_ES] = regs->es;
84 gdb_regs[GDB_CS] = regs->cs; 87 gdb_regs[GDB_CS] = regs->cs;
@@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
94 gdb_regs[GDB_R13] = regs->r13; 97 gdb_regs[GDB_R13] = regs->r13;
95 gdb_regs[GDB_R14] = regs->r14; 98 gdb_regs[GDB_R14] = regs->r14;
96 gdb_regs[GDB_R15] = regs->r15; 99 gdb_regs[GDB_R15] = regs->r15;
100 gdb_regs32[GDB_PS] = regs->flags;
101 gdb_regs32[GDB_CS] = regs->cs;
102 gdb_regs32[GDB_SS] = regs->ss;
97#endif 103#endif
98 gdb_regs[GDB_SP] = regs->sp; 104 gdb_regs[GDB_SP] = regs->sp;
99} 105}
@@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
112 */ 118 */
113void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) 119void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
114{ 120{
121#ifndef CONFIG_X86_32
122 u32 *gdb_regs32 = (u32 *)gdb_regs;
123#endif
115 gdb_regs[GDB_AX] = 0; 124 gdb_regs[GDB_AX] = 0;
116 gdb_regs[GDB_BX] = 0; 125 gdb_regs[GDB_BX] = 0;
117 gdb_regs[GDB_CX] = 0; 126 gdb_regs[GDB_CX] = 0;
@@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
129 gdb_regs[GDB_FS] = 0xFFFF; 138 gdb_regs[GDB_FS] = 0xFFFF;
130 gdb_regs[GDB_GS] = 0xFFFF; 139 gdb_regs[GDB_GS] = 0xFFFF;
131#else 140#else
132 gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 141 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
133 gdb_regs[GDB_PC] = 0; 142 gdb_regs32[GDB_CS] = __KERNEL_CS;
143 gdb_regs32[GDB_SS] = __KERNEL_DS;
144 gdb_regs[GDB_PC] = p->thread.ip;
134 gdb_regs[GDB_R8] = 0; 145 gdb_regs[GDB_R8] = 0;
135 gdb_regs[GDB_R9] = 0; 146 gdb_regs[GDB_R9] = 0;
136 gdb_regs[GDB_R10] = 0; 147 gdb_regs[GDB_R10] = 0;
@@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
153 */ 164 */
154void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) 165void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
155{ 166{
167#ifndef CONFIG_X86_32
168 u32 *gdb_regs32 = (u32 *)gdb_regs;
169#endif
156 regs->ax = gdb_regs[GDB_AX]; 170 regs->ax = gdb_regs[GDB_AX];
157 regs->bx = gdb_regs[GDB_BX]; 171 regs->bx = gdb_regs[GDB_BX];
158 regs->cx = gdb_regs[GDB_CX]; 172 regs->cx = gdb_regs[GDB_CX];
@@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
160 regs->si = gdb_regs[GDB_SI]; 174 regs->si = gdb_regs[GDB_SI];
161 regs->di = gdb_regs[GDB_DI]; 175 regs->di = gdb_regs[GDB_DI];
162 regs->bp = gdb_regs[GDB_BP]; 176 regs->bp = gdb_regs[GDB_BP];
163 regs->flags = gdb_regs[GDB_PS];
164 regs->ip = gdb_regs[GDB_PC]; 177 regs->ip = gdb_regs[GDB_PC];
165#ifdef CONFIG_X86_32 178#ifdef CONFIG_X86_32
179 regs->flags = gdb_regs[GDB_PS];
166 regs->ds = gdb_regs[GDB_DS]; 180 regs->ds = gdb_regs[GDB_DS];
167 regs->es = gdb_regs[GDB_ES]; 181 regs->es = gdb_regs[GDB_ES];
168 regs->cs = gdb_regs[GDB_CS]; 182 regs->cs = gdb_regs[GDB_CS];
@@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
175 regs->r13 = gdb_regs[GDB_R13]; 189 regs->r13 = gdb_regs[GDB_R13];
176 regs->r14 = gdb_regs[GDB_R14]; 190 regs->r14 = gdb_regs[GDB_R14];
177 regs->r15 = gdb_regs[GDB_R15]; 191 regs->r15 = gdb_regs[GDB_R15];
192 regs->flags = gdb_regs32[GDB_PS];
193 regs->cs = gdb_regs32[GDB_CS];
194 regs->ss = gdb_regs32[GDB_SS];
178#endif 195#endif
179} 196}
180 197
@@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
378 if (remcomInBuffer[0] == 's') { 395 if (remcomInBuffer[0] == 's') {
379 linux_regs->flags |= X86_EFLAGS_TF; 396 linux_regs->flags |= X86_EFLAGS_TF;
380 kgdb_single_step = 1; 397 kgdb_single_step = 1;
381 if (kgdb_contthread) { 398 atomic_set(&kgdb_cpu_doing_single_step,
382 atomic_set(&kgdb_cpu_doing_single_step, 399 raw_smp_processor_id());
383 raw_smp_processor_id());
384 }
385 } 400 }
386 401
387 get_debugreg(dr6, 6); 402 get_debugreg(dr6, 6);
@@ -440,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
440 return NOTIFY_DONE; 455 return NOTIFY_DONE;
441 456
442 case DIE_NMI_IPI: 457 case DIE_NMI_IPI:
443 if (atomic_read(&kgdb_active) != -1) { 458 /* Just ignore, we will handle the roundup on DIE_NMI. */
444 /* KGDB CPU roundup */
445 kgdb_nmicallback(raw_smp_processor_id(), regs);
446 was_in_debug_nmi[raw_smp_processor_id()] = 1;
447 touch_nmi_watchdog();
448 }
449 return NOTIFY_DONE; 459 return NOTIFY_DONE;
450 460
451 case DIE_NMIUNKNOWN: 461 case DIE_NMIUNKNOWN:
@@ -466,9 +476,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
466 476
467 case DIE_DEBUG: 477 case DIE_DEBUG:
468 if (atomic_read(&kgdb_cpu_doing_single_step) == 478 if (atomic_read(&kgdb_cpu_doing_single_step) ==
469 raw_smp_processor_id() && 479 raw_smp_processor_id()) {
470 user_mode(regs)) 480 if (user_mode(regs))
471 return single_step_cont(regs, args); 481 return single_step_cont(regs, args);
482 break;
483 } else if (test_thread_flag(TIF_SINGLESTEP))
484 /* This means a user thread is single stepping
485 * a system call which should be ignored
486 */
487 return NOTIFY_DONE;
472 /* fall through */ 488 /* fall through */
473 default: 489 default:
474 if (user_mode(regs)) 490 if (user_mode(regs))
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8b7a3cf37d2..478bca986ec 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void)
178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb); 178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
179} 179}
180 180
181static void kvm_release_pt(u32 pfn) 181static void kvm_release_pt(unsigned long pfn)
182{ 182{
183 struct kvm_mmu_op_release_pt rpt = { 183 struct kvm_mmu_op_release_pt rpt = {
184 .header.op = KVM_MMU_OP_RELEASE_PT, 184 .header.op = KVM_MMU_OP_RELEASE_PT,
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index d02def06ca9..774ac499156 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void)
78 return ret; 78 return ret;
79} 79}
80 80
81/*
82 * If we don't do that, there is the possibility that the guest
83 * will calibrate under heavy load - thus, getting a lower lpj -
84 * and execute the delays themselves without load. This is wrong,
85 * because no delay loop can finish beforehand.
86 * Any heuristics is subject to fail, because ultimately, a large
87 * poll of guests can be running and trouble each other. So we preset
88 * lpj here
89 */
90static unsigned long kvm_get_tsc_khz(void)
91{
92 return preset_lpj;
93}
94
95static void kvm_get_preset_lpj(void)
96{
97 struct pvclock_vcpu_time_info *src;
98 unsigned long khz;
99 u64 lpj;
100
101 src = &per_cpu(hv_clock, 0);
102 khz = pvclock_tsc_khz(src);
103
104 lpj = ((u64)khz * 1000);
105 do_div(lpj, HZ);
106 preset_lpj = lpj;
107}
108
81static struct clocksource kvm_clock = { 109static struct clocksource kvm_clock = {
82 .name = "kvm-clock", 110 .name = "kvm-clock",
83 .read = kvm_clock_read, 111 .read = kvm_clock_read,
@@ -153,6 +181,7 @@ void __init kvmclock_init(void)
153 pv_time_ops.get_wallclock = kvm_get_wallclock; 181 pv_time_ops.get_wallclock = kvm_get_wallclock;
154 pv_time_ops.set_wallclock = kvm_set_wallclock; 182 pv_time_ops.set_wallclock = kvm_set_wallclock;
155 pv_time_ops.sched_clock = kvm_clock_read; 183 pv_time_ops.sched_clock = kvm_clock_read;
184 pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
156#ifdef CONFIG_X86_LOCAL_APIC 185#ifdef CONFIG_X86_LOCAL_APIC
157 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; 186 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
158#endif 187#endif
@@ -163,6 +192,7 @@ void __init kvmclock_init(void)
163#ifdef CONFIG_KEXEC 192#ifdef CONFIG_KEXEC
164 machine_ops.crash_shutdown = kvm_crash_shutdown; 193 machine_ops.crash_shutdown = kvm_crash_shutdown;
165#endif 194#endif
195 kvm_get_preset_lpj();
166 clocksource_register(&kvm_clock); 196 clocksource_register(&kvm_clock);
167 } 197 }
168} 198}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index b68e21f06f4..eee32b43fee 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -18,6 +18,7 @@
18#include <asm/ldt.h> 18#include <asm/ldt.h>
19#include <asm/desc.h> 19#include <asm/desc.h>
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21#include <asm/syscalls.h>
21 22
22#ifdef CONFIG_SMP 23#ifdef CONFIG_SMP
23static void flush_ldt(void *current_mm) 24static void flush_ldt(void *current_mm)
@@ -51,6 +52,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
51 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, 52 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
52 (mincount - oldsize) * LDT_ENTRY_SIZE); 53 (mincount - oldsize) * LDT_ENTRY_SIZE);
53 54
55 paravirt_alloc_ldt(newldt, mincount);
56
54#ifdef CONFIG_X86_64 57#ifdef CONFIG_X86_64
55 /* CHECKME: Do we really need this ? */ 58 /* CHECKME: Do we really need this ? */
56 wmb(); 59 wmb();
@@ -73,6 +76,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
73#endif 76#endif
74 } 77 }
75 if (oldsize) { 78 if (oldsize) {
79 paravirt_free_ldt(oldldt, oldsize);
76 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) 80 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
77 vfree(oldldt); 81 vfree(oldldt);
78 else 82 else
@@ -84,10 +88,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
84static inline int copy_ldt(mm_context_t *new, mm_context_t *old) 88static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
85{ 89{
86 int err = alloc_ldt(new, old->size, 0); 90 int err = alloc_ldt(new, old->size, 0);
91 int i;
87 92
88 if (err < 0) 93 if (err < 0)
89 return err; 94 return err;
90 memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); 95
96 for(i = 0; i < old->size; i++)
97 write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
91 return 0; 98 return 0;
92} 99}
93 100
@@ -124,6 +131,7 @@ void destroy_context(struct mm_struct *mm)
124 if (mm == current->active_mm) 131 if (mm == current->active_mm)
125 clear_LDT(); 132 clear_LDT();
126#endif 133#endif
134 paravirt_free_ldt(mm->context.ldt, mm->context.size);
127 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) 135 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
128 vfree(mm->context.ldt); 136 vfree(mm->context.ldt);
129 else 137 else
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
deleted file mode 100644
index 652fa5c38eb..00000000000
--- a/arch/x86/kernel/microcode.c
+++ /dev/null
@@ -1,853 +0,0 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to speculative
59 * nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73
74//#define DEBUG /* pr_debug */
75#include <linux/capability.h>
76#include <linux/kernel.h>
77#include <linux/init.h>
78#include <linux/sched.h>
79#include <linux/smp_lock.h>
80#include <linux/cpumask.h>
81#include <linux/module.h>
82#include <linux/slab.h>
83#include <linux/vmalloc.h>
84#include <linux/miscdevice.h>
85#include <linux/spinlock.h>
86#include <linux/mm.h>
87#include <linux/fs.h>
88#include <linux/mutex.h>
89#include <linux/cpu.h>
90#include <linux/firmware.h>
91#include <linux/platform_device.h>
92
93#include <asm/msr.h>
94#include <asm/uaccess.h>
95#include <asm/processor.h>
96
97MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
98MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
99MODULE_LICENSE("GPL");
100
101#define MICROCODE_VERSION "1.14a"
102
103#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
104#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
105#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
106#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */
107#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */
108#define DWSIZE (sizeof (u32))
109#define get_totalsize(mc) \
110 (((microcode_t *)mc)->hdr.totalsize ? \
111 ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
112#define get_datasize(mc) \
113 (((microcode_t *)mc)->hdr.datasize ? \
114 ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
115
116#define sigmatch(s1, s2, p1, p2) \
117 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
118
119#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
120
121/* serialize access to the physical write to MSR 0x79 */
122static DEFINE_SPINLOCK(microcode_update_lock);
123
124/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
125static DEFINE_MUTEX(microcode_mutex);
126
127static struct ucode_cpu_info {
128 int valid;
129 unsigned int sig;
130 unsigned int pf;
131 unsigned int rev;
132 microcode_t *mc;
133} ucode_cpu_info[NR_CPUS];
134
135static void collect_cpu_info(int cpu_num)
136{
137 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
138 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
139 unsigned int val[2];
140
141 /* We should bind the task to the CPU */
142 BUG_ON(raw_smp_processor_id() != cpu_num);
143 uci->pf = uci->rev = 0;
144 uci->mc = NULL;
145 uci->valid = 1;
146
147 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
148 cpu_has(c, X86_FEATURE_IA64)) {
149 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
150 "processor\n", cpu_num);
151 uci->valid = 0;
152 return;
153 }
154
155 uci->sig = cpuid_eax(0x00000001);
156
157 if ((c->x86_model >= 5) || (c->x86 > 6)) {
158 /* get processor flags from MSR 0x17 */
159 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
160 uci->pf = 1 << ((val[1] >> 18) & 7);
161 }
162
163 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
164 /* see notes above for revision 1.07. Apparent chip bug */
165 sync_core();
166 /* get the current revision from MSR 0x8B */
167 rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
168 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
169 uci->sig, uci->pf, uci->rev);
170}
171
172static inline int microcode_update_match(int cpu_num,
173 microcode_header_t *mc_header, int sig, int pf)
174{
175 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
176
177 if (!sigmatch(sig, uci->sig, pf, uci->pf)
178 || mc_header->rev <= uci->rev)
179 return 0;
180 return 1;
181}
182
183static int microcode_sanity_check(void *mc)
184{
185 microcode_header_t *mc_header = mc;
186 struct extended_sigtable *ext_header = NULL;
187 struct extended_signature *ext_sig;
188 unsigned long total_size, data_size, ext_table_size;
189 int sum, orig_sum, ext_sigcount = 0, i;
190
191 total_size = get_totalsize(mc_header);
192 data_size = get_datasize(mc_header);
193 if (data_size + MC_HEADER_SIZE > total_size) {
194 printk(KERN_ERR "microcode: error! "
195 "Bad data size in microcode data file\n");
196 return -EINVAL;
197 }
198
199 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
200 printk(KERN_ERR "microcode: error! "
201 "Unknown microcode update format\n");
202 return -EINVAL;
203 }
204 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
205 if (ext_table_size) {
206 if ((ext_table_size < EXT_HEADER_SIZE)
207 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
208 printk(KERN_ERR "microcode: error! "
209 "Small exttable size in microcode data file\n");
210 return -EINVAL;
211 }
212 ext_header = mc + MC_HEADER_SIZE + data_size;
213 if (ext_table_size != exttable_size(ext_header)) {
214 printk(KERN_ERR "microcode: error! "
215 "Bad exttable size in microcode data file\n");
216 return -EFAULT;
217 }
218 ext_sigcount = ext_header->count;
219 }
220
221 /* check extended table checksum */
222 if (ext_table_size) {
223 int ext_table_sum = 0;
224 int *ext_tablep = (int *)ext_header;
225
226 i = ext_table_size / DWSIZE;
227 while (i--)
228 ext_table_sum += ext_tablep[i];
229 if (ext_table_sum) {
230 printk(KERN_WARNING "microcode: aborting, "
231 "bad extended signature table checksum\n");
232 return -EINVAL;
233 }
234 }
235
236 /* calculate the checksum */
237 orig_sum = 0;
238 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
239 while (i--)
240 orig_sum += ((int *)mc)[i];
241 if (orig_sum) {
242 printk(KERN_ERR "microcode: aborting, bad checksum\n");
243 return -EINVAL;
244 }
245 if (!ext_table_size)
246 return 0;
247 /* check extended signature checksum */
248 for (i = 0; i < ext_sigcount; i++) {
249 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
250 EXT_SIGNATURE_SIZE * i;
251 sum = orig_sum
252 - (mc_header->sig + mc_header->pf + mc_header->cksum)
253 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
254 if (sum) {
255 printk(KERN_ERR "microcode: aborting, bad checksum\n");
256 return -EINVAL;
257 }
258 }
259 return 0;
260}
261
262/*
263 * return 0 - no update found
264 * return 1 - found update
265 * return < 0 - error
266 */
267static int get_maching_microcode(void *mc, int cpu)
268{
269 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
270 microcode_header_t *mc_header = mc;
271 struct extended_sigtable *ext_header;
272 unsigned long total_size = get_totalsize(mc_header);
273 int ext_sigcount, i;
274 struct extended_signature *ext_sig;
275 void *new_mc;
276
277 if (microcode_update_match(cpu, mc_header,
278 mc_header->sig, mc_header->pf))
279 goto find;
280
281 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
282 return 0;
283
284 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
285 ext_sigcount = ext_header->count;
286 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
287 for (i = 0; i < ext_sigcount; i++) {
288 if (microcode_update_match(cpu, mc_header,
289 ext_sig->sig, ext_sig->pf))
290 goto find;
291 ext_sig++;
292 }
293 return 0;
294find:
295 pr_debug("microcode: CPU%d found a matching microcode update with"
296 " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
297 new_mc = vmalloc(total_size);
298 if (!new_mc) {
299 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
300 return -ENOMEM;
301 }
302
303 /* free previous update file */
304 vfree(uci->mc);
305
306 memcpy(new_mc, mc, total_size);
307 uci->mc = new_mc;
308 return 1;
309}
310
311static void apply_microcode(int cpu)
312{
313 unsigned long flags;
314 unsigned int val[2];
315 int cpu_num = raw_smp_processor_id();
316 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
317
318 /* We should bind the task to the CPU */
319 BUG_ON(cpu_num != cpu);
320
321 if (uci->mc == NULL)
322 return;
323
324 /* serialize access to the physical write to MSR 0x79 */
325 spin_lock_irqsave(&microcode_update_lock, flags);
326
327 /* write microcode via MSR 0x79 */
328 wrmsr(MSR_IA32_UCODE_WRITE,
329 (unsigned long) uci->mc->bits,
330 (unsigned long) uci->mc->bits >> 16 >> 16);
331 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
332
333 /* see notes above for revision 1.07. Apparent chip bug */
334 sync_core();
335
336 /* get the current revision from MSR 0x8B */
337 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
338
339 spin_unlock_irqrestore(&microcode_update_lock, flags);
340 if (val[1] != uci->mc->hdr.rev) {
341 printk(KERN_ERR "microcode: CPU%d update from revision "
342 "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
343 return;
344 }
345 printk(KERN_INFO "microcode: CPU%d updated from revision "
346 "0x%x to 0x%x, date = %08x \n",
347 cpu_num, uci->rev, val[1], uci->mc->hdr.date);
348 uci->rev = val[1];
349}
350
351#ifdef CONFIG_MICROCODE_OLD_INTERFACE
352static void __user *user_buffer; /* user area microcode data buffer */
353static unsigned int user_buffer_size; /* it's size */
354
355static long get_next_ucode(void **mc, long offset)
356{
357 microcode_header_t mc_header;
358 unsigned long total_size;
359
360 /* No more data */
361 if (offset >= user_buffer_size)
362 return 0;
363 if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
364 printk(KERN_ERR "microcode: error! Can not read user data\n");
365 return -EFAULT;
366 }
367 total_size = get_totalsize(&mc_header);
368 if (offset + total_size > user_buffer_size) {
369 printk(KERN_ERR "microcode: error! Bad total size in microcode "
370 "data file\n");
371 return -EINVAL;
372 }
373 *mc = vmalloc(total_size);
374 if (!*mc)
375 return -ENOMEM;
376 if (copy_from_user(*mc, user_buffer + offset, total_size)) {
377 printk(KERN_ERR "microcode: error! Can not read user data\n");
378 vfree(*mc);
379 return -EFAULT;
380 }
381 return offset + total_size;
382}
383
384static int do_microcode_update (void)
385{
386 long cursor = 0;
387 int error = 0;
388 void *new_mc = NULL;
389 int cpu;
390 cpumask_t old;
391
392 old = current->cpus_allowed;
393
394 while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
395 error = microcode_sanity_check(new_mc);
396 if (error)
397 goto out;
398 /*
399 * It's possible the data file has multiple matching ucode,
400 * lets keep searching till the latest version
401 */
402 for_each_online_cpu(cpu) {
403 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
404
405 if (!uci->valid)
406 continue;
407 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
408 error = get_maching_microcode(new_mc, cpu);
409 if (error < 0)
410 goto out;
411 if (error == 1)
412 apply_microcode(cpu);
413 }
414 vfree(new_mc);
415 }
416out:
417 if (cursor > 0)
418 vfree(new_mc);
419 if (cursor < 0)
420 error = cursor;
421 set_cpus_allowed_ptr(current, &old);
422 return error;
423}
424
425static int microcode_open (struct inode *unused1, struct file *unused2)
426{
427 cycle_kernel_lock();
428 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
429}
430
431static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
432{
433 ssize_t ret;
434
435 if ((len >> PAGE_SHIFT) > num_physpages) {
436 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
437 return -EINVAL;
438 }
439
440 get_online_cpus();
441 mutex_lock(&microcode_mutex);
442
443 user_buffer = (void __user *) buf;
444 user_buffer_size = (int) len;
445
446 ret = do_microcode_update();
447 if (!ret)
448 ret = (ssize_t)len;
449
450 mutex_unlock(&microcode_mutex);
451 put_online_cpus();
452
453 return ret;
454}
455
456static const struct file_operations microcode_fops = {
457 .owner = THIS_MODULE,
458 .write = microcode_write,
459 .open = microcode_open,
460};
461
462static struct miscdevice microcode_dev = {
463 .minor = MICROCODE_MINOR,
464 .name = "microcode",
465 .fops = &microcode_fops,
466};
467
468static int __init microcode_dev_init (void)
469{
470 int error;
471
472 error = misc_register(&microcode_dev);
473 if (error) {
474 printk(KERN_ERR
475 "microcode: can't misc_register on minor=%d\n",
476 MICROCODE_MINOR);
477 return error;
478 }
479
480 return 0;
481}
482
483static void microcode_dev_exit (void)
484{
485 misc_deregister(&microcode_dev);
486}
487
488MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
489#else
490#define microcode_dev_init() 0
491#define microcode_dev_exit() do { } while(0)
492#endif
493
494static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
495 unsigned long size, long offset)
496{
497 microcode_header_t *mc_header;
498 unsigned long total_size;
499
500 /* No more data */
501 if (offset >= size)
502 return 0;
503 mc_header = (microcode_header_t *)(buf + offset);
504 total_size = get_totalsize(mc_header);
505
506 if (offset + total_size > size) {
507 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
508 return -EINVAL;
509 }
510
511 *mc = vmalloc(total_size);
512 if (!*mc) {
513 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
514 return -ENOMEM;
515 }
516 memcpy(*mc, buf + offset, total_size);
517 return offset + total_size;
518}
519
520/* fake device for request_firmware */
521static struct platform_device *microcode_pdev;
522
523static int cpu_request_microcode(int cpu)
524{
525 char name[30];
526 struct cpuinfo_x86 *c = &cpu_data(cpu);
527 const struct firmware *firmware;
528 const u8 *buf;
529 unsigned long size;
530 long offset = 0;
531 int error;
532 void *mc;
533
534 /* We should bind the task to the CPU */
535 BUG_ON(cpu != raw_smp_processor_id());
536 sprintf(name,"intel-ucode/%02x-%02x-%02x",
537 c->x86, c->x86_model, c->x86_mask);
538 error = request_firmware(&firmware, name, &microcode_pdev->dev);
539 if (error) {
540 pr_debug("microcode: data file %s load failed\n", name);
541 return error;
542 }
543 buf = firmware->data;
544 size = firmware->size;
545 while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
546 > 0) {
547 error = microcode_sanity_check(mc);
548 if (error)
549 break;
550 error = get_maching_microcode(mc, cpu);
551 if (error < 0)
552 break;
553 /*
554 * It's possible the data file has multiple matching ucode,
555 * lets keep searching till the latest version
556 */
557 if (error == 1) {
558 apply_microcode(cpu);
559 error = 0;
560 }
561 vfree(mc);
562 }
563 if (offset > 0)
564 vfree(mc);
565 if (offset < 0)
566 error = offset;
567 release_firmware(firmware);
568
569 return error;
570}
571
572static int apply_microcode_check_cpu(int cpu)
573{
574 struct cpuinfo_x86 *c = &cpu_data(cpu);
575 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
576 cpumask_t old;
577 unsigned int val[2];
578 int err = 0;
579
580 /* Check if the microcode is available */
581 if (!uci->mc)
582 return 0;
583
584 old = current->cpus_allowed;
585 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
586
587 /* Check if the microcode we have in memory matches the CPU */
588 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
589 cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
590 err = -EINVAL;
591
592 if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
593 /* get processor flags from MSR 0x17 */
594 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
595 if (uci->pf != (1 << ((val[1] >> 18) & 7)))
596 err = -EINVAL;
597 }
598
599 if (!err) {
600 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
601 /* see notes above for revision 1.07. Apparent chip bug */
602 sync_core();
603 /* get the current revision from MSR 0x8B */
604 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
605 if (uci->rev != val[1])
606 err = -EINVAL;
607 }
608
609 if (!err)
610 apply_microcode(cpu);
611 else
612 printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
613 " sig=0x%x, pf=0x%x, rev=0x%x\n",
614 cpu, uci->sig, uci->pf, uci->rev);
615
616 set_cpus_allowed_ptr(current, &old);
617 return err;
618}
619
620static void microcode_init_cpu(int cpu, int resume)
621{
622 cpumask_t old;
623 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
624
625 old = current->cpus_allowed;
626
627 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
628 mutex_lock(&microcode_mutex);
629 collect_cpu_info(cpu);
630 if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
631 cpu_request_microcode(cpu);
632 mutex_unlock(&microcode_mutex);
633 set_cpus_allowed_ptr(current, &old);
634}
635
636static void microcode_fini_cpu(int cpu)
637{
638 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
639
640 mutex_lock(&microcode_mutex);
641 uci->valid = 0;
642 vfree(uci->mc);
643 uci->mc = NULL;
644 mutex_unlock(&microcode_mutex);
645}
646
647static ssize_t reload_store(struct sys_device *dev,
648 struct sysdev_attribute *attr,
649 const char *buf, size_t sz)
650{
651 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
652 char *end;
653 unsigned long val = simple_strtoul(buf, &end, 0);
654 int err = 0;
655 int cpu = dev->id;
656
657 if (end == buf)
658 return -EINVAL;
659 if (val == 1) {
660 cpumask_t old = current->cpus_allowed;
661
662 get_online_cpus();
663 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
664
665 mutex_lock(&microcode_mutex);
666 if (uci->valid)
667 err = cpu_request_microcode(cpu);
668 mutex_unlock(&microcode_mutex);
669 put_online_cpus();
670 set_cpus_allowed_ptr(current, &old);
671 }
672 if (err)
673 return err;
674 return sz;
675}
676
677static ssize_t version_show(struct sys_device *dev,
678 struct sysdev_attribute *attr, char *buf)
679{
680 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
681
682 return sprintf(buf, "0x%x\n", uci->rev);
683}
684
685static ssize_t pf_show(struct sys_device *dev,
686 struct sysdev_attribute *attr, char *buf)
687{
688 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
689
690 return sprintf(buf, "0x%x\n", uci->pf);
691}
692
693static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
694static SYSDEV_ATTR(version, 0400, version_show, NULL);
695static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
696
697static struct attribute *mc_default_attrs[] = {
698 &attr_reload.attr,
699 &attr_version.attr,
700 &attr_processor_flags.attr,
701 NULL
702};
703
704static struct attribute_group mc_attr_group = {
705 .attrs = mc_default_attrs,
706 .name = "microcode",
707};
708
709static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
710{
711 int err, cpu = sys_dev->id;
712 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
713
714 if (!cpu_online(cpu))
715 return 0;
716
717 pr_debug("microcode: CPU%d added\n", cpu);
718 memset(uci, 0, sizeof(*uci));
719
720 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
721 if (err)
722 return err;
723
724 microcode_init_cpu(cpu, resume);
725
726 return 0;
727}
728
729static int mc_sysdev_add(struct sys_device *sys_dev)
730{
731 return __mc_sysdev_add(sys_dev, 0);
732}
733
734static int mc_sysdev_remove(struct sys_device *sys_dev)
735{
736 int cpu = sys_dev->id;
737
738 if (!cpu_online(cpu))
739 return 0;
740
741 pr_debug("microcode: CPU%d removed\n", cpu);
742 microcode_fini_cpu(cpu);
743 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
744 return 0;
745}
746
747static int mc_sysdev_resume(struct sys_device *dev)
748{
749 int cpu = dev->id;
750
751 if (!cpu_online(cpu))
752 return 0;
753 pr_debug("microcode: CPU%d resumed\n", cpu);
754 /* only CPU 0 will apply ucode here */
755 apply_microcode(0);
756 return 0;
757}
758
759static struct sysdev_driver mc_sysdev_driver = {
760 .add = mc_sysdev_add,
761 .remove = mc_sysdev_remove,
762 .resume = mc_sysdev_resume,
763};
764
765static __cpuinit int
766mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
767{
768 unsigned int cpu = (unsigned long)hcpu;
769 struct sys_device *sys_dev;
770
771 sys_dev = get_cpu_sysdev(cpu);
772 switch (action) {
773 case CPU_UP_CANCELED_FROZEN:
774 /* The CPU refused to come up during a system resume */
775 microcode_fini_cpu(cpu);
776 break;
777 case CPU_ONLINE:
778 case CPU_DOWN_FAILED:
779 mc_sysdev_add(sys_dev);
780 break;
781 case CPU_ONLINE_FROZEN:
782 /* System-wide resume is in progress, try to apply microcode */
783 if (apply_microcode_check_cpu(cpu)) {
784 /* The application of microcode failed */
785 microcode_fini_cpu(cpu);
786 __mc_sysdev_add(sys_dev, 1);
787 break;
788 }
789 case CPU_DOWN_FAILED_FROZEN:
790 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
791 printk(KERN_ERR "microcode: Failed to create the sysfs "
792 "group for CPU%d\n", cpu);
793 break;
794 case CPU_DOWN_PREPARE:
795 mc_sysdev_remove(sys_dev);
796 break;
797 case CPU_DOWN_PREPARE_FROZEN:
798 /* Suspend is in progress, only remove the interface */
799 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
800 break;
801 }
802 return NOTIFY_OK;
803}
804
805static struct notifier_block __refdata mc_cpu_notifier = {
806 .notifier_call = mc_cpu_callback,
807};
808
809static int __init microcode_init (void)
810{
811 int error;
812
813 printk(KERN_INFO
814 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
815
816 error = microcode_dev_init();
817 if (error)
818 return error;
819 microcode_pdev = platform_device_register_simple("microcode", -1,
820 NULL, 0);
821 if (IS_ERR(microcode_pdev)) {
822 microcode_dev_exit();
823 return PTR_ERR(microcode_pdev);
824 }
825
826 get_online_cpus();
827 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
828 put_online_cpus();
829 if (error) {
830 microcode_dev_exit();
831 platform_device_unregister(microcode_pdev);
832 return error;
833 }
834
835 register_hotcpu_notifier(&mc_cpu_notifier);
836 return 0;
837}
838
839static void __exit microcode_exit (void)
840{
841 microcode_dev_exit();
842
843 unregister_hotcpu_notifier(&mc_cpu_notifier);
844
845 get_online_cpus();
846 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
847 put_online_cpus();
848
849 platform_device_unregister(microcode_pdev);
850}
851
852module_init(microcode_init)
853module_exit(microcode_exit)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
new file mode 100644
index 00000000000..7a1f8eeac2c
--- /dev/null
+++ b/arch/x86/kernel/microcode_amd.c
@@ -0,0 +1,435 @@
1/*
2 * AMD CPU Microcode Update Driver for Linux
3 * Copyright (C) 2008 Advanced Micro Devices Inc.
4 *
5 * Author: Peter Oruba <peter.oruba@amd.com>
6 *
7 * Based on work by:
8 * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
9 *
10 * This driver allows to upgrade microcode on AMD
11 * family 0x10 and 0x11 processors.
12 *
13 * Licensed unter the terms of the GNU General Public
14 * License version 2. See file COPYING for details.
15*/
16
17#include <linux/capability.h>
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/sched.h>
21#include <linux/cpumask.h>
22#include <linux/module.h>
23#include <linux/slab.h>
24#include <linux/vmalloc.h>
25#include <linux/miscdevice.h>
26#include <linux/spinlock.h>
27#include <linux/mm.h>
28#include <linux/fs.h>
29#include <linux/mutex.h>
30#include <linux/cpu.h>
31#include <linux/firmware.h>
32#include <linux/platform_device.h>
33#include <linux/pci.h>
34#include <linux/pci_ids.h>
35
36#include <asm/msr.h>
37#include <asm/uaccess.h>
38#include <asm/processor.h>
39#include <asm/microcode.h>
40
41MODULE_DESCRIPTION("AMD Microcode Update Driver");
42MODULE_AUTHOR("Peter Oruba <peter.oruba@amd.com>");
43MODULE_LICENSE("GPL v2");
44
45#define UCODE_MAGIC 0x00414d44
46#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
47#define UCODE_UCODE_TYPE 0x00000001
48
49struct equiv_cpu_entry {
50 unsigned int installed_cpu;
51 unsigned int fixed_errata_mask;
52 unsigned int fixed_errata_compare;
53 unsigned int equiv_cpu;
54};
55
56struct microcode_header_amd {
57 unsigned int data_code;
58 unsigned int patch_id;
59 unsigned char mc_patch_data_id[2];
60 unsigned char mc_patch_data_len;
61 unsigned char init_flag;
62 unsigned int mc_patch_data_checksum;
63 unsigned int nb_dev_id;
64 unsigned int sb_dev_id;
65 unsigned char processor_rev_id[2];
66 unsigned char nb_rev_id;
67 unsigned char sb_rev_id;
68 unsigned char bios_api_rev;
69 unsigned char reserved1[3];
70 unsigned int match_reg[8];
71};
72
73struct microcode_amd {
74 struct microcode_header_amd hdr;
75 unsigned int mpb[0];
76};
77
78#define UCODE_MAX_SIZE (2048)
79#define DEFAULT_UCODE_DATASIZE (896)
80#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd))
81#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
82#define DWSIZE (sizeof(u32))
83/* For now we support a fixed ucode total size only */
84#define get_totalsize(mc) \
85 ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
86 + MC_HEADER_SIZE)
87
88/* serialize access to the physical write */
89static DEFINE_SPINLOCK(microcode_update_lock);
90
91static struct equiv_cpu_entry *equiv_cpu_table;
92
93static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
94{
95 struct cpuinfo_x86 *c = &cpu_data(cpu);
96
97 memset(csig, 0, sizeof(*csig));
98
99 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
100 printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n",
101 cpu);
102 return -1;
103 }
104
105 asm volatile("movl %1, %%ecx; rdmsr"
106 : "=a" (csig->rev)
107 : "i" (0x0000008B) : "ecx");
108
109 printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
110 csig->rev);
111
112 return 0;
113}
114
115static int get_matching_microcode(int cpu, void *mc, int rev)
116{
117 struct microcode_header_amd *mc_header = mc;
118 struct pci_dev *nb_pci_dev, *sb_pci_dev;
119 unsigned int current_cpu_id;
120 unsigned int equiv_cpu_id = 0x00;
121 unsigned int i = 0;
122
123 BUG_ON(equiv_cpu_table == NULL);
124 current_cpu_id = cpuid_eax(0x00000001);
125
126 while (equiv_cpu_table[i].installed_cpu != 0) {
127 if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
128 equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
129 break;
130 }
131 i++;
132 }
133
134 if (!equiv_cpu_id) {
135 printk(KERN_ERR "microcode: CPU%d cpu_id "
136 "not found in equivalent cpu table \n", cpu);
137 return 0;
138 }
139
140 if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) {
141 printk(KERN_ERR
142 "microcode: CPU%d patch does not match "
143 "(patch is %x, cpu extended is %x) \n",
144 cpu, mc_header->processor_rev_id[0],
145 (equiv_cpu_id & 0xff));
146 return 0;
147 }
148
149 if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) {
150 printk(KERN_ERR "microcode: CPU%d patch does not match "
151 "(patch is %x, cpu base id is %x) \n",
152 cpu, mc_header->processor_rev_id[1],
153 ((equiv_cpu_id >> 16) & 0xff));
154
155 return 0;
156 }
157
158 /* ucode may be northbridge specific */
159 if (mc_header->nb_dev_id) {
160 nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
161 (mc_header->nb_dev_id & 0xff),
162 NULL);
163 if ((!nb_pci_dev) ||
164 (mc_header->nb_rev_id != nb_pci_dev->revision)) {
165 printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
166 pci_dev_put(nb_pci_dev);
167 return 0;
168 }
169 pci_dev_put(nb_pci_dev);
170 }
171
172 /* ucode may be southbridge specific */
173 if (mc_header->sb_dev_id) {
174 sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
175 (mc_header->sb_dev_id & 0xff),
176 NULL);
177 if ((!sb_pci_dev) ||
178 (mc_header->sb_rev_id != sb_pci_dev->revision)) {
179 printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
180 pci_dev_put(sb_pci_dev);
181 return 0;
182 }
183 pci_dev_put(sb_pci_dev);
184 }
185
186 if (mc_header->patch_id <= rev)
187 return 0;
188
189 return 1;
190}
191
192static void apply_microcode_amd(int cpu)
193{
194 unsigned long flags;
195 unsigned int eax, edx;
196 unsigned int rev;
197 int cpu_num = raw_smp_processor_id();
198 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
199 struct microcode_amd *mc_amd = uci->mc;
200 unsigned long addr;
201
202 /* We should bind the task to the CPU */
203 BUG_ON(cpu_num != cpu);
204
205 if (mc_amd == NULL)
206 return;
207
208 spin_lock_irqsave(&microcode_update_lock, flags);
209
210 addr = (unsigned long)&mc_amd->hdr.data_code;
211 edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
212 eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
213
214 asm volatile("movl %0, %%ecx; wrmsr" :
215 : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
216
217 /* get patch id after patching */
218 asm volatile("movl %1, %%ecx; rdmsr"
219 : "=a" (rev)
220 : "i" (0x0000008B) : "ecx");
221
222 spin_unlock_irqrestore(&microcode_update_lock, flags);
223
224 /* check current patch id and patch's id for match */
225 if (rev != mc_amd->hdr.patch_id) {
226 printk(KERN_ERR "microcode: CPU%d update from revision "
227 "0x%x to 0x%x failed\n", cpu_num,
228 mc_amd->hdr.patch_id, rev);
229 return;
230 }
231
232 printk(KERN_INFO "microcode: CPU%d updated from revision "
233 "0x%x to 0x%x \n",
234 cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
235
236 uci->cpu_sig.rev = rev;
237}
238
239static void * get_next_ucode(u8 *buf, unsigned int size,
240 int (*get_ucode_data)(void *, const void *, size_t),
241 unsigned int *mc_size)
242{
243 unsigned int total_size;
244#define UCODE_CONTAINER_SECTION_HDR 8
245 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
246 void *mc;
247
248 if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
249 return NULL;
250
251 if (section_hdr[0] != UCODE_UCODE_TYPE) {
252 printk(KERN_ERR "microcode: error! "
253 "Wrong microcode payload type field\n");
254 return NULL;
255 }
256
257 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
258
259 printk(KERN_INFO "microcode: size %u, total_size %u\n",
260 size, total_size);
261
262 if (total_size > size || total_size > UCODE_MAX_SIZE) {
263 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
264 return NULL;
265 }
266
267 mc = vmalloc(UCODE_MAX_SIZE);
268 if (mc) {
269 memset(mc, 0, UCODE_MAX_SIZE);
270 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) {
271 vfree(mc);
272 mc = NULL;
273 } else
274 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
275 }
276#undef UCODE_CONTAINER_SECTION_HDR
277 return mc;
278}
279
280
281static int install_equiv_cpu_table(u8 *buf,
282 int (*get_ucode_data)(void *, const void *, size_t))
283{
284#define UCODE_CONTAINER_HEADER_SIZE 12
285 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
286 unsigned int *buf_pos = (unsigned int *)container_hdr;
287 unsigned long size;
288
289 if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
290 return 0;
291
292 size = buf_pos[2];
293
294 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
295 printk(KERN_ERR "microcode: error! "
296 "Wrong microcode equivalnet cpu table\n");
297 return 0;
298 }
299
300 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
301 if (!equiv_cpu_table) {
302 printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n");
303 return 0;
304 }
305
306 buf += UCODE_CONTAINER_HEADER_SIZE;
307 if (get_ucode_data(equiv_cpu_table, buf, size)) {
308 vfree(equiv_cpu_table);
309 return 0;
310 }
311
312 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
313#undef UCODE_CONTAINER_HEADER_SIZE
314}
315
316static void free_equiv_cpu_table(void)
317{
318 if (equiv_cpu_table) {
319 vfree(equiv_cpu_table);
320 equiv_cpu_table = NULL;
321 }
322}
323
324static int generic_load_microcode(int cpu, void *data, size_t size,
325 int (*get_ucode_data)(void *, const void *, size_t))
326{
327 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
328 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
329 int new_rev = uci->cpu_sig.rev;
330 unsigned int leftover;
331 unsigned long offset;
332
333 offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data);
334 if (!offset) {
335 printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
336 return -EINVAL;
337 }
338
339 ucode_ptr += offset;
340 leftover = size - offset;
341
342 while (leftover) {
343 unsigned int uninitialized_var(mc_size);
344 struct microcode_header_amd *mc_header;
345
346 mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size);
347 if (!mc)
348 break;
349
350 mc_header = (struct microcode_header_amd *)mc;
351 if (get_matching_microcode(cpu, mc, new_rev)) {
352 if (new_mc)
353 vfree(new_mc);
354 new_rev = mc_header->patch_id;
355 new_mc = mc;
356 } else
357 vfree(mc);
358
359 ucode_ptr += mc_size;
360 leftover -= mc_size;
361 }
362
363 if (new_mc) {
364 if (!leftover) {
365 if (uci->mc)
366 vfree(uci->mc);
367 uci->mc = new_mc;
368 pr_debug("microcode: CPU%d found a matching microcode update with"
369 " version 0x%x (current=0x%x)\n",
370 cpu, new_rev, uci->cpu_sig.rev);
371 } else
372 vfree(new_mc);
373 }
374
375 free_equiv_cpu_table();
376
377 return (int)leftover;
378}
379
380static int get_ucode_fw(void *to, const void *from, size_t n)
381{
382 memcpy(to, from, n);
383 return 0;
384}
385
386static int request_microcode_fw(int cpu, struct device *device)
387{
388 const char *fw_name = "amd-ucode/microcode_amd.bin";
389 const struct firmware *firmware;
390 int ret;
391
392 /* We should bind the task to the CPU */
393 BUG_ON(cpu != raw_smp_processor_id());
394
395 ret = request_firmware(&firmware, fw_name, device);
396 if (ret) {
397 printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name);
398 return ret;
399 }
400
401 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
402 &get_ucode_fw);
403
404 release_firmware(firmware);
405
406 return ret;
407}
408
409static int request_microcode_user(int cpu, const void __user *buf, size_t size)
410{
411 printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode"
412 "is not supported\n");
413 return -1;
414}
415
416static void microcode_fini_cpu_amd(int cpu)
417{
418 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
419
420 vfree(uci->mc);
421 uci->mc = NULL;
422}
423
424static struct microcode_ops microcode_amd_ops = {
425 .request_microcode_user = request_microcode_user,
426 .request_microcode_fw = request_microcode_fw,
427 .collect_cpu_info = collect_cpu_info_amd,
428 .apply_microcode = apply_microcode_amd,
429 .microcode_fini_cpu = microcode_fini_cpu_amd,
430};
431
432struct microcode_ops * __init init_amd_microcode(void)
433{
434 return &microcode_amd_ops;
435}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
new file mode 100644
index 00000000000..936d8d55f23
--- /dev/null
+++ b/arch/x86/kernel/microcode_core.c
@@ -0,0 +1,508 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to
59 * speculative nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73#include <linux/capability.h>
74#include <linux/kernel.h>
75#include <linux/init.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h>
95
96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL");
99
100#define MICROCODE_VERSION "2.00"
101
102struct microcode_ops *microcode_ops;
103
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
105static DEFINE_MUTEX(microcode_mutex);
106
107struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
108EXPORT_SYMBOL_GPL(ucode_cpu_info);
109
110#ifdef CONFIG_MICROCODE_OLD_INTERFACE
111static int do_microcode_update(const void __user *buf, size_t size)
112{
113 cpumask_t old;
114 int error = 0;
115 int cpu;
116
117 old = current->cpus_allowed;
118
119 for_each_online_cpu(cpu) {
120 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
121
122 if (!uci->valid)
123 continue;
124
125 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
126 error = microcode_ops->request_microcode_user(cpu, buf, size);
127 if (error < 0)
128 goto out;
129 if (!error)
130 microcode_ops->apply_microcode(cpu);
131 }
132out:
133 set_cpus_allowed_ptr(current, &old);
134 return error;
135}
136
137static int microcode_open(struct inode *unused1, struct file *unused2)
138{
139 cycle_kernel_lock();
140 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
141}
142
143static ssize_t microcode_write(struct file *file, const char __user *buf,
144 size_t len, loff_t *ppos)
145{
146 ssize_t ret;
147
148 if ((len >> PAGE_SHIFT) > num_physpages) {
149 printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
150 num_physpages);
151 return -EINVAL;
152 }
153
154 get_online_cpus();
155 mutex_lock(&microcode_mutex);
156
157 ret = do_microcode_update(buf, len);
158 if (!ret)
159 ret = (ssize_t)len;
160
161 mutex_unlock(&microcode_mutex);
162 put_online_cpus();
163
164 return ret;
165}
166
167static const struct file_operations microcode_fops = {
168 .owner = THIS_MODULE,
169 .write = microcode_write,
170 .open = microcode_open,
171};
172
173static struct miscdevice microcode_dev = {
174 .minor = MICROCODE_MINOR,
175 .name = "microcode",
176 .fops = &microcode_fops,
177};
178
179static int __init microcode_dev_init(void)
180{
181 int error;
182
183 error = misc_register(&microcode_dev);
184 if (error) {
185 printk(KERN_ERR
186 "microcode: can't misc_register on minor=%d\n",
187 MICROCODE_MINOR);
188 return error;
189 }
190
191 return 0;
192}
193
194static void microcode_dev_exit(void)
195{
196 misc_deregister(&microcode_dev);
197}
198
199MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
200#else
201#define microcode_dev_init() 0
202#define microcode_dev_exit() do { } while (0)
203#endif
204
205/* fake device for request_firmware */
206struct platform_device *microcode_pdev;
207
208static ssize_t reload_store(struct sys_device *dev,
209 struct sysdev_attribute *attr,
210 const char *buf, size_t sz)
211{
212 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
213 char *end;
214 unsigned long val = simple_strtoul(buf, &end, 0);
215 int err = 0;
216 int cpu = dev->id;
217
218 if (end == buf)
219 return -EINVAL;
220 if (val == 1) {
221 cpumask_t old = current->cpus_allowed;
222
223 get_online_cpus();
224 if (cpu_online(cpu)) {
225 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
226 mutex_lock(&microcode_mutex);
227 if (uci->valid) {
228 err = microcode_ops->request_microcode_fw(cpu,
229 &microcode_pdev->dev);
230 if (!err)
231 microcode_ops->apply_microcode(cpu);
232 }
233 mutex_unlock(&microcode_mutex);
234 set_cpus_allowed_ptr(current, &old);
235 }
236 put_online_cpus();
237 }
238 if (err)
239 return err;
240 return sz;
241}
242
243static ssize_t version_show(struct sys_device *dev,
244 struct sysdev_attribute *attr, char *buf)
245{
246 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
247
248 return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
249}
250
251static ssize_t pf_show(struct sys_device *dev,
252 struct sysdev_attribute *attr, char *buf)
253{
254 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
255
256 return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
257}
258
259static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
260static SYSDEV_ATTR(version, 0400, version_show, NULL);
261static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
262
263static struct attribute *mc_default_attrs[] = {
264 &attr_reload.attr,
265 &attr_version.attr,
266 &attr_processor_flags.attr,
267 NULL
268};
269
270static struct attribute_group mc_attr_group = {
271 .attrs = mc_default_attrs,
272 .name = "microcode",
273};
274
275static void microcode_fini_cpu(int cpu)
276{
277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
278
279 mutex_lock(&microcode_mutex);
280 microcode_ops->microcode_fini_cpu(cpu);
281 uci->valid = 0;
282 mutex_unlock(&microcode_mutex);
283}
284
285static void collect_cpu_info(int cpu)
286{
287 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
288
289 memset(uci, 0, sizeof(*uci));
290 if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
291 uci->valid = 1;
292}
293
294static int microcode_resume_cpu(int cpu)
295{
296 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
297 struct cpu_signature nsig;
298
299 pr_debug("microcode: CPU%d resumed\n", cpu);
300
301 if (!uci->mc)
302 return 1;
303
304 /*
305 * Let's verify that the 'cached' ucode does belong
306 * to this cpu (a bit of paranoia):
307 */
308 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
309 microcode_fini_cpu(cpu);
310 return -1;
311 }
312
313 if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) {
314 microcode_fini_cpu(cpu);
315 /* Should we look for a new ucode here? */
316 return 1;
317 }
318
319 return 0;
320}
321
322void microcode_update_cpu(int cpu)
323{
324 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
325 int err = 0;
326
327 /*
328 * Check if the system resume is in progress (uci->valid != NULL),
329 * otherwise just request a firmware:
330 */
331 if (uci->valid) {
332 err = microcode_resume_cpu(cpu);
333 } else {
334 collect_cpu_info(cpu);
335 if (uci->valid && system_state == SYSTEM_RUNNING)
336 err = microcode_ops->request_microcode_fw(cpu,
337 &microcode_pdev->dev);
338 }
339 if (!err)
340 microcode_ops->apply_microcode(cpu);
341}
342
343static void microcode_init_cpu(int cpu)
344{
345 cpumask_t old = current->cpus_allowed;
346
347 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
348 /* We should bind the task to the CPU */
349 BUG_ON(raw_smp_processor_id() != cpu);
350
351 mutex_lock(&microcode_mutex);
352 microcode_update_cpu(cpu);
353 mutex_unlock(&microcode_mutex);
354
355 set_cpus_allowed_ptr(current, &old);
356}
357
358static int mc_sysdev_add(struct sys_device *sys_dev)
359{
360 int err, cpu = sys_dev->id;
361 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
362
363 if (!cpu_online(cpu))
364 return 0;
365
366 pr_debug("microcode: CPU%d added\n", cpu);
367 memset(uci, 0, sizeof(*uci));
368
369 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
370 if (err)
371 return err;
372
373 microcode_init_cpu(cpu);
374 return 0;
375}
376
377static int mc_sysdev_remove(struct sys_device *sys_dev)
378{
379 int cpu = sys_dev->id;
380
381 if (!cpu_online(cpu))
382 return 0;
383
384 pr_debug("microcode: CPU%d removed\n", cpu);
385 microcode_fini_cpu(cpu);
386 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
387 return 0;
388}
389
390static int mc_sysdev_resume(struct sys_device *dev)
391{
392 int cpu = dev->id;
393
394 if (!cpu_online(cpu))
395 return 0;
396
397 /* only CPU 0 will apply ucode here */
398 microcode_update_cpu(0);
399 return 0;
400}
401
402static struct sysdev_driver mc_sysdev_driver = {
403 .add = mc_sysdev_add,
404 .remove = mc_sysdev_remove,
405 .resume = mc_sysdev_resume,
406};
407
408static __cpuinit int
409mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
410{
411 unsigned int cpu = (unsigned long)hcpu;
412 struct sys_device *sys_dev;
413
414 sys_dev = get_cpu_sysdev(cpu);
415 switch (action) {
416 case CPU_ONLINE:
417 case CPU_ONLINE_FROZEN:
418 microcode_init_cpu(cpu);
419 case CPU_DOWN_FAILED:
420 case CPU_DOWN_FAILED_FROZEN:
421 pr_debug("microcode: CPU%d added\n", cpu);
422 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
423 printk(KERN_ERR "microcode: Failed to create the sysfs "
424 "group for CPU%d\n", cpu);
425 break;
426 case CPU_DOWN_PREPARE:
427 case CPU_DOWN_PREPARE_FROZEN:
428 /* Suspend is in progress, only remove the interface */
429 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
430 pr_debug("microcode: CPU%d removed\n", cpu);
431 break;
432 case CPU_DEAD:
433 case CPU_UP_CANCELED_FROZEN:
434 /* The CPU refused to come up during a system resume */
435 microcode_fini_cpu(cpu);
436 break;
437 }
438 return NOTIFY_OK;
439}
440
441static struct notifier_block __refdata mc_cpu_notifier = {
442 .notifier_call = mc_cpu_callback,
443};
444
445static int __init microcode_init(void)
446{
447 struct cpuinfo_x86 *c = &cpu_data(0);
448 int error;
449
450 if (c->x86_vendor == X86_VENDOR_INTEL)
451 microcode_ops = init_intel_microcode();
452 else if (c->x86_vendor == X86_VENDOR_AMD)
453 microcode_ops = init_amd_microcode();
454
455 if (!microcode_ops) {
456 printk(KERN_ERR "microcode: no support for this CPU vendor\n");
457 return -ENODEV;
458 }
459
460 error = microcode_dev_init();
461 if (error)
462 return error;
463 microcode_pdev = platform_device_register_simple("microcode", -1,
464 NULL, 0);
465 if (IS_ERR(microcode_pdev)) {
466 microcode_dev_exit();
467 return PTR_ERR(microcode_pdev);
468 }
469
470 get_online_cpus();
471 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
472 put_online_cpus();
473 if (error) {
474 microcode_dev_exit();
475 platform_device_unregister(microcode_pdev);
476 return error;
477 }
478
479 register_hotcpu_notifier(&mc_cpu_notifier);
480
481 printk(KERN_INFO
482 "Microcode Update Driver: v" MICROCODE_VERSION
483 " <tigran@aivazian.fsnet.co.uk>"
484 " <peter.oruba@amd.com>\n");
485
486 return 0;
487}
488
489static void __exit microcode_exit(void)
490{
491 microcode_dev_exit();
492
493 unregister_hotcpu_notifier(&mc_cpu_notifier);
494
495 get_online_cpus();
496 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
497 put_online_cpus();
498
499 platform_device_unregister(microcode_pdev);
500
501 microcode_ops = NULL;
502
503 printk(KERN_INFO
504 "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
505}
506
507module_init(microcode_init);
508module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
new file mode 100644
index 00000000000..622dc4a2178
--- /dev/null
+++ b/arch/x86/kernel/microcode_intel.c
@@ -0,0 +1,480 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to
59 * speculative nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73#include <linux/capability.h>
74#include <linux/kernel.h>
75#include <linux/init.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h>
95
96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL");
99
100struct microcode_header_intel {
101 unsigned int hdrver;
102 unsigned int rev;
103 unsigned int date;
104 unsigned int sig;
105 unsigned int cksum;
106 unsigned int ldrver;
107 unsigned int pf;
108 unsigned int datasize;
109 unsigned int totalsize;
110 unsigned int reserved[3];
111};
112
113struct microcode_intel {
114 struct microcode_header_intel hdr;
115 unsigned int bits[0];
116};
117
118/* microcode format is extended from prescott processors */
119struct extended_signature {
120 unsigned int sig;
121 unsigned int pf;
122 unsigned int cksum;
123};
124
125struct extended_sigtable {
126 unsigned int count;
127 unsigned int cksum;
128 unsigned int reserved[3];
129 struct extended_signature sigs[0];
130};
131
132#define DEFAULT_UCODE_DATASIZE (2000)
133#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
134#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
135#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
136#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
137#define DWSIZE (sizeof(u32))
138#define get_totalsize(mc) \
139 (((struct microcode_intel *)mc)->hdr.totalsize ? \
140 ((struct microcode_intel *)mc)->hdr.totalsize : \
141 DEFAULT_UCODE_TOTALSIZE)
142
143#define get_datasize(mc) \
144 (((struct microcode_intel *)mc)->hdr.datasize ? \
145 ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
146
147#define sigmatch(s1, s2, p1, p2) \
148 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
149
150#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
151
152/* serialize access to the physical write to MSR 0x79 */
153static DEFINE_SPINLOCK(microcode_update_lock);
154
155static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
156{
157 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
158 unsigned int val[2];
159
160 memset(csig, 0, sizeof(*csig));
161
162 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
163 cpu_has(c, X86_FEATURE_IA64)) {
164 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
165 "processor\n", cpu_num);
166 return -1;
167 }
168
169 csig->sig = cpuid_eax(0x00000001);
170
171 if ((c->x86_model >= 5) || (c->x86 > 6)) {
172 /* get processor flags from MSR 0x17 */
173 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
174 csig->pf = 1 << ((val[1] >> 18) & 7);
175 }
176
177 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
178 /* see notes above for revision 1.07. Apparent chip bug */
179 sync_core();
180 /* get the current revision from MSR 0x8B */
181 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
182 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
183 csig->sig, csig->pf, csig->rev);
184
185 return 0;
186}
187
188static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
189{
190 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
191}
192
193static inline int
194update_match_revision(struct microcode_header_intel *mc_header, int rev)
195{
196 return (mc_header->rev <= rev) ? 0 : 1;
197}
198
199static int microcode_sanity_check(void *mc)
200{
201 struct microcode_header_intel *mc_header = mc;
202 struct extended_sigtable *ext_header = NULL;
203 struct extended_signature *ext_sig;
204 unsigned long total_size, data_size, ext_table_size;
205 int sum, orig_sum, ext_sigcount = 0, i;
206
207 total_size = get_totalsize(mc_header);
208 data_size = get_datasize(mc_header);
209 if (data_size + MC_HEADER_SIZE > total_size) {
210 printk(KERN_ERR "microcode: error! "
211 "Bad data size in microcode data file\n");
212 return -EINVAL;
213 }
214
215 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
216 printk(KERN_ERR "microcode: error! "
217 "Unknown microcode update format\n");
218 return -EINVAL;
219 }
220 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
221 if (ext_table_size) {
222 if ((ext_table_size < EXT_HEADER_SIZE)
223 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
224 printk(KERN_ERR "microcode: error! "
225 "Small exttable size in microcode data file\n");
226 return -EINVAL;
227 }
228 ext_header = mc + MC_HEADER_SIZE + data_size;
229 if (ext_table_size != exttable_size(ext_header)) {
230 printk(KERN_ERR "microcode: error! "
231 "Bad exttable size in microcode data file\n");
232 return -EFAULT;
233 }
234 ext_sigcount = ext_header->count;
235 }
236
237 /* check extended table checksum */
238 if (ext_table_size) {
239 int ext_table_sum = 0;
240 int *ext_tablep = (int *)ext_header;
241
242 i = ext_table_size / DWSIZE;
243 while (i--)
244 ext_table_sum += ext_tablep[i];
245 if (ext_table_sum) {
246 printk(KERN_WARNING "microcode: aborting, "
247 "bad extended signature table checksum\n");
248 return -EINVAL;
249 }
250 }
251
252 /* calculate the checksum */
253 orig_sum = 0;
254 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
255 while (i--)
256 orig_sum += ((int *)mc)[i];
257 if (orig_sum) {
258 printk(KERN_ERR "microcode: aborting, bad checksum\n");
259 return -EINVAL;
260 }
261 if (!ext_table_size)
262 return 0;
263 /* check extended signature checksum */
264 for (i = 0; i < ext_sigcount; i++) {
265 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
266 EXT_SIGNATURE_SIZE * i;
267 sum = orig_sum
268 - (mc_header->sig + mc_header->pf + mc_header->cksum)
269 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
270 if (sum) {
271 printk(KERN_ERR "microcode: aborting, bad checksum\n");
272 return -EINVAL;
273 }
274 }
275 return 0;
276}
277
278/*
279 * return 0 - no update found
280 * return 1 - found update
281 */
282static int
283get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
284{
285 struct microcode_header_intel *mc_header = mc;
286 struct extended_sigtable *ext_header;
287 unsigned long total_size = get_totalsize(mc_header);
288 int ext_sigcount, i;
289 struct extended_signature *ext_sig;
290
291 if (!update_match_revision(mc_header, rev))
292 return 0;
293
294 if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
295 return 1;
296
297 /* Look for ext. headers: */
298 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
299 return 0;
300
301 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
302 ext_sigcount = ext_header->count;
303 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
304
305 for (i = 0; i < ext_sigcount; i++) {
306 if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
307 return 1;
308 ext_sig++;
309 }
310 return 0;
311}
312
313static void apply_microcode(int cpu)
314{
315 unsigned long flags;
316 unsigned int val[2];
317 int cpu_num = raw_smp_processor_id();
318 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
319 struct microcode_intel *mc_intel = uci->mc;
320
321 /* We should bind the task to the CPU */
322 BUG_ON(cpu_num != cpu);
323
324 if (mc_intel == NULL)
325 return;
326
327 /* serialize access to the physical write to MSR 0x79 */
328 spin_lock_irqsave(&microcode_update_lock, flags);
329
330 /* write microcode via MSR 0x79 */
331 wrmsr(MSR_IA32_UCODE_WRITE,
332 (unsigned long) mc_intel->bits,
333 (unsigned long) mc_intel->bits >> 16 >> 16);
334 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
335
336 /* see notes above for revision 1.07. Apparent chip bug */
337 sync_core();
338
339 /* get the current revision from MSR 0x8B */
340 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
341
342 spin_unlock_irqrestore(&microcode_update_lock, flags);
343 if (val[1] != mc_intel->hdr.rev) {
344 printk(KERN_ERR "microcode: CPU%d update from revision "
345 "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]);
346 return;
347 }
348 printk(KERN_INFO "microcode: CPU%d updated from revision "
349 "0x%x to 0x%x, date = %04x-%02x-%02x \n",
350 cpu_num, uci->cpu_sig.rev, val[1],
351 mc_intel->hdr.date & 0xffff,
352 mc_intel->hdr.date >> 24,
353 (mc_intel->hdr.date >> 16) & 0xff);
354 uci->cpu_sig.rev = val[1];
355}
356
357static int generic_load_microcode(int cpu, void *data, size_t size,
358 int (*get_ucode_data)(void *, const void *, size_t))
359{
360 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
361 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
362 int new_rev = uci->cpu_sig.rev;
363 unsigned int leftover = size;
364
365 while (leftover) {
366 struct microcode_header_intel mc_header;
367 unsigned int mc_size;
368
369 if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
370 break;
371
372 mc_size = get_totalsize(&mc_header);
373 if (!mc_size || mc_size > leftover) {
374 printk(KERN_ERR "microcode: error!"
375 "Bad data in microcode data file\n");
376 break;
377 }
378
379 mc = vmalloc(mc_size);
380 if (!mc)
381 break;
382
383 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
384 microcode_sanity_check(mc) < 0) {
385 vfree(mc);
386 break;
387 }
388
389 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
390 if (new_mc)
391 vfree(new_mc);
392 new_rev = mc_header.rev;
393 new_mc = mc;
394 } else
395 vfree(mc);
396
397 ucode_ptr += mc_size;
398 leftover -= mc_size;
399 }
400
401 if (new_mc) {
402 if (!leftover) {
403 if (uci->mc)
404 vfree(uci->mc);
405 uci->mc = (struct microcode_intel *)new_mc;
406 pr_debug("microcode: CPU%d found a matching microcode update with"
407 " version 0x%x (current=0x%x)\n",
408 cpu, new_rev, uci->cpu_sig.rev);
409 } else
410 vfree(new_mc);
411 }
412
413 return (int)leftover;
414}
415
416static int get_ucode_fw(void *to, const void *from, size_t n)
417{
418 memcpy(to, from, n);
419 return 0;
420}
421
422static int request_microcode_fw(int cpu, struct device *device)
423{
424 char name[30];
425 struct cpuinfo_x86 *c = &cpu_data(cpu);
426 const struct firmware *firmware;
427 int ret;
428
429 /* We should bind the task to the CPU */
430 BUG_ON(cpu != raw_smp_processor_id());
431 sprintf(name, "intel-ucode/%02x-%02x-%02x",
432 c->x86, c->x86_model, c->x86_mask);
433 ret = request_firmware(&firmware, name, device);
434 if (ret) {
435 pr_debug("microcode: data file %s load failed\n", name);
436 return ret;
437 }
438
439 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
440 &get_ucode_fw);
441
442 release_firmware(firmware);
443
444 return ret;
445}
446
447static int get_ucode_user(void *to, const void *from, size_t n)
448{
449 return copy_from_user(to, from, n);
450}
451
452static int request_microcode_user(int cpu, const void __user *buf, size_t size)
453{
454 /* We should bind the task to the CPU */
455 BUG_ON(cpu != raw_smp_processor_id());
456
457 return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user);
458}
459
460static void microcode_fini_cpu(int cpu)
461{
462 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
463
464 vfree(uci->mc);
465 uci->mc = NULL;
466}
467
468struct microcode_ops microcode_intel_ops = {
469 .request_microcode_user = request_microcode_user,
470 .request_microcode_fw = request_microcode_fw,
471 .collect_cpu_info = collect_cpu_info,
472 .apply_microcode = apply_microcode,
473 .microcode_fini_cpu = microcode_fini_cpu,
474};
475
476struct microcode_ops * __init init_intel_microcode(void)
477{
478 return &microcode_intel_ops;
479}
480
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index b3fb430725c..f98f4e1dba0 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -397,7 +397,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
397 generic_bigsmp_probe(); 397 generic_bigsmp_probe();
398#endif 398#endif
399 399
400#ifdef CONFIG_X86_32
400 setup_apic_routing(); 401 setup_apic_routing();
402#endif
401 if (!num_processors) 403 if (!num_processors)
402 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 404 printk(KERN_ERR "MPTABLE: no processors registered!\n");
403 return num_processors; 405 return num_processors;
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 2e2af5d1819..82a7c7ed6d4 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -163,8 +163,8 @@ static int __cpuinit msr_device_create(int cpu)
163{ 163{
164 struct device *dev; 164 struct device *dev;
165 165
166 dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), 166 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
167 NULL, "msr%d", cpu); 167 "msr%d", cpu);
168 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 168 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
169} 169}
170 170
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index abb78a2cc4a..2c97f07f1c2 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -299,6 +299,15 @@ void acpi_nmi_disable(void)
299 on_each_cpu(__acpi_nmi_disable, NULL, 1); 299 on_each_cpu(__acpi_nmi_disable, NULL, 1);
300} 300}
301 301
302/*
303 * This function is called as soon the LAPIC NMI watchdog driver has everything
304 * in place and it's ready to check if the NMIs belong to the NMI watchdog
305 */
306void cpu_nmi_set_wd_enabled(void)
307{
308 __get_cpu_var(wd_enabled) = 1;
309}
310
302void setup_apic_nmi_watchdog(void *unused) 311void setup_apic_nmi_watchdog(void *unused)
303{ 312{
304 if (__get_cpu_var(wd_enabled)) 313 if (__get_cpu_var(wd_enabled))
@@ -311,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused)
311 320
312 switch (nmi_watchdog) { 321 switch (nmi_watchdog) {
313 case NMI_LOCAL_APIC: 322 case NMI_LOCAL_APIC:
314 /* enable it before to avoid race with handler */
315 __get_cpu_var(wd_enabled) = 1;
316 if (lapic_watchdog_init(nmi_hz) < 0) { 323 if (lapic_watchdog_init(nmi_hz) < 0) {
317 __get_cpu_var(wd_enabled) = 0; 324 __get_cpu_var(wd_enabled) = 0;
318 return; 325 return;
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index eecc8c18f01..4caff39078e 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -229,6 +229,12 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
229 } 229 }
230} 230}
231 231
232static int __init numaq_setup_ioapic_ids(void)
233{
234 /* so can skip it */
235 return 1;
236}
237
232static struct x86_quirks numaq_x86_quirks __initdata = { 238static struct x86_quirks numaq_x86_quirks __initdata = {
233 .arch_pre_time_init = numaq_pre_time_init, 239 .arch_pre_time_init = numaq_pre_time_init,
234 .arch_time_init = NULL, 240 .arch_time_init = NULL,
@@ -243,6 +249,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = {
243 .mpc_oem_bus_info = mpc_oem_bus_info, 249 .mpc_oem_bus_info = mpc_oem_bus_info,
244 .mpc_oem_pci_bus = mpc_oem_pci_bus, 250 .mpc_oem_pci_bus = mpc_oem_pci_bus,
245 .smp_read_mpc_oem = smp_read_mpc_oem, 251 .smp_read_mpc_oem = smp_read_mpc_oem,
252 .setup_ioapic_ids = numaq_setup_ioapic_ids,
246}; 253};
247 254
248void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, 255void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 3e667227480..7a13fac63a1 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd);
190static void __init platform_detect(void) 190static void __init platform_detect(void)
191{ 191{
192 size_t propsize; 192 size_t propsize;
193 u32 rev; 193 __be32 rev;
194 194
195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, 195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
196 &propsize) || propsize != 4) { 196 &propsize) || propsize != 4) {
197 printk(KERN_ERR "ofw: getprop call failed!\n"); 197 printk(KERN_ERR "ofw: getprop call failed!\n");
198 rev = 0; 198 rev = cpu_to_be32(0);
199 } 199 }
200 olpc_platform_info.boardrev = be32_to_cpu(rev); 200 olpc_platform_info.boardrev = be32_to_cpu(rev);
201} 201}
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
203static void __init platform_detect(void) 203static void __init platform_detect(void)
204{ 204{
205 /* stopgap until OFW support is added to the kernel */ 205 /* stopgap until OFW support is added to the kernel */
206 olpc_platform_info.boardrev = be32_to_cpu(0xc2); 206 olpc_platform_info.boardrev = 0xc2;
207} 207}
208#endif 208#endif
209 209
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
new file mode 100644
index 00000000000..0e9f1982b1d
--- /dev/null
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -0,0 +1,37 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/spinlock.h>
6#include <linux/module.h>
7
8#include <asm/paravirt.h>
9
10static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
11{
12 __raw_spin_lock(lock);
13}
14
15struct pv_lock_ops pv_lock_ops = {
16#ifdef CONFIG_SMP
17 .spin_is_locked = __ticket_spin_is_locked,
18 .spin_is_contended = __ticket_spin_is_contended,
19
20 .spin_lock = __ticket_spin_lock,
21 .spin_lock_flags = default_spin_lock_flags,
22 .spin_trylock = __ticket_spin_trylock,
23 .spin_unlock = __ticket_spin_unlock,
24#endif
25};
26EXPORT_SYMBOL(pv_lock_ops);
27
28void __init paravirt_use_bytelocks(void)
29{
30#ifdef CONFIG_SMP
31 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
32 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
33 pv_lock_ops.spin_lock = __byte_spin_lock;
34 pv_lock_ops.spin_trylock = __byte_spin_trylock;
35 pv_lock_ops.spin_unlock = __byte_spin_unlock;
36#endif
37}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 300da17e61c..e4c8fb60887 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -268,17 +268,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
268 return __get_cpu_var(paravirt_lazy_mode); 268 return __get_cpu_var(paravirt_lazy_mode);
269} 269}
270 270
271void __init paravirt_use_bytelocks(void)
272{
273#ifdef CONFIG_SMP
274 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
275 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
276 pv_lock_ops.spin_lock = __byte_spin_lock;
277 pv_lock_ops.spin_trylock = __byte_spin_trylock;
278 pv_lock_ops.spin_unlock = __byte_spin_unlock;
279#endif
280}
281
282struct pv_info pv_info = { 271struct pv_info pv_info = {
283 .name = "bare hardware", 272 .name = "bare hardware",
284 .paravirt_enabled = 0, 273 .paravirt_enabled = 0,
@@ -330,6 +319,7 @@ struct pv_cpu_ops pv_cpu_ops = {
330#endif 319#endif
331 .wbinvd = native_wbinvd, 320 .wbinvd = native_wbinvd,
332 .read_msr = native_read_msr_safe, 321 .read_msr = native_read_msr_safe,
322 .read_msr_amd = native_read_msr_amd_safe,
333 .write_msr = native_write_msr_safe, 323 .write_msr = native_write_msr_safe,
334 .read_tsc = native_read_tsc, 324 .read_tsc = native_read_tsc,
335 .read_pmc = native_read_pmc, 325 .read_pmc = native_read_pmc,
@@ -348,6 +338,10 @@ struct pv_cpu_ops pv_cpu_ops = {
348 .write_ldt_entry = native_write_ldt_entry, 338 .write_ldt_entry = native_write_ldt_entry,
349 .write_gdt_entry = native_write_gdt_entry, 339 .write_gdt_entry = native_write_gdt_entry,
350 .write_idt_entry = native_write_idt_entry, 340 .write_idt_entry = native_write_idt_entry,
341
342 .alloc_ldt = paravirt_nop,
343 .free_ldt = paravirt_nop,
344
351 .load_sp0 = native_load_sp0, 345 .load_sp0 = native_load_sp0,
352 346
353#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 347#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -373,8 +367,6 @@ struct pv_cpu_ops pv_cpu_ops = {
373 367
374struct pv_apic_ops pv_apic_ops = { 368struct pv_apic_ops pv_apic_ops = {
375#ifdef CONFIG_X86_LOCAL_APIC 369#ifdef CONFIG_X86_LOCAL_APIC
376 .apic_write = native_apic_write,
377 .apic_read = native_apic_read,
378 .setup_boot_clock = setup_boot_APIC_clock, 370 .setup_boot_clock = setup_boot_APIC_clock,
379 .setup_secondary_clock = setup_secondary_APIC_clock, 371 .setup_secondary_clock = setup_secondary_APIC_clock,
380 .startup_ipi_hook = paravirt_nop, 372 .startup_ipi_hook = paravirt_nop,
@@ -461,18 +453,6 @@ struct pv_mmu_ops pv_mmu_ops = {
461 .set_fixmap = native_set_fixmap, 453 .set_fixmap = native_set_fixmap,
462}; 454};
463 455
464struct pv_lock_ops pv_lock_ops = {
465#ifdef CONFIG_SMP
466 .spin_is_locked = __ticket_spin_is_locked,
467 .spin_is_contended = __ticket_spin_is_contended,
468
469 .spin_lock = __ticket_spin_lock,
470 .spin_trylock = __ticket_spin_trylock,
471 .spin_unlock = __ticket_spin_unlock,
472#endif
473};
474EXPORT_SYMBOL(pv_lock_ops);
475
476EXPORT_SYMBOL_GPL(pv_time_ops); 456EXPORT_SYMBOL_GPL(pv_time_ops);
477EXPORT_SYMBOL (pv_cpu_ops); 457EXPORT_SYMBOL (pv_cpu_ops);
478EXPORT_SYMBOL (pv_mmu_ops); 458EXPORT_SYMBOL (pv_mmu_ops);
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 58262218781..9fe644f4861 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
23 start = start_##ops##_##x; \ 23 start = start_##ops##_##x; \
24 end = end_##ops##_##x; \ 24 end = end_##ops##_##x; \
25 goto patch_site 25 goto patch_site
26 switch(type) { 26 switch (type) {
27 PATCH_SITE(pv_irq_ops, irq_disable); 27 PATCH_SITE(pv_irq_ops, irq_disable);
28 PATCH_SITE(pv_irq_ops, irq_enable); 28 PATCH_SITE(pv_irq_ops, irq_enable);
29 PATCH_SITE(pv_irq_ops, restore_fl); 29 PATCH_SITE(pv_irq_ops, restore_fl);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index dcdac6c826e..e1e731d78f3 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -217,16 +217,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap,
217 217
218#endif /* CONFIG_IOMMU_DEBUG */ 218#endif /* CONFIG_IOMMU_DEBUG */
219 219
220static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
221{
222 unsigned int npages;
223
224 npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK);
225 npages >>= PAGE_SHIFT;
226
227 return npages;
228}
229
230static inline int translation_enabled(struct iommu_table *tbl) 220static inline int translation_enabled(struct iommu_table *tbl)
231{ 221{
232 /* only PHBs with translation enabled have an IOMMU table */ 222 /* only PHBs with translation enabled have an IOMMU table */
@@ -261,7 +251,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
261 badbit, tbl, start_addr, npages); 251 badbit, tbl, start_addr, npages);
262 } 252 }
263 253
264 set_bit_string(tbl->it_map, index, npages); 254 iommu_area_reserve(tbl->it_map, index, npages);
265 255
266 spin_unlock_irqrestore(&tbl->it_lock, flags); 256 spin_unlock_irqrestore(&tbl->it_lock, flags);
267} 257}
@@ -408,7 +398,7 @@ static void calgary_unmap_sg(struct device *dev,
408 if (dmalen == 0) 398 if (dmalen == 0)
409 break; 399 break;
410 400
411 npages = num_dma_pages(dma, dmalen); 401 npages = iommu_num_pages(dma, dmalen, PAGE_SIZE);
412 iommu_free(tbl, dma, npages); 402 iommu_free(tbl, dma, npages);
413 } 403 }
414} 404}
@@ -427,7 +417,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
427 BUG_ON(!sg_page(s)); 417 BUG_ON(!sg_page(s));
428 418
429 vaddr = (unsigned long) sg_virt(s); 419 vaddr = (unsigned long) sg_virt(s);
430 npages = num_dma_pages(vaddr, s->length); 420 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
431 421
432 entry = iommu_range_alloc(dev, tbl, npages); 422 entry = iommu_range_alloc(dev, tbl, npages);
433 if (entry == bad_dma_address) { 423 if (entry == bad_dma_address) {
@@ -464,7 +454,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
464 struct iommu_table *tbl = find_iommu_table(dev); 454 struct iommu_table *tbl = find_iommu_table(dev);
465 455
466 uaddr = (unsigned long)vaddr; 456 uaddr = (unsigned long)vaddr;
467 npages = num_dma_pages(uaddr, size); 457 npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
468 458
469 return iommu_alloc(dev, tbl, vaddr, npages, direction); 459 return iommu_alloc(dev, tbl, vaddr, npages, direction);
470} 460}
@@ -475,7 +465,7 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
475 struct iommu_table *tbl = find_iommu_table(dev); 465 struct iommu_table *tbl = find_iommu_table(dev);
476 unsigned int npages; 466 unsigned int npages;
477 467
478 npages = num_dma_pages(dma_handle, size); 468 npages = iommu_num_pages(dma_handle, size, PAGE_SIZE);
479 iommu_free(tbl, dma_handle, npages); 469 iommu_free(tbl, dma_handle, npages);
480} 470}
481 471
@@ -491,6 +481,8 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
491 npages = size >> PAGE_SHIFT; 481 npages = size >> PAGE_SHIFT;
492 order = get_order(size); 482 order = get_order(size);
493 483
484 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
485
494 /* alloc enough pages (and possibly more) */ 486 /* alloc enough pages (and possibly more) */
495 ret = (void *)__get_free_pages(flag, order); 487 ret = (void *)__get_free_pages(flag, order);
496 if (!ret) 488 if (!ret)
@@ -510,8 +502,22 @@ error:
510 return ret; 502 return ret;
511} 503}
512 504
505static void calgary_free_coherent(struct device *dev, size_t size,
506 void *vaddr, dma_addr_t dma_handle)
507{
508 unsigned int npages;
509 struct iommu_table *tbl = find_iommu_table(dev);
510
511 size = PAGE_ALIGN(size);
512 npages = size >> PAGE_SHIFT;
513
514 iommu_free(tbl, dma_handle, npages);
515 free_pages((unsigned long)vaddr, get_order(size));
516}
517
513static struct dma_mapping_ops calgary_dma_ops = { 518static struct dma_mapping_ops calgary_dma_ops = {
514 .alloc_coherent = calgary_alloc_coherent, 519 .alloc_coherent = calgary_alloc_coherent,
520 .free_coherent = calgary_free_coherent,
515 .map_single = calgary_map_single, 521 .map_single = calgary_map_single,
516 .unmap_single = calgary_unmap_single, 522 .unmap_single = calgary_unmap_single,
517 .map_sg = calgary_map_sg, 523 .map_sg = calgary_map_sg,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 87d4d6964ec..1972266e8ba 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -9,8 +9,6 @@
9#include <asm/calgary.h> 9#include <asm/calgary.h>
10#include <asm/amd_iommu.h> 10#include <asm/amd_iommu.h>
11 11
12static int forbid_dac __read_mostly;
13
14struct dma_mapping_ops *dma_ops; 12struct dma_mapping_ops *dma_ops;
15EXPORT_SYMBOL(dma_ops); 13EXPORT_SYMBOL(dma_ops);
16 14
@@ -41,11 +39,12 @@ EXPORT_SYMBOL(bad_dma_address);
41/* Dummy device used for NULL arguments (normally ISA). Better would 39/* Dummy device used for NULL arguments (normally ISA). Better would
42 be probably a smaller DMA mask, but this is bug-to-bug compatible 40 be probably a smaller DMA mask, but this is bug-to-bug compatible
43 to older i386. */ 41 to older i386. */
44struct device fallback_dev = { 42struct device x86_dma_fallback_dev = {
45 .bus_id = "fallback device", 43 .bus_id = "fallback device",
46 .coherent_dma_mask = DMA_32BIT_MASK, 44 .coherent_dma_mask = DMA_32BIT_MASK,
47 .dma_mask = &fallback_dev.coherent_dma_mask, 45 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
48}; 46};
47EXPORT_SYMBOL(x86_dma_fallback_dev);
49 48
50int dma_set_mask(struct device *dev, u64 mask) 49int dma_set_mask(struct device *dev, u64 mask)
51{ 50{
@@ -82,7 +81,7 @@ void __init dma32_reserve_bootmem(void)
82 * using 512M as goal 81 * using 512M as goal
83 */ 82 */
84 align = 64ULL<<20; 83 align = 64ULL<<20;
85 size = round_up(dma32_bootmem_size, align); 84 size = roundup(dma32_bootmem_size, align);
86 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 85 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
87 512ULL<<20); 86 512ULL<<20);
88 if (dma32_bootmem_ptr) 87 if (dma32_bootmem_ptr)
@@ -124,15 +123,46 @@ void __init pci_iommu_alloc(void)
124 pci_swiotlb_init(); 123 pci_swiotlb_init();
125} 124}
126 125
127unsigned long iommu_num_pages(unsigned long addr, unsigned long len) 126unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
128{ 127{
129 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); 128 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
130 129
131 return size >> PAGE_SHIFT; 130 return size >> PAGE_SHIFT;
132} 131}
133EXPORT_SYMBOL(iommu_num_pages); 132EXPORT_SYMBOL(iommu_nr_pages);
134#endif 133#endif
135 134
135void *dma_generic_alloc_coherent(struct device *dev, size_t size,
136 dma_addr_t *dma_addr, gfp_t flag)
137{
138 unsigned long dma_mask;
139 struct page *page;
140 dma_addr_t addr;
141
142 dma_mask = dma_alloc_coherent_mask(dev, flag);
143
144 flag |= __GFP_ZERO;
145again:
146 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
147 if (!page)
148 return NULL;
149
150 addr = page_to_phys(page);
151 if (!is_buffer_dma_capable(dma_mask, addr, size)) {
152 __free_pages(page, get_order(size));
153
154 if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) {
155 flag = (flag & ~GFP_DMA32) | GFP_DMA;
156 goto again;
157 }
158
159 return NULL;
160 }
161
162 *dma_addr = addr;
163 return page_address(page);
164}
165
136/* 166/*
137 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter 167 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
138 * documentation. 168 * documentation.
@@ -241,147 +271,6 @@ int dma_supported(struct device *dev, u64 mask)
241} 271}
242EXPORT_SYMBOL(dma_supported); 272EXPORT_SYMBOL(dma_supported);
243 273
244/* Allocate DMA memory on node near device */
245static noinline struct page *
246dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
247{
248 int node;
249
250 node = dev_to_node(dev);
251
252 return alloc_pages_node(node, gfp, order);
253}
254
255/*
256 * Allocate memory for a coherent mapping.
257 */
258void *
259dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
260 gfp_t gfp)
261{
262 struct dma_mapping_ops *ops = get_dma_ops(dev);
263 void *memory = NULL;
264 struct page *page;
265 unsigned long dma_mask = 0;
266 dma_addr_t bus;
267 int noretry = 0;
268
269 /* ignore region specifiers */
270 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
271
272 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
273 return memory;
274
275 if (!dev) {
276 dev = &fallback_dev;
277 gfp |= GFP_DMA;
278 }
279 dma_mask = dev->coherent_dma_mask;
280 if (dma_mask == 0)
281 dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
282
283 /* Device not DMA able */
284 if (dev->dma_mask == NULL)
285 return NULL;
286
287 /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
288 if (gfp & __GFP_DMA)
289 noretry = 1;
290
291#ifdef CONFIG_X86_64
292 /* Why <=? Even when the mask is smaller than 4GB it is often
293 larger than 16MB and in this case we have a chance of
294 finding fitting memory in the next higher zone first. If
295 not retry with true GFP_DMA. -AK */
296 if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
297 gfp |= GFP_DMA32;
298 if (dma_mask < DMA_32BIT_MASK)
299 noretry = 1;
300 }
301#endif
302
303 again:
304 page = dma_alloc_pages(dev,
305 noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
306 if (page == NULL)
307 return NULL;
308
309 {
310 int high, mmu;
311 bus = page_to_phys(page);
312 memory = page_address(page);
313 high = (bus + size) >= dma_mask;
314 mmu = high;
315 if (force_iommu && !(gfp & GFP_DMA))
316 mmu = 1;
317 else if (high) {
318 free_pages((unsigned long)memory,
319 get_order(size));
320
321 /* Don't use the 16MB ZONE_DMA unless absolutely
322 needed. It's better to use remapping first. */
323 if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
324 gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
325 goto again;
326 }
327
328 /* Let low level make its own zone decisions */
329 gfp &= ~(GFP_DMA32|GFP_DMA);
330
331 if (ops->alloc_coherent)
332 return ops->alloc_coherent(dev, size,
333 dma_handle, gfp);
334 return NULL;
335 }
336
337 memset(memory, 0, size);
338 if (!mmu) {
339 *dma_handle = bus;
340 return memory;
341 }
342 }
343
344 if (ops->alloc_coherent) {
345 free_pages((unsigned long)memory, get_order(size));
346 gfp &= ~(GFP_DMA|GFP_DMA32);
347 return ops->alloc_coherent(dev, size, dma_handle, gfp);
348 }
349
350 if (ops->map_simple) {
351 *dma_handle = ops->map_simple(dev, virt_to_phys(memory),
352 size,
353 PCI_DMA_BIDIRECTIONAL);
354 if (*dma_handle != bad_dma_address)
355 return memory;
356 }
357
358 if (panic_on_overflow)
359 panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
360 (unsigned long)size);
361 free_pages((unsigned long)memory, get_order(size));
362 return NULL;
363}
364EXPORT_SYMBOL(dma_alloc_coherent);
365
366/*
367 * Unmap coherent memory.
368 * The caller must ensure that the device has finished accessing the mapping.
369 */
370void dma_free_coherent(struct device *dev, size_t size,
371 void *vaddr, dma_addr_t bus)
372{
373 struct dma_mapping_ops *ops = get_dma_ops(dev);
374
375 int order = get_order(size);
376 WARN_ON(irqs_disabled()); /* for portability */
377 if (dma_release_from_coherent(dev, order, vaddr))
378 return;
379 if (ops->unmap_single)
380 ops->unmap_single(dev, bus, size, 0);
381 free_pages((unsigned long)vaddr, order);
382}
383EXPORT_SYMBOL(dma_free_coherent);
384
385static int __init pci_iommu_init(void) 274static int __init pci_iommu_init(void)
386{ 275{
387 calgary_iommu_init(); 276 calgary_iommu_init();
@@ -402,17 +291,3 @@ void pci_iommu_shutdown(void)
402} 291}
403/* Must execute after PCI subsystem */ 292/* Must execute after PCI subsystem */
404fs_initcall(pci_iommu_init); 293fs_initcall(pci_iommu_init);
405
406#ifdef CONFIG_PCI
407/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
408
409static __devinit void via_no_dac(struct pci_dev *dev)
410{
411 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
412 printk(KERN_INFO "PCI: VIA PCI bridge detected."
413 "Disabling DAC.\n");
414 forbid_dac = 1;
415 }
416}
417DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
418#endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 49285f8fd4d..e3f75bbcede 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -27,8 +27,8 @@
27#include <linux/scatterlist.h> 27#include <linux/scatterlist.h>
28#include <linux/iommu-helper.h> 28#include <linux/iommu-helper.h>
29#include <linux/sysdev.h> 29#include <linux/sysdev.h>
30#include <linux/io.h>
30#include <asm/atomic.h> 31#include <asm/atomic.h>
31#include <asm/io.h>
32#include <asm/mtrr.h> 32#include <asm/mtrr.h>
33#include <asm/pgtable.h> 33#include <asm/pgtable.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
@@ -80,9 +80,10 @@ AGPEXTERN int agp_memory_reserved;
80AGPEXTERN __u32 *agp_gatt_table; 80AGPEXTERN __u32 *agp_gatt_table;
81 81
82static unsigned long next_bit; /* protected by iommu_bitmap_lock */ 82static unsigned long next_bit; /* protected by iommu_bitmap_lock */
83static int need_flush; /* global flush state. set for each gart wrap */ 83static bool need_flush; /* global flush state. set for each gart wrap */
84 84
85static unsigned long alloc_iommu(struct device *dev, int size) 85static unsigned long alloc_iommu(struct device *dev, int size,
86 unsigned long align_mask)
86{ 87{
87 unsigned long offset, flags; 88 unsigned long offset, flags;
88 unsigned long boundary_size; 89 unsigned long boundary_size;
@@ -90,26 +91,27 @@ static unsigned long alloc_iommu(struct device *dev, int size)
90 91
91 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 92 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
92 PAGE_SIZE) >> PAGE_SHIFT; 93 PAGE_SIZE) >> PAGE_SHIFT;
93 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 94 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
94 PAGE_SIZE) >> PAGE_SHIFT; 95 PAGE_SIZE) >> PAGE_SHIFT;
95 96
96 spin_lock_irqsave(&iommu_bitmap_lock, flags); 97 spin_lock_irqsave(&iommu_bitmap_lock, flags);
97 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, 98 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
98 size, base_index, boundary_size, 0); 99 size, base_index, boundary_size, align_mask);
99 if (offset == -1) { 100 if (offset == -1) {
100 need_flush = 1; 101 need_flush = true;
101 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, 102 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
102 size, base_index, boundary_size, 0); 103 size, base_index, boundary_size,
104 align_mask);
103 } 105 }
104 if (offset != -1) { 106 if (offset != -1) {
105 next_bit = offset+size; 107 next_bit = offset+size;
106 if (next_bit >= iommu_pages) { 108 if (next_bit >= iommu_pages) {
107 next_bit = 0; 109 next_bit = 0;
108 need_flush = 1; 110 need_flush = true;
109 } 111 }
110 } 112 }
111 if (iommu_fullflush) 113 if (iommu_fullflush)
112 need_flush = 1; 114 need_flush = true;
113 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 115 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
114 116
115 return offset; 117 return offset;
@@ -134,7 +136,7 @@ static void flush_gart(void)
134 spin_lock_irqsave(&iommu_bitmap_lock, flags); 136 spin_lock_irqsave(&iommu_bitmap_lock, flags);
135 if (need_flush) { 137 if (need_flush) {
136 k8_flush_garts(); 138 k8_flush_garts();
137 need_flush = 0; 139 need_flush = false;
138 } 140 }
139 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 141 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
140} 142}
@@ -173,7 +175,8 @@ static void dump_leak(void)
173 iommu_leak_pages); 175 iommu_leak_pages);
174 for (i = 0; i < iommu_leak_pages; i += 2) { 176 for (i = 0; i < iommu_leak_pages; i += 2) {
175 printk(KERN_DEBUG "%lu: ", iommu_pages-i); 177 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
176 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); 178 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
179 0);
177 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); 180 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
178 } 181 }
179 printk(KERN_DEBUG "\n"); 182 printk(KERN_DEBUG "\n");
@@ -212,34 +215,24 @@ static void iommu_full(struct device *dev, size_t size, int dir)
212static inline int 215static inline int
213need_iommu(struct device *dev, unsigned long addr, size_t size) 216need_iommu(struct device *dev, unsigned long addr, size_t size)
214{ 217{
215 u64 mask = *dev->dma_mask; 218 return force_iommu ||
216 int high = addr + size > mask; 219 !is_buffer_dma_capable(*dev->dma_mask, addr, size);
217 int mmu = high;
218
219 if (force_iommu)
220 mmu = 1;
221
222 return mmu;
223} 220}
224 221
225static inline int 222static inline int
226nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 223nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
227{ 224{
228 u64 mask = *dev->dma_mask; 225 return !is_buffer_dma_capable(*dev->dma_mask, addr, size);
229 int high = addr + size > mask;
230 int mmu = high;
231
232 return mmu;
233} 226}
234 227
235/* Map a single continuous physical area into the IOMMU. 228/* Map a single continuous physical area into the IOMMU.
236 * Caller needs to check if the iommu is needed and flush. 229 * Caller needs to check if the iommu is needed and flush.
237 */ 230 */
238static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, 231static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
239 size_t size, int dir) 232 size_t size, int dir, unsigned long align_mask)
240{ 233{
241 unsigned long npages = iommu_num_pages(phys_mem, size); 234 unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
242 unsigned long iommu_page = alloc_iommu(dev, npages); 235 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
243 int i; 236 int i;
244 237
245 if (iommu_page == -1) { 238 if (iommu_page == -1) {
@@ -259,16 +252,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
259 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 252 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
260} 253}
261 254
262static dma_addr_t
263gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
264{
265 dma_addr_t map = dma_map_area(dev, paddr, size, dir);
266
267 flush_gart();
268
269 return map;
270}
271
272/* Map a single area into the IOMMU */ 255/* Map a single area into the IOMMU */
273static dma_addr_t 256static dma_addr_t
274gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) 257gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
@@ -276,12 +259,13 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
276 unsigned long bus; 259 unsigned long bus;
277 260
278 if (!dev) 261 if (!dev)
279 dev = &fallback_dev; 262 dev = &x86_dma_fallback_dev;
280 263
281 if (!need_iommu(dev, paddr, size)) 264 if (!need_iommu(dev, paddr, size))
282 return paddr; 265 return paddr;
283 266
284 bus = gart_map_simple(dev, paddr, size, dir); 267 bus = dma_map_area(dev, paddr, size, dir, 0);
268 flush_gart();
285 269
286 return bus; 270 return bus;
287} 271}
@@ -301,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
301 return; 285 return;
302 286
303 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; 287 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
304 npages = iommu_num_pages(dma_addr, size); 288 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
305 for (i = 0; i < npages; i++) { 289 for (i = 0; i < npages; i++) {
306 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 290 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
307 CLEAR_LEAK(iommu_page + i); 291 CLEAR_LEAK(iommu_page + i);
@@ -340,7 +324,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
340 unsigned long addr = sg_phys(s); 324 unsigned long addr = sg_phys(s);
341 325
342 if (nonforced_iommu(dev, addr, s->length)) { 326 if (nonforced_iommu(dev, addr, s->length)) {
343 addr = dma_map_area(dev, addr, s->length, dir); 327 addr = dma_map_area(dev, addr, s->length, dir, 0);
344 if (addr == bad_dma_address) { 328 if (addr == bad_dma_address) {
345 if (i > 0) 329 if (i > 0)
346 gart_unmap_sg(dev, sg, i, dir); 330 gart_unmap_sg(dev, sg, i, dir);
@@ -362,7 +346,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
362 int nelems, struct scatterlist *sout, 346 int nelems, struct scatterlist *sout,
363 unsigned long pages) 347 unsigned long pages)
364{ 348{
365 unsigned long iommu_start = alloc_iommu(dev, pages); 349 unsigned long iommu_start = alloc_iommu(dev, pages, 0);
366 unsigned long iommu_page = iommu_start; 350 unsigned long iommu_page = iommu_start;
367 struct scatterlist *s; 351 struct scatterlist *s;
368 int i; 352 int i;
@@ -384,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
384 } 368 }
385 369
386 addr = phys_addr; 370 addr = phys_addr;
387 pages = iommu_num_pages(s->offset, s->length); 371 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
388 while (pages--) { 372 while (pages--) {
389 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 373 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
390 SET_LEAK(iommu_page); 374 SET_LEAK(iommu_page);
@@ -427,7 +411,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
427 return 0; 411 return 0;
428 412
429 if (!dev) 413 if (!dev)
430 dev = &fallback_dev; 414 dev = &x86_dma_fallback_dev;
431 415
432 out = 0; 416 out = 0;
433 start = 0; 417 start = 0;
@@ -467,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
467 451
468 seg_size += s->length; 452 seg_size += s->length;
469 need = nextneed; 453 need = nextneed;
470 pages += iommu_num_pages(s->offset, s->length); 454 pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
471 ps = s; 455 ps = s;
472 } 456 }
473 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) 457 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
@@ -499,6 +483,46 @@ error:
499 return 0; 483 return 0;
500} 484}
501 485
486/* allocate and map a coherent mapping */
487static void *
488gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
489 gfp_t flag)
490{
491 dma_addr_t paddr;
492 unsigned long align_mask;
493 struct page *page;
494
495 if (force_iommu && !(flag & GFP_DMA)) {
496 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
497 page = alloc_pages(flag | __GFP_ZERO, get_order(size));
498 if (!page)
499 return NULL;
500
501 align_mask = (1UL << get_order(size)) - 1;
502 paddr = dma_map_area(dev, page_to_phys(page), size,
503 DMA_BIDIRECTIONAL, align_mask);
504
505 flush_gart();
506 if (paddr != bad_dma_address) {
507 *dma_addr = paddr;
508 return page_address(page);
509 }
510 __free_pages(page, get_order(size));
511 } else
512 return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
513
514 return NULL;
515}
516
517/* free a coherent mapping */
518static void
519gart_free_coherent(struct device *dev, size_t size, void *vaddr,
520 dma_addr_t dma_addr)
521{
522 gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL);
523 free_pages((unsigned long)vaddr, get_order(size));
524}
525
502static int no_agp; 526static int no_agp;
503 527
504static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 528static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -626,7 +650,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
626 struct pci_dev *dev; 650 struct pci_dev *dev;
627 void *gatt; 651 void *gatt;
628 int i, error; 652 int i, error;
629 unsigned long start_pfn, end_pfn;
630 653
631 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 654 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
632 aper_size = aper_base = info->aper_size = 0; 655 aper_size = aper_base = info->aper_size = 0;
@@ -650,13 +673,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
650 info->aper_size = aper_size >> 20; 673 info->aper_size = aper_size >> 20;
651 674
652 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 675 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
653 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 676 gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
677 get_order(gatt_size));
654 if (!gatt) 678 if (!gatt)
655 panic("Cannot allocate GATT table"); 679 panic("Cannot allocate GATT table");
656 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) 680 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
657 panic("Could not set GART PTEs to uncacheable pages"); 681 panic("Could not set GART PTEs to uncacheable pages");
658 682
659 memset(gatt, 0, gatt_size);
660 agp_gatt_table = gatt; 683 agp_gatt_table = gatt;
661 684
662 enable_gart_translations(); 685 enable_gart_translations();
@@ -665,19 +688,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
665 if (!error) 688 if (!error)
666 error = sysdev_register(&device_gart); 689 error = sysdev_register(&device_gart);
667 if (error) 690 if (error)
668 panic("Could not register gart_sysdev -- would corrupt data on next suspend"); 691 panic("Could not register gart_sysdev -- "
692 "would corrupt data on next suspend");
669 693
670 flush_gart(); 694 flush_gart();
671 695
672 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 696 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
673 aper_base, aper_size>>10); 697 aper_base, aper_size>>10);
674 698
675 /* need to map that range */
676 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
677 if (end_pfn > max_low_pfn_mapped) {
678 start_pfn = (aper_base>>PAGE_SHIFT);
679 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
680 }
681 return 0; 699 return 0;
682 700
683 nommu: 701 nommu:
@@ -687,20 +705,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
687 return -1; 705 return -1;
688} 706}
689 707
690extern int agp_amd64_init(void);
691
692static struct dma_mapping_ops gart_dma_ops = { 708static struct dma_mapping_ops gart_dma_ops = {
693 .map_single = gart_map_single, 709 .map_single = gart_map_single,
694 .map_simple = gart_map_simple,
695 .unmap_single = gart_unmap_single, 710 .unmap_single = gart_unmap_single,
696 .sync_single_for_cpu = NULL,
697 .sync_single_for_device = NULL,
698 .sync_single_range_for_cpu = NULL,
699 .sync_single_range_for_device = NULL,
700 .sync_sg_for_cpu = NULL,
701 .sync_sg_for_device = NULL,
702 .map_sg = gart_map_sg, 711 .map_sg = gart_map_sg,
703 .unmap_sg = gart_unmap_sg, 712 .unmap_sg = gart_unmap_sg,
713 .alloc_coherent = gart_alloc_coherent,
714 .free_coherent = gart_free_coherent,
704}; 715};
705 716
706void gart_iommu_shutdown(void) 717void gart_iommu_shutdown(void)
@@ -727,7 +738,8 @@ void __init gart_iommu_init(void)
727{ 738{
728 struct agp_kern_info info; 739 struct agp_kern_info info;
729 unsigned long iommu_start; 740 unsigned long iommu_start;
730 unsigned long aper_size; 741 unsigned long aper_base, aper_size;
742 unsigned long start_pfn, end_pfn;
731 unsigned long scratch; 743 unsigned long scratch;
732 long i; 744 long i;
733 745
@@ -759,30 +771,35 @@ void __init gart_iommu_init(void)
759 (no_agp && init_k8_gatt(&info) < 0)) { 771 (no_agp && init_k8_gatt(&info) < 0)) {
760 if (max_pfn > MAX_DMA32_PFN) { 772 if (max_pfn > MAX_DMA32_PFN) {
761 printk(KERN_WARNING "More than 4GB of memory " 773 printk(KERN_WARNING "More than 4GB of memory "
762 "but GART IOMMU not available.\n" 774 "but GART IOMMU not available.\n");
763 KERN_WARNING "falling back to iommu=soft.\n"); 775 printk(KERN_WARNING "falling back to iommu=soft.\n");
764 } 776 }
765 return; 777 return;
766 } 778 }
767 779
780 /* need to map that range */
781 aper_size = info.aper_size << 20;
782 aper_base = info.aper_base;
783 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
784 if (end_pfn > max_low_pfn_mapped) {
785 start_pfn = (aper_base>>PAGE_SHIFT);
786 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
787 }
788
768 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 789 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
769 aper_size = info.aper_size * 1024 * 1024;
770 iommu_size = check_iommu_size(info.aper_base, aper_size); 790 iommu_size = check_iommu_size(info.aper_base, aper_size);
771 iommu_pages = iommu_size >> PAGE_SHIFT; 791 iommu_pages = iommu_size >> PAGE_SHIFT;
772 792
773 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, 793 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
774 get_order(iommu_pages/8)); 794 get_order(iommu_pages/8));
775 if (!iommu_gart_bitmap) 795 if (!iommu_gart_bitmap)
776 panic("Cannot allocate iommu bitmap\n"); 796 panic("Cannot allocate iommu bitmap\n");
777 memset(iommu_gart_bitmap, 0, iommu_pages/8);
778 797
779#ifdef CONFIG_IOMMU_LEAK 798#ifdef CONFIG_IOMMU_LEAK
780 if (leak_trace) { 799 if (leak_trace) {
781 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 800 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
782 get_order(iommu_pages*sizeof(void *))); 801 get_order(iommu_pages*sizeof(void *)));
783 if (iommu_leak_tab) 802 if (!iommu_leak_tab)
784 memset(iommu_leak_tab, 0, iommu_pages * 8);
785 else
786 printk(KERN_DEBUG 803 printk(KERN_DEBUG
787 "PCI-DMA: Cannot allocate leak trace area\n"); 804 "PCI-DMA: Cannot allocate leak trace area\n");
788 } 805 }
@@ -792,7 +809,7 @@ void __init gart_iommu_init(void)
792 * Out of IOMMU space handling. 809 * Out of IOMMU space handling.
793 * Reserve some invalid pages at the beginning of the GART. 810 * Reserve some invalid pages at the beginning of the GART.
794 */ 811 */
795 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 812 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
796 813
797 agp_memory_reserved = iommu_size; 814 agp_memory_reserved = iommu_size;
798 printk(KERN_INFO 815 printk(KERN_INFO
@@ -850,7 +867,8 @@ void __init gart_parse_options(char *p)
850 if (!strncmp(p, "leak", 4)) { 867 if (!strncmp(p, "leak", 4)) {
851 leak_trace = 1; 868 leak_trace = 1;
852 p += 4; 869 p += 4;
853 if (*p == '=') ++p; 870 if (*p == '=')
871 ++p;
854 if (isdigit(*p) && get_option(&p, &arg)) 872 if (isdigit(*p) && get_option(&p, &arg))
855 iommu_leak_pages = arg; 873 iommu_leak_pages = arg;
856 } 874 }
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index 3f91f71cdc3..c70ab5a5d4c 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -14,7 +14,7 @@
14static int 14static int
15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) 15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
16{ 16{
17 if (hwdev && bus + size > *hwdev->dma_mask) { 17 if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
18 if (*hwdev->dma_mask >= DMA_32BIT_MASK) 18 if (*hwdev->dma_mask >= DMA_32BIT_MASK)
19 printk(KERN_ERR 19 printk(KERN_ERR
20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", 20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -72,7 +72,15 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
72 return nents; 72 return nents;
73} 73}
74 74
75static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
76 dma_addr_t dma_addr)
77{
78 free_pages((unsigned long)vaddr, get_order(size));
79}
80
75struct dma_mapping_ops nommu_dma_ops = { 81struct dma_mapping_ops nommu_dma_ops = {
82 .alloc_coherent = dma_generic_alloc_coherent,
83 .free_coherent = nommu_free_coherent,
76 .map_single = nommu_map_single, 84 .map_single = nommu_map_single,
77 .map_sg = nommu_map_sg, 85 .map_sg = nommu_map_sg,
78 .is_phys = 1, 86 .is_phys = 1,
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index bc1f2d3ea27..a311ffcaad1 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -1,20 +1,13 @@
1#include <linux/platform_device.h> 1#include <linux/platform_device.h>
2#include <linux/errno.h> 2#include <linux/err.h>
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5static __init int add_pcspkr(void) 5static __init int add_pcspkr(void)
6{ 6{
7 struct platform_device *pd; 7 struct platform_device *pd;
8 int ret;
9 8
10 pd = platform_device_alloc("pcspkr", -1); 9 pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
11 if (!pd)
12 return -ENOMEM;
13 10
14 ret = platform_device_add(pd); 11 return IS_ERR(pd) ? PTR_ERR(pd) : 0;
15 if (ret)
16 platform_device_put(pd);
17
18 return ret;
19} 12}
20device_initcall(add_pcspkr); 13device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7fc4d5b0a6a..c622772744d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -15,7 +15,6 @@ unsigned long idle_nomwait;
15EXPORT_SYMBOL(idle_nomwait); 15EXPORT_SYMBOL(idle_nomwait);
16 16
17struct kmem_cache *task_xstate_cachep; 17struct kmem_cache *task_xstate_cachep;
18static int force_mwait __cpuinitdata;
19 18
20int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) 19int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
21{ 20{
@@ -185,7 +184,8 @@ static void mwait_idle(void)
185static void poll_idle(void) 184static void poll_idle(void)
186{ 185{
187 local_irq_enable(); 186 local_irq_enable();
188 cpu_relax(); 187 while (!need_resched())
188 cpu_relax();
189} 189}
190 190
191/* 191/*
@@ -246,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
246 return 1; 246 return 1;
247} 247}
248 248
249static cpumask_t c1e_mask = CPU_MASK_NONE;
250static int c1e_detected;
251
252void c1e_remove_cpu(int cpu)
253{
254 cpu_clear(cpu, c1e_mask);
255}
256
249/* 257/*
250 * C1E aware idle routine. We check for C1E active in the interrupt 258 * C1E aware idle routine. We check for C1E active in the interrupt
251 * pending message MSR. If we detect C1E, then we handle it the same 259 * pending message MSR. If we detect C1E, then we handle it the same
@@ -253,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
253 */ 261 */
254static void c1e_idle(void) 262static void c1e_idle(void)
255{ 263{
256 static cpumask_t c1e_mask = CPU_MASK_NONE;
257 static int c1e_detected;
258
259 if (need_resched()) 264 if (need_resched())
260 return; 265 return;
261 266
@@ -265,8 +270,10 @@ static void c1e_idle(void)
265 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 270 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
266 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 271 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
267 c1e_detected = 1; 272 c1e_detected = 1;
268 mark_tsc_unstable("TSC halt in C1E"); 273 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
269 printk(KERN_INFO "System has C1E enabled\n"); 274 mark_tsc_unstable("TSC halt in AMD C1E");
275 printk(KERN_INFO "System has AMD C1E enabled\n");
276 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
270 } 277 }
271 } 278 }
272 279
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 3b7a1ddcc0b..0a1302fe6d4 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
37#include <linux/tick.h> 37#include <linux/tick.h>
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/dmi.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
@@ -55,6 +56,9 @@
55#include <asm/tlbflush.h> 56#include <asm/tlbflush.h>
56#include <asm/cpu.h> 57#include <asm/cpu.h>
57#include <asm/kdebug.h> 58#include <asm/kdebug.h>
59#include <asm/idle.h>
60#include <asm/syscalls.h>
61#include <asm/smp.h>
58 62
59asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 63asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
60 64
@@ -72,46 +76,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
72 return ((unsigned long *)tsk->thread.sp)[3]; 76 return ((unsigned long *)tsk->thread.sp)[3];
73} 77}
74 78
75#ifdef CONFIG_HOTPLUG_CPU 79#ifndef CONFIG_SMP
76#include <asm/nmi.h>
77
78static void cpu_exit_clear(void)
79{
80 int cpu = raw_smp_processor_id();
81
82 idle_task_exit();
83
84 cpu_uninit();
85 irq_ctx_exit(cpu);
86
87 cpu_clear(cpu, cpu_callout_map);
88 cpu_clear(cpu, cpu_callin_map);
89
90 numa_remove_cpu(cpu);
91}
92
93/* We don't actually take CPU down, just spin without interrupts. */
94static inline void play_dead(void)
95{
96 /* This must be done before dead CPU ack */
97 cpu_exit_clear();
98 mb();
99 /* Ack it */
100 __get_cpu_var(cpu_state) = CPU_DEAD;
101
102 /*
103 * With physical CPU hotplug, we should halt the cpu
104 */
105 local_irq_disable();
106 /* mask all interrupts, flush any and all caches, and halt */
107 wbinvd_halt();
108}
109#else
110static inline void play_dead(void) 80static inline void play_dead(void)
111{ 81{
112 BUG(); 82 BUG();
113} 83}
114#endif /* CONFIG_HOTPLUG_CPU */ 84#endif
115 85
116/* 86/*
117 * The idle thread. There's no useful work to be 87 * The idle thread. There's no useful work to be
@@ -153,12 +123,13 @@ void cpu_idle(void)
153 } 123 }
154} 124}
155 125
156void __show_registers(struct pt_regs *regs, int all) 126void __show_regs(struct pt_regs *regs, int all)
157{ 127{
158 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; 128 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
159 unsigned long d0, d1, d2, d3, d6, d7; 129 unsigned long d0, d1, d2, d3, d6, d7;
160 unsigned long sp; 130 unsigned long sp;
161 unsigned short ss, gs; 131 unsigned short ss, gs;
132 const char *board;
162 133
163 if (user_mode_vm(regs)) { 134 if (user_mode_vm(regs)) {
164 sp = regs->sp; 135 sp = regs->sp;
@@ -171,11 +142,15 @@ void __show_registers(struct pt_regs *regs, int all)
171 } 142 }
172 143
173 printk("\n"); 144 printk("\n");
174 printk("Pid: %d, comm: %s %s (%s %.*s)\n", 145
146 board = dmi_get_system_info(DMI_PRODUCT_NAME);
147 if (!board)
148 board = "";
149 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
175 task_pid_nr(current), current->comm, 150 task_pid_nr(current), current->comm,
176 print_tainted(), init_utsname()->release, 151 print_tainted(), init_utsname()->release,
177 (int)strcspn(init_utsname()->version, " "), 152 (int)strcspn(init_utsname()->version, " "),
178 init_utsname()->version); 153 init_utsname()->version, board);
179 154
180 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 155 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
181 (u16)regs->cs, regs->ip, regs->flags, 156 (u16)regs->cs, regs->ip, regs->flags,
@@ -214,7 +189,7 @@ void __show_registers(struct pt_regs *regs, int all)
214 189
215void show_regs(struct pt_regs *regs) 190void show_regs(struct pt_regs *regs)
216{ 191{
217 __show_registers(regs, 1); 192 __show_regs(regs, 1);
218 show_trace(NULL, regs, &regs->sp, regs->bp); 193 show_trace(NULL, regs, &regs->sp, regs->bp);
219} 194}
220 195
@@ -275,6 +250,14 @@ void exit_thread(void)
275 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 250 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
276 put_cpu(); 251 put_cpu();
277 } 252 }
253#ifdef CONFIG_X86_DS
254 /* Free any DS contexts that have not been properly released. */
255 if (unlikely(current->thread.ds_ctx)) {
256 /* we clear debugctl to make sure DS is not used. */
257 update_debugctlmsr(0);
258 ds_free(current->thread.ds_ctx);
259 }
260#endif /* CONFIG_X86_DS */
278} 261}
279 262
280void flush_thread(void) 263void flush_thread(void)
@@ -436,6 +419,35 @@ int set_tsc_mode(unsigned int val)
436 return 0; 419 return 0;
437} 420}
438 421
422#ifdef CONFIG_X86_DS
423static int update_debugctl(struct thread_struct *prev,
424 struct thread_struct *next, unsigned long debugctl)
425{
426 unsigned long ds_prev = 0;
427 unsigned long ds_next = 0;
428
429 if (prev->ds_ctx)
430 ds_prev = (unsigned long)prev->ds_ctx->ds;
431 if (next->ds_ctx)
432 ds_next = (unsigned long)next->ds_ctx->ds;
433
434 if (ds_next != ds_prev) {
435 /* we clear debugctl to make sure DS
436 * is not in use when we change it */
437 debugctl = 0;
438 update_debugctlmsr(0);
439 wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
440 }
441 return debugctl;
442}
443#else
444static int update_debugctl(struct thread_struct *prev,
445 struct thread_struct *next, unsigned long debugctl)
446{
447 return debugctl;
448}
449#endif /* CONFIG_X86_DS */
450
439static noinline void 451static noinline void
440__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 452__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
441 struct tss_struct *tss) 453 struct tss_struct *tss)
@@ -446,14 +458,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
446 prev = &prev_p->thread; 458 prev = &prev_p->thread;
447 next = &next_p->thread; 459 next = &next_p->thread;
448 460
449 debugctl = prev->debugctlmsr; 461 debugctl = update_debugctl(prev, next, prev->debugctlmsr);
450 if (next->ds_area_msr != prev->ds_area_msr) {
451 /* we clear debugctl to make sure DS
452 * is not in use when we change it */
453 debugctl = 0;
454 update_debugctlmsr(0);
455 wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
456 }
457 462
458 if (next->debugctlmsr != debugctl) 463 if (next->debugctlmsr != debugctl)
459 update_debugctlmsr(next->debugctlmsr); 464 update_debugctlmsr(next->debugctlmsr);
@@ -477,13 +482,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
477 hard_enable_TSC(); 482 hard_enable_TSC();
478 } 483 }
479 484
480#ifdef X86_BTS 485#ifdef CONFIG_X86_PTRACE_BTS
481 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 486 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
482 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 487 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
483 488
484 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 489 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
485 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 490 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
486#endif 491#endif /* CONFIG_X86_PTRACE_BTS */
487 492
488 493
489 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 494 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 71553b664e2..c958120fb1b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,11 +37,11 @@
37#include <linux/kdebug.h> 37#include <linux/kdebug.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/uaccess.h>
41#include <linux/io.h>
40 42
41#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
43#include <asm/system.h> 44#include <asm/system.h>
44#include <asm/io.h>
45#include <asm/processor.h> 45#include <asm/processor.h>
46#include <asm/i387.h> 46#include <asm/i387.h>
47#include <asm/mmu_context.h> 47#include <asm/mmu_context.h>
@@ -51,6 +51,7 @@
51#include <asm/proto.h> 51#include <asm/proto.h>
52#include <asm/ia32.h> 52#include <asm/ia32.h>
53#include <asm/idle.h> 53#include <asm/idle.h>
54#include <asm/syscalls.h>
54 55
55asmlinkage extern void ret_from_fork(void); 56asmlinkage extern void ret_from_fork(void);
56 57
@@ -62,6 +63,13 @@ void idle_notifier_register(struct notifier_block *n)
62{ 63{
63 atomic_notifier_chain_register(&idle_notifier, n); 64 atomic_notifier_chain_register(&idle_notifier, n);
64} 65}
66EXPORT_SYMBOL_GPL(idle_notifier_register);
67
68void idle_notifier_unregister(struct notifier_block *n)
69{
70 atomic_notifier_chain_unregister(&idle_notifier, n);
71}
72EXPORT_SYMBOL_GPL(idle_notifier_unregister);
65 73
66void enter_idle(void) 74void enter_idle(void)
67{ 75{
@@ -85,28 +93,12 @@ void exit_idle(void)
85 __exit_idle(); 93 __exit_idle();
86} 94}
87 95
88#ifdef CONFIG_HOTPLUG_CPU 96#ifndef CONFIG_SMP
89DECLARE_PER_CPU(int, cpu_state);
90
91#include <asm/nmi.h>
92/* We halt the CPU with physical CPU hotplug */
93static inline void play_dead(void)
94{
95 idle_task_exit();
96 mb();
97 /* Ack it */
98 __get_cpu_var(cpu_state) = CPU_DEAD;
99
100 local_irq_disable();
101 /* mask all interrupts, flush any and all caches, and halt */
102 wbinvd_halt();
103}
104#else
105static inline void play_dead(void) 97static inline void play_dead(void)
106{ 98{
107 BUG(); 99 BUG();
108} 100}
109#endif /* CONFIG_HOTPLUG_CPU */ 101#endif
110 102
111/* 103/*
112 * The idle thread. There's no useful work to be 104 * The idle thread. There's no useful work to be
@@ -151,7 +143,7 @@ void cpu_idle(void)
151} 143}
152 144
153/* Prints also some state that isn't saved in the pt_regs */ 145/* Prints also some state that isn't saved in the pt_regs */
154void __show_regs(struct pt_regs * regs) 146void __show_regs(struct pt_regs *regs, int all)
155{ 147{
156 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 148 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
157 unsigned long d0, d1, d2, d3, d6, d7; 149 unsigned long d0, d1, d2, d3, d6, d7;
@@ -160,60 +152,65 @@ void __show_regs(struct pt_regs * regs)
160 152
161 printk("\n"); 153 printk("\n");
162 print_modules(); 154 print_modules();
163 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 155 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
164 current->pid, current->comm, print_tainted(), 156 current->pid, current->comm, print_tainted(),
165 init_utsname()->release, 157 init_utsname()->release,
166 (int)strcspn(init_utsname()->version, " "), 158 (int)strcspn(init_utsname()->version, " "),
167 init_utsname()->version); 159 init_utsname()->version);
168 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 160 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
169 printk_address(regs->ip, 1); 161 printk_address(regs->ip, 1);
170 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, 162 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
171 regs->flags); 163 regs->sp, regs->flags);
172 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", 164 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
173 regs->ax, regs->bx, regs->cx); 165 regs->ax, regs->bx, regs->cx);
174 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", 166 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
175 regs->dx, regs->si, regs->di); 167 regs->dx, regs->si, regs->di);
176 printk("RBP: %016lx R08: %016lx R09: %016lx\n", 168 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
177 regs->bp, regs->r8, regs->r9); 169 regs->bp, regs->r8, regs->r9);
178 printk("R10: %016lx R11: %016lx R12: %016lx\n", 170 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
179 regs->r10, regs->r11, regs->r12); 171 regs->r10, regs->r11, regs->r12);
180 printk("R13: %016lx R14: %016lx R15: %016lx\n", 172 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
181 regs->r13, regs->r14, regs->r15); 173 regs->r13, regs->r14, regs->r15);
182 174
183 asm("movl %%ds,%0" : "=r" (ds)); 175 asm("movl %%ds,%0" : "=r" (ds));
184 asm("movl %%cs,%0" : "=r" (cs)); 176 asm("movl %%cs,%0" : "=r" (cs));
185 asm("movl %%es,%0" : "=r" (es)); 177 asm("movl %%es,%0" : "=r" (es));
186 asm("movl %%fs,%0" : "=r" (fsindex)); 178 asm("movl %%fs,%0" : "=r" (fsindex));
187 asm("movl %%gs,%0" : "=r" (gsindex)); 179 asm("movl %%gs,%0" : "=r" (gsindex));
188 180
189 rdmsrl(MSR_FS_BASE, fs); 181 rdmsrl(MSR_FS_BASE, fs);
190 rdmsrl(MSR_GS_BASE, gs); 182 rdmsrl(MSR_GS_BASE, gs);
191 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 183 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
184
185 if (!all)
186 return;
192 187
193 cr0 = read_cr0(); 188 cr0 = read_cr0();
194 cr2 = read_cr2(); 189 cr2 = read_cr2();
195 cr3 = read_cr3(); 190 cr3 = read_cr3();
196 cr4 = read_cr4(); 191 cr4 = read_cr4();
197 192
198 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 193 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
199 fs,fsindex,gs,gsindex,shadowgs); 194 fs, fsindex, gs, gsindex, shadowgs);
200 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 195 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
201 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); 196 es, cr0);
197 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
198 cr4);
202 199
203 get_debugreg(d0, 0); 200 get_debugreg(d0, 0);
204 get_debugreg(d1, 1); 201 get_debugreg(d1, 1);
205 get_debugreg(d2, 2); 202 get_debugreg(d2, 2);
206 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 203 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
207 get_debugreg(d3, 3); 204 get_debugreg(d3, 3);
208 get_debugreg(d6, 6); 205 get_debugreg(d6, 6);
209 get_debugreg(d7, 7); 206 get_debugreg(d7, 7);
210 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 207 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
211} 208}
212 209
213void show_regs(struct pt_regs *regs) 210void show_regs(struct pt_regs *regs)
214{ 211{
215 printk("CPU %d:", smp_processor_id()); 212 printk(KERN_INFO "CPU %d:", smp_processor_id());
216 __show_regs(regs); 213 __show_regs(regs, 1);
217 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 214 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
218} 215}
219 216
@@ -238,6 +235,14 @@ void exit_thread(void)
238 t->io_bitmap_max = 0; 235 t->io_bitmap_max = 0;
239 put_cpu(); 236 put_cpu();
240 } 237 }
238#ifdef CONFIG_X86_DS
239 /* Free any DS contexts that have not been properly released. */
240 if (unlikely(t->ds_ctx)) {
241 /* we clear debugctl to make sure DS is not used. */
242 update_debugctlmsr(0);
243 ds_free(t->ds_ctx);
244 }
245#endif /* CONFIG_X86_DS */
241} 246}
242 247
243void flush_thread(void) 248void flush_thread(void)
@@ -313,10 +318,10 @@ void prepare_to_copy(struct task_struct *tsk)
313 318
314int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, 319int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
315 unsigned long unused, 320 unsigned long unused,
316 struct task_struct * p, struct pt_regs * regs) 321 struct task_struct *p, struct pt_regs *regs)
317{ 322{
318 int err; 323 int err;
319 struct pt_regs * childregs; 324 struct pt_regs *childregs;
320 struct task_struct *me = current; 325 struct task_struct *me = current;
321 326
322 childregs = ((struct pt_regs *) 327 childregs = ((struct pt_regs *)
@@ -361,10 +366,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
361 if (test_thread_flag(TIF_IA32)) 366 if (test_thread_flag(TIF_IA32))
362 err = do_set_thread_area(p, -1, 367 err = do_set_thread_area(p, -1,
363 (struct user_desc __user *)childregs->si, 0); 368 (struct user_desc __user *)childregs->si, 0);
364 else 369 else
365#endif 370#endif
366 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 371 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
367 if (err) 372 if (err)
368 goto out; 373 goto out;
369 } 374 }
370 err = 0; 375 err = 0;
@@ -471,13 +476,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
471 next = &next_p->thread; 476 next = &next_p->thread;
472 477
473 debugctl = prev->debugctlmsr; 478 debugctl = prev->debugctlmsr;
474 if (next->ds_area_msr != prev->ds_area_msr) { 479
475 /* we clear debugctl to make sure DS 480#ifdef CONFIG_X86_DS
476 * is not in use when we change it */ 481 {
477 debugctl = 0; 482 unsigned long ds_prev = 0, ds_next = 0;
478 update_debugctlmsr(0); 483
479 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); 484 if (prev->ds_ctx)
485 ds_prev = (unsigned long)prev->ds_ctx->ds;
486 if (next->ds_ctx)
487 ds_next = (unsigned long)next->ds_ctx->ds;
488
489 if (ds_next != ds_prev) {
490 /*
491 * We clear debugctl to make sure DS
492 * is not in use when we change it:
493 */
494 debugctl = 0;
495 update_debugctlmsr(0);
496 wrmsrl(MSR_IA32_DS_AREA, ds_next);
497 }
480 } 498 }
499#endif /* CONFIG_X86_DS */
481 500
482 if (next->debugctlmsr != debugctl) 501 if (next->debugctlmsr != debugctl)
483 update_debugctlmsr(next->debugctlmsr); 502 update_debugctlmsr(next->debugctlmsr);
@@ -515,13 +534,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
515 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 534 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
516 } 535 }
517 536
518#ifdef X86_BTS 537#ifdef CONFIG_X86_PTRACE_BTS
519 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 538 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
520 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 539 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
521 540
522 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 541 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
523 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 542 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
524#endif 543#endif /* CONFIG_X86_PTRACE_BTS */
525} 544}
526 545
527/* 546/*
@@ -543,7 +562,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
543 unsigned fsindex, gsindex; 562 unsigned fsindex, gsindex;
544 563
545 /* we're going to use this soon, after a few expensive things */ 564 /* we're going to use this soon, after a few expensive things */
546 if (next_p->fpu_counter>5) 565 if (next_p->fpu_counter > 5)
547 prefetch(next->xstate); 566 prefetch(next->xstate);
548 567
549 /* 568 /*
@@ -551,13 +570,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
551 */ 570 */
552 load_sp0(tss, next); 571 load_sp0(tss, next);
553 572
554 /* 573 /*
555 * Switch DS and ES. 574 * Switch DS and ES.
556 * This won't pick up thread selector changes, but I guess that is ok. 575 * This won't pick up thread selector changes, but I guess that is ok.
557 */ 576 */
558 savesegment(es, prev->es); 577 savesegment(es, prev->es);
559 if (unlikely(next->es | prev->es)) 578 if (unlikely(next->es | prev->es))
560 loadsegment(es, next->es); 579 loadsegment(es, next->es);
561 580
562 savesegment(ds, prev->ds); 581 savesegment(ds, prev->ds);
563 if (unlikely(next->ds | prev->ds)) 582 if (unlikely(next->ds | prev->ds))
@@ -583,7 +602,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
583 */ 602 */
584 arch_leave_lazy_cpu_mode(); 603 arch_leave_lazy_cpu_mode();
585 604
586 /* 605 /*
587 * Switch FS and GS. 606 * Switch FS and GS.
588 * 607 *
589 * Segment register != 0 always requires a reload. Also 608 * Segment register != 0 always requires a reload. Also
@@ -592,13 +611,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
592 */ 611 */
593 if (unlikely(fsindex | next->fsindex | prev->fs)) { 612 if (unlikely(fsindex | next->fsindex | prev->fs)) {
594 loadsegment(fs, next->fsindex); 613 loadsegment(fs, next->fsindex);
595 /* 614 /*
596 * Check if the user used a selector != 0; if yes 615 * Check if the user used a selector != 0; if yes
597 * clear 64bit base, since overloaded base is always 616 * clear 64bit base, since overloaded base is always
598 * mapped to the Null selector 617 * mapped to the Null selector
599 */ 618 */
600 if (fsindex) 619 if (fsindex)
601 prev->fs = 0; 620 prev->fs = 0;
602 } 621 }
603 /* when next process has a 64bit base use it */ 622 /* when next process has a 64bit base use it */
604 if (next->fs) 623 if (next->fs)
@@ -608,7 +627,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
608 if (unlikely(gsindex | next->gsindex | prev->gs)) { 627 if (unlikely(gsindex | next->gsindex | prev->gs)) {
609 load_gs_index(next->gsindex); 628 load_gs_index(next->gsindex);
610 if (gsindex) 629 if (gsindex)
611 prev->gs = 0; 630 prev->gs = 0;
612 } 631 }
613 if (next->gs) 632 if (next->gs)
614 wrmsrl(MSR_KERNEL_GS_BASE, next->gs); 633 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
@@ -617,12 +636,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
617 /* Must be after DS reload */ 636 /* Must be after DS reload */
618 unlazy_fpu(prev_p); 637 unlazy_fpu(prev_p);
619 638
620 /* 639 /*
621 * Switch the PDA and FPU contexts. 640 * Switch the PDA and FPU contexts.
622 */ 641 */
623 prev->usersp = read_pda(oldrsp); 642 prev->usersp = read_pda(oldrsp);
624 write_pda(oldrsp, next->usersp); 643 write_pda(oldrsp, next->usersp);
625 write_pda(pcurrent, next_p); 644 write_pda(pcurrent, next_p);
626 645
627 write_pda(kernelstack, 646 write_pda(kernelstack,
628 (unsigned long)task_stack_page(next_p) + 647 (unsigned long)task_stack_page(next_p) +
@@ -663,7 +682,7 @@ long sys_execve(char __user *name, char __user * __user *argv,
663 char __user * __user *envp, struct pt_regs *regs) 682 char __user * __user *envp, struct pt_regs *regs)
664{ 683{
665 long error; 684 long error;
666 char * filename; 685 char *filename;
667 686
668 filename = getname(name); 687 filename = getname(name);
669 error = PTR_ERR(filename); 688 error = PTR_ERR(filename);
@@ -721,55 +740,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs)
721unsigned long get_wchan(struct task_struct *p) 740unsigned long get_wchan(struct task_struct *p)
722{ 741{
723 unsigned long stack; 742 unsigned long stack;
724 u64 fp,ip; 743 u64 fp, ip;
725 int count = 0; 744 int count = 0;
726 745
727 if (!p || p == current || p->state==TASK_RUNNING) 746 if (!p || p == current || p->state == TASK_RUNNING)
728 return 0; 747 return 0;
729 stack = (unsigned long)task_stack_page(p); 748 stack = (unsigned long)task_stack_page(p);
730 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) 749 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
731 return 0; 750 return 0;
732 fp = *(u64 *)(p->thread.sp); 751 fp = *(u64 *)(p->thread.sp);
733 do { 752 do {
734 if (fp < (unsigned long)stack || 753 if (fp < (unsigned long)stack ||
735 fp > (unsigned long)stack+THREAD_SIZE) 754 fp >= (unsigned long)stack+THREAD_SIZE)
736 return 0; 755 return 0;
737 ip = *(u64 *)(fp+8); 756 ip = *(u64 *)(fp+8);
738 if (!in_sched_functions(ip)) 757 if (!in_sched_functions(ip))
739 return ip; 758 return ip;
740 fp = *(u64 *)fp; 759 fp = *(u64 *)fp;
741 } while (count++ < 16); 760 } while (count++ < 16);
742 return 0; 761 return 0;
743} 762}
744 763
745long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 764long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
746{ 765{
747 int ret = 0; 766 int ret = 0;
748 int doit = task == current; 767 int doit = task == current;
749 int cpu; 768 int cpu;
750 769
751 switch (code) { 770 switch (code) {
752 case ARCH_SET_GS: 771 case ARCH_SET_GS:
753 if (addr >= TASK_SIZE_OF(task)) 772 if (addr >= TASK_SIZE_OF(task))
754 return -EPERM; 773 return -EPERM;
755 cpu = get_cpu(); 774 cpu = get_cpu();
756 /* handle small bases via the GDT because that's faster to 775 /* handle small bases via the GDT because that's faster to
757 switch. */ 776 switch. */
758 if (addr <= 0xffffffff) { 777 if (addr <= 0xffffffff) {
759 set_32bit_tls(task, GS_TLS, addr); 778 set_32bit_tls(task, GS_TLS, addr);
760 if (doit) { 779 if (doit) {
761 load_TLS(&task->thread, cpu); 780 load_TLS(&task->thread, cpu);
762 load_gs_index(GS_TLS_SEL); 781 load_gs_index(GS_TLS_SEL);
763 } 782 }
764 task->thread.gsindex = GS_TLS_SEL; 783 task->thread.gsindex = GS_TLS_SEL;
765 task->thread.gs = 0; 784 task->thread.gs = 0;
766 } else { 785 } else {
767 task->thread.gsindex = 0; 786 task->thread.gsindex = 0;
768 task->thread.gs = addr; 787 task->thread.gs = addr;
769 if (doit) { 788 if (doit) {
770 load_gs_index(0); 789 load_gs_index(0);
771 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 790 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
772 } 791 }
773 } 792 }
774 put_cpu(); 793 put_cpu();
775 break; 794 break;
@@ -823,8 +842,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
823 rdmsrl(MSR_KERNEL_GS_BASE, base); 842 rdmsrl(MSR_KERNEL_GS_BASE, base);
824 else 843 else
825 base = task->thread.gs; 844 base = task->thread.gs;
826 } 845 } else
827 else
828 base = task->thread.gs; 846 base = task->thread.gs;
829 ret = put_user(base, (unsigned long __user *)addr); 847 ret = put_user(base, (unsigned long __user *)addr);
830 break; 848 break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e37dccce85d..0a6d8c12e10 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -14,6 +14,7 @@
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/ptrace.h> 15#include <linux/ptrace.h>
16#include <linux/regset.h> 16#include <linux/regset.h>
17#include <linux/tracehook.h>
17#include <linux/user.h> 18#include <linux/user.h>
18#include <linux/elf.h> 19#include <linux/elf.h>
19#include <linux/security.h> 20#include <linux/security.h>
@@ -39,7 +40,9 @@ enum x86_regset {
39 REGSET_GENERAL, 40 REGSET_GENERAL,
40 REGSET_FP, 41 REGSET_FP,
41 REGSET_XFP, 42 REGSET_XFP,
43 REGSET_IOPERM64 = REGSET_XFP,
42 REGSET_TLS, 44 REGSET_TLS,
45 REGSET_IOPERM32,
43}; 46};
44 47
45/* 48/*
@@ -69,7 +72,7 @@ static inline bool invalid_selector(u16 value)
69 72
70#define FLAG_MASK FLAG_MASK_32 73#define FLAG_MASK FLAG_MASK_32
71 74
72static long *pt_regs_access(struct pt_regs *regs, unsigned long regno) 75static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
73{ 76{
74 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); 77 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
75 regno >>= 2; 78 regno >>= 2;
@@ -554,45 +557,138 @@ static int ptrace_set_debugreg(struct task_struct *child,
554 return 0; 557 return 0;
555} 558}
556 559
557#ifdef X86_BTS 560/*
561 * These access the current or another (stopped) task's io permission
562 * bitmap for debugging or core dump.
563 */
564static int ioperm_active(struct task_struct *target,
565 const struct user_regset *regset)
566{
567 return target->thread.io_bitmap_max / regset->size;
568}
558 569
559static int ptrace_bts_get_size(struct task_struct *child) 570static int ioperm_get(struct task_struct *target,
571 const struct user_regset *regset,
572 unsigned int pos, unsigned int count,
573 void *kbuf, void __user *ubuf)
560{ 574{
561 if (!child->thread.ds_area_msr) 575 if (!target->thread.io_bitmap_ptr)
562 return -ENXIO; 576 return -ENXIO;
563 577
564 return ds_get_bts_index((void *)child->thread.ds_area_msr); 578 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
579 target->thread.io_bitmap_ptr,
580 0, IO_BITMAP_BYTES);
581}
582
583#ifdef CONFIG_X86_PTRACE_BTS
584/*
585 * The configuration for a particular BTS hardware implementation.
586 */
587struct bts_configuration {
588 /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
589 unsigned char sizeof_bts;
590 /* the size of a field in the BTS record in bytes */
591 unsigned char sizeof_field;
592 /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
593 unsigned long debugctl_mask;
594};
595static struct bts_configuration bts_cfg;
596
597#define BTS_MAX_RECORD_SIZE (8 * 3)
598
599
600/*
601 * Branch Trace Store (BTS) uses the following format. Different
602 * architectures vary in the size of those fields.
603 * - source linear address
604 * - destination linear address
605 * - flags
606 *
607 * Later architectures use 64bit pointers throughout, whereas earlier
608 * architectures use 32bit pointers in 32bit mode.
609 *
610 * We compute the base address for the first 8 fields based on:
611 * - the field size stored in the DS configuration
612 * - the relative field position
613 *
614 * In order to store additional information in the BTS buffer, we use
615 * a special source address to indicate that the record requires
616 * special interpretation.
617 *
618 * Netburst indicated via a bit in the flags field whether the branch
619 * was predicted; this is ignored.
620 */
621
622enum bts_field {
623 bts_from = 0,
624 bts_to,
625 bts_flags,
626
627 bts_escape = (unsigned long)-1,
628 bts_qual = bts_to,
629 bts_jiffies = bts_flags
630};
631
632static inline unsigned long bts_get(const char *base, enum bts_field field)
633{
634 base += (bts_cfg.sizeof_field * field);
635 return *(unsigned long *)base;
636}
637
638static inline void bts_set(char *base, enum bts_field field, unsigned long val)
639{
640 base += (bts_cfg.sizeof_field * field);;
641 (*(unsigned long *)base) = val;
642}
643
644/*
645 * Translate a BTS record from the raw format into the bts_struct format
646 *
647 * out (out): bts_struct interpretation
648 * raw: raw BTS record
649 */
650static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
651{
652 memset(out, 0, sizeof(*out));
653 if (bts_get(raw, bts_from) == bts_escape) {
654 out->qualifier = bts_get(raw, bts_qual);
655 out->variant.jiffies = bts_get(raw, bts_jiffies);
656 } else {
657 out->qualifier = BTS_BRANCH;
658 out->variant.lbr.from_ip = bts_get(raw, bts_from);
659 out->variant.lbr.to_ip = bts_get(raw, bts_to);
660 }
565} 661}
566 662
567static int ptrace_bts_read_record(struct task_struct *child, 663static int ptrace_bts_read_record(struct task_struct *child, size_t index,
568 long index,
569 struct bts_struct __user *out) 664 struct bts_struct __user *out)
570{ 665{
571 struct bts_struct ret; 666 struct bts_struct ret;
572 int retval; 667 const void *bts_record;
573 int bts_end; 668 size_t bts_index, bts_end;
574 int bts_index; 669 int error;
575
576 if (!child->thread.ds_area_msr)
577 return -ENXIO;
578 670
579 if (index < 0) 671 error = ds_get_bts_end(child, &bts_end);
580 return -EINVAL; 672 if (error < 0)
673 return error;
581 674
582 bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
583 if (bts_end <= index) 675 if (bts_end <= index)
584 return -EINVAL; 676 return -EINVAL;
585 677
678 error = ds_get_bts_index(child, &bts_index);
679 if (error < 0)
680 return error;
681
586 /* translate the ptrace bts index into the ds bts index */ 682 /* translate the ptrace bts index into the ds bts index */
587 bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); 683 bts_index += bts_end - (index + 1);
588 bts_index -= (index + 1); 684 if (bts_end <= bts_index)
589 if (bts_index < 0) 685 bts_index -= bts_end;
590 bts_index += bts_end; 686
687 error = ds_access_bts(child, bts_index, &bts_record);
688 if (error < 0)
689 return error;
591 690
592 retval = ds_read_bts((void *)child->thread.ds_area_msr, 691 ptrace_bts_translate_record(&ret, bts_record);
593 bts_index, &ret);
594 if (retval < 0)
595 return retval;
596 692
597 if (copy_to_user(out, &ret, sizeof(ret))) 693 if (copy_to_user(out, &ret, sizeof(ret)))
598 return -EFAULT; 694 return -EFAULT;
@@ -600,101 +696,106 @@ static int ptrace_bts_read_record(struct task_struct *child,
600 return sizeof(ret); 696 return sizeof(ret);
601} 697}
602 698
603static int ptrace_bts_clear(struct task_struct *child)
604{
605 if (!child->thread.ds_area_msr)
606 return -ENXIO;
607
608 return ds_clear((void *)child->thread.ds_area_msr);
609}
610
611static int ptrace_bts_drain(struct task_struct *child, 699static int ptrace_bts_drain(struct task_struct *child,
612 long size, 700 long size,
613 struct bts_struct __user *out) 701 struct bts_struct __user *out)
614{ 702{
615 int end, i; 703 struct bts_struct ret;
616 void *ds = (void *)child->thread.ds_area_msr; 704 const unsigned char *raw;
617 705 size_t end, i;
618 if (!ds) 706 int error;
619 return -ENXIO;
620 707
621 end = ds_get_bts_index(ds); 708 error = ds_get_bts_index(child, &end);
622 if (end <= 0) 709 if (error < 0)
623 return end; 710 return error;
624 711
625 if (size < (end * sizeof(struct bts_struct))) 712 if (size < (end * sizeof(struct bts_struct)))
626 return -EIO; 713 return -EIO;
627 714
628 for (i = 0; i < end; i++, out++) { 715 error = ds_access_bts(child, 0, (const void **)&raw);
629 struct bts_struct ret; 716 if (error < 0)
630 int retval; 717 return error;
631 718
632 retval = ds_read_bts(ds, i, &ret); 719 for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
633 if (retval < 0) 720 ptrace_bts_translate_record(&ret, raw);
634 return retval;
635 721
636 if (copy_to_user(out, &ret, sizeof(ret))) 722 if (copy_to_user(out, &ret, sizeof(ret)))
637 return -EFAULT; 723 return -EFAULT;
638 } 724 }
639 725
640 ds_clear(ds); 726 error = ds_clear_bts(child);
727 if (error < 0)
728 return error;
641 729
642 return end; 730 return end;
643} 731}
644 732
733static void ptrace_bts_ovfl(struct task_struct *child)
734{
735 send_sig(child->thread.bts_ovfl_signal, child, 0);
736}
737
645static int ptrace_bts_config(struct task_struct *child, 738static int ptrace_bts_config(struct task_struct *child,
646 long cfg_size, 739 long cfg_size,
647 const struct ptrace_bts_config __user *ucfg) 740 const struct ptrace_bts_config __user *ucfg)
648{ 741{
649 struct ptrace_bts_config cfg; 742 struct ptrace_bts_config cfg;
650 int bts_size, ret = 0; 743 int error = 0;
651 void *ds;
652 744
745 error = -EOPNOTSUPP;
746 if (!bts_cfg.sizeof_bts)
747 goto errout;
748
749 error = -EIO;
653 if (cfg_size < sizeof(cfg)) 750 if (cfg_size < sizeof(cfg))
654 return -EIO; 751 goto errout;
655 752
753 error = -EFAULT;
656 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 754 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
657 return -EFAULT; 755 goto errout;
658 756
659 if ((int)cfg.size < 0) 757 error = -EINVAL;
660 return -EINVAL; 758 if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
759 !(cfg.flags & PTRACE_BTS_O_ALLOC))
760 goto errout;
661 761
662 bts_size = 0; 762 if (cfg.flags & PTRACE_BTS_O_ALLOC) {
663 ds = (void *)child->thread.ds_area_msr; 763 ds_ovfl_callback_t ovfl = NULL;
664 if (ds) { 764 unsigned int sig = 0;
665 bts_size = ds_get_bts_size(ds); 765
666 if (bts_size < 0) 766 /* we ignore the error in case we were not tracing child */
667 return bts_size; 767 (void)ds_release_bts(child);
668 } 768
669 cfg.size = PAGE_ALIGN(cfg.size); 769 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
770 if (!cfg.signal)
771 goto errout;
772
773 sig = cfg.signal;
774 ovfl = ptrace_bts_ovfl;
775 }
670 776
671 if (bts_size != cfg.size) { 777 error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
672 ret = ptrace_bts_realloc(child, cfg.size, 778 if (error < 0)
673 cfg.flags & PTRACE_BTS_O_CUT_SIZE);
674 if (ret < 0)
675 goto errout; 779 goto errout;
676 780
677 ds = (void *)child->thread.ds_area_msr; 781 child->thread.bts_ovfl_signal = sig;
678 } 782 }
679 783
680 if (cfg.flags & PTRACE_BTS_O_SIGNAL) 784 error = -EINVAL;
681 ret = ds_set_overflow(ds, DS_O_SIGNAL); 785 if (!child->thread.ds_ctx && cfg.flags)
682 else
683 ret = ds_set_overflow(ds, DS_O_WRAP);
684 if (ret < 0)
685 goto errout; 786 goto errout;
686 787
687 if (cfg.flags & PTRACE_BTS_O_TRACE) 788 if (cfg.flags & PTRACE_BTS_O_TRACE)
688 child->thread.debugctlmsr |= ds_debugctl_mask(); 789 child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
689 else 790 else
690 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 791 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
691 792
692 if (cfg.flags & PTRACE_BTS_O_SCHED) 793 if (cfg.flags & PTRACE_BTS_O_SCHED)
693 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 794 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
694 else 795 else
695 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 796 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
696 797
697 ret = sizeof(cfg); 798 error = sizeof(cfg);
698 799
699out: 800out:
700 if (child->thread.debugctlmsr) 801 if (child->thread.debugctlmsr)
@@ -702,10 +803,10 @@ out:
702 else 803 else
703 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 804 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
704 805
705 return ret; 806 return error;
706 807
707errout: 808errout:
708 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 809 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
709 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 810 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
710 goto out; 811 goto out;
711} 812}
@@ -714,29 +815,40 @@ static int ptrace_bts_status(struct task_struct *child,
714 long cfg_size, 815 long cfg_size,
715 struct ptrace_bts_config __user *ucfg) 816 struct ptrace_bts_config __user *ucfg)
716{ 817{
717 void *ds = (void *)child->thread.ds_area_msr;
718 struct ptrace_bts_config cfg; 818 struct ptrace_bts_config cfg;
819 size_t end;
820 const void *base, *max;
821 int error;
719 822
720 if (cfg_size < sizeof(cfg)) 823 if (cfg_size < sizeof(cfg))
721 return -EIO; 824 return -EIO;
722 825
723 memset(&cfg, 0, sizeof(cfg)); 826 error = ds_get_bts_end(child, &end);
827 if (error < 0)
828 return error;
724 829
725 if (ds) { 830 error = ds_access_bts(child, /* index = */ 0, &base);
726 cfg.size = ds_get_bts_size(ds); 831 if (error < 0)
832 return error;
727 833
728 if (ds_get_overflow(ds) == DS_O_SIGNAL) 834 error = ds_access_bts(child, /* index = */ end, &max);
729 cfg.flags |= PTRACE_BTS_O_SIGNAL; 835 if (error < 0)
836 return error;
730 837
731 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && 838 memset(&cfg, 0, sizeof(cfg));
732 child->thread.debugctlmsr & ds_debugctl_mask()) 839 cfg.size = (max - base);
733 cfg.flags |= PTRACE_BTS_O_TRACE; 840 cfg.signal = child->thread.bts_ovfl_signal;
841 cfg.bts_size = sizeof(struct bts_struct);
734 842
735 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) 843 if (cfg.signal)
736 cfg.flags |= PTRACE_BTS_O_SCHED; 844 cfg.flags |= PTRACE_BTS_O_SIGNAL;
737 }
738 845
739 cfg.bts_size = sizeof(struct bts_struct); 846 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
847 child->thread.debugctlmsr & bts_cfg.debugctl_mask)
848 cfg.flags |= PTRACE_BTS_O_TRACE;
849
850 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
851 cfg.flags |= PTRACE_BTS_O_SCHED;
740 852
741 if (copy_to_user(ucfg, &cfg, sizeof(cfg))) 853 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
742 return -EFAULT; 854 return -EFAULT;
@@ -744,89 +856,38 @@ static int ptrace_bts_status(struct task_struct *child,
744 return sizeof(cfg); 856 return sizeof(cfg);
745} 857}
746 858
747
748static int ptrace_bts_write_record(struct task_struct *child, 859static int ptrace_bts_write_record(struct task_struct *child,
749 const struct bts_struct *in) 860 const struct bts_struct *in)
750{ 861{
751 int retval; 862 unsigned char bts_record[BTS_MAX_RECORD_SIZE];
752 863
753 if (!child->thread.ds_area_msr) 864 BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
754 return -ENXIO;
755 865
756 retval = ds_write_bts((void *)child->thread.ds_area_msr, in); 866 memset(bts_record, 0, bts_cfg.sizeof_bts);
757 if (retval) 867 switch (in->qualifier) {
758 return retval; 868 case BTS_INVALID:
869 break;
759 870
760 return sizeof(*in); 871 case BTS_BRANCH:
761} 872 bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
873 bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
874 break;
762 875
763static int ptrace_bts_realloc(struct task_struct *child, 876 case BTS_TASK_ARRIVES:
764 int size, int reduce_size) 877 case BTS_TASK_DEPARTS:
765{ 878 bts_set(bts_record, bts_from, bts_escape);
766 unsigned long rlim, vm; 879 bts_set(bts_record, bts_qual, in->qualifier);
767 int ret, old_size; 880 bts_set(bts_record, bts_jiffies, in->variant.jiffies);
881 break;
768 882
769 if (size < 0) 883 default:
770 return -EINVAL; 884 return -EINVAL;
771
772 old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
773 if (old_size < 0)
774 return old_size;
775
776 ret = ds_free((void **)&child->thread.ds_area_msr);
777 if (ret < 0)
778 goto out;
779
780 size >>= PAGE_SHIFT;
781 old_size >>= PAGE_SHIFT;
782
783 current->mm->total_vm -= old_size;
784 current->mm->locked_vm -= old_size;
785
786 if (size == 0)
787 goto out;
788
789 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
790 vm = current->mm->total_vm + size;
791 if (rlim < vm) {
792 ret = -ENOMEM;
793
794 if (!reduce_size)
795 goto out;
796
797 size = rlim - current->mm->total_vm;
798 if (size <= 0)
799 goto out;
800 }
801
802 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
803 vm = current->mm->locked_vm + size;
804 if (rlim < vm) {
805 ret = -ENOMEM;
806
807 if (!reduce_size)
808 goto out;
809
810 size = rlim - current->mm->locked_vm;
811 if (size <= 0)
812 goto out;
813 } 885 }
814 886
815 ret = ds_allocate((void **)&child->thread.ds_area_msr, 887 /* The writing task will be the switched-to task on a context
816 size << PAGE_SHIFT); 888 * switch. It needs to write into the switched-from task's BTS
817 if (ret < 0) 889 * buffer. */
818 goto out; 890 return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
819
820 current->mm->total_vm += size;
821 current->mm->locked_vm += size;
822
823out:
824 if (child->thread.ds_area_msr)
825 set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
826 else
827 clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
828
829 return ret;
830} 891}
831 892
832void ptrace_bts_take_timestamp(struct task_struct *tsk, 893void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -839,7 +900,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
839 900
840 ptrace_bts_write_record(tsk, &rec); 901 ptrace_bts_write_record(tsk, &rec);
841} 902}
842#endif /* X86_BTS */ 903
904static const struct bts_configuration bts_cfg_netburst = {
905 .sizeof_bts = sizeof(long) * 3,
906 .sizeof_field = sizeof(long),
907 .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
908};
909
910static const struct bts_configuration bts_cfg_pentium_m = {
911 .sizeof_bts = sizeof(long) * 3,
912 .sizeof_field = sizeof(long),
913 .debugctl_mask = (1<<6)|(1<<7)
914};
915
916static const struct bts_configuration bts_cfg_core2 = {
917 .sizeof_bts = 8 * 3,
918 .sizeof_field = 8,
919 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
920};
921
922static inline void bts_configure(const struct bts_configuration *cfg)
923{
924 bts_cfg = *cfg;
925}
926
927void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
928{
929 switch (c->x86) {
930 case 0x6:
931 switch (c->x86_model) {
932 case 0xD:
933 case 0xE: /* Pentium M */
934 bts_configure(&bts_cfg_pentium_m);
935 break;
936 case 0xF: /* Core2 */
937 case 0x1C: /* Atom */
938 bts_configure(&bts_cfg_core2);
939 break;
940 default:
941 /* sorry, don't know about them */
942 break;
943 }
944 break;
945 case 0xF:
946 switch (c->x86_model) {
947 case 0x0:
948 case 0x1:
949 case 0x2: /* Netburst */
950 bts_configure(&bts_cfg_netburst);
951 break;
952 default:
953 /* sorry, don't know about them */
954 break;
955 }
956 break;
957 default:
958 /* sorry, don't know about them */
959 break;
960 }
961}
962#endif /* CONFIG_X86_PTRACE_BTS */
843 963
844/* 964/*
845 * Called by kernel/ptrace.c when detaching.. 965 * Called by kernel/ptrace.c when detaching..
@@ -852,15 +972,15 @@ void ptrace_disable(struct task_struct *child)
852#ifdef TIF_SYSCALL_EMU 972#ifdef TIF_SYSCALL_EMU
853 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 973 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
854#endif 974#endif
855 if (child->thread.ds_area_msr) { 975#ifdef CONFIG_X86_PTRACE_BTS
856#ifdef X86_BTS 976 (void)ds_release_bts(child);
857 ptrace_bts_realloc(child, 0, 0); 977
858#endif 978 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
859 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 979 if (!child->thread.debugctlmsr)
860 if (!child->thread.debugctlmsr) 980 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
861 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 981
862 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 982 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
863 } 983#endif /* CONFIG_X86_PTRACE_BTS */
864} 984}
865 985
866#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 986#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -980,7 +1100,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
980 /* 1100 /*
981 * These bits need more cooking - not enabled yet: 1101 * These bits need more cooking - not enabled yet:
982 */ 1102 */
983#ifdef X86_BTS 1103#ifdef CONFIG_X86_PTRACE_BTS
984 case PTRACE_BTS_CONFIG: 1104 case PTRACE_BTS_CONFIG:
985 ret = ptrace_bts_config 1105 ret = ptrace_bts_config
986 (child, data, (struct ptrace_bts_config __user *)addr); 1106 (child, data, (struct ptrace_bts_config __user *)addr);
@@ -992,7 +1112,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
992 break; 1112 break;
993 1113
994 case PTRACE_BTS_SIZE: 1114 case PTRACE_BTS_SIZE:
995 ret = ptrace_bts_get_size(child); 1115 ret = ds_get_bts_index(child, /* pos = */ NULL);
996 break; 1116 break;
997 1117
998 case PTRACE_BTS_GET: 1118 case PTRACE_BTS_GET:
@@ -1001,14 +1121,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1001 break; 1121 break;
1002 1122
1003 case PTRACE_BTS_CLEAR: 1123 case PTRACE_BTS_CLEAR:
1004 ret = ptrace_bts_clear(child); 1124 ret = ds_clear_bts(child);
1005 break; 1125 break;
1006 1126
1007 case PTRACE_BTS_DRAIN: 1127 case PTRACE_BTS_DRAIN:
1008 ret = ptrace_bts_drain 1128 ret = ptrace_bts_drain
1009 (child, data, (struct bts_struct __user *) addr); 1129 (child, data, (struct bts_struct __user *) addr);
1010 break; 1130 break;
1011#endif 1131#endif /* CONFIG_X86_PTRACE_BTS */
1012 1132
1013 default: 1133 default:
1014 ret = ptrace_request(child, request, addr, data); 1134 ret = ptrace_request(child, request, addr, data);
@@ -1290,6 +1410,12 @@ static const struct user_regset x86_64_regsets[] = {
1290 .size = sizeof(long), .align = sizeof(long), 1410 .size = sizeof(long), .align = sizeof(long),
1291 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1411 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1292 }, 1412 },
1413 [REGSET_IOPERM64] = {
1414 .core_note_type = NT_386_IOPERM,
1415 .n = IO_BITMAP_LONGS,
1416 .size = sizeof(long), .align = sizeof(long),
1417 .active = ioperm_active, .get = ioperm_get
1418 },
1293}; 1419};
1294 1420
1295static const struct user_regset_view user_x86_64_view = { 1421static const struct user_regset_view user_x86_64_view = {
@@ -1336,6 +1462,12 @@ static const struct user_regset x86_32_regsets[] = {
1336 .active = regset_tls_active, 1462 .active = regset_tls_active,
1337 .get = regset_tls_get, .set = regset_tls_set 1463 .get = regset_tls_get, .set = regset_tls_set
1338 }, 1464 },
1465 [REGSET_IOPERM32] = {
1466 .core_note_type = NT_386_IOPERM,
1467 .n = IO_BITMAP_BYTES / sizeof(u32),
1468 .size = sizeof(u32), .align = sizeof(u32),
1469 .active = ioperm_active, .get = ioperm_get
1470 },
1339}; 1471};
1340 1472
1341static const struct user_regset_view user_x86_32_view = { 1473static const struct user_regset_view user_x86_32_view = {
@@ -1357,7 +1489,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1357#endif 1489#endif
1358} 1490}
1359 1491
1360void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) 1492void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1493 int error_code, int si_code)
1361{ 1494{
1362 struct siginfo info; 1495 struct siginfo info;
1363 1496
@@ -1366,7 +1499,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1366 1499
1367 memset(&info, 0, sizeof(info)); 1500 memset(&info, 0, sizeof(info));
1368 info.si_signo = SIGTRAP; 1501 info.si_signo = SIGTRAP;
1369 info.si_code = TRAP_BRKPT; 1502 info.si_code = si_code;
1370 1503
1371 /* User-mode ip? */ 1504 /* User-mode ip? */
1372 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; 1505 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
@@ -1375,30 +1508,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1375 force_sig_info(SIGTRAP, &info, tsk); 1508 force_sig_info(SIGTRAP, &info, tsk);
1376} 1509}
1377 1510
1378static void syscall_trace(struct pt_regs *regs)
1379{
1380 if (!(current->ptrace & PT_PTRACED))
1381 return;
1382
1383#if 0
1384 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
1385 current->comm,
1386 regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
1387 current_thread_info()->flags, current->ptrace);
1388#endif
1389
1390 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
1391 ? 0x80 : 0));
1392 /*
1393 * this isn't the same as continuing with a signal, but it will do
1394 * for normal use. strace only continues with a signal if the
1395 * stopping signal is not SIGTRAP. -brl
1396 */
1397 if (current->exit_code) {
1398 send_sig(current->exit_code, current, 1);
1399 current->exit_code = 0;
1400 }
1401}
1402 1511
1403#ifdef CONFIG_X86_32 1512#ifdef CONFIG_X86_32
1404# define IS_IA32 1 1513# define IS_IA32 1
@@ -1432,8 +1541,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1432 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) 1541 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1433 ret = -1L; 1542 ret = -1L;
1434 1543
1435 if (ret || test_thread_flag(TIF_SYSCALL_TRACE)) 1544 if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
1436 syscall_trace(regs); 1545 tracehook_report_syscall_entry(regs))
1546 ret = -1L;
1437 1547
1438 if (unlikely(current->audit_context)) { 1548 if (unlikely(current->audit_context)) {
1439 if (IS_IA32) 1549 if (IS_IA32)
@@ -1459,7 +1569,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1459 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1569 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1460 1570
1461 if (test_thread_flag(TIF_SYSCALL_TRACE)) 1571 if (test_thread_flag(TIF_SYSCALL_TRACE))
1462 syscall_trace(regs); 1572 tracehook_report_syscall_exit(regs, 0);
1463 1573
1464 /* 1574 /*
1465 * If TIF_SYSCALL_EMU is set, we only get here because of 1575 * If TIF_SYSCALL_EMU is set, we only get here because of
@@ -1475,6 +1585,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
1475 * system call instruction. 1585 * system call instruction.
1476 */ 1586 */
1477 if (test_thread_flag(TIF_SINGLESTEP) && 1587 if (test_thread_flag(TIF_SINGLESTEP) &&
1478 (current->ptrace & PT_PTRACED)) 1588 tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
1479 send_sigtrap(current, regs, 0); 1589 send_sigtrap(current, regs, 0, TRAP_BRKPT);
1480} 1590}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 05fbe9a0325..4f9c55f3a7c 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
97 return dst->version; 97 return dst->version;
98} 98}
99 99
100unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
101{
102 u64 pv_tsc_khz = 1000000ULL << 32;
103
104 do_div(pv_tsc_khz, src->tsc_to_system_mul);
105 if (src->tsc_shift < 0)
106 pv_tsc_khz <<= -src->tsc_shift;
107 else
108 pv_tsc_khz >>= src->tsc_shift;
109 return pv_tsc_khz;
110}
111
100cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 112cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
101{ 113{
102 struct pvclock_shadow_time shadow; 114 struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index d1385881810..67465ed8931 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -35,9 +35,6 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
35 if (!(word & (1 << 13))) { 35 if (!(word & (1 << 13))) {
36 dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " 36 dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
37 "disabling irq balancing and affinity\n"); 37 "disabling irq balancing and affinity\n");
38#ifdef CONFIG_IRQBALANCE
39 irqbalance_disable("");
40#endif
41 noirqdebug_setup(""); 38 noirqdebug_setup("");
42#ifdef CONFIG_PROC_FS 39#ifdef CONFIG_PROC_FS
43 no_irq_affinity = 1; 40 no_irq_affinity = 1;
@@ -354,9 +351,27 @@ static void ati_force_hpet_resume(void)
354 printk(KERN_DEBUG "Force enabled HPET at resume\n"); 351 printk(KERN_DEBUG "Force enabled HPET at resume\n");
355} 352}
356 353
354static u32 ati_ixp4x0_rev(struct pci_dev *dev)
355{
356 u32 d;
357 u8 b;
358
359 pci_read_config_byte(dev, 0xac, &b);
360 b &= ~(1<<5);
361 pci_write_config_byte(dev, 0xac, b);
362 pci_read_config_dword(dev, 0x70, &d);
363 d |= 1<<8;
364 pci_write_config_dword(dev, 0x70, d);
365 pci_read_config_dword(dev, 0x8, &d);
366 d &= 0xff;
367 dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d);
368 return d;
369}
370
357static void ati_force_enable_hpet(struct pci_dev *dev) 371static void ati_force_enable_hpet(struct pci_dev *dev)
358{ 372{
359 u32 uninitialized_var(val); 373 u32 d, val;
374 u8 b;
360 375
361 if (hpet_address || force_hpet_address) 376 if (hpet_address || force_hpet_address)
362 return; 377 return;
@@ -366,14 +381,33 @@ static void ati_force_enable_hpet(struct pci_dev *dev)
366 return; 381 return;
367 } 382 }
368 383
384 d = ati_ixp4x0_rev(dev);
385 if (d < 0x82)
386 return;
387
388 /* base address */
369 pci_write_config_dword(dev, 0x14, 0xfed00000); 389 pci_write_config_dword(dev, 0x14, 0xfed00000);
370 pci_read_config_dword(dev, 0x14, &val); 390 pci_read_config_dword(dev, 0x14, &val);
391
392 /* enable interrupt */
393 outb(0x72, 0xcd6); b = inb(0xcd7);
394 b |= 0x1;
395 outb(0x72, 0xcd6); outb(b, 0xcd7);
396 outb(0x72, 0xcd6); b = inb(0xcd7);
397 if (!(b & 0x1))
398 return;
399 pci_read_config_dword(dev, 0x64, &d);
400 d |= (1<<10);
401 pci_write_config_dword(dev, 0x64, d);
402 pci_read_config_dword(dev, 0x64, &d);
403 if (!(d & (1<<10)))
404 return;
405
371 force_hpet_address = val; 406 force_hpet_address = val;
372 force_hpet_resume_type = ATI_FORCE_HPET_RESUME; 407 force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
373 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", 408 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
374 force_hpet_address); 409 force_hpet_address);
375 cached_dev = dev; 410 cached_dev = dev;
376 return;
377} 411}
378DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, 412DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
379 ati_force_enable_hpet); 413 ati_force_enable_hpet);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 724adfc63cb..f4c93f1cfc1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off);
29 29
30static const struct desc_ptr no_idt = {}; 30static const struct desc_ptr no_idt = {};
31static int reboot_mode; 31static int reboot_mode;
32enum reboot_type reboot_type = BOOT_KBD; 32/*
33 * Keyboard reset and triple fault may result in INIT, not RESET, which
34 * doesn't work when we're in vmx root mode. Try ACPI first.
35 */
36enum reboot_type reboot_type = BOOT_ACPI;
33int reboot_force; 37int reboot_force;
34 38
35#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 39#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 05191bbc68b..dd6f2b71561 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -52,7 +52,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
52 52
53 cmos_minutes = CMOS_READ(RTC_MINUTES); 53 cmos_minutes = CMOS_READ(RTC_MINUTES);
54 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) 54 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
55 BCD_TO_BIN(cmos_minutes); 55 cmos_minutes = bcd2bin(cmos_minutes);
56 56
57 /* 57 /*
58 * since we're only adjusting minutes and seconds, 58 * since we're only adjusting minutes and seconds,
@@ -69,8 +69,8 @@ int mach_set_rtc_mmss(unsigned long nowtime)
69 69
70 if (abs(real_minutes - cmos_minutes) < 30) { 70 if (abs(real_minutes - cmos_minutes) < 30) {
71 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { 71 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
72 BIN_TO_BCD(real_seconds); 72 real_seconds = bin2bcd(real_seconds);
73 BIN_TO_BCD(real_minutes); 73 real_minutes = bin2bcd(real_minutes);
74 } 74 }
75 CMOS_WRITE(real_seconds,RTC_SECONDS); 75 CMOS_WRITE(real_seconds,RTC_SECONDS);
76 CMOS_WRITE(real_minutes,RTC_MINUTES); 76 CMOS_WRITE(real_minutes,RTC_MINUTES);
@@ -124,16 +124,16 @@ unsigned long mach_get_cmos_time(void)
124 WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); 124 WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
125 125
126 if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { 126 if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
127 BCD_TO_BIN(sec); 127 sec = bcd2bin(sec);
128 BCD_TO_BIN(min); 128 min = bcd2bin(min);
129 BCD_TO_BIN(hour); 129 hour = bcd2bin(hour);
130 BCD_TO_BIN(day); 130 day = bcd2bin(day);
131 BCD_TO_BIN(mon); 131 mon = bcd2bin(mon);
132 BCD_TO_BIN(year); 132 year = bcd2bin(year);
133 } 133 }
134 134
135 if (century) { 135 if (century) {
136 BCD_TO_BIN(century); 136 century = bcd2bin(century);
137 year += century * 100; 137 year += century * 100;
138 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); 138 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
139 } else 139 } else
@@ -223,11 +223,25 @@ static struct platform_device rtc_device = {
223static __init int add_rtc_cmos(void) 223static __init int add_rtc_cmos(void)
224{ 224{
225#ifdef CONFIG_PNP 225#ifdef CONFIG_PNP
226 if (!pnp_platform_devices) 226 static const char *ids[] __initconst =
227 platform_device_register(&rtc_device); 227 { "PNP0b00", "PNP0b01", "PNP0b02", };
228#else 228 struct pnp_dev *dev;
229 struct pnp_id *id;
230 int i;
231
232 pnp_for_each_dev(dev) {
233 for (id = dev->id; id; id = id->next) {
234 for (i = 0; i < ARRAY_SIZE(ids); i++) {
235 if (compare_pnp_id(id, ids[i]) != 0)
236 return 0;
237 }
238 }
239 }
240#endif
241
229 platform_device_register(&rtc_device); 242 platform_device_register(&rtc_device);
230#endif /* CONFIG_PNP */ 243 dev_info(&rtc_device.dev,
244 "registered platform RTC device (no PNP device found)\n");
231 return 0; 245 return 0;
232} 246}
233device_initcall(add_rtc_cmos); 247device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 9838f2539df..0fa6790c1dd 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -223,6 +223,9 @@ unsigned long saved_video_mode;
223#define RAMDISK_LOAD_FLAG 0x4000 223#define RAMDISK_LOAD_FLAG 0x4000
224 224
225static char __initdata command_line[COMMAND_LINE_SIZE]; 225static char __initdata command_line[COMMAND_LINE_SIZE];
226#ifdef CONFIG_CMDLINE_BOOL
227static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
228#endif
226 229
227#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 230#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
228struct edd edd; 231struct edd edd;
@@ -299,7 +302,7 @@ static void __init relocate_initrd(void)
299 if (clen > MAX_MAP_CHUNK-slop) 302 if (clen > MAX_MAP_CHUNK-slop)
300 clen = MAX_MAP_CHUNK-slop; 303 clen = MAX_MAP_CHUNK-slop;
301 mapaddr = ramdisk_image & PAGE_MASK; 304 mapaddr = ramdisk_image & PAGE_MASK;
302 p = early_ioremap(mapaddr, clen+slop); 305 p = early_memremap(mapaddr, clen+slop);
303 memcpy(q, p+slop, clen); 306 memcpy(q, p+slop, clen);
304 early_iounmap(p, clen+slop); 307 early_iounmap(p, clen+slop);
305 q += clen; 308 q += clen;
@@ -376,7 +379,7 @@ static void __init parse_setup_data(void)
376 return; 379 return;
377 pa_data = boot_params.hdr.setup_data; 380 pa_data = boot_params.hdr.setup_data;
378 while (pa_data) { 381 while (pa_data) {
379 data = early_ioremap(pa_data, PAGE_SIZE); 382 data = early_memremap(pa_data, PAGE_SIZE);
380 switch (data->type) { 383 switch (data->type) {
381 case SETUP_E820_EXT: 384 case SETUP_E820_EXT:
382 parse_e820_ext(data, pa_data); 385 parse_e820_ext(data, pa_data);
@@ -399,7 +402,7 @@ static void __init e820_reserve_setup_data(void)
399 return; 402 return;
400 pa_data = boot_params.hdr.setup_data; 403 pa_data = boot_params.hdr.setup_data;
401 while (pa_data) { 404 while (pa_data) {
402 data = early_ioremap(pa_data, sizeof(*data)); 405 data = early_memremap(pa_data, sizeof(*data));
403 e820_update_range(pa_data, sizeof(*data)+data->len, 406 e820_update_range(pa_data, sizeof(*data)+data->len,
404 E820_RAM, E820_RESERVED_KERN); 407 E820_RAM, E820_RESERVED_KERN);
405 found = 1; 408 found = 1;
@@ -425,7 +428,7 @@ static void __init reserve_early_setup_data(void)
425 return; 428 return;
426 pa_data = boot_params.hdr.setup_data; 429 pa_data = boot_params.hdr.setup_data;
427 while (pa_data) { 430 while (pa_data) {
428 data = early_ioremap(pa_data, sizeof(*data)); 431 data = early_memremap(pa_data, sizeof(*data));
429 sprintf(buf, "setup data %x", data->type); 432 sprintf(buf, "setup data %x", data->type);
430 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); 433 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
431 pa_data = data->next; 434 pa_data = data->next;
@@ -558,7 +561,13 @@ static void __init reserve_standard_io_resources(void)
558 561
559} 562}
560 563
561#ifdef CONFIG_PROC_VMCORE 564/*
565 * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
566 * is_kdump_kernel() to determine if we are booting after a panic. Hence
567 * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
568 */
569
570#ifdef CONFIG_CRASH_DUMP
562/* elfcorehdr= specifies the location of elf core header 571/* elfcorehdr= specifies the location of elf core header
563 * stored by the crashed kernel. This option will be passed 572 * stored by the crashed kernel. This option will be passed
564 * by kexec loader to the capture kernel. 573 * by kexec loader to the capture kernel.
@@ -579,6 +588,190 @@ static struct x86_quirks default_x86_quirks __initdata;
579struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; 588struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
580 589
581/* 590/*
591 * Some BIOSes seem to corrupt the low 64k of memory during events
592 * like suspend/resume and unplugging an HDMI cable. Reserve all
593 * remaining free memory in that area and fill it with a distinct
594 * pattern.
595 */
596#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
597#define MAX_SCAN_AREAS 8
598
599static int __read_mostly memory_corruption_check = -1;
600
601static unsigned __read_mostly corruption_check_size = 64*1024;
602static unsigned __read_mostly corruption_check_period = 60; /* seconds */
603
604static struct e820entry scan_areas[MAX_SCAN_AREAS];
605static int num_scan_areas;
606
607
608static int set_corruption_check(char *arg)
609{
610 char *end;
611
612 memory_corruption_check = simple_strtol(arg, &end, 10);
613
614 return (*end == 0) ? 0 : -EINVAL;
615}
616early_param("memory_corruption_check", set_corruption_check);
617
618static int set_corruption_check_period(char *arg)
619{
620 char *end;
621
622 corruption_check_period = simple_strtoul(arg, &end, 10);
623
624 return (*end == 0) ? 0 : -EINVAL;
625}
626early_param("memory_corruption_check_period", set_corruption_check_period);
627
628static int set_corruption_check_size(char *arg)
629{
630 char *end;
631 unsigned size;
632
633 size = memparse(arg, &end);
634
635 if (*end == '\0')
636 corruption_check_size = size;
637
638 return (size == corruption_check_size) ? 0 : -EINVAL;
639}
640early_param("memory_corruption_check_size", set_corruption_check_size);
641
642
643static void __init setup_bios_corruption_check(void)
644{
645 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
646
647 if (memory_corruption_check == -1) {
648 memory_corruption_check =
649#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
650 1
651#else
652 0
653#endif
654 ;
655 }
656
657 if (corruption_check_size == 0)
658 memory_corruption_check = 0;
659
660 if (!memory_corruption_check)
661 return;
662
663 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
664
665 while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
666 u64 size;
667 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
668
669 if (addr == 0)
670 break;
671
672 if ((addr + size) > corruption_check_size)
673 size = corruption_check_size - addr;
674
675 if (size == 0)
676 break;
677
678 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
679 scan_areas[num_scan_areas].addr = addr;
680 scan_areas[num_scan_areas].size = size;
681 num_scan_areas++;
682
683 /* Assume we've already mapped this early memory */
684 memset(__va(addr), 0, size);
685
686 addr += size;
687 }
688
689 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
690 num_scan_areas);
691 update_e820();
692}
693
694static struct timer_list periodic_check_timer;
695
696void check_for_bios_corruption(void)
697{
698 int i;
699 int corruption = 0;
700
701 if (!memory_corruption_check)
702 return;
703
704 for(i = 0; i < num_scan_areas; i++) {
705 unsigned long *addr = __va(scan_areas[i].addr);
706 unsigned long size = scan_areas[i].size;
707
708 for(; size; addr++, size -= sizeof(unsigned long)) {
709 if (!*addr)
710 continue;
711 printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
712 addr, __pa(addr), *addr);
713 corruption = 1;
714 *addr = 0;
715 }
716 }
717
718 WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
719}
720
721static void periodic_check_for_corruption(unsigned long data)
722{
723 check_for_bios_corruption();
724 mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
725}
726
727void start_periodic_check_for_corruption(void)
728{
729 if (!memory_corruption_check || corruption_check_period == 0)
730 return;
731
732 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
733 corruption_check_period);
734
735 init_timer(&periodic_check_timer);
736 periodic_check_timer.function = &periodic_check_for_corruption;
737 periodic_check_for_corruption(0);
738}
739#endif
740
741static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
742{
743 printk(KERN_NOTICE
744 "%s detected: BIOS may corrupt low RAM, working it around.\n",
745 d->ident);
746
747 e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
748 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
749
750 return 0;
751}
752
753/* List of systems that have known low memory corruption BIOS problems */
754static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
755#ifdef CONFIG_X86_RESERVE_LOW_64K
756 {
757 .callback = dmi_low_memory_corruption,
758 .ident = "AMI BIOS",
759 .matches = {
760 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
761 },
762 },
763 {
764 .callback = dmi_low_memory_corruption,
765 .ident = "Phoenix BIOS",
766 .matches = {
767 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
768 },
769 },
770#endif
771 {}
772};
773
774/*
582 * Determine if we were loaded by an EFI loader. If so, then we have also been 775 * Determine if we were loaded by an EFI loader. If so, then we have also been
583 * passed the efi memmap, systab, etc., so we should use these data structures 776 * passed the efi memmap, systab, etc., so we should use these data structures
584 * for initialization. Note, the efi init code path is determined by the 777 * for initialization. Note, the efi init code path is determined by the
@@ -665,6 +858,19 @@ void __init setup_arch(char **cmdline_p)
665 bss_resource.start = virt_to_phys(&__bss_start); 858 bss_resource.start = virt_to_phys(&__bss_start);
666 bss_resource.end = virt_to_phys(&__bss_stop)-1; 859 bss_resource.end = virt_to_phys(&__bss_stop)-1;
667 860
861#ifdef CONFIG_CMDLINE_BOOL
862#ifdef CONFIG_CMDLINE_OVERRIDE
863 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
864#else
865 if (builtin_cmdline[0]) {
866 /* append boot loader cmdline to builtin */
867 strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
868 strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
869 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
870 }
871#endif
872#endif
873
668 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 874 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
669 *cmdline_p = command_line; 875 *cmdline_p = command_line;
670 876
@@ -699,6 +905,10 @@ void __init setup_arch(char **cmdline_p)
699 905
700 finish_e820_parsing(); 906 finish_e820_parsing();
701 907
908 dmi_scan_machine();
909
910 dmi_check_system(bad_bios_dmi_table);
911
702#ifdef CONFIG_X86_32 912#ifdef CONFIG_X86_32
703 probe_roms(); 913 probe_roms();
704#endif 914#endif
@@ -742,6 +952,8 @@ void __init setup_arch(char **cmdline_p)
742#else 952#else
743 num_physpages = max_pfn; 953 num_physpages = max_pfn;
744 954
955 if (cpu_has_x2apic)
956 check_x2apic();
745 957
746 /* How many end-of-memory variables you have, grandma! */ 958 /* How many end-of-memory variables you have, grandma! */
747 /* need this before calling reserve_initrd */ 959 /* need this before calling reserve_initrd */
@@ -753,6 +965,10 @@ void __init setup_arch(char **cmdline_p)
753 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 965 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
754#endif 966#endif
755 967
968#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
969 setup_bios_corruption_check();
970#endif
971
756 /* max_pfn_mapped is updated here */ 972 /* max_pfn_mapped is updated here */
757 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); 973 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
758 max_pfn_mapped = max_low_pfn_mapped; 974 max_pfn_mapped = max_low_pfn_mapped;
@@ -781,8 +997,6 @@ void __init setup_arch(char **cmdline_p)
781 vsmp_init(); 997 vsmp_init();
782#endif 998#endif
783 999
784 dmi_scan_machine();
785
786 io_delay_init(); 1000 io_delay_init();
787 1001
788 /* 1002 /*
@@ -790,6 +1004,8 @@ void __init setup_arch(char **cmdline_p)
790 */ 1004 */
791 acpi_boot_table_init(); 1005 acpi_boot_table_init();
792 1006
1007 early_acpi_boot_init();
1008
793#ifdef CONFIG_ACPI_NUMA 1009#ifdef CONFIG_ACPI_NUMA
794 /* 1010 /*
795 * Parse SRAT to discover nodes. 1011 * Parse SRAT to discover nodes.
@@ -857,6 +1073,7 @@ void __init setup_arch(char **cmdline_p)
857#endif 1073#endif
858 1074
859 prefill_possible_map(); 1075 prefill_possible_map();
1076
860#ifdef CONFIG_X86_64 1077#ifdef CONFIG_X86_64
861 init_cpu_to_node(); 1078 init_cpu_to_node();
862#endif 1079#endif
@@ -864,6 +1081,9 @@ void __init setup_arch(char **cmdline_p)
864 init_apic_mappings(); 1081 init_apic_mappings();
865 ioapic_init_mappings(); 1082 ioapic_init_mappings();
866 1083
1084 /* need to wait for io_apic is mapped */
1085 nr_irqs = probe_nr_irqs();
1086
867 kvm_guest_init(); 1087 kvm_guest_init();
868 1088
869 e820_reserve_resources(); 1089 e820_reserve_resources();
@@ -885,3 +1105,5 @@ void __init setup_arch(char **cmdline_p)
885#endif 1105#endif
886#endif 1106#endif
887} 1107}
1108
1109
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 76e305e064f..ae0c0d3bb77 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -140,35 +140,47 @@ static void __init setup_cpu_pda_map(void)
140 */ 140 */
141void __init setup_per_cpu_areas(void) 141void __init setup_per_cpu_areas(void)
142{ 142{
143 ssize_t size = PERCPU_ENOUGH_ROOM; 143 ssize_t size, old_size;
144 char *ptr; 144 char *ptr;
145 int cpu; 145 int cpu;
146 unsigned long align = 1;
146 147
147 /* Setup cpu_pda map */ 148 /* Setup cpu_pda map */
148 setup_cpu_pda_map(); 149 setup_cpu_pda_map();
149 150
150 /* Copy section for each CPU (we discard the original) */ 151 /* Copy section for each CPU (we discard the original) */
151 size = PERCPU_ENOUGH_ROOM; 152 old_size = PERCPU_ENOUGH_ROOM;
153 align = max_t(unsigned long, PAGE_SIZE, align);
154 size = roundup(old_size, align);
152 printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", 155 printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
153 size); 156 size);
154 157
155 for_each_possible_cpu(cpu) { 158 for_each_possible_cpu(cpu) {
156#ifndef CONFIG_NEED_MULTIPLE_NODES 159#ifndef CONFIG_NEED_MULTIPLE_NODES
157 ptr = alloc_bootmem_pages(size); 160 ptr = __alloc_bootmem(size, align,
161 __pa(MAX_DMA_ADDRESS));
158#else 162#else
159 int node = early_cpu_to_node(cpu); 163 int node = early_cpu_to_node(cpu);
160 if (!node_online(node) || !NODE_DATA(node)) { 164 if (!node_online(node) || !NODE_DATA(node)) {
161 ptr = alloc_bootmem_pages(size); 165 ptr = __alloc_bootmem(size, align,
166 __pa(MAX_DMA_ADDRESS));
162 printk(KERN_INFO 167 printk(KERN_INFO
163 "cpu %d has no node %d or node-local memory\n", 168 "cpu %d has no node %d or node-local memory\n",
164 cpu, node); 169 cpu, node);
170 if (ptr)
171 printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
172 cpu, __pa(ptr));
173 }
174 else {
175 ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
176 __pa(MAX_DMA_ADDRESS));
177 if (ptr)
178 printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
179 cpu, node, __pa(ptr));
165 } 180 }
166 else
167 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
168#endif 181#endif
169 per_cpu_offset(cpu) = ptr - __per_cpu_start; 182 per_cpu_offset(cpu) = ptr - __per_cpu_start;
170 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 183 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
171
172 } 184 }
173 185
174 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", 186 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
@@ -206,7 +218,7 @@ static void __init setup_node_to_cpumask_map(void)
206 /* allocate the map */ 218 /* allocate the map */
207 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); 219 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
208 220
209 pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", 221 pr_debug("Node to cpumask map at %p for %d nodes\n",
210 map, nr_node_ids); 222 map, nr_node_ids);
211 223
212 /* node_to_cpumask() will now work */ 224 /* node_to_cpumask() will now work */
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
index 72bbb519d2d..cc673aa55ce 100644
--- a/arch/x86/kernel/sigframe.h
+++ b/arch/x86/kernel/sigframe.h
@@ -3,9 +3,18 @@ struct sigframe {
3 char __user *pretcode; 3 char __user *pretcode;
4 int sig; 4 int sig;
5 struct sigcontext sc; 5 struct sigcontext sc;
6 struct _fpstate fpstate; 6 /*
7 * fpstate is unused. fpstate is moved/allocated after
8 * retcode[] below. This movement allows to have the FP state and the
9 * future state extensions (xsave) stay together.
10 * And at the same time retaining the unused fpstate, prevents changing
11 * the offset of extramask[] in the sigframe and thus prevent any
12 * legacy application accessing/modifying it.
13 */
14 struct _fpstate fpstate_unused;
7 unsigned long extramask[_NSIG_WORDS-1]; 15 unsigned long extramask[_NSIG_WORDS-1];
8 char retcode[8]; 16 char retcode[8];
17 /* fp state follows here */
9}; 18};
10 19
11struct rt_sigframe { 20struct rt_sigframe {
@@ -15,13 +24,19 @@ struct rt_sigframe {
15 void __user *puc; 24 void __user *puc;
16 struct siginfo info; 25 struct siginfo info;
17 struct ucontext uc; 26 struct ucontext uc;
18 struct _fpstate fpstate;
19 char retcode[8]; 27 char retcode[8];
28 /* fp state follows here */
20}; 29};
21#else 30#else
22struct rt_sigframe { 31struct rt_sigframe {
23 char __user *pretcode; 32 char __user *pretcode;
24 struct ucontext uc; 33 struct ucontext uc;
25 struct siginfo info; 34 struct siginfo info;
35 /* fp state follows here */
26}; 36};
37
38int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
39 sigset_t *set, struct pt_regs *regs);
40int ia32_setup_frame(int sig, struct k_sigaction *ka,
41 sigset_t *set, struct pt_regs *regs);
27#endif 42#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index 6fb5bcdd893..d6dd057d0f2 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/wait.h> 19#include <linux/wait.h>
20#include <linux/tracehook.h>
20#include <linux/elf.h> 21#include <linux/elf.h>
21#include <linux/smp.h> 22#include <linux/smp.h>
22#include <linux/mm.h> 23#include <linux/mm.h>
@@ -26,6 +27,8 @@
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27#include <asm/i387.h> 28#include <asm/i387.h>
28#include <asm/vdso.h> 29#include <asm/vdso.h>
30#include <asm/syscall.h>
31#include <asm/syscalls.h>
29 32
30#include "sigframe.h" 33#include "sigframe.h"
31 34
@@ -110,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
110 return do_sigaltstack(uss, uoss, regs->sp); 113 return do_sigaltstack(uss, uoss, regs->sp);
111} 114}
112 115
116#define COPY(x) { \
117 err |= __get_user(regs->x, &sc->x); \
118}
119
120#define COPY_SEG(seg) { \
121 unsigned short tmp; \
122 err |= __get_user(tmp, &sc->seg); \
123 regs->seg = tmp; \
124}
125
126#define COPY_SEG_STRICT(seg) { \
127 unsigned short tmp; \
128 err |= __get_user(tmp, &sc->seg); \
129 regs->seg = tmp | 3; \
130}
131
132#define GET_SEG(seg) { \
133 unsigned short tmp; \
134 err |= __get_user(tmp, &sc->seg); \
135 loadsegment(seg, tmp); \
136}
113 137
114/* 138/*
115 * Do a signal return; undo the signal stack. 139 * Do a signal return; undo the signal stack.
@@ -118,28 +142,13 @@ static int
118restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 142restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
119 unsigned long *pax) 143 unsigned long *pax)
120{ 144{
145 void __user *buf;
146 unsigned int tmpflags;
121 unsigned int err = 0; 147 unsigned int err = 0;
122 148
123 /* Always make any pending restarted system calls return -EINTR */ 149 /* Always make any pending restarted system calls return -EINTR */
124 current_thread_info()->restart_block.fn = do_no_restart_syscall; 150 current_thread_info()->restart_block.fn = do_no_restart_syscall;
125 151
126#define COPY(x) err |= __get_user(regs->x, &sc->x)
127
128#define COPY_SEG(seg) \
129 { unsigned short tmp; \
130 err |= __get_user(tmp, &sc->seg); \
131 regs->seg = tmp; }
132
133#define COPY_SEG_STRICT(seg) \
134 { unsigned short tmp; \
135 err |= __get_user(tmp, &sc->seg); \
136 regs->seg = tmp|3; }
137
138#define GET_SEG(seg) \
139 { unsigned short tmp; \
140 err |= __get_user(tmp, &sc->seg); \
141 loadsegment(seg, tmp); }
142
143 GET_SEG(gs); 152 GET_SEG(gs);
144 COPY_SEG(fs); 153 COPY_SEG(fs);
145 COPY_SEG(es); 154 COPY_SEG(es);
@@ -149,38 +158,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
149 COPY_SEG_STRICT(cs); 158 COPY_SEG_STRICT(cs);
150 COPY_SEG_STRICT(ss); 159 COPY_SEG_STRICT(ss);
151 160
152 { 161 err |= __get_user(tmpflags, &sc->flags);
153 unsigned int tmpflags; 162 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
154 163 regs->orig_ax = -1; /* disable syscall checks */
155 err |= __get_user(tmpflags, &sc->flags);
156 regs->flags = (regs->flags & ~FIX_EFLAGS) |
157 (tmpflags & FIX_EFLAGS);
158 regs->orig_ax = -1; /* disable syscall checks */
159 }
160 164
161 { 165 err |= __get_user(buf, &sc->fpstate);
162 struct _fpstate __user *buf; 166 err |= restore_i387_xstate(buf);
163
164 err |= __get_user(buf, &sc->fpstate);
165 if (buf) {
166 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
167 goto badframe;
168 err |= restore_i387(buf);
169 } else {
170 struct task_struct *me = current;
171
172 if (used_math()) {
173 clear_fpu(me);
174 clear_used_math();
175 }
176 }
177 }
178 167
179 err |= __get_user(*pax, &sc->ax); 168 err |= __get_user(*pax, &sc->ax);
180 return err; 169 return err;
181
182badframe:
183 return 1;
184} 170}
185 171
186asmlinkage unsigned long sys_sigreturn(unsigned long __unused) 172asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
@@ -226,9 +212,8 @@ badframe:
226 return 0; 212 return 0;
227} 213}
228 214
229asmlinkage int sys_rt_sigreturn(unsigned long __unused) 215static long do_rt_sigreturn(struct pt_regs *regs)
230{ 216{
231 struct pt_regs *regs = (struct pt_regs *)&__unused;
232 struct rt_sigframe __user *frame; 217 struct rt_sigframe __user *frame;
233 unsigned long ax; 218 unsigned long ax;
234 sigset_t set; 219 sigset_t set;
@@ -254,15 +239,22 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
254 return ax; 239 return ax;
255 240
256badframe: 241badframe:
257 force_sig(SIGSEGV, current); 242 signal_fault(regs, frame, "rt_sigreturn");
258 return 0; 243 return 0;
259} 244}
260 245
246asmlinkage int sys_rt_sigreturn(unsigned long __unused)
247{
248 struct pt_regs *regs = (struct pt_regs *)&__unused;
249
250 return do_rt_sigreturn(regs);
251}
252
261/* 253/*
262 * Set up a signal frame. 254 * Set up a signal frame.
263 */ 255 */
264static int 256static int
265setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, 257setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
266 struct pt_regs *regs, unsigned long mask) 258 struct pt_regs *regs, unsigned long mask)
267{ 259{
268 int tmp, err = 0; 260 int tmp, err = 0;
@@ -289,7 +281,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
289 err |= __put_user(regs->sp, &sc->sp_at_signal); 281 err |= __put_user(regs->sp, &sc->sp_at_signal);
290 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); 282 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
291 283
292 tmp = save_i387(fpstate); 284 tmp = save_i387_xstate(fpstate);
293 if (tmp < 0) 285 if (tmp < 0)
294 err = 1; 286 err = 1;
295 else 287 else
@@ -306,7 +298,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
306 * Determine which stack to use.. 298 * Determine which stack to use..
307 */ 299 */
308static inline void __user * 300static inline void __user *
309get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) 301get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
302 void **fpstate)
310{ 303{
311 unsigned long sp; 304 unsigned long sp;
312 305
@@ -332,6 +325,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
332 sp = (unsigned long) ka->sa.sa_restorer; 325 sp = (unsigned long) ka->sa.sa_restorer;
333 } 326 }
334 327
328 if (used_math()) {
329 sp = sp - sig_xstate_size;
330 *fpstate = (struct _fpstate *) sp;
331 }
332
335 sp -= frame_size; 333 sp -= frame_size;
336 /* 334 /*
337 * Align the stack pointer according to the i386 ABI, 335 * Align the stack pointer according to the i386 ABI,
@@ -343,38 +341,29 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
343} 341}
344 342
345static int 343static int
346setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, 344__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
347 struct pt_regs *regs) 345 struct pt_regs *regs)
348{ 346{
349 struct sigframe __user *frame; 347 struct sigframe __user *frame;
350 void __user *restorer; 348 void __user *restorer;
351 int err = 0; 349 int err = 0;
352 int usig; 350 void __user *fpstate = NULL;
353 351
354 frame = get_sigframe(ka, regs, sizeof(*frame)); 352 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
355 353
356 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 354 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
357 goto give_sigsegv; 355 return -EFAULT;
358 356
359 usig = current_thread_info()->exec_domain 357 if (__put_user(sig, &frame->sig))
360 && current_thread_info()->exec_domain->signal_invmap 358 return -EFAULT;
361 && sig < 32
362 ? current_thread_info()->exec_domain->signal_invmap[sig]
363 : sig;
364 359
365 err = __put_user(usig, &frame->sig); 360 if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
366 if (err) 361 return -EFAULT;
367 goto give_sigsegv;
368
369 err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
370 if (err)
371 goto give_sigsegv;
372 362
373 if (_NSIG_WORDS > 1) { 363 if (_NSIG_WORDS > 1) {
374 err = __copy_to_user(&frame->extramask, &set->sig[1], 364 if (__copy_to_user(&frame->extramask, &set->sig[1],
375 sizeof(frame->extramask)); 365 sizeof(frame->extramask)))
376 if (err) 366 return -EFAULT;
377 goto give_sigsegv;
378 } 367 }
379 368
380 if (current->mm->context.vdso) 369 if (current->mm->context.vdso)
@@ -399,7 +388,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
399 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); 388 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
400 389
401 if (err) 390 if (err)
402 goto give_sigsegv; 391 return -EFAULT;
403 392
404 /* Set up registers for signal handler */ 393 /* Set up registers for signal handler */
405 regs->sp = (unsigned long)frame; 394 regs->sp = (unsigned long)frame;
@@ -414,50 +403,43 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
414 regs->cs = __USER_CS; 403 regs->cs = __USER_CS;
415 404
416 return 0; 405 return 0;
417
418give_sigsegv:
419 force_sigsegv(sig, current);
420 return -EFAULT;
421} 406}
422 407
423static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 408static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
424 sigset_t *set, struct pt_regs *regs) 409 sigset_t *set, struct pt_regs *regs)
425{ 410{
426 struct rt_sigframe __user *frame; 411 struct rt_sigframe __user *frame;
427 void __user *restorer; 412 void __user *restorer;
428 int err = 0; 413 int err = 0;
429 int usig; 414 void __user *fpstate = NULL;
430 415
431 frame = get_sigframe(ka, regs, sizeof(*frame)); 416 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
432 417
433 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 418 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
434 goto give_sigsegv; 419 return -EFAULT;
435 420
436 usig = current_thread_info()->exec_domain 421 err |= __put_user(sig, &frame->sig);
437 && current_thread_info()->exec_domain->signal_invmap
438 && sig < 32
439 ? current_thread_info()->exec_domain->signal_invmap[sig]
440 : sig;
441
442 err |= __put_user(usig, &frame->sig);
443 err |= __put_user(&frame->info, &frame->pinfo); 422 err |= __put_user(&frame->info, &frame->pinfo);
444 err |= __put_user(&frame->uc, &frame->puc); 423 err |= __put_user(&frame->uc, &frame->puc);
445 err |= copy_siginfo_to_user(&frame->info, info); 424 err |= copy_siginfo_to_user(&frame->info, info);
446 if (err) 425 if (err)
447 goto give_sigsegv; 426 return -EFAULT;
448 427
449 /* Create the ucontext. */ 428 /* Create the ucontext. */
450 err |= __put_user(0, &frame->uc.uc_flags); 429 if (cpu_has_xsave)
430 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
431 else
432 err |= __put_user(0, &frame->uc.uc_flags);
451 err |= __put_user(0, &frame->uc.uc_link); 433 err |= __put_user(0, &frame->uc.uc_link);
452 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 434 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
453 err |= __put_user(sas_ss_flags(regs->sp), 435 err |= __put_user(sas_ss_flags(regs->sp),
454 &frame->uc.uc_stack.ss_flags); 436 &frame->uc.uc_stack.ss_flags);
455 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 437 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
456 err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 438 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
457 regs, set->sig[0]); 439 regs, set->sig[0]);
458 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 440 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
459 if (err) 441 if (err)
460 goto give_sigsegv; 442 return -EFAULT;
461 443
462 /* Set up to return from userspace. */ 444 /* Set up to return from userspace. */
463 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); 445 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -477,12 +459,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
477 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); 459 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
478 460
479 if (err) 461 if (err)
480 goto give_sigsegv; 462 return -EFAULT;
481 463
482 /* Set up registers for signal handler */ 464 /* Set up registers for signal handler */
483 regs->sp = (unsigned long)frame; 465 regs->sp = (unsigned long)frame;
484 regs->ip = (unsigned long)ka->sa.sa_handler; 466 regs->ip = (unsigned long)ka->sa.sa_handler;
485 regs->ax = (unsigned long)usig; 467 regs->ax = (unsigned long)sig;
486 regs->dx = (unsigned long)&frame->info; 468 regs->dx = (unsigned long)&frame->info;
487 regs->cx = (unsigned long)&frame->uc; 469 regs->cx = (unsigned long)&frame->uc;
488 470
@@ -492,15 +474,48 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
492 regs->cs = __USER_CS; 474 regs->cs = __USER_CS;
493 475
494 return 0; 476 return 0;
495
496give_sigsegv:
497 force_sigsegv(sig, current);
498 return -EFAULT;
499} 477}
500 478
501/* 479/*
502 * OK, we're invoking a handler: 480 * OK, we're invoking a handler:
503 */ 481 */
482static int signr_convert(int sig)
483{
484 struct thread_info *info = current_thread_info();
485
486 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
487 return info->exec_domain->signal_invmap[sig];
488 return sig;
489}
490
491#define is_ia32 1
492#define ia32_setup_frame __setup_frame
493#define ia32_setup_rt_frame __setup_rt_frame
494
495static int
496setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
497 sigset_t *set, struct pt_regs *regs)
498{
499 int usig = signr_convert(sig);
500 int ret;
501
502 /* Set up the stack frame */
503 if (is_ia32) {
504 if (ka->sa.sa_flags & SA_SIGINFO)
505 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
506 else
507 ret = ia32_setup_frame(usig, ka, set, regs);
508 } else
509 ret = __setup_rt_frame(sig, ka, info, set, regs);
510
511 if (ret) {
512 force_sigsegv(sig, current);
513 return -EFAULT;
514 }
515
516 return ret;
517}
518
504static int 519static int
505handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 520handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
506 sigset_t *oldset, struct pt_regs *regs) 521 sigset_t *oldset, struct pt_regs *regs)
@@ -508,9 +523,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
508 int ret; 523 int ret;
509 524
510 /* Are we from a system call? */ 525 /* Are we from a system call? */
511 if ((long)regs->orig_ax >= 0) { 526 if (syscall_get_nr(current, regs) >= 0) {
512 /* If so, check system call restarting.. */ 527 /* If so, check system call restarting.. */
513 switch (regs->ax) { 528 switch (syscall_get_error(current, regs)) {
514 case -ERESTART_RESTARTBLOCK: 529 case -ERESTART_RESTARTBLOCK:
515 case -ERESTARTNOHAND: 530 case -ERESTARTNOHAND:
516 regs->ax = -EINTR; 531 regs->ax = -EINTR;
@@ -537,15 +552,20 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
537 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 552 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
538 regs->flags &= ~X86_EFLAGS_TF; 553 regs->flags &= ~X86_EFLAGS_TF;
539 554
540 /* Set up the stack frame */ 555 ret = setup_rt_frame(sig, ka, info, oldset, regs);
541 if (ka->sa.sa_flags & SA_SIGINFO)
542 ret = setup_rt_frame(sig, ka, info, oldset, regs);
543 else
544 ret = setup_frame(sig, ka, oldset, regs);
545 556
546 if (ret) 557 if (ret)
547 return ret; 558 return ret;
548 559
560#ifdef CONFIG_X86_64
561 /*
562 * This has nothing to do with segment registers,
563 * despite the name. This magic affects uaccess.h
564 * macros' behavior. Reset it to the normal setting.
565 */
566 set_fs(USER_DS);
567#endif
568
549 /* 569 /*
550 * Clear the direction flag as per the ABI for function entry. 570 * Clear the direction flag as per the ABI for function entry.
551 */ 571 */
@@ -558,8 +578,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
558 * handler too. 578 * handler too.
559 */ 579 */
560 regs->flags &= ~X86_EFLAGS_TF; 580 regs->flags &= ~X86_EFLAGS_TF;
561 if (test_thread_flag(TIF_SINGLESTEP))
562 ptrace_notify(SIGTRAP);
563 581
564 spin_lock_irq(&current->sighand->siglock); 582 spin_lock_irq(&current->sighand->siglock);
565 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask); 583 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
@@ -568,9 +586,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
568 recalc_sigpending(); 586 recalc_sigpending();
569 spin_unlock_irq(&current->sighand->siglock); 587 spin_unlock_irq(&current->sighand->siglock);
570 588
589 tracehook_signal_handler(sig, info, ka, regs,
590 test_thread_flag(TIF_SINGLESTEP));
591
571 return 0; 592 return 0;
572} 593}
573 594
595#define NR_restart_syscall __NR_restart_syscall
574/* 596/*
575 * Note that 'init' is a special process: it doesn't get signals it doesn't 597 * Note that 'init' is a special process: it doesn't get signals it doesn't
576 * want to handle. Thus you cannot kill init even with a SIGKILL even by 598 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -623,9 +645,9 @@ static void do_signal(struct pt_regs *regs)
623 } 645 }
624 646
625 /* Did we come from a system call? */ 647 /* Did we come from a system call? */
626 if ((long)regs->orig_ax >= 0) { 648 if (syscall_get_nr(current, regs) >= 0) {
627 /* Restart the system call - no handlers present */ 649 /* Restart the system call - no handlers present */
628 switch (regs->ax) { 650 switch (syscall_get_error(current, regs)) {
629 case -ERESTARTNOHAND: 651 case -ERESTARTNOHAND:
630 case -ERESTARTSYS: 652 case -ERESTARTSYS:
631 case -ERESTARTNOINTR: 653 case -ERESTARTNOINTR:
@@ -634,7 +656,7 @@ static void do_signal(struct pt_regs *regs)
634 break; 656 break;
635 657
636 case -ERESTART_RESTARTBLOCK: 658 case -ERESTART_RESTARTBLOCK:
637 regs->ax = __NR_restart_syscall; 659 regs->ax = NR_restart_syscall;
638 regs->ip -= 2; 660 regs->ip -= 2;
639 break; 661 break;
640 } 662 }
@@ -657,9 +679,38 @@ static void do_signal(struct pt_regs *regs)
657void 679void
658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 680do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
659{ 681{
682#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
683 /* notify userspace of pending MCEs */
684 if (thread_info_flags & _TIF_MCE_NOTIFY)
685 mce_notify_user();
686#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
687
660 /* deal with pending signal delivery */ 688 /* deal with pending signal delivery */
661 if (thread_info_flags & _TIF_SIGPENDING) 689 if (thread_info_flags & _TIF_SIGPENDING)
662 do_signal(regs); 690 do_signal(regs);
663 691
692 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
693 clear_thread_flag(TIF_NOTIFY_RESUME);
694 tracehook_notify_resume(regs);
695 }
696
697#ifdef CONFIG_X86_32
664 clear_thread_flag(TIF_IRET); 698 clear_thread_flag(TIF_IRET);
699#endif /* CONFIG_X86_32 */
700}
701
702void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
703{
704 struct task_struct *me = current;
705
706 if (show_unhandled_signals && printk_ratelimit()) {
707 printk(KERN_INFO
708 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
709 me->comm, me->pid, where, frame,
710 regs->ip, regs->sp, regs->orig_ax);
711 print_vma_addr(" in ", regs->ip);
712 printk(KERN_CONT "\n");
713 }
714
715 force_sig(SIGSEGV, me);
665} 716}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index ca316b5b742..a5c9627f4db 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -15,17 +15,21 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/ptrace.h> 17#include <linux/ptrace.h>
18#include <linux/tracehook.h>
18#include <linux/unistd.h> 19#include <linux/unistd.h>
19#include <linux/stddef.h> 20#include <linux/stddef.h>
20#include <linux/personality.h> 21#include <linux/personality.h>
21#include <linux/compiler.h> 22#include <linux/compiler.h>
23#include <linux/uaccess.h>
24
22#include <asm/processor.h> 25#include <asm/processor.h>
23#include <asm/ucontext.h> 26#include <asm/ucontext.h>
24#include <asm/uaccess.h>
25#include <asm/i387.h> 27#include <asm/i387.h>
26#include <asm/proto.h> 28#include <asm/proto.h>
27#include <asm/ia32_unistd.h> 29#include <asm/ia32_unistd.h>
28#include <asm/mce.h> 30#include <asm/mce.h>
31#include <asm/syscall.h>
32#include <asm/syscalls.h>
29#include "sigframe.h" 33#include "sigframe.h"
30 34
31#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 35#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
@@ -41,11 +45,6 @@
41# define FIX_EFLAGS __FIX_EFLAGS 45# define FIX_EFLAGS __FIX_EFLAGS
42#endif 46#endif
43 47
44int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
45 sigset_t *set, struct pt_regs * regs);
46int ia32_setup_frame(int sig, struct k_sigaction *ka,
47 sigset_t *set, struct pt_regs * regs);
48
49asmlinkage long 48asmlinkage long
50sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 49sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
51 struct pt_regs *regs) 50 struct pt_regs *regs)
@@ -53,67 +52,14 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
53 return do_sigaltstack(uss, uoss, regs->sp); 52 return do_sigaltstack(uss, uoss, regs->sp);
54} 53}
55 54
56/* 55#define COPY(x) { \
57 * Signal frame handlers. 56 err |= __get_user(regs->x, &sc->x); \
58 */
59
60static inline int save_i387(struct _fpstate __user *buf)
61{
62 struct task_struct *tsk = current;
63 int err = 0;
64
65 BUILD_BUG_ON(sizeof(struct user_i387_struct) !=
66 sizeof(tsk->thread.xstate->fxsave));
67
68 if ((unsigned long)buf % 16)
69 printk("save_i387: bad fpstate %p\n", buf);
70
71 if (!used_math())
72 return 0;
73 clear_used_math(); /* trigger finit */
74 if (task_thread_info(tsk)->status & TS_USEDFPU) {
75 err = save_i387_checking((struct i387_fxsave_struct __user *)
76 buf);
77 if (err)
78 return err;
79 task_thread_info(tsk)->status &= ~TS_USEDFPU;
80 stts();
81 } else {
82 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
83 sizeof(struct i387_fxsave_struct)))
84 return -1;
85 }
86 return 1;
87} 57}
88 58
89/* 59#define COPY_SEG_STRICT(seg) { \
90 * This restores directly out of user space. Exceptions are handled. 60 unsigned short tmp; \
91 */ 61 err |= __get_user(tmp, &sc->seg); \
92static inline int restore_i387(struct _fpstate __user *buf) 62 regs->seg = tmp | 3; \
93{
94 struct task_struct *tsk = current;
95 int err;
96
97 if (!used_math()) {
98 err = init_fpu(tsk);
99 if (err)
100 return err;
101 }
102
103 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
104 clts();
105 task_thread_info(current)->status |= TS_USEDFPU;
106 }
107 err = restore_fpu_checking((__force struct i387_fxsave_struct *)buf);
108 if (unlikely(err)) {
109 /*
110 * Encountered an error while doing the restore from the
111 * user buffer, clear the fpu state.
112 */
113 clear_fpu(tsk);
114 clear_used_math();
115 }
116 return err;
117} 63}
118 64
119/* 65/*
@@ -123,13 +69,13 @@ static int
123restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 69restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
124 unsigned long *pax) 70 unsigned long *pax)
125{ 71{
72 void __user *buf;
73 unsigned int tmpflags;
126 unsigned int err = 0; 74 unsigned int err = 0;
127 75
128 /* Always make any pending restarted system calls return -EINTR */ 76 /* Always make any pending restarted system calls return -EINTR */
129 current_thread_info()->restart_block.fn = do_no_restart_syscall; 77 current_thread_info()->restart_block.fn = do_no_restart_syscall;
130 78
131#define COPY(x) err |= __get_user(regs->x, &sc->x)
132
133 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 79 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
134 COPY(dx); COPY(cx); COPY(ip); 80 COPY(dx); COPY(cx); COPY(ip);
135 COPY(r8); 81 COPY(r8);
@@ -144,48 +90,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
144 /* Kernel saves and restores only the CS segment register on signals, 90 /* Kernel saves and restores only the CS segment register on signals,
145 * which is the bare minimum needed to allow mixed 32/64-bit code. 91 * which is the bare minimum needed to allow mixed 32/64-bit code.
146 * App's signal handler can save/restore other segments if needed. */ 92 * App's signal handler can save/restore other segments if needed. */
147 { 93 COPY_SEG_STRICT(cs);
148 unsigned cs;
149 err |= __get_user(cs, &sc->cs);
150 regs->cs = cs | 3; /* Force into user mode */
151 }
152 94
153 { 95 err |= __get_user(tmpflags, &sc->flags);
154 unsigned int tmpflags; 96 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
155 err |= __get_user(tmpflags, &sc->flags); 97 regs->orig_ax = -1; /* disable syscall checks */
156 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
157 regs->orig_ax = -1; /* disable syscall checks */
158 }
159 98
160 { 99 err |= __get_user(buf, &sc->fpstate);
161 struct _fpstate __user * buf; 100 err |= restore_i387_xstate(buf);
162 err |= __get_user(buf, &sc->fpstate);
163
164 if (buf) {
165 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
166 goto badframe;
167 err |= restore_i387(buf);
168 } else {
169 struct task_struct *me = current;
170 if (used_math()) {
171 clear_fpu(me);
172 clear_used_math();
173 }
174 }
175 }
176 101
177 err |= __get_user(*pax, &sc->ax); 102 err |= __get_user(*pax, &sc->ax);
178 return err; 103 return err;
179
180badframe:
181 return 1;
182} 104}
183 105
184asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) 106static long do_rt_sigreturn(struct pt_regs *regs)
185{ 107{
186 struct rt_sigframe __user *frame; 108 struct rt_sigframe __user *frame;
187 sigset_t set;
188 unsigned long ax; 109 unsigned long ax;
110 sigset_t set;
189 111
190 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); 112 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
191 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 113 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
@@ -198,7 +120,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
198 current->blocked = set; 120 current->blocked = set;
199 recalc_sigpending(); 121 recalc_sigpending();
200 spin_unlock_irq(&current->sighand->siglock); 122 spin_unlock_irq(&current->sighand->siglock);
201 123
202 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 124 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
203 goto badframe; 125 goto badframe;
204 126
@@ -208,16 +130,22 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
208 return ax; 130 return ax;
209 131
210badframe: 132badframe:
211 signal_fault(regs,frame,"sigreturn"); 133 signal_fault(regs, frame, "rt_sigreturn");
212 return 0; 134 return 0;
213} 135}
136
137asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
138{
139 return do_rt_sigreturn(regs);
140}
214 141
215/* 142/*
216 * Set up a signal frame. 143 * Set up a signal frame.
217 */ 144 */
218 145
219static inline int 146static inline int
220setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) 147setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
148 unsigned long mask, struct task_struct *me)
221{ 149{
222 int err = 0; 150 int err = 0;
223 151
@@ -269,41 +197,40 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
269 sp = current->sas_ss_sp + current->sas_ss_size; 197 sp = current->sas_ss_sp + current->sas_ss_size;
270 } 198 }
271 199
272 return (void __user *)round_down(sp - size, 16); 200 return (void __user *)round_down(sp - size, 64);
273} 201}
274 202
275static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 203static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
276 sigset_t *set, struct pt_regs * regs) 204 sigset_t *set, struct pt_regs *regs)
277{ 205{
278 struct rt_sigframe __user *frame; 206 struct rt_sigframe __user *frame;
279 struct _fpstate __user *fp = NULL; 207 void __user *fp = NULL;
280 int err = 0; 208 int err = 0;
281 struct task_struct *me = current; 209 struct task_struct *me = current;
282 210
283 if (used_math()) { 211 if (used_math()) {
284 fp = get_stack(ka, regs, sizeof(struct _fpstate)); 212 fp = get_stack(ka, regs, sig_xstate_size);
285 frame = (void __user *)round_down( 213 frame = (void __user *)round_down(
286 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; 214 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
287 215
288 if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) 216 if (save_i387_xstate(fp) < 0)
289 goto give_sigsegv; 217 return -EFAULT;
290
291 if (save_i387(fp) < 0)
292 err |= -1;
293 } else 218 } else
294 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; 219 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
295 220
296 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 221 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
297 goto give_sigsegv; 222 return -EFAULT;
298 223
299 if (ka->sa.sa_flags & SA_SIGINFO) { 224 if (ka->sa.sa_flags & SA_SIGINFO) {
300 err |= copy_siginfo_to_user(&frame->info, info); 225 if (copy_siginfo_to_user(&frame->info, info))
301 if (err) 226 return -EFAULT;
302 goto give_sigsegv;
303 } 227 }
304 228
305 /* Create the ucontext. */ 229 /* Create the ucontext. */
306 err |= __put_user(0, &frame->uc.uc_flags); 230 if (cpu_has_xsave)
231 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
232 else
233 err |= __put_user(0, &frame->uc.uc_flags);
307 err |= __put_user(0, &frame->uc.uc_link); 234 err |= __put_user(0, &frame->uc.uc_link);
308 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 235 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
309 err |= __put_user(sas_ss_flags(regs->sp), 236 err |= __put_user(sas_ss_flags(regs->sp),
@@ -311,9 +238,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
311 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); 238 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
312 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); 239 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
313 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); 240 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
314 if (sizeof(*set) == 16) { 241 if (sizeof(*set) == 16) {
315 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); 242 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
316 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); 243 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
317 } else 244 } else
318 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 245 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
319 246
@@ -324,15 +251,15 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
324 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); 251 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
325 } else { 252 } else {
326 /* could use a vstub here */ 253 /* could use a vstub here */
327 goto give_sigsegv; 254 return -EFAULT;
328 } 255 }
329 256
330 if (err) 257 if (err)
331 goto give_sigsegv; 258 return -EFAULT;
332 259
333 /* Set up registers for signal handler */ 260 /* Set up registers for signal handler */
334 regs->di = sig; 261 regs->di = sig;
335 /* In case the signal handler was declared without prototypes */ 262 /* In case the signal handler was declared without prototypes */
336 regs->ax = 0; 263 regs->ax = 0;
337 264
338 /* This also works for non SA_SIGINFO handlers because they expect the 265 /* This also works for non SA_SIGINFO handlers because they expect the
@@ -348,44 +275,45 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
348 regs->cs = __USER_CS; 275 regs->cs = __USER_CS;
349 276
350 return 0; 277 return 0;
351
352give_sigsegv:
353 force_sigsegv(sig, current);
354 return -EFAULT;
355} 278}
356 279
357/* 280/*
358 * Return -1L or the syscall number that @regs is executing. 281 * OK, we're invoking a handler
359 */ 282 */
360static long current_syscall(struct pt_regs *regs) 283static int signr_convert(int sig)
361{ 284{
362 /* 285 return sig;
363 * We always sign-extend a -1 value being set here,
364 * so this is always either -1L or a syscall number.
365 */
366 return regs->orig_ax;
367} 286}
368 287
369/*
370 * Return a value that is -EFOO if the system call in @regs->orig_ax
371 * returned an error. This only works for @regs from @current.
372 */
373static long current_syscall_ret(struct pt_regs *regs)
374{
375#ifdef CONFIG_IA32_EMULATION 288#ifdef CONFIG_IA32_EMULATION
376 if (test_thread_flag(TIF_IA32)) 289#define is_ia32 test_thread_flag(TIF_IA32)
377 /* 290#else
378 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO 291#define is_ia32 0
379 * and will match correctly in comparisons.
380 */
381 return (int) regs->ax;
382#endif 292#endif
383 return regs->ax;
384}
385 293
386/* 294static int
387 * OK, we're invoking a handler 295setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
388 */ 296 sigset_t *set, struct pt_regs *regs)
297{
298 int usig = signr_convert(sig);
299 int ret;
300
301 /* Set up the stack frame */
302 if (is_ia32) {
303 if (ka->sa.sa_flags & SA_SIGINFO)
304 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
305 else
306 ret = ia32_setup_frame(usig, ka, set, regs);
307 } else
308 ret = __setup_rt_frame(sig, ka, info, set, regs);
309
310 if (ret) {
311 force_sigsegv(sig, current);
312 return -EFAULT;
313 }
314
315 return ret;
316}
389 317
390static int 318static int
391handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 319handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -394,9 +322,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
394 int ret; 322 int ret;
395 323
396 /* Are we from a system call? */ 324 /* Are we from a system call? */
397 if (current_syscall(regs) >= 0) { 325 if (syscall_get_nr(current, regs) >= 0) {
398 /* If so, check system call restarting.. */ 326 /* If so, check system call restarting.. */
399 switch (current_syscall_ret(regs)) { 327 switch (syscall_get_error(current, regs)) {
400 case -ERESTART_RESTARTBLOCK: 328 case -ERESTART_RESTARTBLOCK:
401 case -ERESTARTNOHAND: 329 case -ERESTARTNOHAND:
402 regs->ax = -EINTR; 330 regs->ax = -EINTR;
@@ -423,50 +351,48 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
423 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 351 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
424 regs->flags &= ~X86_EFLAGS_TF; 352 regs->flags &= ~X86_EFLAGS_TF;
425 353
426#ifdef CONFIG_IA32_EMULATION
427 if (test_thread_flag(TIF_IA32)) {
428 if (ka->sa.sa_flags & SA_SIGINFO)
429 ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
430 else
431 ret = ia32_setup_frame(sig, ka, oldset, regs);
432 } else
433#endif
434 ret = setup_rt_frame(sig, ka, info, oldset, regs); 354 ret = setup_rt_frame(sig, ka, info, oldset, regs);
435 355
436 if (ret == 0) { 356 if (ret)
437 /* 357 return ret;
438 * This has nothing to do with segment registers,
439 * despite the name. This magic affects uaccess.h
440 * macros' behavior. Reset it to the normal setting.
441 */
442 set_fs(USER_DS);
443 358
444 /* 359#ifdef CONFIG_X86_64
445 * Clear the direction flag as per the ABI for function entry. 360 /*
446 */ 361 * This has nothing to do with segment registers,
447 regs->flags &= ~X86_EFLAGS_DF; 362 * despite the name. This magic affects uaccess.h
363 * macros' behavior. Reset it to the normal setting.
364 */
365 set_fs(USER_DS);
366#endif
448 367
449 /* 368 /*
450 * Clear TF when entering the signal handler, but 369 * Clear the direction flag as per the ABI for function entry.
451 * notify any tracer that was single-stepping it. 370 */
452 * The tracer may want to single-step inside the 371 regs->flags &= ~X86_EFLAGS_DF;
453 * handler too.
454 */
455 regs->flags &= ~X86_EFLAGS_TF;
456 if (test_thread_flag(TIF_SINGLESTEP))
457 ptrace_notify(SIGTRAP);
458
459 spin_lock_irq(&current->sighand->siglock);
460 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
461 if (!(ka->sa.sa_flags & SA_NODEFER))
462 sigaddset(&current->blocked,sig);
463 recalc_sigpending();
464 spin_unlock_irq(&current->sighand->siglock);
465 }
466 372
467 return ret; 373 /*
374 * Clear TF when entering the signal handler, but
375 * notify any tracer that was single-stepping it.
376 * The tracer may want to single-step inside the
377 * handler too.
378 */
379 regs->flags &= ~X86_EFLAGS_TF;
380
381 spin_lock_irq(&current->sighand->siglock);
382 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
383 if (!(ka->sa.sa_flags & SA_NODEFER))
384 sigaddset(&current->blocked, sig);
385 recalc_sigpending();
386 spin_unlock_irq(&current->sighand->siglock);
387
388 tracehook_signal_handler(sig, info, ka, regs,
389 test_thread_flag(TIF_SINGLESTEP));
390
391 return 0;
468} 392}
469 393
394#define NR_restart_syscall \
395 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
470/* 396/*
471 * Note that 'init' is a special process: it doesn't get signals it doesn't 397 * Note that 'init' is a special process: it doesn't get signals it doesn't
472 * want to handle. Thus you cannot kill init even with a SIGKILL even by 398 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -496,7 +422,8 @@ static void do_signal(struct pt_regs *regs)
496 422
497 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 423 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
498 if (signr > 0) { 424 if (signr > 0) {
499 /* Re-enable any watchpoints before delivering the 425 /*
426 * Re-enable any watchpoints before delivering the
500 * signal to user space. The processor register will 427 * signal to user space. The processor register will
501 * have been cleared if the watchpoint triggered 428 * have been cleared if the watchpoint triggered
502 * inside the kernel. 429 * inside the kernel.
@@ -504,7 +431,7 @@ static void do_signal(struct pt_regs *regs)
504 if (current->thread.debugreg7) 431 if (current->thread.debugreg7)
505 set_debugreg(current->thread.debugreg7, 7); 432 set_debugreg(current->thread.debugreg7, 7);
506 433
507 /* Whee! Actually deliver the signal. */ 434 /* Whee! Actually deliver the signal. */
508 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 435 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
509 /* 436 /*
510 * A signal was successfully delivered; the saved 437 * A signal was successfully delivered; the saved
@@ -518,19 +445,18 @@ static void do_signal(struct pt_regs *regs)
518 } 445 }
519 446
520 /* Did we come from a system call? */ 447 /* Did we come from a system call? */
521 if (current_syscall(regs) >= 0) { 448 if (syscall_get_nr(current, regs) >= 0) {
522 /* Restart the system call - no handlers present */ 449 /* Restart the system call - no handlers present */
523 switch (current_syscall_ret(regs)) { 450 switch (syscall_get_error(current, regs)) {
524 case -ERESTARTNOHAND: 451 case -ERESTARTNOHAND:
525 case -ERESTARTSYS: 452 case -ERESTARTSYS:
526 case -ERESTARTNOINTR: 453 case -ERESTARTNOINTR:
527 regs->ax = regs->orig_ax; 454 regs->ax = regs->orig_ax;
528 regs->ip -= 2; 455 regs->ip -= 2;
529 break; 456 break;
457
530 case -ERESTART_RESTARTBLOCK: 458 case -ERESTART_RESTARTBLOCK:
531 regs->ax = test_thread_flag(TIF_IA32) ? 459 regs->ax = NR_restart_syscall;
532 __NR_ia32_restart_syscall :
533 __NR_restart_syscall;
534 regs->ip -= 2; 460 regs->ip -= 2;
535 break; 461 break;
536 } 462 }
@@ -546,29 +472,45 @@ static void do_signal(struct pt_regs *regs)
546 } 472 }
547} 473}
548 474
549void do_notify_resume(struct pt_regs *regs, void *unused, 475/*
550 __u32 thread_info_flags) 476 * notification of userspace execution resumption
477 * - triggered by the TIF_WORK_MASK flags
478 */
479void
480do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
551{ 481{
552#ifdef CONFIG_X86_MCE 482#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
553 /* notify userspace of pending MCEs */ 483 /* notify userspace of pending MCEs */
554 if (thread_info_flags & _TIF_MCE_NOTIFY) 484 if (thread_info_flags & _TIF_MCE_NOTIFY)
555 mce_notify_user(); 485 mce_notify_user();
556#endif /* CONFIG_X86_MCE */ 486#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
557 487
558 /* deal with pending signal delivery */ 488 /* deal with pending signal delivery */
559 if (thread_info_flags & _TIF_SIGPENDING) 489 if (thread_info_flags & _TIF_SIGPENDING)
560 do_signal(regs); 490 do_signal(regs);
491
492 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
493 clear_thread_flag(TIF_NOTIFY_RESUME);
494 tracehook_notify_resume(regs);
495 }
496
497#ifdef CONFIG_X86_32
498 clear_thread_flag(TIF_IRET);
499#endif /* CONFIG_X86_32 */
561} 500}
562 501
563void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 502void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
564{ 503{
565 struct task_struct *me = current; 504 struct task_struct *me = current;
505
566 if (show_unhandled_signals && printk_ratelimit()) { 506 if (show_unhandled_signals && printk_ratelimit()) {
567 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", 507 printk(KERN_INFO
568 me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax); 508 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
509 me->comm, me->pid, where, frame,
510 regs->ip, regs->sp, regs->orig_ax);
569 print_vma_addr(" in ", regs->ip); 511 print_vma_addr(" in ", regs->ip);
570 printk("\n"); 512 printk(KERN_CONT "\n");
571 } 513 }
572 514
573 force_sig(SIGSEGV, me); 515 force_sig(SIGSEGV, me);
574} 516}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 361b7a4c640..18f9b19f5f8 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
214struct smp_ops smp_ops = { 214struct smp_ops smp_ops = {
215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
216 .smp_prepare_cpus = native_smp_prepare_cpus, 216 .smp_prepare_cpus = native_smp_prepare_cpus,
217 .cpu_up = native_cpu_up,
218 .smp_cpus_done = native_smp_cpus_done, 217 .smp_cpus_done = native_smp_cpus_done,
219 218
220 .smp_send_stop = native_smp_send_stop, 219 .smp_send_stop = native_smp_send_stop,
221 .smp_send_reschedule = native_smp_send_reschedule, 220 .smp_send_reschedule = native_smp_send_reschedule,
222 221
222 .cpu_up = native_cpu_up,
223 .cpu_die = native_cpu_die,
224 .cpu_disable = native_cpu_disable,
225 .play_dead = native_play_dead,
226
223 .send_call_func_ipi = native_send_call_func_ipi, 227 .send_call_func_ipi = native_send_call_func_ipi,
224 .send_call_func_single_ipi = native_send_call_func_single_ipi, 228 .send_call_func_single_ipi = native_send_call_func_single_ipi,
225}; 229};
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7985c5b3f91..7b109339731 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -52,6 +52,7 @@
52#include <asm/desc.h> 52#include <asm/desc.h>
53#include <asm/nmi.h> 53#include <asm/nmi.h>
54#include <asm/irq.h> 54#include <asm/irq.h>
55#include <asm/idle.h>
55#include <asm/smp.h> 56#include <asm/smp.h>
56#include <asm/trampoline.h> 57#include <asm/trampoline.h>
57#include <asm/cpu.h> 58#include <asm/cpu.h>
@@ -88,7 +89,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
88#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) 89#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
89#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) 90#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
90#else 91#else
91struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; 92static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
92#define get_idle_for_cpu(x) (idle_thread_array[(x)]) 93#define get_idle_for_cpu(x) (idle_thread_array[(x)])
93#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) 94#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
94#endif 95#endif
@@ -123,13 +124,12 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
123 124
124static atomic_t init_deasserted; 125static atomic_t init_deasserted;
125 126
126static int boot_cpu_logical_apicid;
127 127
128/* representing cpus for which sibling maps can be computed */ 128/* representing cpus for which sibling maps can be computed */
129static cpumask_t cpu_sibling_setup_map; 129static cpumask_t cpu_sibling_setup_map;
130 130
131/* Set if we find a B stepping CPU */ 131/* Set if we find a B stepping CPU */
132int __cpuinitdata smp_b_stepping; 132static int __cpuinitdata smp_b_stepping;
133 133
134#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) 134#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
135 135
@@ -165,6 +165,8 @@ static void unmap_cpu_to_node(int cpu)
165#endif 165#endif
166 166
167#ifdef CONFIG_X86_32 167#ifdef CONFIG_X86_32
168static int boot_cpu_logical_apicid;
169
168u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = 170u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
169 { [0 ... NR_CPUS-1] = BAD_APICID }; 171 { [0 ... NR_CPUS-1] = BAD_APICID };
170 172
@@ -210,7 +212,7 @@ static void __cpuinit smp_callin(void)
210 /* 212 /*
211 * (This works even if the APIC is not enabled.) 213 * (This works even if the APIC is not enabled.)
212 */ 214 */
213 phys_id = GET_APIC_ID(read_apic_id()); 215 phys_id = read_apic_id();
214 cpuid = smp_processor_id(); 216 cpuid = smp_processor_id();
215 if (cpu_isset(cpuid, cpu_callin_map)) { 217 if (cpu_isset(cpuid, cpu_callin_map)) {
216 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, 218 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
@@ -257,6 +259,7 @@ static void __cpuinit smp_callin(void)
257 end_local_APIC_setup(); 259 end_local_APIC_setup();
258 map_cpu_to_logical_apicid(); 260 map_cpu_to_logical_apicid();
259 261
262 notify_cpu_starting(cpuid);
260 /* 263 /*
261 * Get our bogomips. 264 * Get our bogomips.
262 * 265 *
@@ -279,6 +282,8 @@ static void __cpuinit smp_callin(void)
279 cpu_set(cpuid, cpu_callin_map); 282 cpu_set(cpuid, cpu_callin_map);
280} 283}
281 284
285static int __cpuinitdata unsafe_smp;
286
282/* 287/*
283 * Activate a secondary processor. 288 * Activate a secondary processor.
284 */ 289 */
@@ -331,14 +336,17 @@ static void __cpuinit start_secondary(void *unused)
331 * does not change while we are assigning vectors to cpus. Holding 336 * does not change while we are assigning vectors to cpus. Holding
332 * this lock ensures we don't half assign or remove an irq from a cpu. 337 * this lock ensures we don't half assign or remove an irq from a cpu.
333 */ 338 */
334 ipi_call_lock_irq(); 339 ipi_call_lock();
335 lock_vector_lock(); 340 lock_vector_lock();
336 __setup_vector_irq(smp_processor_id()); 341 __setup_vector_irq(smp_processor_id());
337 cpu_set(smp_processor_id(), cpu_online_map); 342 cpu_set(smp_processor_id(), cpu_online_map);
338 unlock_vector_lock(); 343 unlock_vector_lock();
339 ipi_call_unlock_irq(); 344 ipi_call_unlock();
340 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 345 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
341 346
347 /* enable local interrupts */
348 local_irq_enable();
349
342 setup_secondary_clock(); 350 setup_secondary_clock();
343 351
344 wmb(); 352 wmb();
@@ -391,7 +399,7 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
391 goto valid_k7; 399 goto valid_k7;
392 400
393 /* If we get here, not a certified SMP capable AMD system. */ 401 /* If we get here, not a certified SMP capable AMD system. */
394 add_taint(TAINT_UNSAFE_SMP); 402 unsafe_smp = 1;
395 } 403 }
396 404
397valid_k7: 405valid_k7:
@@ -408,12 +416,10 @@ static void __cpuinit smp_checks(void)
408 * Don't taint if we are running SMP kernel on a single non-MP 416 * Don't taint if we are running SMP kernel on a single non-MP
409 * approved Athlon 417 * approved Athlon
410 */ 418 */
411 if (tainted & TAINT_UNSAFE_SMP) { 419 if (unsafe_smp && num_online_cpus() > 1) {
412 if (num_online_cpus()) 420 printk(KERN_INFO "WARNING: This combination of AMD"
413 printk(KERN_INFO "WARNING: This combination of AMD" 421 "processors is not suitable for SMP.\n");
414 "processors is not suitable for SMP.\n"); 422 add_taint(TAINT_UNSAFE_SMP);
415 else
416 tainted &= ~TAINT_UNSAFE_SMP;
417 } 423 }
418} 424}
419 425
@@ -537,10 +543,10 @@ static inline void __inquire_remote_apic(int apicid)
537 int timeout; 543 int timeout;
538 u32 status; 544 u32 status;
539 545
540 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); 546 printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid);
541 547
542 for (i = 0; i < ARRAY_SIZE(regs); i++) { 548 for (i = 0; i < ARRAY_SIZE(regs); i++) {
543 printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]); 549 printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]);
544 550
545 /* 551 /*
546 * Wait for idle. 552 * Wait for idle.
@@ -550,8 +556,7 @@ static inline void __inquire_remote_apic(int apicid)
550 printk(KERN_CONT 556 printk(KERN_CONT
551 "a previous APIC delivery may have failed\n"); 557 "a previous APIC delivery may have failed\n");
552 558
553 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 559 apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
554 apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]);
555 560
556 timeout = 0; 561 timeout = 0;
557 do { 562 do {
@@ -583,11 +588,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
583 int maxlvt; 588 int maxlvt;
584 589
585 /* Target chip */ 590 /* Target chip */
586 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
587
588 /* Boot on the stack */ 591 /* Boot on the stack */
589 /* Kick the second */ 592 /* Kick the second */
590 apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); 593 apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid);
591 594
592 pr_debug("Waiting for send to finish...\n"); 595 pr_debug("Waiting for send to finish...\n");
593 send_status = safe_apic_wait_icr_idle(); 596 send_status = safe_apic_wait_icr_idle();
@@ -596,10 +599,12 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
596 * Give the other CPU some time to accept the IPI. 599 * Give the other CPU some time to accept the IPI.
597 */ 600 */
598 udelay(200); 601 udelay(200);
599 maxlvt = lapic_get_maxlvt(); 602 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
600 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ 603 maxlvt = lapic_get_maxlvt();
601 apic_write(APIC_ESR, 0); 604 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
602 accept_status = (apic_read(APIC_ESR) & 0xEF); 605 apic_write(APIC_ESR, 0);
606 accept_status = (apic_read(APIC_ESR) & 0xEF);
607 }
603 pr_debug("NMI sent.\n"); 608 pr_debug("NMI sent.\n");
604 609
605 if (send_status) 610 if (send_status)
@@ -640,13 +645,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
640 /* 645 /*
641 * Turn INIT on target chip 646 * Turn INIT on target chip
642 */ 647 */
643 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
644
645 /* 648 /*
646 * Send IPI 649 * Send IPI
647 */ 650 */
648 apic_write(APIC_ICR, 651 apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
649 APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); 652 phys_apicid);
650 653
651 pr_debug("Waiting for send to finish...\n"); 654 pr_debug("Waiting for send to finish...\n");
652 send_status = safe_apic_wait_icr_idle(); 655 send_status = safe_apic_wait_icr_idle();
@@ -656,10 +659,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
656 pr_debug("Deasserting INIT.\n"); 659 pr_debug("Deasserting INIT.\n");
657 660
658 /* Target chip */ 661 /* Target chip */
659 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
660
661 /* Send IPI */ 662 /* Send IPI */
662 apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 663 apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
663 664
664 pr_debug("Waiting for send to finish...\n"); 665 pr_debug("Waiting for send to finish...\n");
665 send_status = safe_apic_wait_icr_idle(); 666 send_status = safe_apic_wait_icr_idle();
@@ -702,11 +703,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
702 */ 703 */
703 704
704 /* Target chip */ 705 /* Target chip */
705 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
706
707 /* Boot on the stack */ 706 /* Boot on the stack */
708 /* Kick the second */ 707 /* Kick the second */
709 apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12)); 708 apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
709 phys_apicid);
710 710
711 /* 711 /*
712 * Give the other CPU some time to accept the IPI. 712 * Give the other CPU some time to accept the IPI.
@@ -874,7 +874,7 @@ do_rest:
874 start_ip = setup_trampoline(); 874 start_ip = setup_trampoline();
875 875
876 /* So we see what's up */ 876 /* So we see what's up */
877 printk(KERN_INFO "Booting processor %d/%d ip %lx\n", 877 printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n",
878 cpu, apicid, start_ip); 878 cpu, apicid, start_ip);
879 879
880 /* 880 /*
@@ -893,9 +893,11 @@ do_rest:
893 smpboot_setup_warm_reset_vector(start_ip); 893 smpboot_setup_warm_reset_vector(start_ip);
894 /* 894 /*
895 * Be paranoid about clearing APIC errors. 895 * Be paranoid about clearing APIC errors.
896 */ 896 */
897 apic_write(APIC_ESR, 0); 897 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
898 apic_read(APIC_ESR); 898 apic_write(APIC_ESR, 0);
899 apic_read(APIC_ESR);
900 }
899 } 901 }
900 902
901 /* 903 /*
@@ -1175,10 +1177,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1175 * Setup boot CPU information 1177 * Setup boot CPU information
1176 */ 1178 */
1177 smp_store_cpu_info(0); /* Final full version of the data */ 1179 smp_store_cpu_info(0); /* Final full version of the data */
1180#ifdef CONFIG_X86_32
1178 boot_cpu_logical_apicid = logical_smp_processor_id(); 1181 boot_cpu_logical_apicid = logical_smp_processor_id();
1182#endif
1179 current_thread_info()->cpu = 0; /* needed? */ 1183 current_thread_info()->cpu = 0; /* needed? */
1180 set_cpu_sibling_map(0); 1184 set_cpu_sibling_map(0);
1181 1185
1186#ifdef CONFIG_X86_64
1187 enable_IR_x2apic();
1188 setup_apic_routing();
1189#endif
1190
1182 if (smp_sanity_check(max_cpus) < 0) { 1191 if (smp_sanity_check(max_cpus) < 0) {
1183 printk(KERN_INFO "SMP disabled\n"); 1192 printk(KERN_INFO "SMP disabled\n");
1184 disable_smp(); 1193 disable_smp();
@@ -1186,9 +1195,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1186 } 1195 }
1187 1196
1188 preempt_disable(); 1197 preempt_disable();
1189 if (GET_APIC_ID(read_apic_id()) != boot_cpu_physical_apicid) { 1198 if (read_apic_id() != boot_cpu_physical_apicid) {
1190 panic("Boot APIC ID in local APIC unexpected (%d vs %d)", 1199 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
1191 GET_APIC_ID(read_apic_id()), boot_cpu_physical_apicid); 1200 read_apic_id(), boot_cpu_physical_apicid);
1192 /* Or can we switch back to PIC here? */ 1201 /* Or can we switch back to PIC here? */
1193 } 1202 }
1194 preempt_enable(); 1203 preempt_enable();
@@ -1254,39 +1263,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1254 check_nmi_watchdog(); 1263 check_nmi_watchdog();
1255} 1264}
1256 1265
1257#ifdef CONFIG_HOTPLUG_CPU
1258
1259static void remove_siblinginfo(int cpu)
1260{
1261 int sibling;
1262 struct cpuinfo_x86 *c = &cpu_data(cpu);
1263
1264 for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
1265 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1266 /*/
1267 * last thread sibling in this cpu core going down
1268 */
1269 if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
1270 cpu_data(sibling).booted_cores--;
1271 }
1272
1273 for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
1274 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1275 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1276 cpus_clear(per_cpu(cpu_core_map, cpu));
1277 c->phys_proc_id = 0;
1278 c->cpu_core_id = 0;
1279 cpu_clear(cpu, cpu_sibling_setup_map);
1280}
1281
1282static int additional_cpus __initdata = -1;
1283
1284static __init int setup_additional_cpus(char *s)
1285{
1286 return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
1287}
1288early_param("additional_cpus", setup_additional_cpus);
1289
1290/* 1266/*
1291 * cpu_possible_map should be static, it cannot change as cpu's 1267 * cpu_possible_map should be static, it cannot change as cpu's
1292 * are onlined, or offlined. The reason is per-cpu data-structures 1268 * are onlined, or offlined. The reason is per-cpu data-structures
@@ -1306,24 +1282,13 @@ early_param("additional_cpus", setup_additional_cpus);
1306 */ 1282 */
1307__init void prefill_possible_map(void) 1283__init void prefill_possible_map(void)
1308{ 1284{
1309 int i; 1285 int i, possible;
1310 int possible;
1311 1286
1312 /* no processor from mptable or madt */ 1287 /* no processor from mptable or madt */
1313 if (!num_processors) 1288 if (!num_processors)
1314 num_processors = 1; 1289 num_processors = 1;
1315 1290
1316#ifdef CONFIG_HOTPLUG_CPU 1291 possible = num_processors + disabled_cpus;
1317 if (additional_cpus == -1) {
1318 if (disabled_cpus > 0)
1319 additional_cpus = disabled_cpus;
1320 else
1321 additional_cpus = 0;
1322 }
1323#else
1324 additional_cpus = 0;
1325#endif
1326 possible = num_processors + additional_cpus;
1327 if (possible > NR_CPUS) 1292 if (possible > NR_CPUS)
1328 possible = NR_CPUS; 1293 possible = NR_CPUS;
1329 1294
@@ -1336,6 +1301,31 @@ __init void prefill_possible_map(void)
1336 nr_cpu_ids = possible; 1301 nr_cpu_ids = possible;
1337} 1302}
1338 1303
1304#ifdef CONFIG_HOTPLUG_CPU
1305
1306static void remove_siblinginfo(int cpu)
1307{
1308 int sibling;
1309 struct cpuinfo_x86 *c = &cpu_data(cpu);
1310
1311 for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
1312 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1313 /*/
1314 * last thread sibling in this cpu core going down
1315 */
1316 if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
1317 cpu_data(sibling).booted_cores--;
1318 }
1319
1320 for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
1321 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1322 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1323 cpus_clear(per_cpu(cpu_core_map, cpu));
1324 c->phys_proc_id = 0;
1325 c->cpu_core_id = 0;
1326 cpu_clear(cpu, cpu_sibling_setup_map);
1327}
1328
1339static void __ref remove_cpu_from_maps(int cpu) 1329static void __ref remove_cpu_from_maps(int cpu)
1340{ 1330{
1341 cpu_clear(cpu, cpu_online_map); 1331 cpu_clear(cpu, cpu_online_map);
@@ -1346,25 +1336,9 @@ static void __ref remove_cpu_from_maps(int cpu)
1346 numa_remove_cpu(cpu); 1336 numa_remove_cpu(cpu);
1347} 1337}
1348 1338
1349int __cpu_disable(void) 1339void cpu_disable_common(void)
1350{ 1340{
1351 int cpu = smp_processor_id(); 1341 int cpu = smp_processor_id();
1352
1353 /*
1354 * Perhaps use cpufreq to drop frequency, but that could go
1355 * into generic code.
1356 *
1357 * We won't take down the boot processor on i386 due to some
1358 * interrupts only being able to be serviced by the BSP.
1359 * Especially so if we're not using an IOAPIC -zwane
1360 */
1361 if (cpu == 0)
1362 return -EBUSY;
1363
1364 if (nmi_watchdog == NMI_LOCAL_APIC)
1365 stop_apic_nmi_watchdog(NULL);
1366 clear_local_APIC();
1367
1368 /* 1342 /*
1369 * HACK: 1343 * HACK:
1370 * Allow any queued timer interrupts to get serviced 1344 * Allow any queued timer interrupts to get serviced
@@ -1382,10 +1356,32 @@ int __cpu_disable(void)
1382 remove_cpu_from_maps(cpu); 1356 remove_cpu_from_maps(cpu);
1383 unlock_vector_lock(); 1357 unlock_vector_lock();
1384 fixup_irqs(cpu_online_map); 1358 fixup_irqs(cpu_online_map);
1359}
1360
1361int native_cpu_disable(void)
1362{
1363 int cpu = smp_processor_id();
1364
1365 /*
1366 * Perhaps use cpufreq to drop frequency, but that could go
1367 * into generic code.
1368 *
1369 * We won't take down the boot processor on i386 due to some
1370 * interrupts only being able to be serviced by the BSP.
1371 * Especially so if we're not using an IOAPIC -zwane
1372 */
1373 if (cpu == 0)
1374 return -EBUSY;
1375
1376 if (nmi_watchdog == NMI_LOCAL_APIC)
1377 stop_apic_nmi_watchdog(NULL);
1378 clear_local_APIC();
1379
1380 cpu_disable_common();
1385 return 0; 1381 return 0;
1386} 1382}
1387 1383
1388void __cpu_die(unsigned int cpu) 1384void native_cpu_die(unsigned int cpu)
1389{ 1385{
1390 /* We don't do anything here: idle task is faking death itself. */ 1386 /* We don't do anything here: idle task is faking death itself. */
1391 unsigned int i; 1387 unsigned int i;
@@ -1402,15 +1398,45 @@ void __cpu_die(unsigned int cpu)
1402 } 1398 }
1403 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1399 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1404} 1400}
1401
1402void play_dead_common(void)
1403{
1404 idle_task_exit();
1405 reset_lazy_tlbstate();
1406 irq_ctx_exit(raw_smp_processor_id());
1407 c1e_remove_cpu(raw_smp_processor_id());
1408
1409 mb();
1410 /* Ack it */
1411 __get_cpu_var(cpu_state) = CPU_DEAD;
1412
1413 /*
1414 * With physical CPU hotplug, we should halt the cpu
1415 */
1416 local_irq_disable();
1417}
1418
1419void native_play_dead(void)
1420{
1421 play_dead_common();
1422 wbinvd_halt();
1423}
1424
1405#else /* ... !CONFIG_HOTPLUG_CPU */ 1425#else /* ... !CONFIG_HOTPLUG_CPU */
1406int __cpu_disable(void) 1426int native_cpu_disable(void)
1407{ 1427{
1408 return -ENOSYS; 1428 return -ENOSYS;
1409} 1429}
1410 1430
1411void __cpu_die(unsigned int cpu) 1431void native_cpu_die(unsigned int cpu)
1412{ 1432{
1413 /* We said "no" in __cpu_disable */ 1433 /* We said "no" in __cpu_disable */
1414 BUG(); 1434 BUG();
1415} 1435}
1436
1437void native_play_dead(void)
1438{
1439 BUG();
1440}
1441
1416#endif 1442#endif
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index d67ce5f044b..7b987852e87 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -30,7 +30,7 @@
30#include <linux/init.h> 30#include <linux/init.h>
31#include <asm/io.h> 31#include <asm/io.h>
32#include <asm/bios_ebda.h> 32#include <asm/bios_ebda.h>
33#include <asm/mach-summit/mach_mpparse.h> 33#include <asm/summit/mpparse.h>
34 34
35static struct rio_table_hdr *rio_table_hdr __initdata; 35static struct rio_table_hdr *rio_table_hdr __initdata;
36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; 36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 7066cb855a6..1884a8d12bf 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -22,6 +22,8 @@
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/unistd.h> 23#include <linux/unistd.h>
24 24
25#include <asm/syscalls.h>
26
25asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, 27asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
26 unsigned long prot, unsigned long flags, 28 unsigned long prot, unsigned long flags,
27 unsigned long fd, unsigned long pgoff) 29 unsigned long fd, unsigned long pgoff)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 3b360ef3381..6bc211accf0 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -13,15 +13,17 @@
13#include <linux/utsname.h> 13#include <linux/utsname.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/uaccess.h>
16 17
17#include <asm/uaccess.h>
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/syscalls.h>
19 20
20asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, 21asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
21 unsigned long fd, unsigned long off) 22 unsigned long prot, unsigned long flags,
23 unsigned long fd, unsigned long off)
22{ 24{
23 long error; 25 long error;
24 struct file * file; 26 struct file *file;
25 27
26 error = -EINVAL; 28 error = -EINVAL;
27 if (off & ~PAGE_MASK) 29 if (off & ~PAGE_MASK)
@@ -56,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
56 unmapped base down for this case. This can give 58 unmapped base down for this case. This can give
57 conflicts with the heap, but we assume that glibc 59 conflicts with the heap, but we assume that glibc
58 malloc knows how to fall back to mmap. Give it 1GB 60 malloc knows how to fall back to mmap. Give it 1GB
59 of playground for now. -AK */ 61 of playground for now. -AK */
60 *begin = 0x40000000; 62 *begin = 0x40000000;
61 *end = 0x80000000; 63 *end = 0x80000000;
62 if (current->flags & PF_RANDOMIZE) { 64 if (current->flags & PF_RANDOMIZE) {
63 new_begin = randomize_range(*begin, *begin + 0x02000000, 0); 65 new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
64 if (new_begin) 66 if (new_begin)
@@ -66,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
66 } 68 }
67 } else { 69 } else {
68 *begin = TASK_UNMAPPED_BASE; 70 *begin = TASK_UNMAPPED_BASE;
69 *end = TASK_SIZE; 71 *end = TASK_SIZE;
70 } 72 }
71} 73}
72 74
73unsigned long 75unsigned long
74arch_get_unmapped_area(struct file *filp, unsigned long addr, 76arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -78,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
78 struct vm_area_struct *vma; 80 struct vm_area_struct *vma;
79 unsigned long start_addr; 81 unsigned long start_addr;
80 unsigned long begin, end; 82 unsigned long begin, end;
81 83
82 if (flags & MAP_FIXED) 84 if (flags & MAP_FIXED)
83 return addr; 85 return addr;
84 86
85 find_start_end(flags, &begin, &end); 87 find_start_end(flags, &begin, &end);
86 88
87 if (len > end) 89 if (len > end)
88 return -ENOMEM; 90 return -ENOMEM;
@@ -96,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
96 } 98 }
97 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) 99 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
98 && len <= mm->cached_hole_size) { 100 && len <= mm->cached_hole_size) {
99 mm->cached_hole_size = 0; 101 mm->cached_hole_size = 0;
100 mm->free_area_cache = begin; 102 mm->free_area_cache = begin;
101 } 103 }
102 addr = mm->free_area_cache; 104 addr = mm->free_area_cache;
103 if (addr < begin) 105 if (addr < begin)
104 addr = begin; 106 addr = begin;
105 start_addr = addr; 107 start_addr = addr;
106 108
107full_search: 109full_search:
@@ -127,7 +129,7 @@ full_search:
127 return addr; 129 return addr;
128 } 130 }
129 if (addr + mm->cached_hole_size < vma->vm_start) 131 if (addr + mm->cached_hole_size < vma->vm_start)
130 mm->cached_hole_size = vma->vm_start - addr; 132 mm->cached_hole_size = vma->vm_start - addr;
131 133
132 addr = vma->vm_end; 134 addr = vma->vm_end;
133 } 135 }
@@ -177,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
177 vma = find_vma(mm, addr-len); 179 vma = find_vma(mm, addr-len);
178 if (!vma || addr <= vma->vm_start) 180 if (!vma || addr <= vma->vm_start)
179 /* remember the address as a hint for next time */ 181 /* remember the address as a hint for next time */
180 return (mm->free_area_cache = addr-len); 182 return mm->free_area_cache = addr-len;
181 } 183 }
182 184
183 if (mm->mmap_base < len) 185 if (mm->mmap_base < len)
@@ -194,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
194 vma = find_vma(mm, addr); 196 vma = find_vma(mm, addr);
195 if (!vma || addr+len <= vma->vm_start) 197 if (!vma || addr+len <= vma->vm_start)
196 /* remember the address as a hint for next time */ 198 /* remember the address as a hint for next time */
197 return (mm->free_area_cache = addr); 199 return mm->free_area_cache = addr;
198 200
199 /* remember the largest hole we saw so far */ 201 /* remember the largest hole we saw so far */
200 if (addr + mm->cached_hole_size < vma->vm_start) 202 if (addr + mm->cached_hole_size < vma->vm_start)
@@ -224,13 +226,13 @@ bottomup:
224} 226}
225 227
226 228
227asmlinkage long sys_uname(struct new_utsname __user * name) 229asmlinkage long sys_uname(struct new_utsname __user *name)
228{ 230{
229 int err; 231 int err;
230 down_read(&uts_sem); 232 down_read(&uts_sem);
231 err = copy_to_user(name, utsname(), sizeof (*name)); 233 err = copy_to_user(name, utsname(), sizeof(*name));
232 up_read(&uts_sem); 234 up_read(&uts_sem);
233 if (personality(current->personality) == PER_LINUX32) 235 if (personality(current->personality) == PER_LINUX32)
234 err |= copy_to_user(&name->machine, "i686", 5); 236 err |= copy_to_user(&name->machine, "i686", 5);
235 return err ? -EFAULT : 0; 237 return err ? -EFAULT : 0;
236} 238}
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 170d43c1748..de87d600829 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -8,12 +8,12 @@
8#define __NO_STUBS 8#define __NO_STUBS
9 9
10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
11#undef _ASM_X86_64_UNISTD_H_ 11#undef _ASM_X86_UNISTD_64_H
12#include <asm/unistd_64.h> 12#include <asm/unistd_64.h>
13 13
14#undef __SYSCALL 14#undef __SYSCALL
15#define __SYSCALL(nr, sym) [nr] = sym, 15#define __SYSCALL(nr, sym) [nr] = sym,
16#undef _ASM_X86_64_UNISTD_H_ 16#undef _ASM_X86_UNISTD_64_H
17 17
18typedef void (*sys_call_ptr_t)(void); 18typedef void (*sys_call_ptr_t)(void);
19 19
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index ffe3c664afc..77b400f06ea 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -36,6 +36,7 @@
36#include <asm/arch_hooks.h> 36#include <asm/arch_hooks.h>
37#include <asm/hpet.h> 37#include <asm/hpet.h>
38#include <asm/time.h> 38#include <asm/time.h>
39#include <asm/timer.h>
39 40
40#include "do_timer.h" 41#include "do_timer.h"
41 42
@@ -46,10 +47,9 @@ unsigned long profile_pc(struct pt_regs *regs)
46 unsigned long pc = instruction_pointer(regs); 47 unsigned long pc = instruction_pointer(regs);
47 48
48#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
49 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) && 50 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
50 in_lock_functions(pc)) {
51#ifdef CONFIG_FRAME_POINTER 51#ifdef CONFIG_FRAME_POINTER
52 return *(unsigned long *)(regs->bp + 4); 52 return *(unsigned long *)(regs->bp + sizeof(long));
53#else 53#else
54 unsigned long *sp = (unsigned long *)&regs->sp; 54 unsigned long *sp = (unsigned long *)&regs->sp;
55 55
@@ -94,6 +94,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
94 94
95 do_timer_interrupt_hook(); 95 do_timer_interrupt_hook();
96 96
97#ifdef CONFIG_MCA
97 if (MCA_bus) { 98 if (MCA_bus) {
98 /* The PS/2 uses level-triggered interrupts. You can't 99 /* The PS/2 uses level-triggered interrupts. You can't
99 turn them off, nor would you want to (any attempt to 100 turn them off, nor would you want to (any attempt to
@@ -107,6 +108,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
107 u8 irq_v = inb_p( 0x61 ); /* read the current state */ 108 u8 irq_v = inb_p( 0x61 ); /* read the current state */
108 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ 109 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */
109 } 110 }
111#endif
110 112
111 return IRQ_HANDLED; 113 return IRQ_HANDLED;
112} 114}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index e3d49c553af..cb19d650c21 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/time.h> 18#include <linux/time.h>
19#include <linux/mca.h>
19 20
20#include <asm/i8253.h> 21#include <asm/i8253.h>
21#include <asm/hpet.h> 22#include <asm/hpet.h>
@@ -33,23 +34,34 @@ unsigned long profile_pc(struct pt_regs *regs)
33 /* Assume the lock function has either no stack frame or a copy 34 /* Assume the lock function has either no stack frame or a copy
34 of flags from PUSHF 35 of flags from PUSHF
35 Eflags always has bits 22 and up cleared unlike kernel addresses. */ 36 Eflags always has bits 22 and up cleared unlike kernel addresses. */
36 if (!user_mode(regs) && in_lock_functions(pc)) { 37 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else
37 unsigned long *sp = (unsigned long *)regs->sp; 41 unsigned long *sp = (unsigned long *)regs->sp;
38 if (sp[0] >> 22) 42 if (sp[0] >> 22)
39 return sp[0]; 43 return sp[0];
40 if (sp[1] >> 22) 44 if (sp[1] >> 22)
41 return sp[1]; 45 return sp[1];
46#endif
42 } 47 }
43 return pc; 48 return pc;
44} 49}
45EXPORT_SYMBOL(profile_pc); 50EXPORT_SYMBOL(profile_pc);
46 51
47static irqreturn_t timer_event_interrupt(int irq, void *dev_id) 52irqreturn_t timer_interrupt(int irq, void *dev_id)
48{ 53{
49 add_pda(irq0_irqs, 1); 54 add_pda(irq0_irqs, 1);
50 55
51 global_clock_event->event_handler(global_clock_event); 56 global_clock_event->event_handler(global_clock_event);
52 57
58#ifdef CONFIG_MCA
59 if (MCA_bus) {
60 u8 irq_v = inb_p(0x61); /* read the current state */
61 outb_p(irq_v|0x80, 0x61); /* reset the IRQ */
62 }
63#endif
64
53 return IRQ_HANDLED; 65 return IRQ_HANDLED;
54} 66}
55 67
@@ -100,7 +112,7 @@ unsigned long __init calibrate_cpu(void)
100} 112}
101 113
102static struct irqaction irq0 = { 114static struct irqaction irq0 = {
103 .handler = timer_event_interrupt, 115 .handler = timer_interrupt,
104 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, 116 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING,
105 .mask = CPU_MASK_NONE, 117 .mask = CPU_MASK_NONE,
106 .name = "timer" 118 .name = "timer"
@@ -111,16 +123,13 @@ void __init hpet_time_init(void)
111 if (!hpet_enable()) 123 if (!hpet_enable())
112 setup_pit_timer(); 124 setup_pit_timer();
113 125
126 irq0.mask = cpumask_of_cpu(0);
114 setup_irq(0, &irq0); 127 setup_irq(0, &irq0);
115} 128}
116 129
117void __init time_init(void) 130void __init time_init(void)
118{ 131{
119 tsc_init(); 132 tsc_init();
120 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
121 vgetcpu_mode = VGETCPU_RDTSCP;
122 else
123 vgetcpu_mode = VGETCPU_LSL;
124 133
125 late_time_init = choose_time_init(); 134 late_time_init = choose_time_init();
126} 135}
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index fec1ecedc9b..e00534b3353 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -241,3 +241,11 @@ void flush_tlb_all(void)
241 on_each_cpu(do_flush_tlb_all, NULL, 1); 241 on_each_cpu(do_flush_tlb_all, NULL, 1);
242} 242}
243 243
244void reset_lazy_tlbstate(void)
245{
246 int cpu = raw_smp_processor_id();
247
248 per_cpu(cpu_tlbstate, cpu).state = 0;
249 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
250}
251
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 8b8c0d6640f..04431f34fd1 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -6,7 +6,7 @@
6 * This code is released under the GNU General Public License version 2 or 6 * This code is released under the GNU General Public License version 2 or
7 * later. 7 * later.
8 */ 8 */
9#include <linux/mc146818rtc.h> 9#include <linux/seq_file.h>
10#include <linux/proc_fs.h> 10#include <linux/proc_fs.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12 12
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index ab6bf375a30..6bb7b8579e7 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -10,6 +10,7 @@
10#include <asm/ldt.h> 10#include <asm/ldt.h>
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/proto.h> 12#include <asm/proto.h>
13#include <asm/syscalls.h>
13 14
14#include "tls.h" 15#include "tls.h"
15 16
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps.c
index 03df8e45e5a..04d242ab016 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps.c
@@ -7,13 +7,11 @@
7 */ 7 */
8 8
9/* 9/*
10 * 'Traps.c' handles hardware traps and faults after we have saved some 10 * Handle hardware traps and faults.
11 * state in 'asm.s'.
12 */ 11 */
13#include <linux/interrupt.h> 12#include <linux/interrupt.h>
14#include <linux/kallsyms.h> 13#include <linux/kallsyms.h>
15#include <linux/spinlock.h> 14#include <linux/spinlock.h>
16#include <linux/highmem.h>
17#include <linux/kprobes.h> 15#include <linux/kprobes.h>
18#include <linux/uaccess.h> 16#include <linux/uaccess.h>
19#include <linux/utsname.h> 17#include <linux/utsname.h>
@@ -32,6 +30,8 @@
32#include <linux/bug.h> 30#include <linux/bug.h>
33#include <linux/nmi.h> 31#include <linux/nmi.h>
34#include <linux/mm.h> 32#include <linux/mm.h>
33#include <linux/smp.h>
34#include <linux/io.h>
35 35
36#ifdef CONFIG_EISA 36#ifdef CONFIG_EISA
37#include <linux/ioport.h> 37#include <linux/ioport.h>
@@ -46,21 +46,31 @@
46#include <linux/edac.h> 46#include <linux/edac.h>
47#endif 47#endif
48 48
49#include <asm/arch_hooks.h>
50#include <asm/stacktrace.h> 49#include <asm/stacktrace.h>
51#include <asm/processor.h> 50#include <asm/processor.h>
52#include <asm/debugreg.h> 51#include <asm/debugreg.h>
53#include <asm/atomic.h> 52#include <asm/atomic.h>
54#include <asm/system.h> 53#include <asm/system.h>
55#include <asm/unwind.h> 54#include <asm/unwind.h>
55#include <asm/traps.h>
56#include <asm/desc.h> 56#include <asm/desc.h>
57#include <asm/i387.h> 57#include <asm/i387.h>
58
59#include <mach_traps.h>
60
61#ifdef CONFIG_X86_64
62#include <asm/pgalloc.h>
63#include <asm/proto.h>
64#include <asm/pda.h>
65#else
66#include <asm/processor-flags.h>
67#include <asm/arch_hooks.h>
58#include <asm/nmi.h> 68#include <asm/nmi.h>
59#include <asm/smp.h> 69#include <asm/smp.h>
60#include <asm/io.h> 70#include <asm/io.h>
61#include <asm/traps.h> 71#include <asm/traps.h>
62 72
63#include "mach_traps.h" 73#include "cpu/mcheck/mce.h"
64 74
65DECLARE_BITMAP(used_vectors, NR_VECTORS); 75DECLARE_BITMAP(used_vectors, NR_VECTORS);
66EXPORT_SYMBOL_GPL(used_vectors); 76EXPORT_SYMBOL_GPL(used_vectors);
@@ -77,418 +87,104 @@ char ignore_fpu_irq;
77 */ 87 */
78gate_desc idt_table[256] 88gate_desc idt_table[256]
79 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 89 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
80
81int panic_on_unrecovered_nmi;
82int kstack_depth_to_print = 24;
83static unsigned int code_bytes = 64;
84static int ignore_nmis;
85static int die_counter;
86
87void printk_address(unsigned long address, int reliable)
88{
89#ifdef CONFIG_KALLSYMS
90 unsigned long offset = 0;
91 unsigned long symsize;
92 const char *symname;
93 char *modname;
94 char *delim = ":";
95 char namebuf[KSYM_NAME_LEN];
96 char reliab[4] = "";
97
98 symname = kallsyms_lookup(address, &symsize, &offset,
99 &modname, namebuf);
100 if (!symname) {
101 printk(" [<%08lx>]\n", address);
102 return;
103 }
104 if (!reliable)
105 strcpy(reliab, "? ");
106
107 if (!modname)
108 modname = delim = "";
109 printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
110 address, reliab, delim, modname, delim, symname, offset, symsize);
111#else
112 printk(" [<%08lx>]\n", address);
113#endif
114}
115
116static inline int valid_stack_ptr(struct thread_info *tinfo,
117 void *p, unsigned int size)
118{
119 void *t = tinfo;
120 return p > t && p <= t + THREAD_SIZE - size;
121}
122
123/* The form of the top of the frame on the stack */
124struct stack_frame {
125 struct stack_frame *next_frame;
126 unsigned long return_address;
127};
128
129static inline unsigned long
130print_context_stack(struct thread_info *tinfo,
131 unsigned long *stack, unsigned long bp,
132 const struct stacktrace_ops *ops, void *data)
133{
134 struct stack_frame *frame = (struct stack_frame *)bp;
135
136 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
137 unsigned long addr;
138
139 addr = *stack;
140 if (__kernel_text_address(addr)) {
141 if ((unsigned long) stack == bp + 4) {
142 ops->address(data, addr, 1);
143 frame = frame->next_frame;
144 bp = (unsigned long) frame;
145 } else {
146 ops->address(data, addr, bp == 0);
147 }
148 }
149 stack++;
150 }
151 return bp;
152}
153
154void dump_trace(struct task_struct *task, struct pt_regs *regs,
155 unsigned long *stack, unsigned long bp,
156 const struct stacktrace_ops *ops, void *data)
157{
158 if (!task)
159 task = current;
160
161 if (!stack) {
162 unsigned long dummy;
163 stack = &dummy;
164 if (task != current)
165 stack = (unsigned long *)task->thread.sp;
166 }
167
168#ifdef CONFIG_FRAME_POINTER
169 if (!bp) {
170 if (task == current) {
171 /* Grab bp right from our regs */
172 asm("movl %%ebp, %0" : "=r" (bp) :);
173 } else {
174 /* bp is the last reg pushed by switch_to */
175 bp = *(unsigned long *) task->thread.sp;
176 }
177 }
178#endif 90#endif
179 91
180 for (;;) { 92static int ignore_nmis;
181 struct thread_info *context;
182
183 context = (struct thread_info *)
184 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
185 bp = print_context_stack(context, stack, bp, ops, data);
186 /*
187 * Should be after the line below, but somewhere
188 * in early boot context comes out corrupted and we
189 * can't reference it:
190 */
191 if (ops->stack(data, "IRQ") < 0)
192 break;
193 stack = (unsigned long *)context->previous_esp;
194 if (!stack)
195 break;
196 touch_nmi_watchdog();
197 }
198}
199EXPORT_SYMBOL(dump_trace);
200
201static void
202print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
203{
204 printk(data);
205 print_symbol(msg, symbol);
206 printk("\n");
207}
208
209static void print_trace_warning(void *data, char *msg)
210{
211 printk("%s%s\n", (char *)data, msg);
212}
213
214static int print_trace_stack(void *data, char *name)
215{
216 return 0;
217}
218
219/*
220 * Print one address/symbol entries per line.
221 */
222static void print_trace_address(void *data, unsigned long addr, int reliable)
223{
224 printk("%s [<%08lx>] ", (char *)data, addr);
225 if (!reliable)
226 printk("? ");
227 print_symbol("%s\n", addr);
228 touch_nmi_watchdog();
229}
230
231static const struct stacktrace_ops print_trace_ops = {
232 .warning = print_trace_warning,
233 .warning_symbol = print_trace_warning_symbol,
234 .stack = print_trace_stack,
235 .address = print_trace_address,
236};
237 93
238static void 94static inline void conditional_sti(struct pt_regs *regs)
239show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
240 unsigned long *stack, unsigned long bp, char *log_lvl)
241{ 95{
242 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 96 if (regs->flags & X86_EFLAGS_IF)
243 printk("%s =======================\n", log_lvl); 97 local_irq_enable();
244} 98}
245 99
246void show_trace(struct task_struct *task, struct pt_regs *regs, 100static inline void preempt_conditional_sti(struct pt_regs *regs)
247 unsigned long *stack, unsigned long bp)
248{ 101{
249 show_trace_log_lvl(task, regs, stack, bp, ""); 102 inc_preempt_count();
103 if (regs->flags & X86_EFLAGS_IF)
104 local_irq_enable();
250} 105}
251 106
252static void 107static inline void preempt_conditional_cli(struct pt_regs *regs)
253show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
254 unsigned long *sp, unsigned long bp, char *log_lvl)
255{ 108{
256 unsigned long *stack; 109 if (regs->flags & X86_EFLAGS_IF)
257 int i; 110 local_irq_disable();
258 111 dec_preempt_count();
259 if (sp == NULL) {
260 if (task)
261 sp = (unsigned long *)task->thread.sp;
262 else
263 sp = (unsigned long *)&sp;
264 }
265
266 stack = sp;
267 for (i = 0; i < kstack_depth_to_print; i++) {
268 if (kstack_end(stack))
269 break;
270 if (i && ((i % 8) == 0))
271 printk("\n%s ", log_lvl);
272 printk("%08lx ", *stack++);
273 }
274 printk("\n%sCall Trace:\n", log_lvl);
275
276 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
277} 112}
278 113
279void show_stack(struct task_struct *task, unsigned long *sp) 114#ifdef CONFIG_X86_32
115static inline void
116die_if_kernel(const char *str, struct pt_regs *regs, long err)
280{ 117{
281 printk(" "); 118 if (!user_mode_vm(regs))
282 show_stack_log_lvl(task, NULL, sp, 0, ""); 119 die(str, regs, err);
283} 120}
284 121
285/* 122/*
286 * The architecture-independent dump_stack generator 123 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
124 * invalid offset set (the LAZY one) and the faulting thread has
125 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
126 * we set the offset field correctly and return 1.
287 */ 127 */
288void dump_stack(void) 128static int lazy_iobitmap_copy(void)
289{ 129{
290 unsigned long bp = 0; 130 struct thread_struct *thread;
291 unsigned long stack; 131 struct tss_struct *tss;
292 132 int cpu;
293#ifdef CONFIG_FRAME_POINTER
294 if (!bp)
295 asm("movl %%ebp, %0" : "=r" (bp):);
296#endif
297
298 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
299 current->pid, current->comm, print_tainted(),
300 init_utsname()->release,
301 (int)strcspn(init_utsname()->version, " "),
302 init_utsname()->version);
303
304 show_trace(current, NULL, &stack, bp);
305}
306
307EXPORT_SYMBOL(dump_stack);
308
309void show_registers(struct pt_regs *regs)
310{
311 int i;
312 133
313 print_modules(); 134 cpu = get_cpu();
314 __show_registers(regs, 0); 135 tss = &per_cpu(init_tss, cpu);
136 thread = &current->thread;
315 137
316 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", 138 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
317 TASK_COMM_LEN, current->comm, task_pid_nr(current), 139 thread->io_bitmap_ptr) {
318 current_thread_info(), current, task_thread_info(current)); 140 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
319 /* 141 thread->io_bitmap_max);
320 * When in-kernel, we also print out the stack and code at the 142 /*
321 * time of the fault.. 143 * If the previously set map was extending to higher ports
322 */ 144 * than the current one, pad extra space with 0xff (no access).
323 if (!user_mode_vm(regs)) { 145 */
324 unsigned int code_prologue = code_bytes * 43 / 64; 146 if (thread->io_bitmap_max < tss->io_bitmap_max) {
325 unsigned int code_len = code_bytes; 147 memset((char *) tss->io_bitmap +
326 unsigned char c; 148 thread->io_bitmap_max, 0xff,
327 u8 *ip; 149 tss->io_bitmap_max - thread->io_bitmap_max);
328
329 printk("\n" KERN_EMERG "Stack: ");
330 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
331
332 printk(KERN_EMERG "Code: ");
333
334 ip = (u8 *)regs->ip - code_prologue;
335 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
336 /* try starting at EIP */
337 ip = (u8 *)regs->ip;
338 code_len = code_len - code_prologue + 1;
339 }
340 for (i = 0; i < code_len; i++, ip++) {
341 if (ip < (u8 *)PAGE_OFFSET ||
342 probe_kernel_address(ip, c)) {
343 printk(" Bad EIP value.");
344 break;
345 }
346 if (ip == (u8 *)regs->ip)
347 printk("<%02x> ", c);
348 else
349 printk("%02x ", c);
350 } 150 }
351 } 151 tss->io_bitmap_max = thread->io_bitmap_max;
352 printk("\n"); 152 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
353} 153 tss->io_bitmap_owner = thread;
354 154 put_cpu();
355int is_valid_bugaddr(unsigned long ip)
356{
357 unsigned short ud2;
358
359 if (ip < PAGE_OFFSET)
360 return 0;
361 if (probe_kernel_address((unsigned short *)ip, ud2))
362 return 0;
363
364 return ud2 == 0x0b0f;
365}
366
367static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
368static int die_owner = -1;
369static unsigned int die_nest_count;
370
371unsigned __kprobes long oops_begin(void)
372{
373 unsigned long flags;
374
375 oops_enter();
376
377 if (die_owner != raw_smp_processor_id()) {
378 console_verbose();
379 raw_local_irq_save(flags);
380 __raw_spin_lock(&die_lock);
381 die_owner = smp_processor_id();
382 die_nest_count = 0;
383 bust_spinlocks(1);
384 } else {
385 raw_local_irq_save(flags);
386 }
387 die_nest_count++;
388 return flags;
389}
390
391void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
392{
393 bust_spinlocks(0);
394 die_owner = -1;
395 add_taint(TAINT_DIE);
396 __raw_spin_unlock(&die_lock);
397 raw_local_irq_restore(flags);
398
399 if (!regs)
400 return;
401
402 if (kexec_should_crash(current))
403 crash_kexec(regs);
404
405 if (in_interrupt())
406 panic("Fatal exception in interrupt");
407
408 if (panic_on_oops)
409 panic("Fatal exception");
410
411 oops_exit();
412 do_exit(signr);
413}
414
415int __kprobes __die(const char *str, struct pt_regs *regs, long err)
416{
417 unsigned short ss;
418 unsigned long sp;
419 155
420 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
421#ifdef CONFIG_PREEMPT
422 printk("PREEMPT ");
423#endif
424#ifdef CONFIG_SMP
425 printk("SMP ");
426#endif
427#ifdef CONFIG_DEBUG_PAGEALLOC
428 printk("DEBUG_PAGEALLOC");
429#endif
430 printk("\n");
431 if (notify_die(DIE_OOPS, str, regs, err,
432 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
433 return 1; 156 return 1;
434
435 show_registers(regs);
436 /* Executive summary in case the oops scrolled away */
437 sp = (unsigned long) (&regs->sp);
438 savesegment(ss, ss);
439 if (user_mode(regs)) {
440 sp = regs->sp;
441 ss = regs->ss & 0xffff;
442 } 157 }
443 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); 158 put_cpu();
444 print_symbol("%s", regs->ip);
445 printk(" SS:ESP %04x:%08lx\n", ss, sp);
446 return 0;
447}
448
449/*
450 * This is gone through when something in the kernel has done something bad
451 * and is about to be terminated:
452 */
453void die(const char *str, struct pt_regs *regs, long err)
454{
455 unsigned long flags = oops_begin();
456
457 if (die_nest_count < 3) {
458 report_bug(regs->ip, regs);
459
460 if (__die(str, regs, err))
461 regs = NULL;
462 } else {
463 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
464 }
465
466 oops_end(flags, regs, SIGSEGV);
467}
468 159
469static inline void 160 return 0;
470die_if_kernel(const char *str, struct pt_regs *regs, long err)
471{
472 if (!user_mode_vm(regs))
473 die(str, regs, err);
474} 161}
162#endif
475 163
476static void __kprobes 164static void __kprobes
477do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs, 165do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
478 long error_code, siginfo_t *info) 166 long error_code, siginfo_t *info)
479{ 167{
480 struct task_struct *tsk = current; 168 struct task_struct *tsk = current;
481 169
170#ifdef CONFIG_X86_32
482 if (regs->flags & X86_VM_MASK) { 171 if (regs->flags & X86_VM_MASK) {
483 if (vm86) 172 /*
173 * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
174 * On nmi (interrupt 2), do_trap should not be called.
175 */
176 if (trapnr < 6)
484 goto vm86_trap; 177 goto vm86_trap;
485 goto trap_signal; 178 goto trap_signal;
486 } 179 }
180#endif
487 181
488 if (!user_mode(regs)) 182 if (!user_mode(regs))
489 goto kernel_trap; 183 goto kernel_trap;
490 184
185#ifdef CONFIG_X86_32
491trap_signal: 186trap_signal:
187#endif
492 /* 188 /*
493 * We want error_code and trap_no set for userspace faults and 189 * We want error_code and trap_no set for userspace faults and
494 * kernelspace faults which result in die(), but not 190 * kernelspace faults which result in die(), but not
@@ -501,6 +197,18 @@ trap_signal:
501 tsk->thread.error_code = error_code; 197 tsk->thread.error_code = error_code;
502 tsk->thread.trap_no = trapnr; 198 tsk->thread.trap_no = trapnr;
503 199
200#ifdef CONFIG_X86_64
201 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
202 printk_ratelimit()) {
203 printk(KERN_INFO
204 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
205 tsk->comm, tsk->pid, str,
206 regs->ip, regs->sp, error_code);
207 print_vma_addr(" in ", regs->ip);
208 printk("\n");
209 }
210#endif
211
504 if (info) 212 if (info)
505 force_sig_info(signr, info, tsk); 213 force_sig_info(signr, info, tsk);
506 else 214 else
@@ -515,120 +223,98 @@ kernel_trap:
515 } 223 }
516 return; 224 return;
517 225
226#ifdef CONFIG_X86_32
518vm86_trap: 227vm86_trap:
519 if (handle_vm86_trap((struct kernel_vm86_regs *) regs, 228 if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
520 error_code, trapnr)) 229 error_code, trapnr))
521 goto trap_signal; 230 goto trap_signal;
522 return; 231 return;
232#endif
523} 233}
524 234
525#define DO_ERROR(trapnr, signr, str, name) \ 235#define DO_ERROR(trapnr, signr, str, name) \
526void do_##name(struct pt_regs *regs, long error_code) \ 236dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
527{ \
528 trace_hardirqs_fixup(); \
529 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
530 == NOTIFY_STOP) \
531 return; \
532 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
533}
534
535#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
536void do_##name(struct pt_regs *regs, long error_code) \
537{ \ 237{ \
538 siginfo_t info; \
539 if (irq) \
540 local_irq_enable(); \
541 info.si_signo = signr; \
542 info.si_errno = 0; \
543 info.si_code = sicode; \
544 info.si_addr = (void __user *)siaddr; \
545 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 238 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
546 == NOTIFY_STOP) \ 239 == NOTIFY_STOP) \
547 return; \ 240 return; \
548 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ 241 conditional_sti(regs); \
242 do_trap(trapnr, signr, str, regs, error_code, NULL); \
549} 243}
550 244
551#define DO_VM86_ERROR(trapnr, signr, str, name) \ 245#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
552void do_##name(struct pt_regs *regs, long error_code) \ 246dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
553{ \
554 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
555 == NOTIFY_STOP) \
556 return; \
557 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \
558}
559
560#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
561void do_##name(struct pt_regs *regs, long error_code) \
562{ \ 247{ \
563 siginfo_t info; \ 248 siginfo_t info; \
564 info.si_signo = signr; \ 249 info.si_signo = signr; \
565 info.si_errno = 0; \ 250 info.si_errno = 0; \
566 info.si_code = sicode; \ 251 info.si_code = sicode; \
567 info.si_addr = (void __user *)siaddr; \ 252 info.si_addr = (void __user *)siaddr; \
568 trace_hardirqs_fixup(); \
569 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 253 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
570 == NOTIFY_STOP) \ 254 == NOTIFY_STOP) \
571 return; \ 255 return; \
572 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ 256 conditional_sti(regs); \
257 do_trap(trapnr, signr, str, regs, error_code, &info); \
573} 258}
574 259
575DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) 260DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
576#ifndef CONFIG_KPROBES 261DO_ERROR(4, SIGSEGV, "overflow", overflow)
577DO_VM86_ERROR(3, SIGTRAP, "int3", int3) 262DO_ERROR(5, SIGSEGV, "bounds", bounds)
578#endif 263DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
579DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
580DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
581DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
582DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 264DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
583DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 265DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
584DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) 266DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
267#ifdef CONFIG_X86_32
585DO_ERROR(12, SIGBUS, "stack segment", stack_segment) 268DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
586DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) 269#endif
587DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) 270DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
271
272#ifdef CONFIG_X86_64
273/* Runs on IST stack */
274dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
275{
276 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
277 12, SIGBUS) == NOTIFY_STOP)
278 return;
279 preempt_conditional_sti(regs);
280 do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
281 preempt_conditional_cli(regs);
282}
588 283
589void __kprobes 284dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
285{
286 static const char str[] = "double fault";
287 struct task_struct *tsk = current;
288
289 /* Return not checked because double check cannot be ignored */
290 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
291
292 tsk->thread.error_code = error_code;
293 tsk->thread.trap_no = 8;
294
295 /* This is always a kernel trap and never fixable (and thus must
296 never return). */
297 for (;;)
298 die(str, regs, error_code);
299}
300#endif
301
302dotraplinkage void __kprobes
590do_general_protection(struct pt_regs *regs, long error_code) 303do_general_protection(struct pt_regs *regs, long error_code)
591{ 304{
592 struct task_struct *tsk; 305 struct task_struct *tsk;
593 struct thread_struct *thread;
594 struct tss_struct *tss;
595 int cpu;
596
597 cpu = get_cpu();
598 tss = &per_cpu(init_tss, cpu);
599 thread = &current->thread;
600 306
601 /* 307 conditional_sti(regs);
602 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
603 * invalid offset set (the LAZY one) and the faulting thread has
604 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
605 * and we set the offset field correctly. Then we let the CPU to
606 * restart the faulting instruction.
607 */
608 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
609 thread->io_bitmap_ptr) {
610 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
611 thread->io_bitmap_max);
612 /*
613 * If the previously set map was extending to higher ports
614 * than the current one, pad extra space with 0xff (no access).
615 */
616 if (thread->io_bitmap_max < tss->io_bitmap_max) {
617 memset((char *) tss->io_bitmap +
618 thread->io_bitmap_max, 0xff,
619 tss->io_bitmap_max - thread->io_bitmap_max);
620 }
621 tss->io_bitmap_max = thread->io_bitmap_max;
622 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
623 tss->io_bitmap_owner = thread;
624 put_cpu();
625 308
309#ifdef CONFIG_X86_32
310 if (lazy_iobitmap_copy()) {
311 /* restart the faulting instruction */
626 return; 312 return;
627 } 313 }
628 put_cpu();
629 314
630 if (regs->flags & X86_VM_MASK) 315 if (regs->flags & X86_VM_MASK)
631 goto gp_in_vm86; 316 goto gp_in_vm86;
317#endif
632 318
633 tsk = current; 319 tsk = current;
634 if (!user_mode(regs)) 320 if (!user_mode(regs))
@@ -650,10 +336,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
650 force_sig(SIGSEGV, tsk); 336 force_sig(SIGSEGV, tsk);
651 return; 337 return;
652 338
339#ifdef CONFIG_X86_32
653gp_in_vm86: 340gp_in_vm86:
654 local_irq_enable(); 341 local_irq_enable();
655 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 342 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
656 return; 343 return;
344#endif
657 345
658gp_in_kernel: 346gp_in_kernel:
659 if (fixup_exception(regs)) 347 if (fixup_exception(regs))
@@ -690,7 +378,8 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
690 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 378 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
691 379
692 /* Clear and disable the memory parity error line. */ 380 /* Clear and disable the memory parity error line. */
693 clear_mem_error(reason); 381 reason = (reason & 0xf) | 4;
382 outb(reason, 0x61);
694} 383}
695 384
696static notrace __kprobes void 385static notrace __kprobes void
@@ -716,7 +405,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
716static notrace __kprobes void 405static notrace __kprobes void
717unknown_nmi_error(unsigned char reason, struct pt_regs *regs) 406unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
718{ 407{
719 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 408 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
409 NOTIFY_STOP)
720 return; 410 return;
721#ifdef CONFIG_MCA 411#ifdef CONFIG_MCA
722 /* 412 /*
@@ -739,41 +429,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
739 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 429 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
740} 430}
741 431
742static DEFINE_SPINLOCK(nmi_print_lock);
743
744void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
745{
746 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
747 return;
748
749 spin_lock(&nmi_print_lock);
750 /*
751 * We are in trouble anyway, lets at least try
752 * to get a message out:
753 */
754 bust_spinlocks(1);
755 printk(KERN_EMERG "%s", str);
756 printk(" on CPU%d, ip %08lx, registers:\n",
757 smp_processor_id(), regs->ip);
758 show_registers(regs);
759 if (do_panic)
760 panic("Non maskable interrupt");
761 console_silent();
762 spin_unlock(&nmi_print_lock);
763 bust_spinlocks(0);
764
765 /*
766 * If we are in kernel we are probably nested up pretty bad
767 * and might aswell get out now while we still can:
768 */
769 if (!user_mode_vm(regs)) {
770 current->thread.trap_no = 2;
771 crash_kexec(regs);
772 }
773
774 do_exit(SIGSEGV);
775}
776
777static notrace __kprobes void default_do_nmi(struct pt_regs *regs) 432static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
778{ 433{
779 unsigned char reason = 0; 434 unsigned char reason = 0;
@@ -812,22 +467,25 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
812 mem_parity_error(reason, regs); 467 mem_parity_error(reason, regs);
813 if (reason & 0x40) 468 if (reason & 0x40)
814 io_check_error(reason, regs); 469 io_check_error(reason, regs);
470#ifdef CONFIG_X86_32
815 /* 471 /*
816 * Reassert NMI in case it became active meanwhile 472 * Reassert NMI in case it became active meanwhile
817 * as it's edge-triggered: 473 * as it's edge-triggered:
818 */ 474 */
819 reassert_nmi(); 475 reassert_nmi();
476#endif
820} 477}
821 478
822notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) 479dotraplinkage notrace __kprobes void
480do_nmi(struct pt_regs *regs, long error_code)
823{ 481{
824 int cpu;
825
826 nmi_enter(); 482 nmi_enter();
827 483
828 cpu = smp_processor_id(); 484#ifdef CONFIG_X86_32
829 485 { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
830 ++nmi_count(cpu); 486#else
487 add_pda(__nmi_count, 1);
488#endif
831 489
832 if (!ignore_nmis) 490 if (!ignore_nmis)
833 default_do_nmi(regs); 491 default_do_nmi(regs);
@@ -847,21 +505,44 @@ void restart_nmi(void)
847 acpi_nmi_enable(); 505 acpi_nmi_enable();
848} 506}
849 507
850#ifdef CONFIG_KPROBES 508/* May run on IST stack. */
851void __kprobes do_int3(struct pt_regs *regs, long error_code) 509dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
852{ 510{
853 trace_hardirqs_fixup(); 511#ifdef CONFIG_KPROBES
854
855 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) 512 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
856 == NOTIFY_STOP) 513 == NOTIFY_STOP)
857 return; 514 return;
858 /* 515#else
859 * This is an interrupt gate, because kprobes wants interrupts 516 if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
860 * disabled. Normal trap handlers don't. 517 == NOTIFY_STOP)
861 */ 518 return;
862 restore_interrupts(regs); 519#endif
863 520
864 do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); 521 preempt_conditional_sti(regs);
522 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
523 preempt_conditional_cli(regs);
524}
525
526#ifdef CONFIG_X86_64
527/* Help handler running on IST stack to switch back to user stack
528 for scheduling or signal handling. The actual stack switch is done in
529 entry.S */
530asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
531{
532 struct pt_regs *regs = eregs;
533 /* Did already sync */
534 if (eregs == (struct pt_regs *)eregs->sp)
535 ;
536 /* Exception from user space */
537 else if (user_mode(eregs))
538 regs = task_pt_regs(current);
539 /* Exception from kernel and interrupts are enabled. Move to
540 kernel process stack. */
541 else if (eregs->flags & X86_EFLAGS_IF)
542 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
543 if (eregs != regs)
544 *regs = *eregs;
545 return regs;
865} 546}
866#endif 547#endif
867 548
@@ -886,13 +567,14 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code)
886 * about restoring all the debug state, and ptrace doesn't have to 567 * about restoring all the debug state, and ptrace doesn't have to
887 * find every occurrence of the TF bit that could be saved away even 568 * find every occurrence of the TF bit that could be saved away even
888 * by user code) 569 * by user code)
570 *
571 * May run on IST stack.
889 */ 572 */
890void __kprobes do_debug(struct pt_regs *regs, long error_code) 573dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
891{ 574{
892 struct task_struct *tsk = current; 575 struct task_struct *tsk = current;
893 unsigned int condition; 576 unsigned long condition;
894 577 int si_code;
895 trace_hardirqs_fixup();
896 578
897 get_debugreg(condition, 6); 579 get_debugreg(condition, 6);
898 580
@@ -905,9 +587,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
905 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 587 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
906 SIGTRAP) == NOTIFY_STOP) 588 SIGTRAP) == NOTIFY_STOP)
907 return; 589 return;
590
908 /* It's safe to allow irq's after DR6 has been saved */ 591 /* It's safe to allow irq's after DR6 has been saved */
909 if (regs->flags & X86_EFLAGS_IF) 592 preempt_conditional_sti(regs);
910 local_irq_enable();
911 593
912 /* Mask out spurious debug traps due to lazy DR7 setting */ 594 /* Mask out spurious debug traps due to lazy DR7 setting */
913 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 595 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
@@ -915,8 +597,10 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
915 goto clear_dr7; 597 goto clear_dr7;
916 } 598 }
917 599
600#ifdef CONFIG_X86_32
918 if (regs->flags & X86_VM_MASK) 601 if (regs->flags & X86_VM_MASK)
919 goto debug_vm86; 602 goto debug_vm86;
603#endif
920 604
921 /* Save debug status register where ptrace can see it */ 605 /* Save debug status register where ptrace can see it */
922 tsk->thread.debugreg6 = condition; 606 tsk->thread.debugreg6 = condition;
@@ -926,17 +610,13 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
926 * kernel space (but re-enable TF when returning to user mode). 610 * kernel space (but re-enable TF when returning to user mode).
927 */ 611 */
928 if (condition & DR_STEP) { 612 if (condition & DR_STEP) {
929 /*
930 * We already checked v86 mode above, so we can
931 * check for kernel mode by just checking the CPL
932 * of CS.
933 */
934 if (!user_mode(regs)) 613 if (!user_mode(regs))
935 goto clear_TF_reenable; 614 goto clear_TF_reenable;
936 } 615 }
937 616
617 si_code = get_si_code(condition);
938 /* Ok, finally something we can handle */ 618 /* Ok, finally something we can handle */
939 send_sigtrap(tsk, regs, error_code); 619 send_sigtrap(tsk, regs, error_code, si_code);
940 620
941 /* 621 /*
942 * Disable additional traps. They'll be re-enabled when 622 * Disable additional traps. They'll be re-enabled when
@@ -944,18 +624,37 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
944 */ 624 */
945clear_dr7: 625clear_dr7:
946 set_debugreg(0, 7); 626 set_debugreg(0, 7);
627 preempt_conditional_cli(regs);
947 return; 628 return;
948 629
630#ifdef CONFIG_X86_32
949debug_vm86: 631debug_vm86:
950 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); 632 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
633 preempt_conditional_cli(regs);
951 return; 634 return;
635#endif
952 636
953clear_TF_reenable: 637clear_TF_reenable:
954 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 638 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
955 regs->flags &= ~X86_EFLAGS_TF; 639 regs->flags &= ~X86_EFLAGS_TF;
640 preempt_conditional_cli(regs);
956 return; 641 return;
957} 642}
958 643
644#ifdef CONFIG_X86_64
645static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
646{
647 if (fixup_exception(regs))
648 return 1;
649
650 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
651 /* Illegal floating point operation in the kernel */
652 current->thread.trap_no = trapnr;
653 die(str, regs, 0);
654 return 0;
655}
656#endif
657
959/* 658/*
960 * Note that we play around with the 'TS' bit in an attempt to get 659 * Note that we play around with the 'TS' bit in an attempt to get
961 * the correct behaviour even in the presence of the asynchronous 660 * the correct behaviour even in the presence of the asynchronous
@@ -992,7 +691,9 @@ void math_error(void __user *ip)
992 swd = get_fpu_swd(task); 691 swd = get_fpu_swd(task);
993 switch (swd & ~cwd & 0x3f) { 692 switch (swd & ~cwd & 0x3f) {
994 case 0x000: /* No unmasked exception */ 693 case 0x000: /* No unmasked exception */
694#ifdef CONFIG_X86_32
995 return; 695 return;
696#endif
996 default: /* Multiple exceptions */ 697 default: /* Multiple exceptions */
997 break; 698 break;
998 case 0x001: /* Invalid Op */ 699 case 0x001: /* Invalid Op */
@@ -1020,9 +721,18 @@ void math_error(void __user *ip)
1020 force_sig_info(SIGFPE, &info, task); 721 force_sig_info(SIGFPE, &info, task);
1021} 722}
1022 723
1023void do_coprocessor_error(struct pt_regs *regs, long error_code) 724dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
1024{ 725{
726 conditional_sti(regs);
727
728#ifdef CONFIG_X86_32
1025 ignore_fpu_irq = 1; 729 ignore_fpu_irq = 1;
730#else
731 if (!user_mode(regs) &&
732 kernel_math_error(regs, "kernel x87 math error", 16))
733 return;
734#endif
735
1026 math_error((void __user *)regs->ip); 736 math_error((void __user *)regs->ip);
1027} 737}
1028 738
@@ -1074,8 +784,12 @@ static void simd_math_error(void __user *ip)
1074 force_sig_info(SIGFPE, &info, task); 784 force_sig_info(SIGFPE, &info, task);
1075} 785}
1076 786
1077void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 787dotraplinkage void
788do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
1078{ 789{
790 conditional_sti(regs);
791
792#ifdef CONFIG_X86_32
1079 if (cpu_has_xmm) { 793 if (cpu_has_xmm) {
1080 /* Handle SIMD FPU exceptions on PIII+ processors. */ 794 /* Handle SIMD FPU exceptions on PIII+ processors. */
1081 ignore_fpu_irq = 1; 795 ignore_fpu_irq = 1;
@@ -1094,16 +808,25 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
1094 current->thread.error_code = error_code; 808 current->thread.error_code = error_code;
1095 die_if_kernel("cache flush denied", regs, error_code); 809 die_if_kernel("cache flush denied", regs, error_code);
1096 force_sig(SIGSEGV, current); 810 force_sig(SIGSEGV, current);
811#else
812 if (!user_mode(regs) &&
813 kernel_math_error(regs, "kernel simd math error", 19))
814 return;
815 simd_math_error((void __user *)regs->ip);
816#endif
1097} 817}
1098 818
1099void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) 819dotraplinkage void
820do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
1100{ 821{
822 conditional_sti(regs);
1101#if 0 823#if 0
1102 /* No need to warn about this any longer. */ 824 /* No need to warn about this any longer. */
1103 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); 825 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
1104#endif 826#endif
1105} 827}
1106 828
829#ifdef CONFIG_X86_32
1107unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) 830unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
1108{ 831{
1109 struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); 832 struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
@@ -1122,6 +845,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
1122 845
1123 return new_kesp; 846 return new_kesp;
1124} 847}
848#else
849asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
850{
851}
852
853asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
854{
855}
856#endif
1125 857
1126/* 858/*
1127 * 'math_state_restore()' saves the current math information in the 859 * 'math_state_restore()' saves the current math information in the
@@ -1154,14 +886,24 @@ asmlinkage void math_state_restore(void)
1154 } 886 }
1155 887
1156 clts(); /* Allow maths ops (or we recurse) */ 888 clts(); /* Allow maths ops (or we recurse) */
889#ifdef CONFIG_X86_32
1157 restore_fpu(tsk); 890 restore_fpu(tsk);
891#else
892 /*
893 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
894 */
895 if (unlikely(restore_fpu_checking(tsk))) {
896 stts();
897 force_sig(SIGSEGV, tsk);
898 return;
899 }
900#endif
1158 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 901 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
1159 tsk->fpu_counter++; 902 tsk->fpu_counter++;
1160} 903}
1161EXPORT_SYMBOL_GPL(math_state_restore); 904EXPORT_SYMBOL_GPL(math_state_restore);
1162 905
1163#ifndef CONFIG_MATH_EMULATION 906#ifndef CONFIG_MATH_EMULATION
1164
1165asmlinkage void math_emulate(long arg) 907asmlinkage void math_emulate(long arg)
1166{ 908{
1167 printk(KERN_EMERG 909 printk(KERN_EMERG
@@ -1170,12 +912,46 @@ asmlinkage void math_emulate(long arg)
1170 force_sig(SIGFPE, current); 912 force_sig(SIGFPE, current);
1171 schedule(); 913 schedule();
1172} 914}
1173
1174#endif /* CONFIG_MATH_EMULATION */ 915#endif /* CONFIG_MATH_EMULATION */
1175 916
917dotraplinkage void __kprobes
918do_device_not_available(struct pt_regs *regs, long error)
919{
920#ifdef CONFIG_X86_32
921 if (read_cr0() & X86_CR0_EM) {
922 conditional_sti(regs);
923 math_emulate(0);
924 } else {
925 math_state_restore(); /* interrupts still off */
926 conditional_sti(regs);
927 }
928#else
929 math_state_restore();
930#endif
931}
932
933#ifdef CONFIG_X86_32
934dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
935{
936 siginfo_t info;
937 local_irq_enable();
938
939 info.si_signo = SIGILL;
940 info.si_errno = 0;
941 info.si_code = ILL_BADSTK;
942 info.si_addr = 0;
943 if (notify_die(DIE_TRAP, "iret exception",
944 regs, error_code, 32, SIGILL) == NOTIFY_STOP)
945 return;
946 do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
947}
948#endif
949
1176void __init trap_init(void) 950void __init trap_init(void)
1177{ 951{
952#ifdef CONFIG_X86_32
1178 int i; 953 int i;
954#endif
1179 955
1180#ifdef CONFIG_EISA 956#ifdef CONFIG_EISA
1181 void __iomem *p = early_ioremap(0x0FFFD9, 4); 957 void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@ -1185,29 +961,40 @@ void __init trap_init(void)
1185 early_iounmap(p, 4); 961 early_iounmap(p, 4);
1186#endif 962#endif
1187 963
1188 set_trap_gate(0, &divide_error); 964 set_intr_gate(0, &divide_error);
1189 set_intr_gate(1, &debug); 965 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1190 set_intr_gate(2, &nmi); 966 set_intr_gate_ist(2, &nmi, NMI_STACK);
1191 set_system_intr_gate(3, &int3); /* int3 can be called from all */ 967 /* int3 can be called from all */
1192 set_system_gate(4, &overflow); /* int4 can be called from all */ 968 set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
1193 set_trap_gate(5, &bounds); 969 /* int4 can be called from all */
1194 set_trap_gate(6, &invalid_op); 970 set_system_intr_gate(4, &overflow);
1195 set_trap_gate(7, &device_not_available); 971 set_intr_gate(5, &bounds);
972 set_intr_gate(6, &invalid_op);
973 set_intr_gate(7, &device_not_available);
974#ifdef CONFIG_X86_32
1196 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); 975 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
1197 set_trap_gate(9, &coprocessor_segment_overrun); 976#else
1198 set_trap_gate(10, &invalid_TSS); 977 set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
1199 set_trap_gate(11, &segment_not_present); 978#endif
1200 set_trap_gate(12, &stack_segment); 979 set_intr_gate(9, &coprocessor_segment_overrun);
1201 set_trap_gate(13, &general_protection); 980 set_intr_gate(10, &invalid_TSS);
981 set_intr_gate(11, &segment_not_present);
982 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
983 set_intr_gate(13, &general_protection);
1202 set_intr_gate(14, &page_fault); 984 set_intr_gate(14, &page_fault);
1203 set_trap_gate(15, &spurious_interrupt_bug); 985 set_intr_gate(15, &spurious_interrupt_bug);
1204 set_trap_gate(16, &coprocessor_error); 986 set_intr_gate(16, &coprocessor_error);
1205 set_trap_gate(17, &alignment_check); 987 set_intr_gate(17, &alignment_check);
1206#ifdef CONFIG_X86_MCE 988#ifdef CONFIG_X86_MCE
1207 set_trap_gate(18, &machine_check); 989 set_intr_gate_ist(18, &machine_check, MCE_STACK);
1208#endif 990#endif
1209 set_trap_gate(19, &simd_coprocessor_error); 991 set_intr_gate(19, &simd_coprocessor_error);
1210 992
993#ifdef CONFIG_IA32_EMULATION
994 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
995#endif
996
997#ifdef CONFIG_X86_32
1211 if (cpu_has_fxsr) { 998 if (cpu_has_fxsr) {
1212 printk(KERN_INFO "Enabling fast FPU save and restore... "); 999 printk(KERN_INFO "Enabling fast FPU save and restore... ");
1213 set_in_cr4(X86_CR4_OSFXSR); 1000 set_in_cr4(X86_CR4_OSFXSR);
@@ -1220,37 +1007,20 @@ void __init trap_init(void)
1220 printk("done.\n"); 1007 printk("done.\n");
1221 } 1008 }
1222 1009
1223 set_system_gate(SYSCALL_VECTOR, &system_call); 1010 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
1224 1011
1225 /* Reserve all the builtin and the syscall vector: */ 1012 /* Reserve all the builtin and the syscall vector: */
1226 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) 1013 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
1227 set_bit(i, used_vectors); 1014 set_bit(i, used_vectors);
1228 1015
1229 set_bit(SYSCALL_VECTOR, used_vectors); 1016 set_bit(SYSCALL_VECTOR, used_vectors);
1230 1017#endif
1231 init_thread_xstate();
1232 /* 1018 /*
1233 * Should be a barrier for any external CPU state: 1019 * Should be a barrier for any external CPU state:
1234 */ 1020 */
1235 cpu_init(); 1021 cpu_init();
1236 1022
1023#ifdef CONFIG_X86_32
1237 trap_init_hook(); 1024 trap_init_hook();
1025#endif
1238} 1026}
1239
1240static int __init kstack_setup(char *s)
1241{
1242 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1243
1244 return 1;
1245}
1246__setup("kstack=", kstack_setup);
1247
1248static int __init code_bytes_setup(char *s)
1249{
1250 code_bytes = simple_strtoul(s, NULL, 0);
1251 if (code_bytes > 8192)
1252 code_bytes = 8192;
1253
1254 return 1;
1255}
1256__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
deleted file mode 100644
index 513caaca711..00000000000
--- a/arch/x86/kernel/traps_64.c
+++ /dev/null
@@ -1,1212 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 *
5 * Pentium III FXSR, SSE support
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 */
8
9/*
10 * 'Traps.c' handles hardware traps and faults after we have saved some
11 * state in 'entry.S'.
12 */
13#include <linux/moduleparam.h>
14#include <linux/interrupt.h>
15#include <linux/kallsyms.h>
16#include <linux/spinlock.h>
17#include <linux/kprobes.h>
18#include <linux/uaccess.h>
19#include <linux/utsname.h>
20#include <linux/kdebug.h>
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/ptrace.h>
24#include <linux/string.h>
25#include <linux/unwind.h>
26#include <linux/delay.h>
27#include <linux/errno.h>
28#include <linux/kexec.h>
29#include <linux/sched.h>
30#include <linux/timer.h>
31#include <linux/init.h>
32#include <linux/bug.h>
33#include <linux/nmi.h>
34#include <linux/mm.h>
35
36#if defined(CONFIG_EDAC)
37#include <linux/edac.h>
38#endif
39
40#include <asm/stacktrace.h>
41#include <asm/processor.h>
42#include <asm/debugreg.h>
43#include <asm/atomic.h>
44#include <asm/system.h>
45#include <asm/unwind.h>
46#include <asm/desc.h>
47#include <asm/i387.h>
48#include <asm/nmi.h>
49#include <asm/smp.h>
50#include <asm/io.h>
51#include <asm/pgalloc.h>
52#include <asm/proto.h>
53#include <asm/pda.h>
54#include <asm/traps.h>
55
56#include <mach_traps.h>
57
58int panic_on_unrecovered_nmi;
59int kstack_depth_to_print = 12;
60static unsigned int code_bytes = 64;
61static int ignore_nmis;
62static int die_counter;
63
64static inline void conditional_sti(struct pt_regs *regs)
65{
66 if (regs->flags & X86_EFLAGS_IF)
67 local_irq_enable();
68}
69
70static inline void preempt_conditional_sti(struct pt_regs *regs)
71{
72 inc_preempt_count();
73 if (regs->flags & X86_EFLAGS_IF)
74 local_irq_enable();
75}
76
77static inline void preempt_conditional_cli(struct pt_regs *regs)
78{
79 if (regs->flags & X86_EFLAGS_IF)
80 local_irq_disable();
81 /* Make sure to not schedule here because we could be running
82 on an exception stack. */
83 dec_preempt_count();
84}
85
86void printk_address(unsigned long address, int reliable)
87{
88 printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
89}
90
91static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
92 unsigned *usedp, char **idp)
93{
94 static char ids[][8] = {
95 [DEBUG_STACK - 1] = "#DB",
96 [NMI_STACK - 1] = "NMI",
97 [DOUBLEFAULT_STACK - 1] = "#DF",
98 [STACKFAULT_STACK - 1] = "#SS",
99 [MCE_STACK - 1] = "#MC",
100#if DEBUG_STKSZ > EXCEPTION_STKSZ
101 [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
102#endif
103 };
104 unsigned k;
105
106 /*
107 * Iterate over all exception stacks, and figure out whether
108 * 'stack' is in one of them:
109 */
110 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
111 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
112 /*
113 * Is 'stack' above this exception frame's end?
114 * If yes then skip to the next frame.
115 */
116 if (stack >= end)
117 continue;
118 /*
119 * Is 'stack' above this exception frame's start address?
120 * If yes then we found the right frame.
121 */
122 if (stack >= end - EXCEPTION_STKSZ) {
123 /*
124 * Make sure we only iterate through an exception
125 * stack once. If it comes up for the second time
126 * then there's something wrong going on - just
127 * break out and return NULL:
128 */
129 if (*usedp & (1U << k))
130 break;
131 *usedp |= 1U << k;
132 *idp = ids[k];
133 return (unsigned long *)end;
134 }
135 /*
136 * If this is a debug stack, and if it has a larger size than
137 * the usual exception stacks, then 'stack' might still
138 * be within the lower portion of the debug stack:
139 */
140#if DEBUG_STKSZ > EXCEPTION_STKSZ
141 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
142 unsigned j = N_EXCEPTION_STACKS - 1;
143
144 /*
145 * Black magic. A large debug stack is composed of
146 * multiple exception stack entries, which we
147 * iterate through now. Dont look:
148 */
149 do {
150 ++j;
151 end -= EXCEPTION_STKSZ;
152 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
153 } while (stack < end - EXCEPTION_STKSZ);
154 if (*usedp & (1U << j))
155 break;
156 *usedp |= 1U << j;
157 *idp = ids[j];
158 return (unsigned long *)end;
159 }
160#endif
161 }
162 return NULL;
163}
164
165/*
166 * x86-64 can have up to three kernel stacks:
167 * process stack
168 * interrupt stack
169 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
170 */
171
172static inline int valid_stack_ptr(struct thread_info *tinfo,
173 void *p, unsigned int size, void *end)
174{
175 void *t = tinfo;
176 if (end) {
177 if (p < end && p >= (end-THREAD_SIZE))
178 return 1;
179 else
180 return 0;
181 }
182 return p > t && p < t + THREAD_SIZE - size;
183}
184
185/* The form of the top of the frame on the stack */
186struct stack_frame {
187 struct stack_frame *next_frame;
188 unsigned long return_address;
189};
190
191static inline unsigned long
192print_context_stack(struct thread_info *tinfo,
193 unsigned long *stack, unsigned long bp,
194 const struct stacktrace_ops *ops, void *data,
195 unsigned long *end)
196{
197 struct stack_frame *frame = (struct stack_frame *)bp;
198
199 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
200 unsigned long addr;
201
202 addr = *stack;
203 if (__kernel_text_address(addr)) {
204 if ((unsigned long) stack == bp + 8) {
205 ops->address(data, addr, 1);
206 frame = frame->next_frame;
207 bp = (unsigned long) frame;
208 } else {
209 ops->address(data, addr, bp == 0);
210 }
211 }
212 stack++;
213 }
214 return bp;
215}
216
217void dump_trace(struct task_struct *task, struct pt_regs *regs,
218 unsigned long *stack, unsigned long bp,
219 const struct stacktrace_ops *ops, void *data)
220{
221 const unsigned cpu = get_cpu();
222 unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
223 unsigned used = 0;
224 struct thread_info *tinfo;
225
226 if (!task)
227 task = current;
228
229 if (!stack) {
230 unsigned long dummy;
231 stack = &dummy;
232 if (task && task != current)
233 stack = (unsigned long *)task->thread.sp;
234 }
235
236#ifdef CONFIG_FRAME_POINTER
237 if (!bp) {
238 if (task == current) {
239 /* Grab bp right from our regs */
240 asm("movq %%rbp, %0" : "=r" (bp) :);
241 } else {
242 /* bp is the last reg pushed by switch_to */
243 bp = *(unsigned long *) task->thread.sp;
244 }
245 }
246#endif
247
248 /*
249 * Print function call entries in all stacks, starting at the
250 * current stack address. If the stacks consist of nested
251 * exceptions
252 */
253 tinfo = task_thread_info(task);
254 for (;;) {
255 char *id;
256 unsigned long *estack_end;
257 estack_end = in_exception_stack(cpu, (unsigned long)stack,
258 &used, &id);
259
260 if (estack_end) {
261 if (ops->stack(data, id) < 0)
262 break;
263
264 bp = print_context_stack(tinfo, stack, bp, ops,
265 data, estack_end);
266 ops->stack(data, "<EOE>");
267 /*
268 * We link to the next stack via the
269 * second-to-last pointer (index -2 to end) in the
270 * exception stack:
271 */
272 stack = (unsigned long *) estack_end[-2];
273 continue;
274 }
275 if (irqstack_end) {
276 unsigned long *irqstack;
277 irqstack = irqstack_end -
278 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
279
280 if (stack >= irqstack && stack < irqstack_end) {
281 if (ops->stack(data, "IRQ") < 0)
282 break;
283 bp = print_context_stack(tinfo, stack, bp,
284 ops, data, irqstack_end);
285 /*
286 * We link to the next stack (which would be
287 * the process stack normally) the last
288 * pointer (index -1 to end) in the IRQ stack:
289 */
290 stack = (unsigned long *) (irqstack_end[-1]);
291 irqstack_end = NULL;
292 ops->stack(data, "EOI");
293 continue;
294 }
295 }
296 break;
297 }
298
299 /*
300 * This handles the process stack:
301 */
302 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
303 put_cpu();
304}
305EXPORT_SYMBOL(dump_trace);
306
307static void
308print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
309{
310 print_symbol(msg, symbol);
311 printk("\n");
312}
313
314static void print_trace_warning(void *data, char *msg)
315{
316 printk("%s\n", msg);
317}
318
319static int print_trace_stack(void *data, char *name)
320{
321 printk(" <%s> ", name);
322 return 0;
323}
324
325static void print_trace_address(void *data, unsigned long addr, int reliable)
326{
327 touch_nmi_watchdog();
328 printk_address(addr, reliable);
329}
330
331static const struct stacktrace_ops print_trace_ops = {
332 .warning = print_trace_warning,
333 .warning_symbol = print_trace_warning_symbol,
334 .stack = print_trace_stack,
335 .address = print_trace_address,
336};
337
338static void
339show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
340 unsigned long *stack, unsigned long bp, char *log_lvl)
341{
342 printk("\nCall Trace:\n");
343 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
344 printk("\n");
345}
346
347void show_trace(struct task_struct *task, struct pt_regs *regs,
348 unsigned long *stack, unsigned long bp)
349{
350 show_trace_log_lvl(task, regs, stack, bp, "");
351}
352
353static void
354show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
355 unsigned long *sp, unsigned long bp, char *log_lvl)
356{
357 unsigned long *stack;
358 int i;
359 const int cpu = smp_processor_id();
360 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
361 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
362
363 // debugging aid: "show_stack(NULL, NULL);" prints the
364 // back trace for this cpu.
365
366 if (sp == NULL) {
367 if (task)
368 sp = (unsigned long *)task->thread.sp;
369 else
370 sp = (unsigned long *)&sp;
371 }
372
373 stack = sp;
374 for (i = 0; i < kstack_depth_to_print; i++) {
375 if (stack >= irqstack && stack <= irqstack_end) {
376 if (stack == irqstack_end) {
377 stack = (unsigned long *) (irqstack_end[-1]);
378 printk(" <EOI> ");
379 }
380 } else {
381 if (((long) stack & (THREAD_SIZE-1)) == 0)
382 break;
383 }
384 if (i && ((i % 4) == 0))
385 printk("\n");
386 printk(" %016lx", *stack++);
387 touch_nmi_watchdog();
388 }
389 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
390}
391
392void show_stack(struct task_struct *task, unsigned long *sp)
393{
394 show_stack_log_lvl(task, NULL, sp, 0, "");
395}
396
397/*
398 * The architecture-independent dump_stack generator
399 */
400void dump_stack(void)
401{
402 unsigned long bp = 0;
403 unsigned long stack;
404
405#ifdef CONFIG_FRAME_POINTER
406 if (!bp)
407 asm("movq %%rbp, %0" : "=r" (bp):);
408#endif
409
410 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
411 current->pid, current->comm, print_tainted(),
412 init_utsname()->release,
413 (int)strcspn(init_utsname()->version, " "),
414 init_utsname()->version);
415 show_trace(NULL, NULL, &stack, bp);
416}
417
418EXPORT_SYMBOL(dump_stack);
419
420void show_registers(struct pt_regs *regs)
421{
422 int i;
423 unsigned long sp;
424 const int cpu = smp_processor_id();
425 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
426
427 sp = regs->sp;
428 printk("CPU %d ", cpu);
429 __show_regs(regs);
430 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
431 cur->comm, cur->pid, task_thread_info(cur), cur);
432
433 /*
434 * When in-kernel, we also print out the stack and code at the
435 * time of the fault..
436 */
437 if (!user_mode(regs)) {
438 unsigned int code_prologue = code_bytes * 43 / 64;
439 unsigned int code_len = code_bytes;
440 unsigned char c;
441 u8 *ip;
442
443 printk("Stack: ");
444 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
445 regs->bp, "");
446 printk("\n");
447
448 printk(KERN_EMERG "Code: ");
449
450 ip = (u8 *)regs->ip - code_prologue;
451 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
452 /* try starting at RIP */
453 ip = (u8 *)regs->ip;
454 code_len = code_len - code_prologue + 1;
455 }
456 for (i = 0; i < code_len; i++, ip++) {
457 if (ip < (u8 *)PAGE_OFFSET ||
458 probe_kernel_address(ip, c)) {
459 printk(" Bad RIP value.");
460 break;
461 }
462 if (ip == (u8 *)regs->ip)
463 printk("<%02x> ", c);
464 else
465 printk("%02x ", c);
466 }
467 }
468 printk("\n");
469}
470
471int is_valid_bugaddr(unsigned long ip)
472{
473 unsigned short ud2;
474
475 if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
476 return 0;
477
478 return ud2 == 0x0b0f;
479}
480
481static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
482static int die_owner = -1;
483static unsigned int die_nest_count;
484
485unsigned __kprobes long oops_begin(void)
486{
487 int cpu;
488 unsigned long flags;
489
490 oops_enter();
491
492 /* racy, but better than risking deadlock. */
493 raw_local_irq_save(flags);
494 cpu = smp_processor_id();
495 if (!__raw_spin_trylock(&die_lock)) {
496 if (cpu == die_owner)
497 /* nested oops. should stop eventually */;
498 else
499 __raw_spin_lock(&die_lock);
500 }
501 die_nest_count++;
502 die_owner = cpu;
503 console_verbose();
504 bust_spinlocks(1);
505 return flags;
506}
507
508void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
509{
510 die_owner = -1;
511 bust_spinlocks(0);
512 die_nest_count--;
513 if (!die_nest_count)
514 /* Nest count reaches zero, release the lock. */
515 __raw_spin_unlock(&die_lock);
516 raw_local_irq_restore(flags);
517 if (!regs) {
518 oops_exit();
519 return;
520 }
521 if (panic_on_oops)
522 panic("Fatal exception");
523 oops_exit();
524 do_exit(signr);
525}
526
527int __kprobes __die(const char *str, struct pt_regs *regs, long err)
528{
529 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
530#ifdef CONFIG_PREEMPT
531 printk("PREEMPT ");
532#endif
533#ifdef CONFIG_SMP
534 printk("SMP ");
535#endif
536#ifdef CONFIG_DEBUG_PAGEALLOC
537 printk("DEBUG_PAGEALLOC");
538#endif
539 printk("\n");
540 if (notify_die(DIE_OOPS, str, regs, err,
541 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
542 return 1;
543
544 show_registers(regs);
545 add_taint(TAINT_DIE);
546 /* Executive summary in case the oops scrolled away */
547 printk(KERN_ALERT "RIP ");
548 printk_address(regs->ip, 1);
549 printk(" RSP <%016lx>\n", regs->sp);
550 if (kexec_should_crash(current))
551 crash_kexec(regs);
552 return 0;
553}
554
555void die(const char *str, struct pt_regs *regs, long err)
556{
557 unsigned long flags = oops_begin();
558
559 if (!user_mode(regs))
560 report_bug(regs->ip, regs);
561
562 if (__die(str, regs, err))
563 regs = NULL;
564 oops_end(flags, regs, SIGSEGV);
565}
566
567notrace __kprobes void
568die_nmi(char *str, struct pt_regs *regs, int do_panic)
569{
570 unsigned long flags;
571
572 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
573 return;
574
575 flags = oops_begin();
576 /*
577 * We are in trouble anyway, lets at least try
578 * to get a message out.
579 */
580 printk(KERN_EMERG "%s", str);
581 printk(" on CPU%d, ip %08lx, registers:\n",
582 smp_processor_id(), regs->ip);
583 show_registers(regs);
584 if (kexec_should_crash(current))
585 crash_kexec(regs);
586 if (do_panic || panic_on_oops)
587 panic("Non maskable interrupt");
588 oops_end(flags, NULL, SIGBUS);
589 nmi_exit();
590 local_irq_enable();
591 do_exit(SIGBUS);
592}
593
594static void __kprobes
595do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
596 long error_code, siginfo_t *info)
597{
598 struct task_struct *tsk = current;
599
600 if (!user_mode(regs))
601 goto kernel_trap;
602
603 /*
604 * We want error_code and trap_no set for userspace faults and
605 * kernelspace faults which result in die(), but not
606 * kernelspace faults which are fixed up. die() gives the
607 * process no chance to handle the signal and notice the
608 * kernel fault information, so that won't result in polluting
609 * the information about previously queued, but not yet
610 * delivered, faults. See also do_general_protection below.
611 */
612 tsk->thread.error_code = error_code;
613 tsk->thread.trap_no = trapnr;
614
615 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
616 printk_ratelimit()) {
617 printk(KERN_INFO
618 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
619 tsk->comm, tsk->pid, str,
620 regs->ip, regs->sp, error_code);
621 print_vma_addr(" in ", regs->ip);
622 printk("\n");
623 }
624
625 if (info)
626 force_sig_info(signr, info, tsk);
627 else
628 force_sig(signr, tsk);
629 return;
630
631kernel_trap:
632 if (!fixup_exception(regs)) {
633 tsk->thread.error_code = error_code;
634 tsk->thread.trap_no = trapnr;
635 die(str, regs, error_code);
636 }
637 return;
638}
639
640#define DO_ERROR(trapnr, signr, str, name) \
641asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
642{ \
643 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
644 == NOTIFY_STOP) \
645 return; \
646 conditional_sti(regs); \
647 do_trap(trapnr, signr, str, regs, error_code, NULL); \
648}
649
650#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
651asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
652{ \
653 siginfo_t info; \
654 info.si_signo = signr; \
655 info.si_errno = 0; \
656 info.si_code = sicode; \
657 info.si_addr = (void __user *)siaddr; \
658 trace_hardirqs_fixup(); \
659 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
660 == NOTIFY_STOP) \
661 return; \
662 conditional_sti(regs); \
663 do_trap(trapnr, signr, str, regs, error_code, &info); \
664}
665
666DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
667DO_ERROR(4, SIGSEGV, "overflow", overflow)
668DO_ERROR(5, SIGSEGV, "bounds", bounds)
669DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
670DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
671DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
672DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
673DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
674
675/* Runs on IST stack */
676asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
677{
678 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
679 12, SIGBUS) == NOTIFY_STOP)
680 return;
681 preempt_conditional_sti(regs);
682 do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
683 preempt_conditional_cli(regs);
684}
685
686asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
687{
688 static const char str[] = "double fault";
689 struct task_struct *tsk = current;
690
691 /* Return not checked because double check cannot be ignored */
692 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
693
694 tsk->thread.error_code = error_code;
695 tsk->thread.trap_no = 8;
696
697 /* This is always a kernel trap and never fixable (and thus must
698 never return). */
699 for (;;)
700 die(str, regs, error_code);
701}
702
703asmlinkage void __kprobes
704do_general_protection(struct pt_regs *regs, long error_code)
705{
706 struct task_struct *tsk;
707
708 conditional_sti(regs);
709
710 tsk = current;
711 if (!user_mode(regs))
712 goto gp_in_kernel;
713
714 tsk->thread.error_code = error_code;
715 tsk->thread.trap_no = 13;
716
717 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
718 printk_ratelimit()) {
719 printk(KERN_INFO
720 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
721 tsk->comm, tsk->pid,
722 regs->ip, regs->sp, error_code);
723 print_vma_addr(" in ", regs->ip);
724 printk("\n");
725 }
726
727 force_sig(SIGSEGV, tsk);
728 return;
729
730gp_in_kernel:
731 if (fixup_exception(regs))
732 return;
733
734 tsk->thread.error_code = error_code;
735 tsk->thread.trap_no = 13;
736 if (notify_die(DIE_GPF, "general protection fault", regs,
737 error_code, 13, SIGSEGV) == NOTIFY_STOP)
738 return;
739 die("general protection fault", regs, error_code);
740}
741
742static notrace __kprobes void
743mem_parity_error(unsigned char reason, struct pt_regs *regs)
744{
745 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
746 reason);
747 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
748
749#if defined(CONFIG_EDAC)
750 if (edac_handler_set()) {
751 edac_atomic_assert_error();
752 return;
753 }
754#endif
755
756 if (panic_on_unrecovered_nmi)
757 panic("NMI: Not continuing");
758
759 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
760
761 /* Clear and disable the memory parity error line. */
762 reason = (reason & 0xf) | 4;
763 outb(reason, 0x61);
764}
765
766static notrace __kprobes void
767io_check_error(unsigned char reason, struct pt_regs *regs)
768{
769 printk("NMI: IOCK error (debug interrupt?)\n");
770 show_registers(regs);
771
772 /* Re-enable the IOCK line, wait for a few seconds */
773 reason = (reason & 0xf) | 8;
774 outb(reason, 0x61);
775 mdelay(2000);
776 reason &= ~8;
777 outb(reason, 0x61);
778}
779
780static notrace __kprobes void
781unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
782{
783 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
784 return;
785 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
786 reason);
787 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
788
789 if (panic_on_unrecovered_nmi)
790 panic("NMI: Not continuing");
791
792 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
793}
794
795/* Runs on IST stack. This code must keep interrupts off all the time.
796 Nested NMIs are prevented by the CPU. */
797asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
798{
799 unsigned char reason = 0;
800 int cpu;
801
802 cpu = smp_processor_id();
803
804 /* Only the BSP gets external NMIs from the system. */
805 if (!cpu)
806 reason = get_nmi_reason();
807
808 if (!(reason & 0xc0)) {
809 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
810 == NOTIFY_STOP)
811 return;
812 /*
813 * Ok, so this is none of the documented NMI sources,
814 * so it must be the NMI watchdog.
815 */
816 if (nmi_watchdog_tick(regs, reason))
817 return;
818 if (!do_nmi_callback(regs, cpu))
819 unknown_nmi_error(reason, regs);
820
821 return;
822 }
823 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
824 return;
825
826 /* AK: following checks seem to be broken on modern chipsets. FIXME */
827 if (reason & 0x80)
828 mem_parity_error(reason, regs);
829 if (reason & 0x40)
830 io_check_error(reason, regs);
831}
832
833asmlinkage notrace __kprobes void
834do_nmi(struct pt_regs *regs, long error_code)
835{
836 nmi_enter();
837
838 add_pda(__nmi_count, 1);
839
840 if (!ignore_nmis)
841 default_do_nmi(regs);
842
843 nmi_exit();
844}
845
846void stop_nmi(void)
847{
848 acpi_nmi_disable();
849 ignore_nmis++;
850}
851
852void restart_nmi(void)
853{
854 ignore_nmis--;
855 acpi_nmi_enable();
856}
857
858/* runs on IST stack. */
859asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
860{
861 trace_hardirqs_fixup();
862
863 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
864 == NOTIFY_STOP)
865 return;
866
867 preempt_conditional_sti(regs);
868 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
869 preempt_conditional_cli(regs);
870}
871
872/* Help handler running on IST stack to switch back to user stack
873 for scheduling or signal handling. The actual stack switch is done in
874 entry.S */
875asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
876{
877 struct pt_regs *regs = eregs;
878 /* Did already sync */
879 if (eregs == (struct pt_regs *)eregs->sp)
880 ;
881 /* Exception from user space */
882 else if (user_mode(eregs))
883 regs = task_pt_regs(current);
884 /* Exception from kernel and interrupts are enabled. Move to
885 kernel process stack. */
886 else if (eregs->flags & X86_EFLAGS_IF)
887 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
888 if (eregs != regs)
889 *regs = *eregs;
890 return regs;
891}
892
893/* runs on IST stack. */
894asmlinkage void __kprobes do_debug(struct pt_regs * regs,
895 unsigned long error_code)
896{
897 struct task_struct *tsk = current;
898 unsigned long condition;
899 siginfo_t info;
900
901 trace_hardirqs_fixup();
902
903 get_debugreg(condition, 6);
904
905 /*
906 * The processor cleared BTF, so don't mark that we need it set.
907 */
908 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
909 tsk->thread.debugctlmsr = 0;
910
911 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
912 SIGTRAP) == NOTIFY_STOP)
913 return;
914
915 preempt_conditional_sti(regs);
916
917 /* Mask out spurious debug traps due to lazy DR7 setting */
918 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
919 if (!tsk->thread.debugreg7)
920 goto clear_dr7;
921 }
922
923 tsk->thread.debugreg6 = condition;
924
925 /*
926 * Single-stepping through TF: make sure we ignore any events in
927 * kernel space (but re-enable TF when returning to user mode).
928 */
929 if (condition & DR_STEP) {
930 if (!user_mode(regs))
931 goto clear_TF_reenable;
932 }
933
934 /* Ok, finally something we can handle */
935 tsk->thread.trap_no = 1;
936 tsk->thread.error_code = error_code;
937 info.si_signo = SIGTRAP;
938 info.si_errno = 0;
939 info.si_code = TRAP_BRKPT;
940 info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
941 force_sig_info(SIGTRAP, &info, tsk);
942
943clear_dr7:
944 set_debugreg(0, 7);
945 preempt_conditional_cli(regs);
946 return;
947
948clear_TF_reenable:
949 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
950 regs->flags &= ~X86_EFLAGS_TF;
951 preempt_conditional_cli(regs);
952 return;
953}
954
955static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
956{
957 if (fixup_exception(regs))
958 return 1;
959
960 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
961 /* Illegal floating point operation in the kernel */
962 current->thread.trap_no = trapnr;
963 die(str, regs, 0);
964 return 0;
965}
966
967/*
968 * Note that we play around with the 'TS' bit in an attempt to get
969 * the correct behaviour even in the presence of the asynchronous
970 * IRQ13 behaviour
971 */
972asmlinkage void do_coprocessor_error(struct pt_regs *regs)
973{
974 void __user *ip = (void __user *)(regs->ip);
975 struct task_struct *task;
976 siginfo_t info;
977 unsigned short cwd, swd;
978
979 conditional_sti(regs);
980 if (!user_mode(regs) &&
981 kernel_math_error(regs, "kernel x87 math error", 16))
982 return;
983
984 /*
985 * Save the info for the exception handler and clear the error.
986 */
987 task = current;
988 save_init_fpu(task);
989 task->thread.trap_no = 16;
990 task->thread.error_code = 0;
991 info.si_signo = SIGFPE;
992 info.si_errno = 0;
993 info.si_code = __SI_FAULT;
994 info.si_addr = ip;
995 /*
996 * (~cwd & swd) will mask out exceptions that are not set to unmasked
997 * status. 0x3f is the exception bits in these regs, 0x200 is the
998 * C1 reg you need in case of a stack fault, 0x040 is the stack
999 * fault bit. We should only be taking one exception at a time,
1000 * so if this combination doesn't produce any single exception,
1001 * then we have a bad program that isn't synchronizing its FPU usage
1002 * and it will suffer the consequences since we won't be able to
1003 * fully reproduce the context of the exception
1004 */
1005 cwd = get_fpu_cwd(task);
1006 swd = get_fpu_swd(task);
1007 switch (swd & ~cwd & 0x3f) {
1008 case 0x000: /* No unmasked exception */
1009 default: /* Multiple exceptions */
1010 break;
1011 case 0x001: /* Invalid Op */
1012 /*
1013 * swd & 0x240 == 0x040: Stack Underflow
1014 * swd & 0x240 == 0x240: Stack Overflow
1015 * User must clear the SF bit (0x40) if set
1016 */
1017 info.si_code = FPE_FLTINV;
1018 break;
1019 case 0x002: /* Denormalize */
1020 case 0x010: /* Underflow */
1021 info.si_code = FPE_FLTUND;
1022 break;
1023 case 0x004: /* Zero Divide */
1024 info.si_code = FPE_FLTDIV;
1025 break;
1026 case 0x008: /* Overflow */
1027 info.si_code = FPE_FLTOVF;
1028 break;
1029 case 0x020: /* Precision */
1030 info.si_code = FPE_FLTRES;
1031 break;
1032 }
1033 force_sig_info(SIGFPE, &info, task);
1034}
1035
1036asmlinkage void bad_intr(void)
1037{
1038 printk("bad interrupt");
1039}
1040
1041asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1042{
1043 void __user *ip = (void __user *)(regs->ip);
1044 struct task_struct *task;
1045 siginfo_t info;
1046 unsigned short mxcsr;
1047
1048 conditional_sti(regs);
1049 if (!user_mode(regs) &&
1050 kernel_math_error(regs, "kernel simd math error", 19))
1051 return;
1052
1053 /*
1054 * Save the info for the exception handler and clear the error.
1055 */
1056 task = current;
1057 save_init_fpu(task);
1058 task->thread.trap_no = 19;
1059 task->thread.error_code = 0;
1060 info.si_signo = SIGFPE;
1061 info.si_errno = 0;
1062 info.si_code = __SI_FAULT;
1063 info.si_addr = ip;
1064 /*
1065 * The SIMD FPU exceptions are handled a little differently, as there
1066 * is only a single status/control register. Thus, to determine which
1067 * unmasked exception was caught we must mask the exception mask bits
1068 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
1069 */
1070 mxcsr = get_fpu_mxcsr(task);
1071 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
1072 case 0x000:
1073 default:
1074 break;
1075 case 0x001: /* Invalid Op */
1076 info.si_code = FPE_FLTINV;
1077 break;
1078 case 0x002: /* Denormalize */
1079 case 0x010: /* Underflow */
1080 info.si_code = FPE_FLTUND;
1081 break;
1082 case 0x004: /* Zero Divide */
1083 info.si_code = FPE_FLTDIV;
1084 break;
1085 case 0x008: /* Overflow */
1086 info.si_code = FPE_FLTOVF;
1087 break;
1088 case 0x020: /* Precision */
1089 info.si_code = FPE_FLTRES;
1090 break;
1091 }
1092 force_sig_info(SIGFPE, &info, task);
1093}
1094
1095asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
1096{
1097}
1098
1099asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
1100{
1101}
1102
1103asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
1104{
1105}
1106
1107/*
1108 * 'math_state_restore()' saves the current math information in the
1109 * old math state array, and gets the new ones from the current task
1110 *
1111 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
1112 * Don't touch unless you *really* know how it works.
1113 */
1114asmlinkage void math_state_restore(void)
1115{
1116 struct task_struct *me = current;
1117
1118 if (!used_math()) {
1119 local_irq_enable();
1120 /*
1121 * does a slab alloc which can sleep
1122 */
1123 if (init_fpu(me)) {
1124 /*
1125 * ran out of memory!
1126 */
1127 do_group_exit(SIGKILL);
1128 return;
1129 }
1130 local_irq_disable();
1131 }
1132
1133 clts(); /* Allow maths ops (or we recurse) */
1134 /*
1135 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
1136 */
1137 if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) {
1138 stts();
1139 force_sig(SIGSEGV, me);
1140 return;
1141 }
1142 task_thread_info(me)->status |= TS_USEDFPU;
1143 me->fpu_counter++;
1144}
1145EXPORT_SYMBOL_GPL(math_state_restore);
1146
1147void __init trap_init(void)
1148{
1149 set_intr_gate(0, &divide_error);
1150 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1151 set_intr_gate_ist(2, &nmi, NMI_STACK);
1152 set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */
1153 set_system_gate(4, &overflow); /* int4 can be called from all */
1154 set_intr_gate(5, &bounds);
1155 set_intr_gate(6, &invalid_op);
1156 set_intr_gate(7, &device_not_available);
1157 set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
1158 set_intr_gate(9, &coprocessor_segment_overrun);
1159 set_intr_gate(10, &invalid_TSS);
1160 set_intr_gate(11, &segment_not_present);
1161 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
1162 set_intr_gate(13, &general_protection);
1163 set_intr_gate(14, &page_fault);
1164 set_intr_gate(15, &spurious_interrupt_bug);
1165 set_intr_gate(16, &coprocessor_error);
1166 set_intr_gate(17, &alignment_check);
1167#ifdef CONFIG_X86_MCE
1168 set_intr_gate_ist(18, &machine_check, MCE_STACK);
1169#endif
1170 set_intr_gate(19, &simd_coprocessor_error);
1171
1172#ifdef CONFIG_IA32_EMULATION
1173 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
1174#endif
1175 /*
1176 * initialize the per thread extended state:
1177 */
1178 init_thread_xstate();
1179 /*
1180 * Should be a barrier for any external CPU state:
1181 */
1182 cpu_init();
1183}
1184
1185static int __init oops_setup(char *s)
1186{
1187 if (!s)
1188 return -EINVAL;
1189 if (!strcmp(s, "panic"))
1190 panic_on_oops = 1;
1191 return 0;
1192}
1193early_param("oops", oops_setup);
1194
1195static int __init kstack_setup(char *s)
1196{
1197 if (!s)
1198 return -EINVAL;
1199 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1200 return 0;
1201}
1202early_param("kstack", kstack_setup);
1203
1204static int __init code_bytes_setup(char *s)
1205{
1206 code_bytes = simple_strtoul(s, NULL, 0);
1207 if (code_bytes > 8192)
1208 code_bytes = 8192;
1209
1210 return 1;
1211}
1212__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 8f98e9de1b8..161bb850fc4 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
104/* 104/*
105 * Read TSC and the reference counters. Take care of SMI disturbance 105 * Read TSC and the reference counters. Take care of SMI disturbance
106 */ 106 */
107static u64 tsc_read_refs(u64 *pm, u64 *hpet) 107static u64 tsc_read_refs(u64 *p, int hpet)
108{ 108{
109 u64 t1, t2; 109 u64 t1, t2;
110 int i; 110 int i;
@@ -112,9 +112,9 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
112 for (i = 0; i < MAX_RETRIES; i++) { 112 for (i = 0; i < MAX_RETRIES; i++) {
113 t1 = get_cycles(); 113 t1 = get_cycles();
114 if (hpet) 114 if (hpet)
115 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; 115 *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
116 else 116 else
117 *pm = acpi_pm_read_early(); 117 *p = acpi_pm_read_early();
118 t2 = get_cycles(); 118 t2 = get_cycles();
119 if ((t2 - t1) < SMI_TRESHOLD) 119 if ((t2 - t1) < SMI_TRESHOLD)
120 return t2; 120 return t2;
@@ -123,13 +123,59 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet)
123} 123}
124 124
125/* 125/*
126 * Calculate the TSC frequency from HPET reference
127 */
128static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
129{
130 u64 tmp;
131
132 if (hpet2 < hpet1)
133 hpet2 += 0x100000000ULL;
134 hpet2 -= hpet1;
135 tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
136 do_div(tmp, 1000000);
137 do_div(deltatsc, tmp);
138
139 return (unsigned long) deltatsc;
140}
141
142/*
143 * Calculate the TSC frequency from PMTimer reference
144 */
145static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
146{
147 u64 tmp;
148
149 if (!pm1 && !pm2)
150 return ULONG_MAX;
151
152 if (pm2 < pm1)
153 pm2 += (u64)ACPI_PM_OVRRUN;
154 pm2 -= pm1;
155 tmp = pm2 * 1000000000LL;
156 do_div(tmp, PMTMR_TICKS_PER_SEC);
157 do_div(deltatsc, tmp);
158
159 return (unsigned long) deltatsc;
160}
161
162#define CAL_MS 10
163#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
164#define CAL_PIT_LOOPS 1000
165
166#define CAL2_MS 50
167#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
168#define CAL2_PIT_LOOPS 5000
169
170
171/*
126 * Try to calibrate the TSC against the Programmable 172 * Try to calibrate the TSC against the Programmable
127 * Interrupt Timer and return the frequency of the TSC 173 * Interrupt Timer and return the frequency of the TSC
128 * in kHz. 174 * in kHz.
129 * 175 *
130 * Return ULONG_MAX on failure to calibrate. 176 * Return ULONG_MAX on failure to calibrate.
131 */ 177 */
132static unsigned long pit_calibrate_tsc(void) 178static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
133{ 179{
134 u64 tsc, t1, t2, delta; 180 u64 tsc, t1, t2, delta;
135 unsigned long tscmin, tscmax; 181 unsigned long tscmin, tscmax;
@@ -144,8 +190,8 @@ static unsigned long pit_calibrate_tsc(void)
144 * (LSB then MSB) to begin countdown. 190 * (LSB then MSB) to begin countdown.
145 */ 191 */
146 outb(0xb0, 0x43); 192 outb(0xb0, 0x43);
147 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); 193 outb(latch & 0xff, 0x42);
148 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); 194 outb(latch >> 8, 0x42);
149 195
150 tsc = t1 = t2 = get_cycles(); 196 tsc = t1 = t2 = get_cycles();
151 197
@@ -166,31 +212,154 @@ static unsigned long pit_calibrate_tsc(void)
166 /* 212 /*
167 * Sanity checks: 213 * Sanity checks:
168 * 214 *
169 * If we were not able to read the PIT more than 5000 215 * If we were not able to read the PIT more than loopmin
170 * times, then we have been hit by a massive SMI 216 * times, then we have been hit by a massive SMI
171 * 217 *
172 * If the maximum is 10 times larger than the minimum, 218 * If the maximum is 10 times larger than the minimum,
173 * then we got hit by an SMI as well. 219 * then we got hit by an SMI as well.
174 */ 220 */
175 if (pitcnt < 5000 || tscmax > 10 * tscmin) 221 if (pitcnt < loopmin || tscmax > 10 * tscmin)
176 return ULONG_MAX; 222 return ULONG_MAX;
177 223
178 /* Calculate the PIT value */ 224 /* Calculate the PIT value */
179 delta = t2 - t1; 225 delta = t2 - t1;
180 do_div(delta, 50); 226 do_div(delta, ms);
181 return delta; 227 return delta;
182} 228}
183 229
230/*
231 * This reads the current MSB of the PIT counter, and
232 * checks if we are running on sufficiently fast and
233 * non-virtualized hardware.
234 *
235 * Our expectations are:
236 *
237 * - the PIT is running at roughly 1.19MHz
238 *
239 * - each IO is going to take about 1us on real hardware,
240 * but we allow it to be much faster (by a factor of 10) or
241 * _slightly_ slower (ie we allow up to a 2us read+counter
242 * update - anything else implies a unacceptably slow CPU
243 * or PIT for the fast calibration to work.
244 *
245 * - with 256 PIT ticks to read the value, we have 214us to
246 * see the same MSB (and overhead like doing a single TSC
247 * read per MSB value etc).
248 *
249 * - We're doing 2 reads per loop (LSB, MSB), and we expect
250 * them each to take about a microsecond on real hardware.
251 * So we expect a count value of around 100. But we'll be
252 * generous, and accept anything over 50.
253 *
254 * - if the PIT is stuck, and we see *many* more reads, we
255 * return early (and the next caller of pit_expect_msb()
256 * then consider it a failure when they don't see the
257 * next expected value).
258 *
259 * These expectations mean that we know that we have seen the
260 * transition from one expected value to another with a fairly
261 * high accuracy, and we didn't miss any events. We can thus
262 * use the TSC value at the transitions to calculate a pretty
263 * good value for the TSC frequencty.
264 */
265static inline int pit_expect_msb(unsigned char val)
266{
267 int count = 0;
268
269 for (count = 0; count < 50000; count++) {
270 /* Ignore LSB */
271 inb(0x42);
272 if (inb(0x42) != val)
273 break;
274 }
275 return count > 50;
276}
277
278/*
279 * How many MSB values do we want to see? We aim for a
280 * 15ms calibration, which assuming a 2us counter read
281 * error should give us roughly 150 ppm precision for
282 * the calibration.
283 */
284#define QUICK_PIT_MS 15
285#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
286
287static unsigned long quick_pit_calibrate(void)
288{
289 /* Set the Gate high, disable speaker */
290 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
291
292 /*
293 * Counter 2, mode 0 (one-shot), binary count
294 *
295 * NOTE! Mode 2 decrements by two (and then the
296 * output is flipped each time, giving the same
297 * final output frequency as a decrement-by-one),
298 * so mode 0 is much better when looking at the
299 * individual counts.
300 */
301 outb(0xb0, 0x43);
302
303 /* Start at 0xffff */
304 outb(0xff, 0x42);
305 outb(0xff, 0x42);
306
307 if (pit_expect_msb(0xff)) {
308 int i;
309 u64 t1, t2, delta;
310 unsigned char expect = 0xfe;
311
312 t1 = get_cycles();
313 for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
314 if (!pit_expect_msb(expect))
315 goto failed;
316 }
317 t2 = get_cycles();
318
319 /*
320 * Make sure we can rely on the second TSC timestamp:
321 */
322 if (!pit_expect_msb(expect))
323 goto failed;
324
325 /*
326 * Ok, if we get here, then we've seen the
327 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
328 * times, and each MSB had many hits, so we never
329 * had any sudden jumps.
330 *
331 * As a result, we can depend on there not being
332 * any odd delays anywhere, and the TSC reads are
333 * reliable.
334 *
335 * kHz = ticks / time-in-seconds / 1000;
336 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
337 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
338 */
339 delta = (t2 - t1)*PIT_TICK_RATE;
340 do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
341 printk("Fast TSC calibration using PIT\n");
342 return delta;
343 }
344failed:
345 return 0;
346}
184 347
185/** 348/**
186 * native_calibrate_tsc - calibrate the tsc on boot 349 * native_calibrate_tsc - calibrate the tsc on boot
187 */ 350 */
188unsigned long native_calibrate_tsc(void) 351unsigned long native_calibrate_tsc(void)
189{ 352{
190 u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2; 353 u64 tsc1, tsc2, delta, ref1, ref2;
191 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; 354 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
192 unsigned long flags; 355 unsigned long flags, latch, ms, fast_calibrate;
193 int hpet = is_hpet_enabled(), i; 356 int hpet = is_hpet_enabled(), i, loopmin;
357
358 local_irq_save(flags);
359 fast_calibrate = quick_pit_calibrate();
360 local_irq_restore(flags);
361 if (fast_calibrate)
362 return fast_calibrate;
194 363
195 /* 364 /*
196 * Run 5 calibration loops to get the lowest frequency value 365 * Run 5 calibration loops to get the lowest frequency value
@@ -216,7 +385,13 @@ unsigned long native_calibrate_tsc(void)
216 * calibration delay loop as we have to wait for a certain 385 * calibration delay loop as we have to wait for a certain
217 * amount of time anyway. 386 * amount of time anyway.
218 */ 387 */
219 for (i = 0; i < 5; i++) { 388
389 /* Preset PIT loop values */
390 latch = CAL_LATCH;
391 ms = CAL_MS;
392 loopmin = CAL_PIT_LOOPS;
393
394 for (i = 0; i < 3; i++) {
220 unsigned long tsc_pit_khz; 395 unsigned long tsc_pit_khz;
221 396
222 /* 397 /*
@@ -226,16 +401,16 @@ unsigned long native_calibrate_tsc(void)
226 * read the end value. 401 * read the end value.
227 */ 402 */
228 local_irq_save(flags); 403 local_irq_save(flags);
229 tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); 404 tsc1 = tsc_read_refs(&ref1, hpet);
230 tsc_pit_khz = pit_calibrate_tsc(); 405 tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
231 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); 406 tsc2 = tsc_read_refs(&ref2, hpet);
232 local_irq_restore(flags); 407 local_irq_restore(flags);
233 408
234 /* Pick the lowest PIT TSC calibration so far */ 409 /* Pick the lowest PIT TSC calibration so far */
235 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); 410 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
236 411
237 /* hpet or pmtimer available ? */ 412 /* hpet or pmtimer available ? */
238 if (!hpet && !pm1 && !pm2) 413 if (!hpet && !ref1 && !ref2)
239 continue; 414 continue;
240 415
241 /* Check, whether the sampling was disturbed by an SMI */ 416 /* Check, whether the sampling was disturbed by an SMI */
@@ -243,23 +418,41 @@ unsigned long native_calibrate_tsc(void)
243 continue; 418 continue;
244 419
245 tsc2 = (tsc2 - tsc1) * 1000000LL; 420 tsc2 = (tsc2 - tsc1) * 1000000LL;
421 if (hpet)
422 tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
423 else
424 tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
246 425
247 if (hpet) { 426 tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
248 if (hpet2 < hpet1) 427
249 hpet2 += 0x100000000ULL; 428 /* Check the reference deviation */
250 hpet2 -= hpet1; 429 delta = ((u64) tsc_pit_min) * 100;
251 tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); 430 do_div(delta, tsc_ref_min);
252 do_div(tsc1, 1000000); 431
253 } else { 432 /*
254 if (pm2 < pm1) 433 * If both calibration results are inside a 10% window
255 pm2 += (u64)ACPI_PM_OVRRUN; 434 * then we can be sure, that the calibration
256 pm2 -= pm1; 435 * succeeded. We break out of the loop right away. We
257 tsc1 = pm2 * 1000000000LL; 436 * use the reference value, as it is more precise.
258 do_div(tsc1, PMTMR_TICKS_PER_SEC); 437 */
438 if (delta >= 90 && delta <= 110) {
439 printk(KERN_INFO
440 "TSC: PIT calibration matches %s. %d loops\n",
441 hpet ? "HPET" : "PMTIMER", i + 1);
442 return tsc_ref_min;
259 } 443 }
260 444
261 do_div(tsc2, tsc1); 445 /*
262 tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); 446 * Check whether PIT failed more than once. This
447 * happens in virtualized environments. We need to
448 * give the virtual PC a slightly longer timeframe for
449 * the HPET/PMTIMER to make the result precise.
450 */
451 if (i == 1 && tsc_pit_min == ULONG_MAX) {
452 latch = CAL2_LATCH;
453 ms = CAL2_MS;
454 loopmin = CAL2_PIT_LOOPS;
455 }
263 } 456 }
264 457
265 /* 458 /*
@@ -270,7 +463,7 @@ unsigned long native_calibrate_tsc(void)
270 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); 463 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
271 464
272 /* We don't have an alternative source, disable TSC */ 465 /* We don't have an alternative source, disable TSC */
273 if (!hpet && !pm1 && !pm2) { 466 if (!hpet && !ref1 && !ref2) {
274 printk("TSC: No reference (HPET/PMTIMER) available\n"); 467 printk("TSC: No reference (HPET/PMTIMER) available\n");
275 return 0; 468 return 0;
276 } 469 }
@@ -278,7 +471,7 @@ unsigned long native_calibrate_tsc(void)
278 /* The alternative source failed as well, disable TSC */ 471 /* The alternative source failed as well, disable TSC */
279 if (tsc_ref_min == ULONG_MAX) { 472 if (tsc_ref_min == ULONG_MAX) {
280 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " 473 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
281 "failed due to SMI disturbance.\n"); 474 "failed.\n");
282 return 0; 475 return 0;
283 } 476 }
284 477
@@ -290,44 +483,25 @@ unsigned long native_calibrate_tsc(void)
290 } 483 }
291 484
292 /* We don't have an alternative source, use the PIT calibration value */ 485 /* We don't have an alternative source, use the PIT calibration value */
293 if (!hpet && !pm1 && !pm2) { 486 if (!hpet && !ref1 && !ref2) {
294 printk(KERN_INFO "TSC: Using PIT calibration value\n"); 487 printk(KERN_INFO "TSC: Using PIT calibration value\n");
295 return tsc_pit_min; 488 return tsc_pit_min;
296 } 489 }
297 490
298 /* The alternative source failed, use the PIT calibration value */ 491 /* The alternative source failed, use the PIT calibration value */
299 if (tsc_ref_min == ULONG_MAX) { 492 if (tsc_ref_min == ULONG_MAX) {
300 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due " 493 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
301 "to SMI disturbance. Using PIT calibration\n"); 494 "Using PIT calibration\n");
302 return tsc_pit_min; 495 return tsc_pit_min;
303 } 496 }
304 497
305 /* Check the reference deviation */
306 delta = ((u64) tsc_pit_min) * 100;
307 do_div(delta, tsc_ref_min);
308
309 /*
310 * If both calibration results are inside a 5% window, the we
311 * use the lower frequency of those as it is probably the
312 * closest estimate.
313 */
314 if (delta >= 95 && delta <= 105) {
315 printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n",
316 hpet ? "HPET" : "PMTIMER");
317 printk(KERN_INFO "TSC: using %s calibration value\n",
318 tsc_pit_min <= tsc_ref_min ? "PIT" :
319 hpet ? "HPET" : "PMTIMER");
320 return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min;
321 }
322
323 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
324 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
325
326 /* 498 /*
327 * The calibration values differ too much. In doubt, we use 499 * The calibration values differ too much. In doubt, we use
328 * the PIT value as we know that there are PMTIMERs around 500 * the PIT value as we know that there are PMTIMERs around
329 * running at double speed. 501 * running at double speed. At least we let the user know:
330 */ 502 */
503 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
504 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
331 printk(KERN_INFO "TSC: Using PIT calibration value\n"); 505 printk(KERN_INFO "TSC: Using PIT calibration value\n");
332 return tsc_pit_min; 506 return tsc_pit_min;
333} 507}
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
new file mode 100644
index 00000000000..aeef529917e
--- /dev/null
+++ b/arch/x86/kernel/uv_irq.c
@@ -0,0 +1,79 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * SGI UV IRQ functions
7 *
8 * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
9 */
10
11#include <linux/module.h>
12#include <linux/irq.h>
13
14#include <asm/apic.h>
15#include <asm/uv/uv_irq.h>
16
17static void uv_noop(unsigned int irq)
18{
19}
20
21static unsigned int uv_noop_ret(unsigned int irq)
22{
23 return 0;
24}
25
26static void uv_ack_apic(unsigned int irq)
27{
28 ack_APIC_irq();
29}
30
31struct irq_chip uv_irq_chip = {
32 .name = "UV-CORE",
33 .startup = uv_noop_ret,
34 .shutdown = uv_noop,
35 .enable = uv_noop,
36 .disable = uv_noop,
37 .ack = uv_noop,
38 .mask = uv_noop,
39 .unmask = uv_noop,
40 .eoi = uv_ack_apic,
41 .end = uv_noop,
42};
43
44/*
45 * Set up a mapping of an available irq and vector, and enable the specified
46 * MMR that defines the MSI that is to be sent to the specified CPU when an
47 * interrupt is raised.
48 */
49int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
50 unsigned long mmr_offset)
51{
52 int irq;
53 int ret;
54
55 irq = create_irq();
56 if (irq <= 0)
57 return -EBUSY;
58
59 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset);
60 if (ret != irq)
61 destroy_irq(irq);
62
63 return ret;
64}
65EXPORT_SYMBOL_GPL(uv_setup_irq);
66
67/*
68 * Tear down a mapping of an irq and vector, and disable the specified MMR that
69 * defined the MSI that was to be sent to the specified CPU when an interrupt
70 * was raised.
71 *
72 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
73 */
74void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset)
75{
76 arch_disable_uv_irq(mmr_blade, mmr_offset);
77 destroy_irq(irq);
78}
79EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
new file mode 100644
index 00000000000..67f9b9dbf80
--- /dev/null
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -0,0 +1,72 @@
1/*
2 * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson
20 */
21
22#include <linux/sysdev.h>
23#include <asm/uv/bios.h>
24
25struct kobject *sgi_uv_kobj;
26
27static ssize_t partition_id_show(struct kobject *kobj,
28 struct kobj_attribute *attr, char *buf)
29{
30 return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
31}
32
33static ssize_t coherence_id_show(struct kobject *kobj,
34 struct kobj_attribute *attr, char *buf)
35{
36 return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id());
37}
38
39static struct kobj_attribute partition_id_attr =
40 __ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
41
42static struct kobj_attribute coherence_id_attr =
43 __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
44
45
46static int __init sgi_uv_sysfs_init(void)
47{
48 unsigned long ret;
49
50 if (!sgi_uv_kobj)
51 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
52 if (!sgi_uv_kobj) {
53 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n");
54 return -EINVAL;
55 }
56
57 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
58 if (ret) {
59 printk(KERN_WARNING "sysfs_create_file partition_id failed \n");
60 return ret;
61 }
62
63 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
64 if (ret) {
65 printk(KERN_WARNING "sysfs_create_file coherence_id failed \n");
66 return ret;
67 }
68
69 return 0;
70}
71
72device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index 594ef47f0a6..0c9667f0752 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -25,45 +25,31 @@
25#include <asm/visws/cobalt.h> 25#include <asm/visws/cobalt.h>
26#include <asm/visws/piix4.h> 26#include <asm/visws/piix4.h>
27#include <asm/arch_hooks.h> 27#include <asm/arch_hooks.h>
28#include <asm/io_apic.h>
28#include <asm/fixmap.h> 29#include <asm/fixmap.h>
29#include <asm/reboot.h> 30#include <asm/reboot.h>
30#include <asm/setup.h> 31#include <asm/setup.h>
31#include <asm/e820.h> 32#include <asm/e820.h>
32#include <asm/smp.h>
33#include <asm/io.h> 33#include <asm/io.h>
34 34
35#include <mach_ipi.h> 35#include <mach_ipi.h>
36 36
37#include "mach_apic.h" 37#include "mach_apic.h"
38 38
39#include <linux/init.h>
40#include <linux/smp.h>
41
42#include <linux/kernel_stat.h> 39#include <linux/kernel_stat.h>
43#include <linux/interrupt.h>
44#include <linux/init.h>
45 40
46#include <asm/io.h>
47#include <asm/apic.h>
48#include <asm/i8259.h> 41#include <asm/i8259.h>
49#include <asm/irq_vectors.h> 42#include <asm/irq_vectors.h>
50#include <asm/visws/cobalt.h>
51#include <asm/visws/lithium.h> 43#include <asm/visws/lithium.h>
52#include <asm/visws/piix4.h>
53 44
54#include <linux/sched.h> 45#include <linux/sched.h>
55#include <linux/kernel.h> 46#include <linux/kernel.h>
56#include <linux/init.h>
57#include <linux/pci.h> 47#include <linux/pci.h>
58#include <linux/pci_ids.h> 48#include <linux/pci_ids.h>
59 49
60extern int no_broadcast; 50extern int no_broadcast;
61 51
62#include <asm/io.h>
63#include <asm/apic.h> 52#include <asm/apic.h>
64#include <asm/arch_hooks.h>
65#include <asm/visws/cobalt.h>
66#include <asm/visws/lithium.h>
67 53
68char visws_board_type = -1; 54char visws_board_type = -1;
69char visws_board_rev = -1; 55char visws_board_rev = -1;
@@ -498,10 +484,11 @@ static void disable_cobalt_irq(unsigned int irq)
498static unsigned int startup_cobalt_irq(unsigned int irq) 484static unsigned int startup_cobalt_irq(unsigned int irq)
499{ 485{
500 unsigned long flags; 486 unsigned long flags;
487 struct irq_desc *desc = irq_to_desc(irq);
501 488
502 spin_lock_irqsave(&cobalt_lock, flags); 489 spin_lock_irqsave(&cobalt_lock, flags);
503 if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) 490 if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
504 irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); 491 desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
505 enable_cobalt_irq(irq); 492 enable_cobalt_irq(irq);
506 spin_unlock_irqrestore(&cobalt_lock, flags); 493 spin_unlock_irqrestore(&cobalt_lock, flags);
507 return 0; 494 return 0;
@@ -520,9 +507,10 @@ static void ack_cobalt_irq(unsigned int irq)
520static void end_cobalt_irq(unsigned int irq) 507static void end_cobalt_irq(unsigned int irq)
521{ 508{
522 unsigned long flags; 509 unsigned long flags;
510 struct irq_desc *desc = irq_to_desc(irq);
523 511
524 spin_lock_irqsave(&cobalt_lock, flags); 512 spin_lock_irqsave(&cobalt_lock, flags);
525 if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS))) 513 if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
526 enable_cobalt_irq(irq); 514 enable_cobalt_irq(irq);
527 spin_unlock_irqrestore(&cobalt_lock, flags); 515 spin_unlock_irqrestore(&cobalt_lock, flags);
528} 516}
@@ -640,12 +628,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
640 628
641 spin_unlock_irqrestore(&i8259A_lock, flags); 629 spin_unlock_irqrestore(&i8259A_lock, flags);
642 630
643 desc = irq_desc + realirq; 631 desc = irq_to_desc(realirq);
644 632
645 /* 633 /*
646 * handle this 'virtual interrupt' as a Cobalt one now. 634 * handle this 'virtual interrupt' as a Cobalt one now.
647 */ 635 */
648 kstat_cpu(smp_processor_id()).irqs[realirq]++; 636 kstat_incr_irqs_this_cpu(realirq, desc);
649 637
650 if (likely(desc->action != NULL)) 638 if (likely(desc->action != NULL))
651 handle_IRQ_event(realirq, desc->action); 639 handle_IRQ_event(realirq, desc->action);
@@ -676,27 +664,29 @@ void init_VISWS_APIC_irqs(void)
676 int i; 664 int i;
677 665
678 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { 666 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
679 irq_desc[i].status = IRQ_DISABLED; 667 struct irq_desc *desc = irq_to_desc(i);
680 irq_desc[i].action = 0; 668
681 irq_desc[i].depth = 1; 669 desc->status = IRQ_DISABLED;
670 desc->action = 0;
671 desc->depth = 1;
682 672
683 if (i == 0) { 673 if (i == 0) {
684 irq_desc[i].chip = &cobalt_irq_type; 674 desc->chip = &cobalt_irq_type;
685 } 675 }
686 else if (i == CO_IRQ_IDE0) { 676 else if (i == CO_IRQ_IDE0) {
687 irq_desc[i].chip = &cobalt_irq_type; 677 desc->chip = &cobalt_irq_type;
688 } 678 }
689 else if (i == CO_IRQ_IDE1) { 679 else if (i == CO_IRQ_IDE1) {
690 irq_desc[i].chip = &cobalt_irq_type; 680 desc->chip = &cobalt_irq_type;
691 } 681 }
692 else if (i == CO_IRQ_8259) { 682 else if (i == CO_IRQ_8259) {
693 irq_desc[i].chip = &piix4_master_irq_type; 683 desc->chip = &piix4_master_irq_type;
694 } 684 }
695 else if (i < CO_IRQ_APIC0) { 685 else if (i < CO_IRQ_APIC0) {
696 irq_desc[i].chip = &piix4_virtual_irq_type; 686 desc->chip = &piix4_virtual_irq_type;
697 } 687 }
698 else if (IS_CO_APIC(i)) { 688 else if (IS_CO_APIC(i)) {
699 irq_desc[i].chip = &cobalt_irq_type; 689 desc->chip = &cobalt_irq_type;
700 } 690 }
701 } 691 }
702 692
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 38f566fa27d..4eeb5cf9720 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -46,6 +46,7 @@
46#include <asm/io.h> 46#include <asm/io.h>
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/irq.h> 48#include <asm/irq.h>
49#include <asm/syscalls.h>
49 50
50/* 51/*
51 * Known problems: 52 * Known problems:
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index 6ca515d6db5..8b6c393ab9f 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -235,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
235 const void *desc) 235 const void *desc)
236{ 236{
237 u32 *ldt_entry = (u32 *)desc; 237 u32 *ldt_entry = (u32 *)desc;
238 vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); 238 vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
239} 239}
240 240
241static void vmi_load_sp0(struct tss_struct *tss, 241static void vmi_load_sp0(struct tss_struct *tss,
@@ -393,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
393} 393}
394#endif 394#endif
395 395
396static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) 396static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
397{ 397{
398 vmi_set_page_type(pfn, VMI_PAGE_L1); 398 vmi_set_page_type(pfn, VMI_PAGE_L1);
399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
400} 400}
401 401
402static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) 402static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
403{ 403{
404 /* 404 /*
405 * This call comes in very early, before mem_map is setup. 405 * This call comes in very early, before mem_map is setup.
@@ -410,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); 410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
411} 411}
412 412
413static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) 413static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
414{ 414{
415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); 415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
416 vmi_check_page_type(clonepfn, VMI_PAGE_L2); 416 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); 417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
418} 418}
419 419
420static void vmi_release_pte(u32 pfn) 420static void vmi_release_pte(unsigned long pfn)
421{ 421{
422 vmi_ops.release_page(pfn, VMI_PAGE_L1); 422 vmi_ops.release_page(pfn, VMI_PAGE_L1);
423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
424} 424}
425 425
426static void vmi_release_pmd(u32 pfn) 426static void vmi_release_pmd(unsigned long pfn)
427{ 427{
428 vmi_ops.release_page(pfn, VMI_PAGE_L2); 428 vmi_ops.release_page(pfn, VMI_PAGE_L2);
429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
@@ -905,8 +905,8 @@ static inline int __init activate_vmi(void)
905#endif 905#endif
906 906
907#ifdef CONFIG_X86_LOCAL_APIC 907#ifdef CONFIG_X86_LOCAL_APIC
908 para_fill(pv_apic_ops.apic_read, APICRead); 908 para_fill(apic_ops->read, APICRead);
909 para_fill(pv_apic_ops.apic_write, APICWrite); 909 para_fill(apic_ops->write, APICWrite);
910#endif 910#endif
911 911
912 /* 912 /*
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 6953859fe28..254ee07f863 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -235,11 +235,14 @@ static void __devinit vmi_time_init_clockevent(void)
235 235
236void __init vmi_time_init(void) 236void __init vmi_time_init(void)
237{ 237{
238 unsigned int cpu;
238 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ 239 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
239 outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ 240 outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
240 241
241 vmi_time_init_clockevent(); 242 vmi_time_init_clockevent();
242 setup_irq(0, &vmi_clock_action); 243 setup_irq(0, &vmi_clock_action);
244 for_each_possible_cpu(cpu)
245 per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
243} 246}
244 247
245#ifdef CONFIG_X86_LOCAL_APIC 248#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index af5bdad8460..a9b8560adbc 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -140,10 +140,10 @@ SECTIONS
140 *(.con_initcall.init) 140 *(.con_initcall.init)
141 __con_initcall_end = .; 141 __con_initcall_end = .;
142 } 142 }
143 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { 143 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
144 __x86cpuvendor_start = .; 144 __x86_cpu_dev_start = .;
145 *(.x86cpuvendor.init) 145 *(.x86_cpu_dev.init)
146 __x86cpuvendor_end = .; 146 __x86_cpu_dev_end = .;
147 } 147 }
148 SECURITY_INIT 148 SECURITY_INIT
149 . = ALIGN(4); 149 . = ALIGN(4);
@@ -180,6 +180,7 @@ SECTIONS
180 . = ALIGN(PAGE_SIZE); 180 . = ALIGN(PAGE_SIZE);
181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { 181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
182 __per_cpu_start = .; 182 __per_cpu_start = .;
183 *(.data.percpu.page_aligned)
183 *(.data.percpu) 184 *(.data.percpu)
184 *(.data.percpu.shared_aligned) 185 *(.data.percpu.shared_aligned)
185 __per_cpu_end = .; 186 __per_cpu_end = .;
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 63e5c1a22e8..46e05447405 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -168,12 +168,11 @@ SECTIONS
168 *(.con_initcall.init) 168 *(.con_initcall.init)
169 } 169 }
170 __con_initcall_end = .; 170 __con_initcall_end = .;
171 . = ALIGN(16); 171 __x86_cpu_dev_start = .;
172 __x86cpuvendor_start = .; 172 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
173 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { 173 *(.x86_cpu_dev.init)
174 *(.x86cpuvendor.init)
175 } 174 }
176 __x86cpuvendor_end = .; 175 __x86_cpu_dev_end = .;
177 SECURITY_INIT 176 SECURITY_INIT
178 177
179 . = ALIGN(8); 178 . = ALIGN(8);
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 0c029e8959c..7766d36983f 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -61,7 +61,7 @@ static void vsmp_irq_enable(void)
61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
62} 62}
63 63
64static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, 64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
65 unsigned long addr, unsigned len) 65 unsigned long addr, unsigned len)
66{ 66{
67 switch (type) { 67 switch (type) {
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
new file mode 100644
index 00000000000..b13acb75e82
--- /dev/null
+++ b/arch/x86/kernel/xsave.c
@@ -0,0 +1,345 @@
1/*
2 * xsave/xrstor support.
3 *
4 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
5 */
6#include <linux/bootmem.h>
7#include <linux/compat.h>
8#include <asm/i387.h>
9#ifdef CONFIG_IA32_EMULATION
10#include <asm/sigcontext32.h>
11#endif
12#include <asm/xcr.h>
13
14/*
15 * Supported feature mask by the CPU and the kernel.
16 */
17u64 pcntxt_mask;
18
19struct _fpx_sw_bytes fx_sw_reserved;
20#ifdef CONFIG_IA32_EMULATION
21struct _fpx_sw_bytes fx_sw_reserved_ia32;
22#endif
23
24/*
25 * Check for the presence of extended state information in the
26 * user fpstate pointer in the sigcontext.
27 */
28int check_for_xstate(struct i387_fxsave_struct __user *buf,
29 void __user *fpstate,
30 struct _fpx_sw_bytes *fx_sw_user)
31{
32 int min_xstate_size = sizeof(struct i387_fxsave_struct) +
33 sizeof(struct xsave_hdr_struct);
34 unsigned int magic2;
35 int err;
36
37 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
38 sizeof(struct _fpx_sw_bytes));
39
40 if (err)
41 return err;
42
43 /*
44 * First Magic check failed.
45 */
46 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
47 return -1;
48
49 /*
50 * Check for error scenarios.
51 */
52 if (fx_sw_user->xstate_size < min_xstate_size ||
53 fx_sw_user->xstate_size > xstate_size ||
54 fx_sw_user->xstate_size > fx_sw_user->extended_size)
55 return -1;
56
57 err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
58 fx_sw_user->extended_size -
59 FP_XSTATE_MAGIC2_SIZE));
60 /*
61 * Check for the presence of second magic word at the end of memory
62 * layout. This detects the case where the user just copied the legacy
63 * fpstate layout with out copying the extended state information
64 * in the memory layout.
65 */
66 if (err || magic2 != FP_XSTATE_MAGIC2)
67 return -1;
68
69 return 0;
70}
71
72#ifdef CONFIG_X86_64
73/*
74 * Signal frame handlers.
75 */
76
77int save_i387_xstate(void __user *buf)
78{
79 struct task_struct *tsk = current;
80 int err = 0;
81
82 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size))
83 return -EACCES;
84
85 BUG_ON(sig_xstate_size < xstate_size);
86
87 if ((unsigned long)buf % 64)
88 printk("save_i387_xstate: bad fpstate %p\n", buf);
89
90 if (!used_math())
91 return 0;
92 clear_used_math(); /* trigger finit */
93 if (task_thread_info(tsk)->status & TS_USEDFPU) {
94 /*
95 * Start with clearing the user buffer. This will present a
96 * clean context for the bytes not touched by the fxsave/xsave.
97 */
98 err = __clear_user(buf, sig_xstate_size);
99 if (err)
100 return err;
101
102 if (task_thread_info(tsk)->status & TS_XSAVE)
103 err = xsave_user(buf);
104 else
105 err = fxsave_user(buf);
106
107 if (err)
108 return err;
109 task_thread_info(tsk)->status &= ~TS_USEDFPU;
110 stts();
111 } else {
112 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
113 xstate_size))
114 return -1;
115 }
116
117 if (task_thread_info(tsk)->status & TS_XSAVE) {
118 struct _fpstate __user *fx = buf;
119 struct _xstate __user *x = buf;
120 u64 xstate_bv;
121
122 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
123 sizeof(struct _fpx_sw_bytes));
124
125 err |= __put_user(FP_XSTATE_MAGIC2,
126 (__u32 __user *) (buf + sig_xstate_size
127 - FP_XSTATE_MAGIC2_SIZE));
128
129 /*
130 * Read the xstate_bv which we copied (directly from the cpu or
131 * from the state in task struct) to the user buffers and
132 * set the FP/SSE bits.
133 */
134 err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
135
136 /*
137 * For legacy compatible, we always set FP/SSE bits in the bit
138 * vector while saving the state to the user context. This will
139 * enable us capturing any changes(during sigreturn) to
140 * the FP/SSE bits by the legacy applications which don't touch
141 * xstate_bv in the xsave header.
142 *
143 * xsave aware apps can change the xstate_bv in the xsave
144 * header as well as change any contents in the memory layout.
145 * xrestore as part of sigreturn will capture all the changes.
146 */
147 xstate_bv |= XSTATE_FPSSE;
148
149 err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv);
150
151 if (err)
152 return err;
153 }
154
155 return 1;
156}
157
158/*
159 * Restore the extended state if present. Otherwise, restore the FP/SSE
160 * state.
161 */
162int restore_user_xstate(void __user *buf)
163{
164 struct _fpx_sw_bytes fx_sw_user;
165 u64 mask;
166 int err;
167
168 if (((unsigned long)buf % 64) ||
169 check_for_xstate(buf, buf, &fx_sw_user))
170 goto fx_only;
171
172 mask = fx_sw_user.xstate_bv;
173
174 /*
175 * restore the state passed by the user.
176 */
177 err = xrestore_user(buf, mask);
178 if (err)
179 return err;
180
181 /*
182 * init the state skipped by the user.
183 */
184 mask = pcntxt_mask & ~mask;
185
186 xrstor_state(init_xstate_buf, mask);
187
188 return 0;
189
190fx_only:
191 /*
192 * couldn't find the extended state information in the
193 * memory layout. Restore just the FP/SSE and init all
194 * the other extended state.
195 */
196 xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE);
197 return fxrstor_checking((__force struct i387_fxsave_struct *)buf);
198}
199
200/*
201 * This restores directly out of user space. Exceptions are handled.
202 */
203int restore_i387_xstate(void __user *buf)
204{
205 struct task_struct *tsk = current;
206 int err = 0;
207
208 if (!buf) {
209 if (used_math())
210 goto clear;
211 return 0;
212 } else
213 if (!access_ok(VERIFY_READ, buf, sig_xstate_size))
214 return -EACCES;
215
216 if (!used_math()) {
217 err = init_fpu(tsk);
218 if (err)
219 return err;
220 }
221
222 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
223 clts();
224 task_thread_info(current)->status |= TS_USEDFPU;
225 }
226 if (task_thread_info(tsk)->status & TS_XSAVE)
227 err = restore_user_xstate(buf);
228 else
229 err = fxrstor_checking((__force struct i387_fxsave_struct *)
230 buf);
231 if (unlikely(err)) {
232 /*
233 * Encountered an error while doing the restore from the
234 * user buffer, clear the fpu state.
235 */
236clear:
237 clear_fpu(tsk);
238 clear_used_math();
239 }
240 return err;
241}
242#endif
243
244/*
245 * Prepare the SW reserved portion of the fxsave memory layout, indicating
246 * the presence of the extended state information in the memory layout
247 * pointed by the fpstate pointer in the sigcontext.
248 * This will be saved when ever the FP and extended state context is
249 * saved on the user stack during the signal handler delivery to the user.
250 */
251static void prepare_fx_sw_frame(void)
252{
253 int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) +
254 FP_XSTATE_MAGIC2_SIZE;
255
256 sig_xstate_size = sizeof(struct _fpstate) + size_extended;
257
258#ifdef CONFIG_IA32_EMULATION
259 sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended;
260#endif
261
262 memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved));
263
264 fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
265 fx_sw_reserved.extended_size = sig_xstate_size;
266 fx_sw_reserved.xstate_bv = pcntxt_mask;
267 fx_sw_reserved.xstate_size = xstate_size;
268#ifdef CONFIG_IA32_EMULATION
269 memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved,
270 sizeof(struct _fpx_sw_bytes));
271 fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size;
272#endif
273}
274
275/*
276 * Represents init state for the supported extended state.
277 */
278struct xsave_struct *init_xstate_buf;
279
280#ifdef CONFIG_X86_64
281unsigned int sig_xstate_size = sizeof(struct _fpstate);
282#endif
283
284/*
285 * Enable the extended processor state save/restore feature
286 */
287void __cpuinit xsave_init(void)
288{
289 if (!cpu_has_xsave)
290 return;
291
292 set_in_cr4(X86_CR4_OSXSAVE);
293
294 /*
295 * Enable all the features that the HW is capable of
296 * and the Linux kernel is aware of.
297 */
298 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
299}
300
301/*
302 * setup the xstate image representing the init state
303 */
304static void __init setup_xstate_init(void)
305{
306 init_xstate_buf = alloc_bootmem(xstate_size);
307 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
308}
309
310/*
311 * Enable and initialize the xsave feature.
312 */
313void __init xsave_cntxt_init(void)
314{
315 unsigned int eax, ebx, ecx, edx;
316
317 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
318 pcntxt_mask = eax + ((u64)edx << 32);
319
320 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
321 printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n",
322 pcntxt_mask);
323 BUG();
324 }
325
326 /*
327 * for now OS knows only about FP/SSE
328 */
329 pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
330 xsave_init();
331
332 /*
333 * Recompute the context size for enabled features
334 */
335 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
336 xstate_size = ebx;
337
338 prepare_fx_sw_frame();
339
340 setup_xstate_init();
341
342 printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, "
343 "cntxt size 0x%x\n",
344 pcntxt_mask, xstate_size);
345}
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index d0e940bb6f4..c02343594b4 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -3,10 +3,13 @@
3# 3#
4 4
5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
6 coalesced_mmio.o) 6 coalesced_mmio.o irq_comm.o)
7ifeq ($(CONFIG_KVM_TRACE),y) 7ifeq ($(CONFIG_KVM_TRACE),y)
8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) 8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
9endif 9endif
10ifeq ($(CONFIG_DMAR),y)
11common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
12endif
10 13
11EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
12 15
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index c0f7872a912..11c6725fb79 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -200,13 +200,14 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps)
200 200
201 if (!atomic_inc_and_test(&pt->pending)) 201 if (!atomic_inc_and_test(&pt->pending))
202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); 202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
203 if (vcpu0 && waitqueue_active(&vcpu0->wq)) { 203
204 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; 204 if (vcpu0 && waitqueue_active(&vcpu0->wq))
205 wake_up_interruptible(&vcpu0->wq); 205 wake_up_interruptible(&vcpu0->wq);
206 }
207 206
208 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); 207 hrtimer_add_expires_ns(&pt->timer, pt->period);
209 pt->scheduled = ktime_to_ns(pt->timer.expires); 208 pt->scheduled = hrtimer_get_expires_ns(&pt->timer);
209 if (pt->period)
210 ps->channels[0].count_load_time = hrtimer_get_expires(&pt->timer);
210 211
211 return (pt->period == 0 ? 0 : 1); 212 return (pt->period == 0 ? 0 : 1);
212} 213}
@@ -215,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
215{ 216{
216 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 217 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
217 218
218 if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending) 219 if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
219 return atomic_read(&pit->pit_state.pit_timer.pending); 220 return atomic_read(&pit->pit_state.pit_timer.pending);
220
221 return 0; 221 return 0;
222} 222}
223 223
224static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
225{
226 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
227 irq_ack_notifier);
228 spin_lock(&ps->inject_lock);
229 if (atomic_dec_return(&ps->pit_timer.pending) < 0)
230 atomic_inc(&ps->pit_timer.pending);
231 ps->irq_ack = 1;
232 spin_unlock(&ps->inject_lock);
233}
234
224static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) 235static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
225{ 236{
226 struct kvm_kpit_state *ps; 237 struct kvm_kpit_state *ps;
@@ -246,7 +257,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
246 257
247 timer = &pit->pit_state.pit_timer.timer; 258 timer = &pit->pit_state.pit_timer.timer;
248 if (hrtimer_cancel(timer)) 259 if (hrtimer_cancel(timer))
249 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); 260 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
250} 261}
251 262
252static void destroy_pit_timer(struct kvm_kpit_timer *pt) 263static void destroy_pit_timer(struct kvm_kpit_timer *pt)
@@ -255,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt)
255 hrtimer_cancel(&pt->timer); 266 hrtimer_cancel(&pt->timer);
256} 267}
257 268
258static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) 269static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
259{ 270{
271 struct kvm_kpit_timer *pt = &ps->pit_timer;
260 s64 interval; 272 s64 interval;
261 273
262 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 274 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -268,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
268 pt->period = (is_period == 0) ? 0 : interval; 280 pt->period = (is_period == 0) ? 0 : interval;
269 pt->timer.function = pit_timer_fn; 281 pt->timer.function = pit_timer_fn;
270 atomic_set(&pt->pending, 0); 282 atomic_set(&pt->pending, 0);
283 ps->irq_ack = 1;
271 284
272 hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), 285 hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
273 HRTIMER_MODE_ABS); 286 HRTIMER_MODE_ABS);
@@ -302,11 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
302 case 1: 315 case 1:
303 /* FIXME: enhance mode 4 precision */ 316 /* FIXME: enhance mode 4 precision */
304 case 4: 317 case 4:
305 create_pit_timer(&ps->pit_timer, val, 0); 318 create_pit_timer(ps, val, 0);
306 break; 319 break;
307 case 2: 320 case 2:
308 case 3: 321 case 3:
309 create_pit_timer(&ps->pit_timer, val, 1); 322 create_pit_timer(ps, val, 1);
310 break; 323 break;
311 default: 324 default:
312 destroy_pit_timer(&ps->pit_timer); 325 destroy_pit_timer(&ps->pit_timer);
@@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
520 mutex_unlock(&pit->pit_state.lock); 533 mutex_unlock(&pit->pit_state.lock);
521 534
522 atomic_set(&pit->pit_state.pit_timer.pending, 0); 535 atomic_set(&pit->pit_state.pit_timer.pending, 0);
523 pit->pit_state.inject_pending = 1; 536 pit->pit_state.irq_ack = 1;
524} 537}
525 538
526struct kvm_pit *kvm_create_pit(struct kvm *kvm) 539struct kvm_pit *kvm_create_pit(struct kvm *kvm)
@@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
534 547
535 mutex_init(&pit->pit_state.lock); 548 mutex_init(&pit->pit_state.lock);
536 mutex_lock(&pit->pit_state.lock); 549 mutex_lock(&pit->pit_state.lock);
550 spin_lock_init(&pit->pit_state.inject_lock);
537 551
538 /* Initialize PIO device */ 552 /* Initialize PIO device */
539 pit->dev.read = pit_ioport_read; 553 pit->dev.read = pit_ioport_read;
@@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
555 pit_state->pit = pit; 569 pit_state->pit = pit;
556 hrtimer_init(&pit_state->pit_timer.timer, 570 hrtimer_init(&pit_state->pit_timer.timer,
557 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 571 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
572 pit_state->irq_ack_notifier.gsi = 0;
573 pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
574 kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
558 mutex_unlock(&pit->pit_state.lock); 575 mutex_unlock(&pit->pit_state.lock);
559 576
560 kvm_pit_reset(pit); 577 kvm_pit_reset(pit);
@@ -578,10 +595,8 @@ void kvm_free_pit(struct kvm *kvm)
578static void __inject_pit_timer_intr(struct kvm *kvm) 595static void __inject_pit_timer_intr(struct kvm *kvm)
579{ 596{
580 mutex_lock(&kvm->lock); 597 mutex_lock(&kvm->lock);
581 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); 598 kvm_set_irq(kvm, 0, 1);
582 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0); 599 kvm_set_irq(kvm, 0, 0);
583 kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
584 kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
585 mutex_unlock(&kvm->lock); 600 mutex_unlock(&kvm->lock);
586} 601}
587 602
@@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
592 struct kvm_kpit_state *ps; 607 struct kvm_kpit_state *ps;
593 608
594 if (vcpu && pit) { 609 if (vcpu && pit) {
610 int inject = 0;
595 ps = &pit->pit_state; 611 ps = &pit->pit_state;
596 612
597 /* Try to inject pending interrupts when: 613 /* Try to inject pending interrupts when
598 * 1. Pending exists 614 * last one has been acked.
599 * 2. Last interrupt was accepted or waited for too long time*/ 615 */
600 if (atomic_read(&ps->pit_timer.pending) && 616 spin_lock(&ps->inject_lock);
601 (ps->inject_pending || 617 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
602 (jiffies - ps->last_injected_time 618 ps->irq_ack = 0;
603 >= KVM_MAX_PIT_INTR_INTERVAL))) { 619 inject = 1;
604 ps->inject_pending = 0;
605 __inject_pit_timer_intr(kvm);
606 ps->last_injected_time = jiffies;
607 }
608 }
609}
610
611void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
612{
613 struct kvm_arch *arch = &vcpu->kvm->arch;
614 struct kvm_kpit_state *ps;
615
616 if (vcpu && arch->vpit) {
617 ps = &arch->vpit->pit_state;
618 if (atomic_read(&ps->pit_timer.pending) &&
619 (((arch->vpic->pics[0].imr & 1) == 0 &&
620 arch->vpic->pics[0].irq_base == vec) ||
621 (arch->vioapic->redirtbl[0].fields.vector == vec &&
622 arch->vioapic->redirtbl[0].fields.mask != 1))) {
623 ps->inject_pending = 1;
624 atomic_dec(&ps->pit_timer.pending);
625 ps->channels[0].count_load_time = ktime_get();
626 } 620 }
621 spin_unlock(&ps->inject_lock);
622 if (inject)
623 __inject_pit_timer_intr(kvm);
627 } 624 }
628} 625}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index db25c2a6c8c..e436d4983aa 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -8,7 +8,6 @@ struct kvm_kpit_timer {
8 int irq; 8 int irq;
9 s64 period; /* unit: ns */ 9 s64 period; /* unit: ns */
10 s64 scheduled; 10 s64 scheduled;
11 ktime_t last_update;
12 atomic_t pending; 11 atomic_t pending;
13}; 12};
14 13
@@ -34,8 +33,9 @@ struct kvm_kpit_state {
34 u32 speaker_data_on; 33 u32 speaker_data_on;
35 struct mutex lock; 34 struct mutex lock;
36 struct kvm_pit *pit; 35 struct kvm_pit *pit;
37 bool inject_pending; /* if inject pending interrupts */ 36 spinlock_t inject_lock;
38 unsigned long last_injected_time; 37 unsigned long irq_ack;
38 struct kvm_irq_ack_notifier irq_ack_notifier;
39}; 39};
40 40
41struct kvm_pit { 41struct kvm_pit {
@@ -54,7 +54,6 @@ struct kvm_pit {
54#define KVM_PIT_CHANNEL_MASK 0x3 54#define KVM_PIT_CHANNEL_MASK 0x3
55 55
56void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); 56void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
57void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
58void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); 57void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
59struct kvm_pit *kvm_create_pit(struct kvm *kvm); 58struct kvm_pit *kvm_create_pit(struct kvm *kvm);
60void kvm_free_pit(struct kvm *kvm); 59void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index c31164e8aa4..17e41e165f1 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -30,6 +30,19 @@
30 30
31#include <linux/kvm_host.h> 31#include <linux/kvm_host.h>
32 32
33static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
34{
35 s->isr &= ~(1 << irq);
36 s->isr_ack |= (1 << irq);
37}
38
39void kvm_pic_clear_isr_ack(struct kvm *kvm)
40{
41 struct kvm_pic *s = pic_irqchip(kvm);
42 s->pics[0].isr_ack = 0xff;
43 s->pics[1].isr_ack = 0xff;
44}
45
33/* 46/*
34 * set irq level. If an edge is detected, then the IRR is set to 1 47 * set irq level. If an edge is detected, then the IRR is set to 1
35 */ 48 */
@@ -141,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
141 */ 154 */
142static inline void pic_intack(struct kvm_kpic_state *s, int irq) 155static inline void pic_intack(struct kvm_kpic_state *s, int irq)
143{ 156{
157 s->isr |= 1 << irq;
144 if (s->auto_eoi) { 158 if (s->auto_eoi) {
145 if (s->rotate_on_auto_eoi) 159 if (s->rotate_on_auto_eoi)
146 s->priority_add = (irq + 1) & 7; 160 s->priority_add = (irq + 1) & 7;
147 } else 161 pic_clear_isr(s, irq);
148 s->isr |= (1 << irq); 162 }
149 /* 163 /*
150 * We don't clear a level sensitive interrupt here 164 * We don't clear a level sensitive interrupt here
151 */ 165 */
@@ -153,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
153 s->irr &= ~(1 << irq); 167 s->irr &= ~(1 << irq);
154} 168}
155 169
156int kvm_pic_read_irq(struct kvm_pic *s) 170int kvm_pic_read_irq(struct kvm *kvm)
157{ 171{
158 int irq, irq2, intno; 172 int irq, irq2, intno;
173 struct kvm_pic *s = pic_irqchip(kvm);
159 174
160 irq = pic_get_irq(&s->pics[0]); 175 irq = pic_get_irq(&s->pics[0]);
161 if (irq >= 0) { 176 if (irq >= 0) {
@@ -181,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s)
181 intno = s->pics[0].irq_base + irq; 196 intno = s->pics[0].irq_base + irq;
182 } 197 }
183 pic_update_irq(s); 198 pic_update_irq(s);
199 kvm_notify_acked_irq(kvm, irq);
184 200
185 return intno; 201 return intno;
186} 202}
187 203
188void kvm_pic_reset(struct kvm_kpic_state *s) 204void kvm_pic_reset(struct kvm_kpic_state *s)
189{ 205{
206 int irq, irqbase;
207 struct kvm *kvm = s->pics_state->irq_request_opaque;
208 struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
209
210 if (s == &s->pics_state->pics[0])
211 irqbase = 0;
212 else
213 irqbase = 8;
214
215 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
216 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
217 if (s->irr & (1 << irq) || s->isr & (1 << irq))
218 kvm_notify_acked_irq(kvm, irq+irqbase);
219 }
190 s->last_irr = 0; 220 s->last_irr = 0;
191 s->irr = 0; 221 s->irr = 0;
192 s->imr = 0; 222 s->imr = 0;
193 s->isr = 0; 223 s->isr = 0;
224 s->isr_ack = 0xff;
194 s->priority_add = 0; 225 s->priority_add = 0;
195 s->irq_base = 0; 226 s->irq_base = 0;
196 s->read_reg_select = 0; 227 s->read_reg_select = 0;
@@ -243,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
243 priority = get_priority(s, s->isr); 274 priority = get_priority(s, s->isr);
244 if (priority != 8) { 275 if (priority != 8) {
245 irq = (priority + s->priority_add) & 7; 276 irq = (priority + s->priority_add) & 7;
246 s->isr &= ~(1 << irq); 277 pic_clear_isr(s, irq);
247 if (cmd == 5) 278 if (cmd == 5)
248 s->priority_add = (irq + 1) & 7; 279 s->priority_add = (irq + 1) & 7;
249 pic_update_irq(s->pics_state); 280 pic_update_irq(s->pics_state);
@@ -251,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
251 break; 282 break;
252 case 3: 283 case 3:
253 irq = val & 7; 284 irq = val & 7;
254 s->isr &= ~(1 << irq); 285 pic_clear_isr(s, irq);
255 pic_update_irq(s->pics_state); 286 pic_update_irq(s->pics_state);
256 break; 287 break;
257 case 6: 288 case 6:
@@ -260,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
260 break; 291 break;
261 case 7: 292 case 7:
262 irq = val & 7; 293 irq = val & 7;
263 s->isr &= ~(1 << irq);
264 s->priority_add = (irq + 1) & 7; 294 s->priority_add = (irq + 1) & 7;
295 pic_clear_isr(s, irq);
265 pic_update_irq(s->pics_state); 296 pic_update_irq(s->pics_state);
266 break; 297 break;
267 default: 298 default:
@@ -303,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
303 s->pics_state->pics[0].irr &= ~(1 << 2); 334 s->pics_state->pics[0].irr &= ~(1 << 2);
304 } 335 }
305 s->irr &= ~(1 << ret); 336 s->irr &= ~(1 << ret);
306 s->isr &= ~(1 << ret); 337 pic_clear_isr(s, ret);
307 if (addr1 >> 7 || ret != 2) 338 if (addr1 >> 7 || ret != 2)
308 pic_update_irq(s->pics_state); 339 pic_update_irq(s->pics_state);
309 } else { 340 } else {
@@ -422,10 +453,14 @@ static void pic_irq_request(void *opaque, int level)
422{ 453{
423 struct kvm *kvm = opaque; 454 struct kvm *kvm = opaque;
424 struct kvm_vcpu *vcpu = kvm->vcpus[0]; 455 struct kvm_vcpu *vcpu = kvm->vcpus[0];
456 struct kvm_pic *s = pic_irqchip(kvm);
457 int irq = pic_get_irq(&s->pics[0]);
425 458
426 pic_irqchip(kvm)->output = level; 459 s->output = level;
427 if (vcpu) 460 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
461 s->pics[0].isr_ack &= ~(1 << irq);
428 kvm_vcpu_kick(vcpu); 462 kvm_vcpu_kick(vcpu);
463 }
429} 464}
430 465
431struct kvm_pic *kvm_create_pic(struct kvm *kvm) 466struct kvm_pic *kvm_create_pic(struct kvm *kvm)
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 76d736b5f66..c019b8edcdb 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
72 if (kvm_apic_accept_pic_intr(v)) { 72 if (kvm_apic_accept_pic_intr(v)) {
73 s = pic_irqchip(v->kvm); 73 s = pic_irqchip(v->kvm);
74 s->output = 0; /* PIC */ 74 s->output = 0; /* PIC */
75 vector = kvm_pic_read_irq(s); 75 vector = kvm_pic_read_irq(v->kvm);
76 } 76 }
77 } 77 }
78 return vector; 78 return vector;
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
90void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) 90void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
91{ 91{
92 kvm_apic_timer_intr_post(vcpu, vec); 92 kvm_apic_timer_intr_post(vcpu, vec);
93 kvm_pit_timer_intr_post(vcpu, vec);
94 /* TODO: PIT, RTC etc. */ 93 /* TODO: PIT, RTC etc. */
95} 94}
96EXPORT_SYMBOL_GPL(kvm_timer_intr_post); 95EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7ca47cbb48b..f17c8f5bbf3 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -42,6 +42,7 @@ struct kvm_kpic_state {
42 u8 irr; /* interrupt request register */ 42 u8 irr; /* interrupt request register */
43 u8 imr; /* interrupt mask register */ 43 u8 imr; /* interrupt mask register */
44 u8 isr; /* interrupt service register */ 44 u8 isr; /* interrupt service register */
45 u8 isr_ack; /* interrupt ack detection */
45 u8 priority_add; /* highest irq priority */ 46 u8 priority_add; /* highest irq priority */
46 u8 irq_base; 47 u8 irq_base;
47 u8 read_reg_select; 48 u8 read_reg_select;
@@ -63,12 +64,13 @@ struct kvm_pic {
63 void *irq_request_opaque; 64 void *irq_request_opaque;
64 int output; /* intr from master PIC */ 65 int output; /* intr from master PIC */
65 struct kvm_io_device dev; 66 struct kvm_io_device dev;
67 void (*ack_notifier)(void *opaque, int irq);
66}; 68};
67 69
68struct kvm_pic *kvm_create_pic(struct kvm *kvm); 70struct kvm_pic *kvm_create_pic(struct kvm *kvm);
69void kvm_pic_set_irq(void *opaque, int irq, int level); 71int kvm_pic_read_irq(struct kvm *kvm);
70int kvm_pic_read_irq(struct kvm_pic *s);
71void kvm_pic_update_irq(struct kvm_pic *s); 72void kvm_pic_update_irq(struct kvm_pic *s);
73void kvm_pic_clear_isr_ack(struct kvm *kvm);
72 74
73static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) 75static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
74{ 76{
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
new file mode 100644
index 00000000000..1ff819dce7d
--- /dev/null
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -0,0 +1,32 @@
1#ifndef ASM_KVM_CACHE_REGS_H
2#define ASM_KVM_CACHE_REGS_H
3
4static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
5 enum kvm_reg reg)
6{
7 if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
8 kvm_x86_ops->cache_reg(vcpu, reg);
9
10 return vcpu->arch.regs[reg];
11}
12
13static inline void kvm_register_write(struct kvm_vcpu *vcpu,
14 enum kvm_reg reg,
15 unsigned long val)
16{
17 vcpu->arch.regs[reg] = val;
18 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
19 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
20}
21
22static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
23{
24 return kvm_register_read(vcpu, VCPU_REGS_RIP);
25}
26
27static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
28{
29 kvm_register_write(vcpu, VCPU_REGS_RIP, val);
30}
31
32#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 73f43de69f6..0fc3cab4894 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,6 +32,7 @@
32#include <asm/current.h> 32#include <asm/current.h>
33#include <asm/apicdef.h> 33#include <asm/apicdef.h>
34#include <asm/atomic.h> 34#include <asm/atomic.h>
35#include "kvm_cache_regs.h"
35#include "irq.h" 36#include "irq.h"
36 37
37#define PRId64 "d" 38#define PRId64 "d"
@@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
338 } else 339 } else
339 apic_clear_vector(vector, apic->regs + APIC_TMR); 340 apic_clear_vector(vector, apic->regs + APIC_TMR);
340 341
341 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 342 kvm_vcpu_kick(vcpu);
342 kvm_vcpu_kick(vcpu);
343 else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
344 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
345 if (waitqueue_active(&vcpu->wq))
346 wake_up_interruptible(&vcpu->wq);
347 }
348 343
349 result = (orig_irr == 0); 344 result = (orig_irr == 0);
350 break; 345 break;
@@ -370,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
370 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 365 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
371 kvm_vcpu_kick(vcpu); 366 kvm_vcpu_kick(vcpu);
372 } else { 367 } else {
373 printk(KERN_DEBUG 368 apic_debug("Ignoring de-assert INIT to vcpu %d\n",
374 "Ignoring de-assert INIT to vcpu %d\n", 369 vcpu->vcpu_id);
375 vcpu->vcpu_id);
376 } 370 }
377
378 break; 371 break;
379 372
380 case APIC_DM_STARTUP: 373 case APIC_DM_STARTUP:
381 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", 374 apic_debug("SIPI to vcpu %d vector 0x%02x\n",
382 vcpu->vcpu_id, vector); 375 vcpu->vcpu_id, vector);
383 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 376 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
384 vcpu->arch.sipi_vector = vector; 377 vcpu->arch.sipi_vector = vector;
385 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 378 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
386 if (waitqueue_active(&vcpu->wq)) 379 kvm_vcpu_kick(vcpu);
387 wake_up_interruptible(&vcpu->wq);
388 } 380 }
389 break; 381 break;
390 382
@@ -438,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
438static void apic_set_eoi(struct kvm_lapic *apic) 430static void apic_set_eoi(struct kvm_lapic *apic)
439{ 431{
440 int vector = apic_find_highest_isr(apic); 432 int vector = apic_find_highest_isr(apic);
441 433 int trigger_mode;
442 /* 434 /*
443 * Not every write EOI will has corresponding ISR, 435 * Not every write EOI will has corresponding ISR,
444 * one example is when Kernel check timer on setup_IO_APIC 436 * one example is when Kernel check timer on setup_IO_APIC
@@ -450,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic)
450 apic_update_ppr(apic); 442 apic_update_ppr(apic);
451 443
452 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) 444 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
453 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); 445 trigger_mode = IOAPIC_LEVEL_TRIG;
446 else
447 trigger_mode = IOAPIC_EDGE_TRIG;
448 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
454} 449}
455 450
456static void apic_send_ipi(struct kvm_lapic *apic) 451static void apic_send_ipi(struct kvm_lapic *apic)
@@ -558,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
558 struct kvm_run *run = vcpu->run; 553 struct kvm_run *run = vcpu->run;
559 554
560 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); 555 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
561 kvm_x86_ops->cache_regs(vcpu); 556 run->tpr_access.rip = kvm_rip_read(vcpu);
562 run->tpr_access.rip = vcpu->arch.rip;
563 run->tpr_access.is_write = write; 557 run->tpr_access.is_write = write;
564} 558}
565 559
@@ -683,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
683 * Refer SDM 8.4.1 677 * Refer SDM 8.4.1
684 */ 678 */
685 if (len != 4 || alignment) { 679 if (len != 4 || alignment) {
686 if (printk_ratelimit()) 680 /* Don't shout loud, $infamous_os would cause only noise. */
687 printk(KERN_ERR "apic write: bad size=%d %lx\n", 681 apic_debug("apic write: bad size=%d %lx\n",
688 len, (long)address); 682 len, (long)address);
689 return; 683 return;
690 } 684 }
691 685
@@ -947,15 +941,12 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
947 941
948 if(!atomic_inc_and_test(&apic->timer.pending)) 942 if(!atomic_inc_and_test(&apic->timer.pending))
949 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); 943 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
950 if (waitqueue_active(q)) { 944 if (waitqueue_active(q))
951 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
952 wake_up_interruptible(q); 945 wake_up_interruptible(q);
953 } 946
954 if (apic_lvtt_period(apic)) { 947 if (apic_lvtt_period(apic)) {
955 result = 1; 948 result = 1;
956 apic->timer.dev.expires = ktime_add_ns( 949 hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
957 apic->timer.dev.expires,
958 apic->timer.period);
959 } 950 }
960 return result; 951 return result;
961} 952}
@@ -1124,7 +1115,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1124 1115
1125 timer = &apic->timer.dev; 1116 timer = &apic->timer.dev;
1126 if (hrtimer_cancel(timer)) 1117 if (hrtimer_cancel(timer))
1127 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); 1118 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
1128} 1119}
1129 1120
1130void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) 1121void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3da2508eb22..99c239c5c0a 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -70,6 +70,9 @@ static int dbg = 0;
70module_param(dbg, bool, 0644); 70module_param(dbg, bool, 0644);
71#endif 71#endif
72 72
73static int oos_shadow = 1;
74module_param(oos_shadow, bool, 0644);
75
73#ifndef MMU_DEBUG 76#ifndef MMU_DEBUG
74#define ASSERT(x) do { } while (0) 77#define ASSERT(x) do { } while (0)
75#else 78#else
@@ -135,18 +138,24 @@ module_param(dbg, bool, 0644);
135#define ACC_USER_MASK PT_USER_MASK 138#define ACC_USER_MASK PT_USER_MASK
136#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 139#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
137 140
138struct kvm_pv_mmu_op_buffer { 141#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
139 void *ptr;
140 unsigned len;
141 unsigned processed;
142 char buf[512] __aligned(sizeof(long));
143};
144 142
145struct kvm_rmap_desc { 143struct kvm_rmap_desc {
146 u64 *shadow_ptes[RMAP_EXT]; 144 u64 *shadow_ptes[RMAP_EXT];
147 struct kvm_rmap_desc *more; 145 struct kvm_rmap_desc *more;
148}; 146};
149 147
148struct kvm_shadow_walk {
149 int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
150 u64 addr, u64 *spte, int level);
151};
152
153struct kvm_unsync_walk {
154 int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
155};
156
157typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
158
150static struct kmem_cache *pte_chain_cache; 159static struct kmem_cache *pte_chain_cache;
151static struct kmem_cache *rmap_desc_cache; 160static struct kmem_cache *rmap_desc_cache;
152static struct kmem_cache *mmu_page_header_cache; 161static struct kmem_cache *mmu_page_header_cache;
@@ -405,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
405{ 414{
406 struct vm_area_struct *vma; 415 struct vm_area_struct *vma;
407 unsigned long addr; 416 unsigned long addr;
417 int ret = 0;
408 418
409 addr = gfn_to_hva(kvm, gfn); 419 addr = gfn_to_hva(kvm, gfn);
410 if (kvm_is_error_hva(addr)) 420 if (kvm_is_error_hva(addr))
411 return 0; 421 return ret;
412 422
423 down_read(&current->mm->mmap_sem);
413 vma = find_vma(current->mm, addr); 424 vma = find_vma(current->mm, addr);
414 if (vma && is_vm_hugetlb_page(vma)) 425 if (vma && is_vm_hugetlb_page(vma))
415 return 1; 426 ret = 1;
427 up_read(&current->mm->mmap_sem);
416 428
417 return 0; 429 return ret;
418} 430}
419 431
420static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) 432static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -649,8 +661,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
649 661
650 if (write_protected) 662 if (write_protected)
651 kvm_flush_remote_tlbs(kvm); 663 kvm_flush_remote_tlbs(kvm);
652
653 account_shadowed(kvm, gfn);
654} 664}
655 665
656static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) 666static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
@@ -859,6 +869,77 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
859 BUG(); 869 BUG();
860} 870}
861 871
872
873static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
874 mmu_parent_walk_fn fn)
875{
876 struct kvm_pte_chain *pte_chain;
877 struct hlist_node *node;
878 struct kvm_mmu_page *parent_sp;
879 int i;
880
881 if (!sp->multimapped && sp->parent_pte) {
882 parent_sp = page_header(__pa(sp->parent_pte));
883 fn(vcpu, parent_sp);
884 mmu_parent_walk(vcpu, parent_sp, fn);
885 return;
886 }
887 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
888 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
889 if (!pte_chain->parent_ptes[i])
890 break;
891 parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
892 fn(vcpu, parent_sp);
893 mmu_parent_walk(vcpu, parent_sp, fn);
894 }
895}
896
897static void kvm_mmu_update_unsync_bitmap(u64 *spte)
898{
899 unsigned int index;
900 struct kvm_mmu_page *sp = page_header(__pa(spte));
901
902 index = spte - sp->spt;
903 __set_bit(index, sp->unsync_child_bitmap);
904 sp->unsync_children = 1;
905}
906
907static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
908{
909 struct kvm_pte_chain *pte_chain;
910 struct hlist_node *node;
911 int i;
912
913 if (!sp->parent_pte)
914 return;
915
916 if (!sp->multimapped) {
917 kvm_mmu_update_unsync_bitmap(sp->parent_pte);
918 return;
919 }
920
921 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
922 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
923 if (!pte_chain->parent_ptes[i])
924 break;
925 kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
926 }
927}
928
929static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
930{
931 sp->unsync_children = 1;
932 kvm_mmu_update_parents_unsync(sp);
933 return 1;
934}
935
936static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
937 struct kvm_mmu_page *sp)
938{
939 mmu_parent_walk(vcpu, sp, unsync_walk_fn);
940 kvm_mmu_update_parents_unsync(sp);
941}
942
862static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, 943static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
863 struct kvm_mmu_page *sp) 944 struct kvm_mmu_page *sp)
864{ 945{
@@ -868,6 +949,58 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
868 sp->spt[i] = shadow_trap_nonpresent_pte; 949 sp->spt[i] = shadow_trap_nonpresent_pte;
869} 950}
870 951
952static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
953 struct kvm_mmu_page *sp)
954{
955 return 1;
956}
957
958static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
959{
960}
961
962#define for_each_unsync_children(bitmap, idx) \
963 for (idx = find_first_bit(bitmap, 512); \
964 idx < 512; \
965 idx = find_next_bit(bitmap, 512, idx+1))
966
967static int mmu_unsync_walk(struct kvm_mmu_page *sp,
968 struct kvm_unsync_walk *walker)
969{
970 int i, ret;
971
972 if (!sp->unsync_children)
973 return 0;
974
975 for_each_unsync_children(sp->unsync_child_bitmap, i) {
976 u64 ent = sp->spt[i];
977
978 if (is_shadow_present_pte(ent)) {
979 struct kvm_mmu_page *child;
980 child = page_header(ent & PT64_BASE_ADDR_MASK);
981
982 if (child->unsync_children) {
983 ret = mmu_unsync_walk(child, walker);
984 if (ret)
985 return ret;
986 __clear_bit(i, sp->unsync_child_bitmap);
987 }
988
989 if (child->unsync) {
990 ret = walker->entry(child, walker);
991 __clear_bit(i, sp->unsync_child_bitmap);
992 if (ret)
993 return ret;
994 }
995 }
996 }
997
998 if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
999 sp->unsync_children = 0;
1000
1001 return 0;
1002}
1003
871static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) 1004static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
872{ 1005{
873 unsigned index; 1006 unsigned index;
@@ -888,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
888 return NULL; 1021 return NULL;
889} 1022}
890 1023
1024static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1025{
1026 WARN_ON(!sp->unsync);
1027 sp->unsync = 0;
1028 --kvm->stat.mmu_unsync;
1029}
1030
1031static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
1032
1033static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1034{
1035 if (sp->role.glevels != vcpu->arch.mmu.root_level) {
1036 kvm_mmu_zap_page(vcpu->kvm, sp);
1037 return 1;
1038 }
1039
1040 rmap_write_protect(vcpu->kvm, sp->gfn);
1041 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1042 kvm_mmu_zap_page(vcpu->kvm, sp);
1043 return 1;
1044 }
1045
1046 kvm_mmu_flush_tlb(vcpu);
1047 kvm_unlink_unsync_page(vcpu->kvm, sp);
1048 return 0;
1049}
1050
1051struct sync_walker {
1052 struct kvm_vcpu *vcpu;
1053 struct kvm_unsync_walk walker;
1054};
1055
1056static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1057{
1058 struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
1059 walker);
1060 struct kvm_vcpu *vcpu = sync_walk->vcpu;
1061
1062 kvm_sync_page(vcpu, sp);
1063 return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
1064}
1065
1066static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1067{
1068 struct sync_walker walker = {
1069 .walker = { .entry = mmu_sync_fn, },
1070 .vcpu = vcpu,
1071 };
1072
1073 while (mmu_unsync_walk(sp, &walker.walker))
1074 cond_resched_lock(&vcpu->kvm->mmu_lock);
1075}
1076
891static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1077static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
892 gfn_t gfn, 1078 gfn_t gfn,
893 gva_t gaddr, 1079 gva_t gaddr,
@@ -901,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
901 unsigned quadrant; 1087 unsigned quadrant;
902 struct hlist_head *bucket; 1088 struct hlist_head *bucket;
903 struct kvm_mmu_page *sp; 1089 struct kvm_mmu_page *sp;
904 struct hlist_node *node; 1090 struct hlist_node *node, *tmp;
905 1091
906 role.word = 0; 1092 role.word = 0;
907 role.glevels = vcpu->arch.mmu.root_level; 1093 role.glevels = vcpu->arch.mmu.root_level;
@@ -917,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
917 gfn, role.word); 1103 gfn, role.word);
918 index = kvm_page_table_hashfn(gfn); 1104 index = kvm_page_table_hashfn(gfn);
919 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1105 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
920 hlist_for_each_entry(sp, node, bucket, hash_link) 1106 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
921 if (sp->gfn == gfn && sp->role.word == role.word) { 1107 if (sp->gfn == gfn) {
1108 if (sp->unsync)
1109 if (kvm_sync_page(vcpu, sp))
1110 continue;
1111
1112 if (sp->role.word != role.word)
1113 continue;
1114
922 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1115 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1116 if (sp->unsync_children) {
1117 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1118 kvm_mmu_mark_parents_unsync(vcpu, sp);
1119 }
923 pgprintk("%s: found\n", __func__); 1120 pgprintk("%s: found\n", __func__);
924 return sp; 1121 return sp;
925 } 1122 }
@@ -931,8 +1128,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
931 sp->gfn = gfn; 1128 sp->gfn = gfn;
932 sp->role = role; 1129 sp->role = role;
933 hlist_add_head(&sp->hash_link, bucket); 1130 hlist_add_head(&sp->hash_link, bucket);
934 if (!metaphysical) 1131 if (!metaphysical) {
935 rmap_write_protect(vcpu->kvm, gfn); 1132 rmap_write_protect(vcpu->kvm, gfn);
1133 account_shadowed(vcpu->kvm, gfn);
1134 }
936 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) 1135 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
937 vcpu->arch.mmu.prefetch_page(vcpu, sp); 1136 vcpu->arch.mmu.prefetch_page(vcpu, sp);
938 else 1137 else
@@ -940,6 +1139,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
940 return sp; 1139 return sp;
941} 1140}
942 1141
1142static int walk_shadow(struct kvm_shadow_walk *walker,
1143 struct kvm_vcpu *vcpu, u64 addr)
1144{
1145 hpa_t shadow_addr;
1146 int level;
1147 int r;
1148 u64 *sptep;
1149 unsigned index;
1150
1151 shadow_addr = vcpu->arch.mmu.root_hpa;
1152 level = vcpu->arch.mmu.shadow_root_level;
1153 if (level == PT32E_ROOT_LEVEL) {
1154 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1155 shadow_addr &= PT64_BASE_ADDR_MASK;
1156 --level;
1157 }
1158
1159 while (level >= PT_PAGE_TABLE_LEVEL) {
1160 index = SHADOW_PT_INDEX(addr, level);
1161 sptep = ((u64 *)__va(shadow_addr)) + index;
1162 r = walker->entry(walker, vcpu, addr, sptep, level);
1163 if (r)
1164 return r;
1165 shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
1166 --level;
1167 }
1168 return 0;
1169}
1170
943static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1171static void kvm_mmu_page_unlink_children(struct kvm *kvm,
944 struct kvm_mmu_page *sp) 1172 struct kvm_mmu_page *sp)
945{ 1173{
@@ -955,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
955 rmap_remove(kvm, &pt[i]); 1183 rmap_remove(kvm, &pt[i]);
956 pt[i] = shadow_trap_nonpresent_pte; 1184 pt[i] = shadow_trap_nonpresent_pte;
957 } 1185 }
958 kvm_flush_remote_tlbs(kvm);
959 return; 1186 return;
960 } 1187 }
961 1188
@@ -974,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
974 } 1201 }
975 pt[i] = shadow_trap_nonpresent_pte; 1202 pt[i] = shadow_trap_nonpresent_pte;
976 } 1203 }
977 kvm_flush_remote_tlbs(kvm);
978} 1204}
979 1205
980static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) 1206static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -991,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
991 kvm->vcpus[i]->arch.last_pte_updated = NULL; 1217 kvm->vcpus[i]->arch.last_pte_updated = NULL;
992} 1218}
993 1219
994static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1220static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
995{ 1221{
996 u64 *parent_pte; 1222 u64 *parent_pte;
997 1223
998 ++kvm->stat.mmu_shadow_zapped;
999 while (sp->multimapped || sp->parent_pte) { 1224 while (sp->multimapped || sp->parent_pte) {
1000 if (!sp->multimapped) 1225 if (!sp->multimapped)
1001 parent_pte = sp->parent_pte; 1226 parent_pte = sp->parent_pte;
@@ -1010,21 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1010 kvm_mmu_put_page(sp, parent_pte); 1235 kvm_mmu_put_page(sp, parent_pte);
1011 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); 1236 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
1012 } 1237 }
1238}
1239
1240struct zap_walker {
1241 struct kvm_unsync_walk walker;
1242 struct kvm *kvm;
1243 int zapped;
1244};
1245
1246static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1247{
1248 struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
1249 walker);
1250 kvm_mmu_zap_page(zap_walk->kvm, sp);
1251 zap_walk->zapped = 1;
1252 return 0;
1253}
1254
1255static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
1256{
1257 struct zap_walker walker = {
1258 .walker = { .entry = mmu_zap_fn, },
1259 .kvm = kvm,
1260 .zapped = 0,
1261 };
1262
1263 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1264 return 0;
1265 mmu_unsync_walk(sp, &walker.walker);
1266 return walker.zapped;
1267}
1268
1269static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1270{
1271 int ret;
1272 ++kvm->stat.mmu_shadow_zapped;
1273 ret = mmu_zap_unsync_children(kvm, sp);
1013 kvm_mmu_page_unlink_children(kvm, sp); 1274 kvm_mmu_page_unlink_children(kvm, sp);
1275 kvm_mmu_unlink_parents(kvm, sp);
1276 kvm_flush_remote_tlbs(kvm);
1277 if (!sp->role.invalid && !sp->role.metaphysical)
1278 unaccount_shadowed(kvm, sp->gfn);
1279 if (sp->unsync)
1280 kvm_unlink_unsync_page(kvm, sp);
1014 if (!sp->root_count) { 1281 if (!sp->root_count) {
1015 if (!sp->role.metaphysical && !sp->role.invalid)
1016 unaccount_shadowed(kvm, sp->gfn);
1017 hlist_del(&sp->hash_link); 1282 hlist_del(&sp->hash_link);
1018 kvm_mmu_free_page(kvm, sp); 1283 kvm_mmu_free_page(kvm, sp);
1019 } else { 1284 } else {
1020 int invalid = sp->role.invalid;
1021 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1022 sp->role.invalid = 1; 1285 sp->role.invalid = 1;
1286 list_move(&sp->link, &kvm->arch.active_mmu_pages);
1023 kvm_reload_remote_mmus(kvm); 1287 kvm_reload_remote_mmus(kvm);
1024 if (!sp->role.metaphysical && !invalid)
1025 unaccount_shadowed(kvm, sp->gfn);
1026 } 1288 }
1027 kvm_mmu_reset_last_pte_updated(kvm); 1289 kvm_mmu_reset_last_pte_updated(kvm);
1290 return ret;
1028} 1291}
1029 1292
1030/* 1293/*
@@ -1077,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1077 if (sp->gfn == gfn && !sp->role.metaphysical) { 1340 if (sp->gfn == gfn && !sp->role.metaphysical) {
1078 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1341 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
1079 sp->role.word); 1342 sp->role.word);
1080 kvm_mmu_zap_page(kvm, sp);
1081 r = 1; 1343 r = 1;
1344 if (kvm_mmu_zap_page(kvm, sp))
1345 n = bucket->first;
1082 } 1346 }
1083 return r; 1347 return r;
1084} 1348}
@@ -1101,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1101 __set_bit(slot, &sp->slot_bitmap); 1365 __set_bit(slot, &sp->slot_bitmap);
1102} 1366}
1103 1367
1368static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1369{
1370 int i;
1371 u64 *pt = sp->spt;
1372
1373 if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1374 return;
1375
1376 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1377 if (pt[i] == shadow_notrap_nonpresent_pte)
1378 set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
1379 }
1380}
1381
1104struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) 1382struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1105{ 1383{
1106 struct page *page; 1384 struct page *page;
@@ -1110,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1110 if (gpa == UNMAPPED_GVA) 1388 if (gpa == UNMAPPED_GVA)
1111 return NULL; 1389 return NULL;
1112 1390
1113 down_read(&current->mm->mmap_sem);
1114 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1391 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1115 up_read(&current->mm->mmap_sem);
1116 1392
1117 return page; 1393 return page;
1118} 1394}
1119 1395
1120static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1396static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1121 unsigned pt_access, unsigned pte_access,
1122 int user_fault, int write_fault, int dirty,
1123 int *ptwrite, int largepage, gfn_t gfn,
1124 pfn_t pfn, bool speculative)
1125{ 1397{
1126 u64 spte; 1398 unsigned index;
1127 int was_rmapped = 0; 1399 struct hlist_head *bucket;
1128 int was_writeble = is_writeble_pte(*shadow_pte); 1400 struct kvm_mmu_page *s;
1401 struct hlist_node *node, *n;
1129 1402
1130 pgprintk("%s: spte %llx access %x write_fault %d" 1403 index = kvm_page_table_hashfn(sp->gfn);
1131 " user_fault %d gfn %lx\n", 1404 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1132 __func__, *shadow_pte, pt_access, 1405 /* don't unsync if pagetable is shadowed with multiple roles */
1133 write_fault, user_fault, gfn); 1406 hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
1407 if (s->gfn != sp->gfn || s->role.metaphysical)
1408 continue;
1409 if (s->role.word != sp->role.word)
1410 return 1;
1411 }
1412 kvm_mmu_mark_parents_unsync(vcpu, sp);
1413 ++vcpu->kvm->stat.mmu_unsync;
1414 sp->unsync = 1;
1415 mmu_convert_notrap(sp);
1416 return 0;
1417}
1134 1418
1135 if (is_rmap_pte(*shadow_pte)) { 1419static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1136 /* 1420 bool can_unsync)
1137 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 1421{
1138 * the parent of the now unreachable PTE. 1422 struct kvm_mmu_page *shadow;
1139 */
1140 if (largepage && !is_large_pte(*shadow_pte)) {
1141 struct kvm_mmu_page *child;
1142 u64 pte = *shadow_pte;
1143 1423
1144 child = page_header(pte & PT64_BASE_ADDR_MASK); 1424 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1145 mmu_page_remove_parent_pte(child, shadow_pte); 1425 if (shadow) {
1146 } else if (pfn != spte_to_pfn(*shadow_pte)) { 1426 if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
1147 pgprintk("hfn old %lx new %lx\n", 1427 return 1;
1148 spte_to_pfn(*shadow_pte), pfn); 1428 if (shadow->unsync)
1149 rmap_remove(vcpu->kvm, shadow_pte); 1429 return 0;
1150 } else { 1430 if (can_unsync && oos_shadow)
1151 if (largepage) 1431 return kvm_unsync_page(vcpu, shadow);
1152 was_rmapped = is_large_pte(*shadow_pte); 1432 return 1;
1153 else
1154 was_rmapped = 1;
1155 }
1156 } 1433 }
1434 return 0;
1435}
1157 1436
1437static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1438 unsigned pte_access, int user_fault,
1439 int write_fault, int dirty, int largepage,
1440 gfn_t gfn, pfn_t pfn, bool speculative,
1441 bool can_unsync)
1442{
1443 u64 spte;
1444 int ret = 0;
1158 /* 1445 /*
1159 * We don't set the accessed bit, since we sometimes want to see 1446 * We don't set the accessed bit, since we sometimes want to see
1160 * whether the guest actually used the pte (in order to detect 1447 * whether the guest actually used the pte (in order to detect
@@ -1162,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1162 */ 1449 */
1163 spte = shadow_base_present_pte | shadow_dirty_mask; 1450 spte = shadow_base_present_pte | shadow_dirty_mask;
1164 if (!speculative) 1451 if (!speculative)
1165 pte_access |= PT_ACCESSED_MASK; 1452 spte |= shadow_accessed_mask;
1166 if (!dirty) 1453 if (!dirty)
1167 pte_access &= ~ACC_WRITE_MASK; 1454 pte_access &= ~ACC_WRITE_MASK;
1168 if (pte_access & ACC_EXEC_MASK) 1455 if (pte_access & ACC_EXEC_MASK)
@@ -1178,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1178 1465
1179 if ((pte_access & ACC_WRITE_MASK) 1466 if ((pte_access & ACC_WRITE_MASK)
1180 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1467 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
1181 struct kvm_mmu_page *shadow; 1468
1469 if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
1470 ret = 1;
1471 spte = shadow_trap_nonpresent_pte;
1472 goto set_pte;
1473 }
1182 1474
1183 spte |= PT_WRITABLE_MASK; 1475 spte |= PT_WRITABLE_MASK;
1184 1476
1185 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1477 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1186 if (shadow ||
1187 (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
1188 pgprintk("%s: found shadow page for %lx, marking ro\n", 1478 pgprintk("%s: found shadow page for %lx, marking ro\n",
1189 __func__, gfn); 1479 __func__, gfn);
1480 ret = 1;
1190 pte_access &= ~ACC_WRITE_MASK; 1481 pte_access &= ~ACC_WRITE_MASK;
1191 if (is_writeble_pte(spte)) { 1482 if (is_writeble_pte(spte))
1192 spte &= ~PT_WRITABLE_MASK; 1483 spte &= ~PT_WRITABLE_MASK;
1193 kvm_x86_ops->tlb_flush(vcpu);
1194 }
1195 if (write_fault)
1196 *ptwrite = 1;
1197 } 1484 }
1198 } 1485 }
1199 1486
1200 if (pte_access & ACC_WRITE_MASK) 1487 if (pte_access & ACC_WRITE_MASK)
1201 mark_page_dirty(vcpu->kvm, gfn); 1488 mark_page_dirty(vcpu->kvm, gfn);
1202 1489
1203 pgprintk("%s: setting spte %llx\n", __func__, spte); 1490set_pte:
1204 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1205 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
1206 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
1207 set_shadow_pte(shadow_pte, spte); 1491 set_shadow_pte(shadow_pte, spte);
1208 if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) 1492 return ret;
1209 && (spte & PT_PRESENT_MASK)) 1493}
1494
1495static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1496 unsigned pt_access, unsigned pte_access,
1497 int user_fault, int write_fault, int dirty,
1498 int *ptwrite, int largepage, gfn_t gfn,
1499 pfn_t pfn, bool speculative)
1500{
1501 int was_rmapped = 0;
1502 int was_writeble = is_writeble_pte(*shadow_pte);
1503
1504 pgprintk("%s: spte %llx access %x write_fault %d"
1505 " user_fault %d gfn %lx\n",
1506 __func__, *shadow_pte, pt_access,
1507 write_fault, user_fault, gfn);
1508
1509 if (is_rmap_pte(*shadow_pte)) {
1510 /*
1511 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1512 * the parent of the now unreachable PTE.
1513 */
1514 if (largepage && !is_large_pte(*shadow_pte)) {
1515 struct kvm_mmu_page *child;
1516 u64 pte = *shadow_pte;
1517
1518 child = page_header(pte & PT64_BASE_ADDR_MASK);
1519 mmu_page_remove_parent_pte(child, shadow_pte);
1520 } else if (pfn != spte_to_pfn(*shadow_pte)) {
1521 pgprintk("hfn old %lx new %lx\n",
1522 spte_to_pfn(*shadow_pte), pfn);
1523 rmap_remove(vcpu->kvm, shadow_pte);
1524 } else {
1525 if (largepage)
1526 was_rmapped = is_large_pte(*shadow_pte);
1527 else
1528 was_rmapped = 1;
1529 }
1530 }
1531 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
1532 dirty, largepage, gfn, pfn, speculative, true)) {
1533 if (write_fault)
1534 *ptwrite = 1;
1535 kvm_x86_ops->tlb_flush(vcpu);
1536 }
1537
1538 pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
1539 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1540 is_large_pte(*shadow_pte)? "2MB" : "4kB",
1541 is_present_pte(*shadow_pte)?"RW":"R", gfn,
1542 *shadow_pte, shadow_pte);
1543 if (!was_rmapped && is_large_pte(*shadow_pte))
1210 ++vcpu->kvm->stat.lpages; 1544 ++vcpu->kvm->stat.lpages;
1211 1545
1212 page_header_update_slot(vcpu->kvm, shadow_pte, gfn); 1546 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
@@ -1230,54 +1564,67 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1230{ 1564{
1231} 1565}
1232 1566
1233static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 1567struct direct_shadow_walk {
1234 int largepage, gfn_t gfn, pfn_t pfn, 1568 struct kvm_shadow_walk walker;
1235 int level) 1569 pfn_t pfn;
1236{ 1570 int write;
1237 hpa_t table_addr = vcpu->arch.mmu.root_hpa; 1571 int largepage;
1238 int pt_write = 0; 1572 int pt_write;
1239 1573};
1240 for (; ; level--) {
1241 u32 index = PT64_INDEX(v, level);
1242 u64 *table;
1243
1244 ASSERT(VALID_PAGE(table_addr));
1245 table = __va(table_addr);
1246 1574
1247 if (level == 1) { 1575static int direct_map_entry(struct kvm_shadow_walk *_walk,
1248 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1576 struct kvm_vcpu *vcpu,
1249 0, write, 1, &pt_write, 0, gfn, pfn, false); 1577 u64 addr, u64 *sptep, int level)
1250 return pt_write; 1578{
1251 } 1579 struct direct_shadow_walk *walk =
1580 container_of(_walk, struct direct_shadow_walk, walker);
1581 struct kvm_mmu_page *sp;
1582 gfn_t pseudo_gfn;
1583 gfn_t gfn = addr >> PAGE_SHIFT;
1584
1585 if (level == PT_PAGE_TABLE_LEVEL
1586 || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
1587 mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
1588 0, walk->write, 1, &walk->pt_write,
1589 walk->largepage, gfn, walk->pfn, false);
1590 ++vcpu->stat.pf_fixed;
1591 return 1;
1592 }
1252 1593
1253 if (largepage && level == 2) { 1594 if (*sptep == shadow_trap_nonpresent_pte) {
1254 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1595 pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
1255 0, write, 1, &pt_write, 1, gfn, pfn, false); 1596 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
1256 return pt_write; 1597 1, ACC_ALL, sptep);
1598 if (!sp) {
1599 pgprintk("nonpaging_map: ENOMEM\n");
1600 kvm_release_pfn_clean(walk->pfn);
1601 return -ENOMEM;
1257 } 1602 }
1258 1603
1259 if (table[index] == shadow_trap_nonpresent_pte) { 1604 set_shadow_pte(sptep,
1260 struct kvm_mmu_page *new_table; 1605 __pa(sp->spt)
1261 gfn_t pseudo_gfn; 1606 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1262 1607 | shadow_user_mask | shadow_x_mask);
1263 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
1264 >> PAGE_SHIFT;
1265 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
1266 v, level - 1,
1267 1, ACC_ALL, &table[index]);
1268 if (!new_table) {
1269 pgprintk("nonpaging_map: ENOMEM\n");
1270 kvm_release_pfn_clean(pfn);
1271 return -ENOMEM;
1272 }
1273
1274 set_shadow_pte(&table[index],
1275 __pa(new_table->spt)
1276 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1277 | shadow_user_mask | shadow_x_mask);
1278 }
1279 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1280 } 1608 }
1609 return 0;
1610}
1611
1612static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1613 int largepage, gfn_t gfn, pfn_t pfn)
1614{
1615 int r;
1616 struct direct_shadow_walk walker = {
1617 .walker = { .entry = direct_map_entry, },
1618 .pfn = pfn,
1619 .largepage = largepage,
1620 .write = write,
1621 .pt_write = 0,
1622 };
1623
1624 r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
1625 if (r < 0)
1626 return r;
1627 return walker.pt_write;
1281} 1628}
1282 1629
1283static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 1630static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
@@ -1287,16 +1634,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1287 pfn_t pfn; 1634 pfn_t pfn;
1288 unsigned long mmu_seq; 1635 unsigned long mmu_seq;
1289 1636
1290 down_read(&current->mm->mmap_sem);
1291 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1637 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1292 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1638 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1293 largepage = 1; 1639 largepage = 1;
1294 } 1640 }
1295 1641
1296 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1642 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1297 /* implicit mb(), we'll read before PT lock is unlocked */ 1643 smp_rmb();
1298 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1644 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1299 up_read(&current->mm->mmap_sem);
1300 1645
1301 /* mmio */ 1646 /* mmio */
1302 if (is_error_pfn(pfn)) { 1647 if (is_error_pfn(pfn)) {
@@ -1308,8 +1653,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1308 if (mmu_notifier_retry(vcpu, mmu_seq)) 1653 if (mmu_notifier_retry(vcpu, mmu_seq))
1309 goto out_unlock; 1654 goto out_unlock;
1310 kvm_mmu_free_some_pages(vcpu); 1655 kvm_mmu_free_some_pages(vcpu);
1311 r = __direct_map(vcpu, v, write, largepage, gfn, pfn, 1656 r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
1312 PT32E_ROOT_LEVEL);
1313 spin_unlock(&vcpu->kvm->mmu_lock); 1657 spin_unlock(&vcpu->kvm->mmu_lock);
1314 1658
1315 1659
@@ -1405,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1405 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 1749 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1406} 1750}
1407 1751
1752static void mmu_sync_roots(struct kvm_vcpu *vcpu)
1753{
1754 int i;
1755 struct kvm_mmu_page *sp;
1756
1757 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1758 return;
1759 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1760 hpa_t root = vcpu->arch.mmu.root_hpa;
1761 sp = page_header(root);
1762 mmu_sync_children(vcpu, sp);
1763 return;
1764 }
1765 for (i = 0; i < 4; ++i) {
1766 hpa_t root = vcpu->arch.mmu.pae_root[i];
1767
1768 if (root) {
1769 root &= PT64_BASE_ADDR_MASK;
1770 sp = page_header(root);
1771 mmu_sync_children(vcpu, sp);
1772 }
1773 }
1774}
1775
1776void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
1777{
1778 spin_lock(&vcpu->kvm->mmu_lock);
1779 mmu_sync_roots(vcpu);
1780 spin_unlock(&vcpu->kvm->mmu_lock);
1781}
1782
1408static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 1783static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1409{ 1784{
1410 return vaddr; 1785 return vaddr;
@@ -1446,15 +1821,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1446 if (r) 1821 if (r)
1447 return r; 1822 return r;
1448 1823
1449 down_read(&current->mm->mmap_sem);
1450 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1824 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1451 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1825 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1452 largepage = 1; 1826 largepage = 1;
1453 } 1827 }
1454 mmu_seq = vcpu->kvm->mmu_notifier_seq; 1828 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1455 /* implicit mb(), we'll read before PT lock is unlocked */ 1829 smp_rmb();
1456 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1830 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1457 up_read(&current->mm->mmap_sem);
1458 if (is_error_pfn(pfn)) { 1831 if (is_error_pfn(pfn)) {
1459 kvm_release_pfn_clean(pfn); 1832 kvm_release_pfn_clean(pfn);
1460 return 1; 1833 return 1;
@@ -1464,7 +1837,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1464 goto out_unlock; 1837 goto out_unlock;
1465 kvm_mmu_free_some_pages(vcpu); 1838 kvm_mmu_free_some_pages(vcpu);
1466 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 1839 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1467 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); 1840 largepage, gfn, pfn);
1468 spin_unlock(&vcpu->kvm->mmu_lock); 1841 spin_unlock(&vcpu->kvm->mmu_lock);
1469 1842
1470 return r; 1843 return r;
@@ -1489,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1489 context->gva_to_gpa = nonpaging_gva_to_gpa; 1862 context->gva_to_gpa = nonpaging_gva_to_gpa;
1490 context->free = nonpaging_free; 1863 context->free = nonpaging_free;
1491 context->prefetch_page = nonpaging_prefetch_page; 1864 context->prefetch_page = nonpaging_prefetch_page;
1865 context->sync_page = nonpaging_sync_page;
1866 context->invlpg = nonpaging_invlpg;
1492 context->root_level = 0; 1867 context->root_level = 0;
1493 context->shadow_root_level = PT32E_ROOT_LEVEL; 1868 context->shadow_root_level = PT32E_ROOT_LEVEL;
1494 context->root_hpa = INVALID_PAGE; 1869 context->root_hpa = INVALID_PAGE;
@@ -1536,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1536 context->page_fault = paging64_page_fault; 1911 context->page_fault = paging64_page_fault;
1537 context->gva_to_gpa = paging64_gva_to_gpa; 1912 context->gva_to_gpa = paging64_gva_to_gpa;
1538 context->prefetch_page = paging64_prefetch_page; 1913 context->prefetch_page = paging64_prefetch_page;
1914 context->sync_page = paging64_sync_page;
1915 context->invlpg = paging64_invlpg;
1539 context->free = paging_free; 1916 context->free = paging_free;
1540 context->root_level = level; 1917 context->root_level = level;
1541 context->shadow_root_level = level; 1918 context->shadow_root_level = level;
@@ -1557,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
1557 context->gva_to_gpa = paging32_gva_to_gpa; 1934 context->gva_to_gpa = paging32_gva_to_gpa;
1558 context->free = paging_free; 1935 context->free = paging_free;
1559 context->prefetch_page = paging32_prefetch_page; 1936 context->prefetch_page = paging32_prefetch_page;
1937 context->sync_page = paging32_sync_page;
1938 context->invlpg = paging32_invlpg;
1560 context->root_level = PT32_ROOT_LEVEL; 1939 context->root_level = PT32_ROOT_LEVEL;
1561 context->shadow_root_level = PT32E_ROOT_LEVEL; 1940 context->shadow_root_level = PT32E_ROOT_LEVEL;
1562 context->root_hpa = INVALID_PAGE; 1941 context->root_hpa = INVALID_PAGE;
@@ -1576,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
1576 context->page_fault = tdp_page_fault; 1955 context->page_fault = tdp_page_fault;
1577 context->free = nonpaging_free; 1956 context->free = nonpaging_free;
1578 context->prefetch_page = nonpaging_prefetch_page; 1957 context->prefetch_page = nonpaging_prefetch_page;
1958 context->sync_page = nonpaging_sync_page;
1959 context->invlpg = nonpaging_invlpg;
1579 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 1960 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
1580 context->root_hpa = INVALID_PAGE; 1961 context->root_hpa = INVALID_PAGE;
1581 1962
@@ -1647,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
1647 spin_lock(&vcpu->kvm->mmu_lock); 2028 spin_lock(&vcpu->kvm->mmu_lock);
1648 kvm_mmu_free_some_pages(vcpu); 2029 kvm_mmu_free_some_pages(vcpu);
1649 mmu_alloc_roots(vcpu); 2030 mmu_alloc_roots(vcpu);
2031 mmu_sync_roots(vcpu);
1650 spin_unlock(&vcpu->kvm->mmu_lock); 2032 spin_unlock(&vcpu->kvm->mmu_lock);
1651 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2033 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1652 kvm_mmu_flush_tlb(vcpu); 2034 kvm_mmu_flush_tlb(vcpu);
@@ -1767,15 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1767 return; 2149 return;
1768 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2150 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1769 2151
1770 down_read(&current->mm->mmap_sem);
1771 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { 2152 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
1772 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 2153 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1773 vcpu->arch.update_pte.largepage = 1; 2154 vcpu->arch.update_pte.largepage = 1;
1774 } 2155 }
1775 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; 2156 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
1776 /* implicit mb(), we'll read before PT lock is unlocked */ 2157 smp_rmb();
1777 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2158 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1778 up_read(&current->mm->mmap_sem);
1779 2159
1780 if (is_error_pfn(pfn)) { 2160 if (is_error_pfn(pfn)) {
1781 kvm_release_pfn_clean(pfn); 2161 kvm_release_pfn_clean(pfn);
@@ -1837,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1837 index = kvm_page_table_hashfn(gfn); 2217 index = kvm_page_table_hashfn(gfn);
1838 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2218 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1839 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2219 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1840 if (sp->gfn != gfn || sp->role.metaphysical) 2220 if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
1841 continue; 2221 continue;
1842 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; 2222 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1843 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2223 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -1855,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1855 */ 2235 */
1856 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2236 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1857 gpa, bytes, sp->role.word); 2237 gpa, bytes, sp->role.word);
1858 kvm_mmu_zap_page(vcpu->kvm, sp); 2238 if (kvm_mmu_zap_page(vcpu->kvm, sp))
2239 n = bucket->first;
1859 ++vcpu->kvm->stat.mmu_flooded; 2240 ++vcpu->kvm->stat.mmu_flooded;
1860 continue; 2241 continue;
1861 } 2242 }
@@ -1969,6 +2350,16 @@ out:
1969} 2350}
1970EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); 2351EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1971 2352
2353void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2354{
2355 spin_lock(&vcpu->kvm->mmu_lock);
2356 vcpu->arch.mmu.invlpg(vcpu, gva);
2357 spin_unlock(&vcpu->kvm->mmu_lock);
2358 kvm_mmu_flush_tlb(vcpu);
2359 ++vcpu->stat.invlpg;
2360}
2361EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
2362
1972void kvm_enable_tdp(void) 2363void kvm_enable_tdp(void)
1973{ 2364{
1974 tdp_enabled = true; 2365 tdp_enabled = true;
@@ -2055,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2055{ 2446{
2056 struct kvm_mmu_page *sp; 2447 struct kvm_mmu_page *sp;
2057 2448
2449 spin_lock(&kvm->mmu_lock);
2058 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 2450 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
2059 int i; 2451 int i;
2060 u64 *pt; 2452 u64 *pt;
@@ -2068,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
2068 if (pt[i] & PT_WRITABLE_MASK) 2460 if (pt[i] & PT_WRITABLE_MASK)
2069 pt[i] &= ~PT_WRITABLE_MASK; 2461 pt[i] &= ~PT_WRITABLE_MASK;
2070 } 2462 }
2463 kvm_flush_remote_tlbs(kvm);
2464 spin_unlock(&kvm->mmu_lock);
2071} 2465}
2072 2466
2073void kvm_mmu_zap_all(struct kvm *kvm) 2467void kvm_mmu_zap_all(struct kvm *kvm)
@@ -2076,7 +2470,9 @@ void kvm_mmu_zap_all(struct kvm *kvm)
2076 2470
2077 spin_lock(&kvm->mmu_lock); 2471 spin_lock(&kvm->mmu_lock);
2078 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 2472 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
2079 kvm_mmu_zap_page(kvm, sp); 2473 if (kvm_mmu_zap_page(kvm, sp))
2474 node = container_of(kvm->arch.active_mmu_pages.next,
2475 struct kvm_mmu_page, link);
2080 spin_unlock(&kvm->mmu_lock); 2476 spin_unlock(&kvm->mmu_lock);
2081 2477
2082 kvm_flush_remote_tlbs(kvm); 2478 kvm_flush_remote_tlbs(kvm);
@@ -2291,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2291 gpa_t addr, unsigned long *ret) 2687 gpa_t addr, unsigned long *ret)
2292{ 2688{
2293 int r; 2689 int r;
2294 struct kvm_pv_mmu_op_buffer buffer; 2690 struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
2295 2691
2296 buffer.ptr = buffer.buf; 2692 buffer->ptr = buffer->buf;
2297 buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); 2693 buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
2298 buffer.processed = 0; 2694 buffer->processed = 0;
2299 2695
2300 r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); 2696 r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
2301 if (r) 2697 if (r)
2302 goto out; 2698 goto out;
2303 2699
2304 while (buffer.len) { 2700 while (buffer->len) {
2305 r = kvm_pv_mmu_op_one(vcpu, &buffer); 2701 r = kvm_pv_mmu_op_one(vcpu, buffer);
2306 if (r < 0) 2702 if (r < 0)
2307 goto out; 2703 goto out;
2308 if (r == 0) 2704 if (r == 0)
@@ -2311,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2311 2707
2312 r = 1; 2708 r = 1;
2313out: 2709out:
2314 *ret = buffer.processed; 2710 *ret = buffer->processed;
2315 return r; 2711 return r;
2316} 2712}
2317 2713
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 4a814bff21f..613ec9aa674 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -25,11 +25,11 @@
25#if PTTYPE == 64 25#if PTTYPE == 64
26 #define pt_element_t u64 26 #define pt_element_t u64
27 #define guest_walker guest_walker64 27 #define guest_walker guest_walker64
28 #define shadow_walker shadow_walker64
28 #define FNAME(name) paging##64_##name 29 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK 30 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK 31 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 32 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) 33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS 34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
35 #ifdef CONFIG_X86_64 35 #ifdef CONFIG_X86_64
@@ -42,11 +42,11 @@
42#elif PTTYPE == 32 42#elif PTTYPE == 32
43 #define pt_element_t u32 43 #define pt_element_t u32
44 #define guest_walker guest_walker32 44 #define guest_walker guest_walker32
45 #define shadow_walker shadow_walker32
45 #define FNAME(name) paging##32_##name 46 #define FNAME(name) paging##32_##name
46 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK 47 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK 48 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
48 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
49 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) 50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
51 #define PT_LEVEL_BITS PT32_LEVEL_BITS 51 #define PT_LEVEL_BITS PT32_LEVEL_BITS
52 #define PT_MAX_FULL_LEVELS 2 52 #define PT_MAX_FULL_LEVELS 2
@@ -73,6 +73,17 @@ struct guest_walker {
73 u32 error_code; 73 u32 error_code;
74}; 74};
75 75
76struct shadow_walker {
77 struct kvm_shadow_walk walker;
78 struct guest_walker *guest_walker;
79 int user_fault;
80 int write_fault;
81 int largepage;
82 int *ptwrite;
83 pfn_t pfn;
84 u64 *sptep;
85};
86
76static gfn_t gpte_to_gfn(pt_element_t gpte) 87static gfn_t gpte_to_gfn(pt_element_t gpte)
77{ 88{
78 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; 89 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
91 pt_element_t *table; 102 pt_element_t *table;
92 struct page *page; 103 struct page *page;
93 104
94 down_read(&current->mm->mmap_sem);
95 page = gfn_to_page(kvm, table_gfn); 105 page = gfn_to_page(kvm, table_gfn);
96 up_read(&current->mm->mmap_sem);
97 106
98 table = kmap_atomic(page, KM_USER0); 107 table = kmap_atomic(page, KM_USER0);
99
100 ret = CMPXCHG(&table[index], orig_pte, new_pte); 108 ret = CMPXCHG(&table[index], orig_pte, new_pte);
101
102 kunmap_atomic(table, KM_USER0); 109 kunmap_atomic(table, KM_USER0);
103 110
104 kvm_release_page_dirty(page); 111 kvm_release_page_dirty(page);
@@ -274,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
274/* 281/*
275 * Fetch a shadow pte for a specific level in the paging hierarchy. 282 * Fetch a shadow pte for a specific level in the paging hierarchy.
276 */ 283 */
277static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 284static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
278 struct guest_walker *walker, 285 struct kvm_vcpu *vcpu, u64 addr,
279 int user_fault, int write_fault, int largepage, 286 u64 *sptep, int level)
280 int *ptwrite, pfn_t pfn)
281{ 287{
282 hpa_t shadow_addr; 288 struct shadow_walker *sw =
283 int level; 289 container_of(_sw, struct shadow_walker, walker);
284 u64 *shadow_ent; 290 struct guest_walker *gw = sw->guest_walker;
285 unsigned access = walker->pt_access; 291 unsigned access = gw->pt_access;
286 292 struct kvm_mmu_page *shadow_page;
287 if (!is_present_pte(walker->ptes[walker->level - 1])) 293 u64 spte;
288 return NULL; 294 int metaphysical;
289 295 gfn_t table_gfn;
290 shadow_addr = vcpu->arch.mmu.root_hpa; 296 int r;
291 level = vcpu->arch.mmu.shadow_root_level; 297 pt_element_t curr_pte;
292 if (level == PT32E_ROOT_LEVEL) { 298
293 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 299 if (level == PT_PAGE_TABLE_LEVEL
294 shadow_addr &= PT64_BASE_ADDR_MASK; 300 || (sw->largepage && level == PT_DIRECTORY_LEVEL)) {
295 --level; 301 mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
302 sw->user_fault, sw->write_fault,
303 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
304 sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
305 false);
306 sw->sptep = sptep;
307 return 1;
296 } 308 }
297 309
298 for (; ; level--) { 310 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
299 u32 index = SHADOW_PT_INDEX(addr, level); 311 return 0;
300 struct kvm_mmu_page *shadow_page;
301 u64 shadow_pte;
302 int metaphysical;
303 gfn_t table_gfn;
304
305 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
306 if (level == PT_PAGE_TABLE_LEVEL)
307 break;
308
309 if (largepage && level == PT_DIRECTORY_LEVEL)
310 break;
311 312
312 if (is_shadow_present_pte(*shadow_ent) 313 if (is_large_pte(*sptep)) {
313 && !is_large_pte(*shadow_ent)) { 314 set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
314 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; 315 kvm_flush_remote_tlbs(vcpu->kvm);
315 continue; 316 rmap_remove(vcpu->kvm, sptep);
316 } 317 }
317 318
318 if (is_large_pte(*shadow_ent)) 319 if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
319 rmap_remove(vcpu->kvm, shadow_ent); 320 metaphysical = 1;
320 321 if (!is_dirty_pte(gw->ptes[level - 1]))
321 if (level - 1 == PT_PAGE_TABLE_LEVEL 322 access &= ~ACC_WRITE_MASK;
322 && walker->level == PT_DIRECTORY_LEVEL) { 323 table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
323 metaphysical = 1; 324 } else {
324 if (!is_dirty_pte(walker->ptes[level - 1])) 325 metaphysical = 0;
325 access &= ~ACC_WRITE_MASK; 326 table_gfn = gw->table_gfn[level - 2];
326 table_gfn = gpte_to_gfn(walker->ptes[level - 1]); 327 }
327 } else { 328 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
328 metaphysical = 0; 329 metaphysical, access, sptep);
329 table_gfn = walker->table_gfn[level - 2]; 330 if (!metaphysical) {
330 } 331 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
331 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, 332 &curr_pte, sizeof(curr_pte));
332 metaphysical, access, 333 if (r || curr_pte != gw->ptes[level - 2]) {
333 shadow_ent); 334 kvm_release_pfn_clean(sw->pfn);
334 if (!metaphysical) { 335 sw->sptep = NULL;
335 int r; 336 return 1;
336 pt_element_t curr_pte;
337 r = kvm_read_guest_atomic(vcpu->kvm,
338 walker->pte_gpa[level - 2],
339 &curr_pte, sizeof(curr_pte));
340 if (r || curr_pte != walker->ptes[level - 2]) {
341 kvm_release_pfn_clean(pfn);
342 return NULL;
343 }
344 } 337 }
345 shadow_addr = __pa(shadow_page->spt);
346 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
347 | PT_WRITABLE_MASK | PT_USER_MASK;
348 set_shadow_pte(shadow_ent, shadow_pte);
349 } 338 }
350 339
351 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, 340 spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK
352 user_fault, write_fault, 341 | PT_WRITABLE_MASK | PT_USER_MASK;
353 walker->ptes[walker->level-1] & PT_DIRTY_MASK, 342 *sptep = spte;
354 ptwrite, largepage, walker->gfn, pfn, false); 343 return 0;
344}
345
346static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
347 struct guest_walker *guest_walker,
348 int user_fault, int write_fault, int largepage,
349 int *ptwrite, pfn_t pfn)
350{
351 struct shadow_walker walker = {
352 .walker = { .entry = FNAME(shadow_walk_entry), },
353 .guest_walker = guest_walker,
354 .user_fault = user_fault,
355 .write_fault = write_fault,
356 .largepage = largepage,
357 .ptwrite = ptwrite,
358 .pfn = pfn,
359 };
360
361 if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1]))
362 return NULL;
363
364 walk_shadow(&walker.walker, vcpu, addr);
355 365
356 return shadow_ent; 366 return walker.sptep;
357} 367}
358 368
359/* 369/*
@@ -407,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
407 return 0; 417 return 0;
408 } 418 }
409 419
410 down_read(&current->mm->mmap_sem);
411 if (walker.level == PT_DIRECTORY_LEVEL) { 420 if (walker.level == PT_DIRECTORY_LEVEL) {
412 gfn_t large_gfn; 421 gfn_t large_gfn;
413 large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); 422 large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
@@ -417,9 +426,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
417 } 426 }
418 } 427 }
419 mmu_seq = vcpu->kvm->mmu_notifier_seq; 428 mmu_seq = vcpu->kvm->mmu_notifier_seq;
420 /* implicit mb(), we'll read before PT lock is unlocked */ 429 smp_rmb();
421 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 430 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
422 up_read(&current->mm->mmap_sem);
423 431
424 /* mmio */ 432 /* mmio */
425 if (is_error_pfn(pfn)) { 433 if (is_error_pfn(pfn)) {
@@ -453,6 +461,31 @@ out_unlock:
453 return 0; 461 return 0;
454} 462}
455 463
464static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
465 struct kvm_vcpu *vcpu, u64 addr,
466 u64 *sptep, int level)
467{
468
469 if (level == PT_PAGE_TABLE_LEVEL) {
470 if (is_shadow_present_pte(*sptep))
471 rmap_remove(vcpu->kvm, sptep);
472 set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
473 return 1;
474 }
475 if (!is_shadow_present_pte(*sptep))
476 return 1;
477 return 0;
478}
479
480static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
481{
482 struct shadow_walker walker = {
483 .walker = { .entry = FNAME(shadow_invlpg_entry), },
484 };
485
486 walk_shadow(&walker.walker, vcpu, gva);
487}
488
456static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 489static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
457{ 490{
458 struct guest_walker walker; 491 struct guest_walker walker;
@@ -499,12 +532,66 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
499 } 532 }
500} 533}
501 534
535/*
536 * Using the cached information from sp->gfns is safe because:
537 * - The spte has a reference to the struct page, so the pfn for a given gfn
538 * can't change unless all sptes pointing to it are nuked first.
539 * - Alias changes zap the entire shadow cache.
540 */
541static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
542{
543 int i, offset, nr_present;
544
545 offset = nr_present = 0;
546
547 if (PTTYPE == 32)
548 offset = sp->role.quadrant << PT64_LEVEL_BITS;
549
550 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
551 unsigned pte_access;
552 pt_element_t gpte;
553 gpa_t pte_gpa;
554 gfn_t gfn = sp->gfns[i];
555
556 if (!is_shadow_present_pte(sp->spt[i]))
557 continue;
558
559 pte_gpa = gfn_to_gpa(sp->gfn);
560 pte_gpa += (i+offset) * sizeof(pt_element_t);
561
562 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
563 sizeof(pt_element_t)))
564 return -EINVAL;
565
566 if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
567 !(gpte & PT_ACCESSED_MASK)) {
568 u64 nonpresent;
569
570 rmap_remove(vcpu->kvm, &sp->spt[i]);
571 if (is_present_pte(gpte))
572 nonpresent = shadow_trap_nonpresent_pte;
573 else
574 nonpresent = shadow_notrap_nonpresent_pte;
575 set_shadow_pte(&sp->spt[i], nonpresent);
576 continue;
577 }
578
579 nr_present++;
580 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
581 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
582 is_dirty_pte(gpte), 0, gfn,
583 spte_to_pfn(sp->spt[i]), true, false);
584 }
585
586 return !nr_present;
587}
588
502#undef pt_element_t 589#undef pt_element_t
503#undef guest_walker 590#undef guest_walker
591#undef shadow_walker
504#undef FNAME 592#undef FNAME
505#undef PT_BASE_ADDR_MASK 593#undef PT_BASE_ADDR_MASK
506#undef PT_INDEX 594#undef PT_INDEX
507#undef SHADOW_PT_INDEX
508#undef PT_LEVEL_MASK 595#undef PT_LEVEL_MASK
509#undef PT_DIR_BASE_ADDR_MASK 596#undef PT_DIR_BASE_ADDR_MASK
510#undef PT_LEVEL_BITS 597#undef PT_LEVEL_BITS
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8233b86c778..9c4ce657d96 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -18,6 +18,7 @@
18#include "kvm_svm.h" 18#include "kvm_svm.h"
19#include "irq.h" 19#include "irq.h"
20#include "mmu.h" 20#include "mmu.h"
21#include "kvm_cache_regs.h"
21 22
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/kernel.h> 24#include <linux/kernel.h>
@@ -35,10 +36,6 @@ MODULE_LICENSE("GPL");
35#define IOPM_ALLOC_ORDER 2 36#define IOPM_ALLOC_ORDER 2
36#define MSRPM_ALLOC_ORDER 1 37#define MSRPM_ALLOC_ORDER 1
37 38
38#define DB_VECTOR 1
39#define UD_VECTOR 6
40#define GP_VECTOR 13
41
42#define DR7_GD_MASK (1 << 13) 39#define DR7_GD_MASK (1 << 13)
43#define DR6_BD_MASK (1 << 13) 40#define DR6_BD_MASK (1 << 13)
44 41
@@ -47,7 +44,7 @@ MODULE_LICENSE("GPL");
47 44
48#define SVM_FEATURE_NPT (1 << 0) 45#define SVM_FEATURE_NPT (1 << 0)
49#define SVM_FEATURE_LBRV (1 << 1) 46#define SVM_FEATURE_LBRV (1 << 1)
50#define SVM_DEATURE_SVML (1 << 2) 47#define SVM_FEATURE_SVML (1 << 2)
51 48
52#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 49#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
53 50
@@ -236,13 +233,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
236 printk(KERN_DEBUG "%s: NOP\n", __func__); 233 printk(KERN_DEBUG "%s: NOP\n", __func__);
237 return; 234 return;
238 } 235 }
239 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) 236 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
240 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", 237 printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
241 __func__, 238 __func__, kvm_rip_read(vcpu), svm->next_rip);
242 svm->vmcb->save.rip,
243 svm->next_rip);
244 239
245 vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; 240 kvm_rip_write(vcpu, svm->next_rip);
246 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 241 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
247 242
248 vcpu->arch.interrupt_window_open = 1; 243 vcpu->arch.interrupt_window_open = 1;
@@ -530,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm)
530 (1ULL << INTERCEPT_CPUID) | 525 (1ULL << INTERCEPT_CPUID) |
531 (1ULL << INTERCEPT_INVD) | 526 (1ULL << INTERCEPT_INVD) |
532 (1ULL << INTERCEPT_HLT) | 527 (1ULL << INTERCEPT_HLT) |
528 (1ULL << INTERCEPT_INVLPG) |
533 (1ULL << INTERCEPT_INVLPGA) | 529 (1ULL << INTERCEPT_INVLPGA) |
534 (1ULL << INTERCEPT_IOIO_PROT) | 530 (1ULL << INTERCEPT_IOIO_PROT) |
535 (1ULL << INTERCEPT_MSR_PROT) | 531 (1ULL << INTERCEPT_MSR_PROT) |
@@ -581,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm)
581 save->dr7 = 0x400; 577 save->dr7 = 0x400;
582 save->rflags = 2; 578 save->rflags = 2;
583 save->rip = 0x0000fff0; 579 save->rip = 0x0000fff0;
580 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
584 581
585 /* 582 /*
586 * cr0 val on cpu init should be 0x60000010, we enable cpu 583 * cr0 val on cpu init should be 0x60000010, we enable cpu
@@ -593,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm)
593 if (npt_enabled) { 590 if (npt_enabled) {
594 /* Setup VMCB for Nested Paging */ 591 /* Setup VMCB for Nested Paging */
595 control->nested_ctl = 1; 592 control->nested_ctl = 1;
596 control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH); 593 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
594 (1ULL << INTERCEPT_INVLPG));
597 control->intercept_exceptions &= ~(1 << PF_VECTOR); 595 control->intercept_exceptions &= ~(1 << PF_VECTOR);
598 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| 596 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
599 INTERCEPT_CR3_MASK); 597 INTERCEPT_CR3_MASK);
@@ -615,10 +613,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
615 init_vmcb(svm); 613 init_vmcb(svm);
616 614
617 if (vcpu->vcpu_id != 0) { 615 if (vcpu->vcpu_id != 0) {
618 svm->vmcb->save.rip = 0; 616 kvm_rip_write(vcpu, 0);
619 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; 617 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
620 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; 618 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
621 } 619 }
620 vcpu->arch.regs_avail = ~0;
621 vcpu->arch.regs_dirty = ~0;
622 622
623 return 0; 623 return 0;
624} 624}
@@ -721,23 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
721 rdtscll(vcpu->arch.host_tsc); 721 rdtscll(vcpu->arch.host_tsc);
722} 722}
723 723
724static void svm_cache_regs(struct kvm_vcpu *vcpu)
725{
726 struct vcpu_svm *svm = to_svm(vcpu);
727
728 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
729 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
730 vcpu->arch.rip = svm->vmcb->save.rip;
731}
732
733static void svm_decache_regs(struct kvm_vcpu *vcpu)
734{
735 struct vcpu_svm *svm = to_svm(vcpu);
736 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
737 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
738 svm->vmcb->save.rip = vcpu->arch.rip;
739}
740
741static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 724static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
742{ 725{
743 return to_svm(vcpu)->vmcb->save.rflags; 726 return to_svm(vcpu)->vmcb->save.rflags;
@@ -1040,7 +1023,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1040 if (npt_enabled) 1023 if (npt_enabled)
1041 svm_flush_tlb(&svm->vcpu); 1024 svm_flush_tlb(&svm->vcpu);
1042 1025
1043 if (event_injection) 1026 if (!npt_enabled && event_injection)
1044 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1027 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1045 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1028 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1046} 1029}
@@ -1139,14 +1122,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1139 1122
1140static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1123static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1141{ 1124{
1142 svm->next_rip = svm->vmcb->save.rip + 1; 1125 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1143 skip_emulated_instruction(&svm->vcpu); 1126 skip_emulated_instruction(&svm->vcpu);
1144 return kvm_emulate_halt(&svm->vcpu); 1127 return kvm_emulate_halt(&svm->vcpu);
1145} 1128}
1146 1129
1147static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1130static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1148{ 1131{
1149 svm->next_rip = svm->vmcb->save.rip + 3; 1132 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1150 skip_emulated_instruction(&svm->vcpu); 1133 skip_emulated_instruction(&svm->vcpu);
1151 kvm_emulate_hypercall(&svm->vcpu); 1134 kvm_emulate_hypercall(&svm->vcpu);
1152 return 1; 1135 return 1;
@@ -1178,11 +1161,18 @@ static int task_switch_interception(struct vcpu_svm *svm,
1178 1161
1179static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1162static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1180{ 1163{
1181 svm->next_rip = svm->vmcb->save.rip + 2; 1164 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1182 kvm_emulate_cpuid(&svm->vcpu); 1165 kvm_emulate_cpuid(&svm->vcpu);
1183 return 1; 1166 return 1;
1184} 1167}
1185 1168
1169static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1170{
1171 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
1172 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
1173 return 1;
1174}
1175
1186static int emulate_on_interception(struct vcpu_svm *svm, 1176static int emulate_on_interception(struct vcpu_svm *svm,
1187 struct kvm_run *kvm_run) 1177 struct kvm_run *kvm_run)
1188{ 1178{
@@ -1273,9 +1263,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1273 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, 1263 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
1274 (u32)(data >> 32), handler); 1264 (u32)(data >> 32), handler);
1275 1265
1276 svm->vmcb->save.rax = data & 0xffffffff; 1266 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
1277 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; 1267 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1278 svm->next_rip = svm->vmcb->save.rip + 2; 1268 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1279 skip_emulated_instruction(&svm->vcpu); 1269 skip_emulated_instruction(&svm->vcpu);
1280 } 1270 }
1281 return 1; 1271 return 1;
@@ -1359,13 +1349,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1359static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1349static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1360{ 1350{
1361 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1351 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1362 u64 data = (svm->vmcb->save.rax & -1u) 1352 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
1363 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 1353 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1364 1354
1365 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), 1355 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
1366 handler); 1356 handler);
1367 1357
1368 svm->next_rip = svm->vmcb->save.rip + 2; 1358 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1369 if (svm_set_msr(&svm->vcpu, ecx, data)) 1359 if (svm_set_msr(&svm->vcpu, ecx, data))
1370 kvm_inject_gp(&svm->vcpu, 0); 1360 kvm_inject_gp(&svm->vcpu, 0);
1371 else 1361 else
@@ -1436,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1436 [SVM_EXIT_CPUID] = cpuid_interception, 1426 [SVM_EXIT_CPUID] = cpuid_interception,
1437 [SVM_EXIT_INVD] = emulate_on_interception, 1427 [SVM_EXIT_INVD] = emulate_on_interception,
1438 [SVM_EXIT_HLT] = halt_interception, 1428 [SVM_EXIT_HLT] = halt_interception,
1439 [SVM_EXIT_INVLPG] = emulate_on_interception, 1429 [SVM_EXIT_INVLPG] = invlpg_interception,
1440 [SVM_EXIT_INVLPGA] = invalid_op_interception, 1430 [SVM_EXIT_INVLPGA] = invalid_op_interception,
1441 [SVM_EXIT_IOIO] = io_interception, 1431 [SVM_EXIT_IOIO] = io_interception,
1442 [SVM_EXIT_MSR] = msr_interception, 1432 [SVM_EXIT_MSR] = msr_interception,
@@ -1538,6 +1528,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1538 1528
1539 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); 1529 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
1540 1530
1531 ++svm->vcpu.stat.irq_injections;
1541 control = &svm->vmcb->control; 1532 control = &svm->vmcb->control;
1542 control->int_vector = irq; 1533 control->int_vector = irq;
1543 control->int_ctl &= ~V_INTR_PRIO_MASK; 1534 control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -1716,6 +1707,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
1716 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 1707 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
1717} 1708}
1718 1709
1710#ifdef CONFIG_X86_64
1711#define R "r"
1712#else
1713#define R "e"
1714#endif
1715
1719static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1716static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1720{ 1717{
1721 struct vcpu_svm *svm = to_svm(vcpu); 1718 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1723,6 +1720,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1723 u16 gs_selector; 1720 u16 gs_selector;
1724 u16 ldt_selector; 1721 u16 ldt_selector;
1725 1722
1723 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
1724 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
1725 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
1726
1726 pre_svm_run(svm); 1727 pre_svm_run(svm);
1727 1728
1728 sync_lapic_to_cr8(vcpu); 1729 sync_lapic_to_cr8(vcpu);
@@ -1750,19 +1751,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1750 local_irq_enable(); 1751 local_irq_enable();
1751 1752
1752 asm volatile ( 1753 asm volatile (
1754 "push %%"R"bp; \n\t"
1755 "mov %c[rbx](%[svm]), %%"R"bx \n\t"
1756 "mov %c[rcx](%[svm]), %%"R"cx \n\t"
1757 "mov %c[rdx](%[svm]), %%"R"dx \n\t"
1758 "mov %c[rsi](%[svm]), %%"R"si \n\t"
1759 "mov %c[rdi](%[svm]), %%"R"di \n\t"
1760 "mov %c[rbp](%[svm]), %%"R"bp \n\t"
1753#ifdef CONFIG_X86_64 1761#ifdef CONFIG_X86_64
1754 "push %%rbp; \n\t"
1755#else
1756 "push %%ebp; \n\t"
1757#endif
1758
1759#ifdef CONFIG_X86_64
1760 "mov %c[rbx](%[svm]), %%rbx \n\t"
1761 "mov %c[rcx](%[svm]), %%rcx \n\t"
1762 "mov %c[rdx](%[svm]), %%rdx \n\t"
1763 "mov %c[rsi](%[svm]), %%rsi \n\t"
1764 "mov %c[rdi](%[svm]), %%rdi \n\t"
1765 "mov %c[rbp](%[svm]), %%rbp \n\t"
1766 "mov %c[r8](%[svm]), %%r8 \n\t" 1762 "mov %c[r8](%[svm]), %%r8 \n\t"
1767 "mov %c[r9](%[svm]), %%r9 \n\t" 1763 "mov %c[r9](%[svm]), %%r9 \n\t"
1768 "mov %c[r10](%[svm]), %%r10 \n\t" 1764 "mov %c[r10](%[svm]), %%r10 \n\t"
@@ -1771,41 +1767,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1771 "mov %c[r13](%[svm]), %%r13 \n\t" 1767 "mov %c[r13](%[svm]), %%r13 \n\t"
1772 "mov %c[r14](%[svm]), %%r14 \n\t" 1768 "mov %c[r14](%[svm]), %%r14 \n\t"
1773 "mov %c[r15](%[svm]), %%r15 \n\t" 1769 "mov %c[r15](%[svm]), %%r15 \n\t"
1774#else
1775 "mov %c[rbx](%[svm]), %%ebx \n\t"
1776 "mov %c[rcx](%[svm]), %%ecx \n\t"
1777 "mov %c[rdx](%[svm]), %%edx \n\t"
1778 "mov %c[rsi](%[svm]), %%esi \n\t"
1779 "mov %c[rdi](%[svm]), %%edi \n\t"
1780 "mov %c[rbp](%[svm]), %%ebp \n\t"
1781#endif 1770#endif
1782 1771
1783#ifdef CONFIG_X86_64
1784 /* Enter guest mode */
1785 "push %%rax \n\t"
1786 "mov %c[vmcb](%[svm]), %%rax \n\t"
1787 __ex(SVM_VMLOAD) "\n\t"
1788 __ex(SVM_VMRUN) "\n\t"
1789 __ex(SVM_VMSAVE) "\n\t"
1790 "pop %%rax \n\t"
1791#else
1792 /* Enter guest mode */ 1772 /* Enter guest mode */
1793 "push %%eax \n\t" 1773 "push %%"R"ax \n\t"
1794 "mov %c[vmcb](%[svm]), %%eax \n\t" 1774 "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
1795 __ex(SVM_VMLOAD) "\n\t" 1775 __ex(SVM_VMLOAD) "\n\t"
1796 __ex(SVM_VMRUN) "\n\t" 1776 __ex(SVM_VMRUN) "\n\t"
1797 __ex(SVM_VMSAVE) "\n\t" 1777 __ex(SVM_VMSAVE) "\n\t"
1798 "pop %%eax \n\t" 1778 "pop %%"R"ax \n\t"
1799#endif
1800 1779
1801 /* Save guest registers, load host registers */ 1780 /* Save guest registers, load host registers */
1781 "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
1782 "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
1783 "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
1784 "mov %%"R"si, %c[rsi](%[svm]) \n\t"
1785 "mov %%"R"di, %c[rdi](%[svm]) \n\t"
1786 "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
1802#ifdef CONFIG_X86_64 1787#ifdef CONFIG_X86_64
1803 "mov %%rbx, %c[rbx](%[svm]) \n\t"
1804 "mov %%rcx, %c[rcx](%[svm]) \n\t"
1805 "mov %%rdx, %c[rdx](%[svm]) \n\t"
1806 "mov %%rsi, %c[rsi](%[svm]) \n\t"
1807 "mov %%rdi, %c[rdi](%[svm]) \n\t"
1808 "mov %%rbp, %c[rbp](%[svm]) \n\t"
1809 "mov %%r8, %c[r8](%[svm]) \n\t" 1788 "mov %%r8, %c[r8](%[svm]) \n\t"
1810 "mov %%r9, %c[r9](%[svm]) \n\t" 1789 "mov %%r9, %c[r9](%[svm]) \n\t"
1811 "mov %%r10, %c[r10](%[svm]) \n\t" 1790 "mov %%r10, %c[r10](%[svm]) \n\t"
@@ -1814,18 +1793,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1814 "mov %%r13, %c[r13](%[svm]) \n\t" 1793 "mov %%r13, %c[r13](%[svm]) \n\t"
1815 "mov %%r14, %c[r14](%[svm]) \n\t" 1794 "mov %%r14, %c[r14](%[svm]) \n\t"
1816 "mov %%r15, %c[r15](%[svm]) \n\t" 1795 "mov %%r15, %c[r15](%[svm]) \n\t"
1817
1818 "pop %%rbp; \n\t"
1819#else
1820 "mov %%ebx, %c[rbx](%[svm]) \n\t"
1821 "mov %%ecx, %c[rcx](%[svm]) \n\t"
1822 "mov %%edx, %c[rdx](%[svm]) \n\t"
1823 "mov %%esi, %c[rsi](%[svm]) \n\t"
1824 "mov %%edi, %c[rdi](%[svm]) \n\t"
1825 "mov %%ebp, %c[rbp](%[svm]) \n\t"
1826
1827 "pop %%ebp; \n\t"
1828#endif 1796#endif
1797 "pop %%"R"bp"
1829 : 1798 :
1830 : [svm]"a"(svm), 1799 : [svm]"a"(svm),
1831 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), 1800 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
@@ -1846,11 +1815,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1846 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) 1815 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
1847#endif 1816#endif
1848 : "cc", "memory" 1817 : "cc", "memory"
1818 , R"bx", R"cx", R"dx", R"si", R"di"
1849#ifdef CONFIG_X86_64 1819#ifdef CONFIG_X86_64
1850 , "rbx", "rcx", "rdx", "rsi", "rdi"
1851 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" 1820 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
1852#else
1853 , "ebx", "ecx", "edx" , "esi", "edi"
1854#endif 1821#endif
1855 ); 1822 );
1856 1823
@@ -1858,6 +1825,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1858 load_db_regs(svm->host_db_regs); 1825 load_db_regs(svm->host_db_regs);
1859 1826
1860 vcpu->arch.cr2 = svm->vmcb->save.cr2; 1827 vcpu->arch.cr2 = svm->vmcb->save.cr2;
1828 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
1829 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
1830 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
1861 1831
1862 write_dr6(svm->host_dr6); 1832 write_dr6(svm->host_dr6);
1863 write_dr7(svm->host_dr7); 1833 write_dr7(svm->host_dr7);
@@ -1879,6 +1849,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1879 svm->next_rip = 0; 1849 svm->next_rip = 0;
1880} 1850}
1881 1851
1852#undef R
1853
1882static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) 1854static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1883{ 1855{
1884 struct vcpu_svm *svm = to_svm(vcpu); 1856 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1977,8 +1949,6 @@ static struct kvm_x86_ops svm_x86_ops = {
1977 .set_gdt = svm_set_gdt, 1949 .set_gdt = svm_set_gdt,
1978 .get_dr = svm_get_dr, 1950 .get_dr = svm_get_dr,
1979 .set_dr = svm_set_dr, 1951 .set_dr = svm_set_dr,
1980 .cache_regs = svm_cache_regs,
1981 .decache_regs = svm_decache_regs,
1982 .get_rflags = svm_get_rflags, 1952 .get_rflags = svm_get_rflags,
1983 .set_rflags = svm_set_rflags, 1953 .set_rflags = svm_set_rflags,
1984 1954
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7041cc52b56..2643b430d83 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -26,6 +26,8 @@
26#include <linux/highmem.h> 26#include <linux/highmem.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/moduleparam.h> 28#include <linux/moduleparam.h>
29#include "kvm_cache_regs.h"
30#include "x86.h"
29 31
30#include <asm/io.h> 32#include <asm/io.h>
31#include <asm/desc.h> 33#include <asm/desc.h>
@@ -47,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0);
47static int enable_ept = 1; 49static int enable_ept = 1;
48module_param(enable_ept, bool, 0); 50module_param(enable_ept, bool, 0);
49 51
52static int emulate_invalid_guest_state = 0;
53module_param(emulate_invalid_guest_state, bool, 0);
54
50struct vmcs { 55struct vmcs {
51 u32 revision_id; 56 u32 revision_id;
52 u32 abort; 57 u32 abort;
@@ -56,6 +61,7 @@ struct vmcs {
56struct vcpu_vmx { 61struct vcpu_vmx {
57 struct kvm_vcpu vcpu; 62 struct kvm_vcpu vcpu;
58 struct list_head local_vcpus_link; 63 struct list_head local_vcpus_link;
64 unsigned long host_rsp;
59 int launched; 65 int launched;
60 u8 fail; 66 u8 fail;
61 u32 idt_vectoring_info; 67 u32 idt_vectoring_info;
@@ -83,6 +89,7 @@ struct vcpu_vmx {
83 } irq; 89 } irq;
84 } rmode; 90 } rmode;
85 int vpid; 91 int vpid;
92 bool emulation_required;
86}; 93};
87 94
88static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 95static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -468,7 +475,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
468 if (!vcpu->fpu_active) 475 if (!vcpu->fpu_active)
469 eb |= 1u << NM_VECTOR; 476 eb |= 1u << NM_VECTOR;
470 if (vcpu->guest_debug.enabled) 477 if (vcpu->guest_debug.enabled)
471 eb |= 1u << 1; 478 eb |= 1u << DB_VECTOR;
472 if (vcpu->arch.rmode.active) 479 if (vcpu->arch.rmode.active)
473 eb = ~0; 480 eb = ~0;
474 if (vm_need_ept()) 481 if (vm_need_ept())
@@ -715,9 +722,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
715 unsigned long rip; 722 unsigned long rip;
716 u32 interruptibility; 723 u32 interruptibility;
717 724
718 rip = vmcs_readl(GUEST_RIP); 725 rip = kvm_rip_read(vcpu);
719 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 726 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
720 vmcs_writel(GUEST_RIP, rip); 727 kvm_rip_write(vcpu, rip);
721 728
722 /* 729 /*
723 * We emulated an instruction, so temporary interrupt blocking 730 * We emulated an instruction, so temporary interrupt blocking
@@ -733,19 +740,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
733static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 740static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
734 bool has_error_code, u32 error_code) 741 bool has_error_code, u32 error_code)
735{ 742{
743 struct vcpu_vmx *vmx = to_vmx(vcpu);
744
745 if (has_error_code)
746 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
747
748 if (vcpu->arch.rmode.active) {
749 vmx->rmode.irq.pending = true;
750 vmx->rmode.irq.vector = nr;
751 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
752 if (nr == BP_VECTOR)
753 vmx->rmode.irq.rip++;
754 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
755 nr | INTR_TYPE_SOFT_INTR
756 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
757 | INTR_INFO_VALID_MASK);
758 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
759 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
760 return;
761 }
762
736 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 763 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
737 nr | INTR_TYPE_EXCEPTION 764 nr | INTR_TYPE_EXCEPTION
738 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) 765 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
739 | INTR_INFO_VALID_MASK); 766 | INTR_INFO_VALID_MASK);
740 if (has_error_code)
741 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
742} 767}
743 768
744static bool vmx_exception_injected(struct kvm_vcpu *vcpu) 769static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
745{ 770{
746 struct vcpu_vmx *vmx = to_vmx(vcpu); 771 return false;
747
748 return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
749} 772}
750 773
751/* 774/*
@@ -947,24 +970,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
947 return ret; 970 return ret;
948} 971}
949 972
950/* 973static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
951 * Sync the rsp and rip registers into the vcpu structure. This allows
952 * registers to be accessed by indexing vcpu->arch.regs.
953 */
954static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
955{
956 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
957 vcpu->arch.rip = vmcs_readl(GUEST_RIP);
958}
959
960/*
961 * Syncs rsp and rip back into the vmcs. Should be called after possible
962 * modification.
963 */
964static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
965{ 974{
966 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 975 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
967 vmcs_writel(GUEST_RIP, vcpu->arch.rip); 976 switch (reg) {
977 case VCPU_REGS_RSP:
978 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
979 break;
980 case VCPU_REGS_RIP:
981 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
982 break;
983 default:
984 break;
985 }
968} 986}
969 987
970static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) 988static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -1007,17 +1025,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
1007 1025
1008static int vmx_get_irq(struct kvm_vcpu *vcpu) 1026static int vmx_get_irq(struct kvm_vcpu *vcpu)
1009{ 1027{
1010 struct vcpu_vmx *vmx = to_vmx(vcpu); 1028 if (!vcpu->arch.interrupt.pending)
1011 u32 idtv_info_field; 1029 return -1;
1012 1030 return vcpu->arch.interrupt.nr;
1013 idtv_info_field = vmx->idt_vectoring_info;
1014 if (idtv_info_field & INTR_INFO_VALID_MASK) {
1015 if (is_external_interrupt(idtv_info_field))
1016 return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
1017 else
1018 printk(KERN_DEBUG "pending exception: not handled yet\n");
1019 }
1020 return -1;
1021} 1031}
1022 1032
1023static __init int cpu_has_kvm_support(void) 1033static __init int cpu_has_kvm_support(void)
@@ -1031,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void)
1031 u64 msr; 1041 u64 msr;
1032 1042
1033 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 1043 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1034 return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | 1044 return (msr & (FEATURE_CONTROL_LOCKED |
1035 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1045 FEATURE_CONTROL_VMXON_ENABLED))
1036 == MSR_IA32_FEATURE_CONTROL_LOCKED; 1046 == FEATURE_CONTROL_LOCKED;
1037 /* locked but not enabled */ 1047 /* locked but not enabled */
1038} 1048}
1039 1049
@@ -1045,14 +1055,14 @@ static void hardware_enable(void *garbage)
1045 1055
1046 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 1056 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1047 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1057 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1048 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | 1058 if ((old & (FEATURE_CONTROL_LOCKED |
1049 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1059 FEATURE_CONTROL_VMXON_ENABLED))
1050 != (MSR_IA32_FEATURE_CONTROL_LOCKED | 1060 != (FEATURE_CONTROL_LOCKED |
1051 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1061 FEATURE_CONTROL_VMXON_ENABLED))
1052 /* enable and lock */ 1062 /* enable and lock */
1053 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 1063 wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
1054 MSR_IA32_FEATURE_CONTROL_LOCKED | 1064 FEATURE_CONTROL_LOCKED |
1055 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); 1065 FEATURE_CONTROL_VMXON_ENABLED);
1056 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1066 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1057 asm volatile (ASM_VMX_VMXON_RAX 1067 asm volatile (ASM_VMX_VMXON_RAX
1058 : : "a"(&phys_addr), "m"(phys_addr) 1068 : : "a"(&phys_addr), "m"(phys_addr)
@@ -1120,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1120 CPU_BASED_CR3_STORE_EXITING | 1130 CPU_BASED_CR3_STORE_EXITING |
1121 CPU_BASED_USE_IO_BITMAPS | 1131 CPU_BASED_USE_IO_BITMAPS |
1122 CPU_BASED_MOV_DR_EXITING | 1132 CPU_BASED_MOV_DR_EXITING |
1123 CPU_BASED_USE_TSC_OFFSETING; 1133 CPU_BASED_USE_TSC_OFFSETING |
1134 CPU_BASED_INVLPG_EXITING;
1124 opt = CPU_BASED_TPR_SHADOW | 1135 opt = CPU_BASED_TPR_SHADOW |
1125 CPU_BASED_USE_MSR_BITMAPS | 1136 CPU_BASED_USE_MSR_BITMAPS |
1126 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1137 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1149,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1149 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 1160 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
1150#endif 1161#endif
1151 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 1162 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
1152 /* CR3 accesses don't need to cause VM Exits when EPT enabled */ 1163 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
1164 enabled */
1153 min &= ~(CPU_BASED_CR3_LOAD_EXITING | 1165 min &= ~(CPU_BASED_CR3_LOAD_EXITING |
1154 CPU_BASED_CR3_STORE_EXITING); 1166 CPU_BASED_CR3_STORE_EXITING |
1167 CPU_BASED_INVLPG_EXITING);
1155 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 1168 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1156 &_cpu_based_exec_control) < 0) 1169 &_cpu_based_exec_control) < 0)
1157 return -EIO; 1170 return -EIO;
@@ -1288,7 +1301,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1288static void enter_pmode(struct kvm_vcpu *vcpu) 1301static void enter_pmode(struct kvm_vcpu *vcpu)
1289{ 1302{
1290 unsigned long flags; 1303 unsigned long flags;
1304 struct vcpu_vmx *vmx = to_vmx(vcpu);
1291 1305
1306 vmx->emulation_required = 1;
1292 vcpu->arch.rmode.active = 0; 1307 vcpu->arch.rmode.active = 0;
1293 1308
1294 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); 1309 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
@@ -1305,6 +1320,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1305 1320
1306 update_exception_bitmap(vcpu); 1321 update_exception_bitmap(vcpu);
1307 1322
1323 if (emulate_invalid_guest_state)
1324 return;
1325
1308 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); 1326 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1309 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); 1327 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1310 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); 1328 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
@@ -1345,7 +1363,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1345static void enter_rmode(struct kvm_vcpu *vcpu) 1363static void enter_rmode(struct kvm_vcpu *vcpu)
1346{ 1364{
1347 unsigned long flags; 1365 unsigned long flags;
1366 struct vcpu_vmx *vmx = to_vmx(vcpu);
1348 1367
1368 vmx->emulation_required = 1;
1349 vcpu->arch.rmode.active = 1; 1369 vcpu->arch.rmode.active = 1;
1350 1370
1351 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1371 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
@@ -1367,6 +1387,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1367 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 1387 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
1368 update_exception_bitmap(vcpu); 1388 update_exception_bitmap(vcpu);
1369 1389
1390 if (emulate_invalid_guest_state)
1391 goto continue_rmode;
1392
1370 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); 1393 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1371 vmcs_write32(GUEST_SS_LIMIT, 0xffff); 1394 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1372 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); 1395 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
@@ -1382,6 +1405,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1382 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); 1405 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1383 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); 1406 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1384 1407
1408continue_rmode:
1385 kvm_mmu_reset_context(vcpu); 1409 kvm_mmu_reset_context(vcpu);
1386 init_rmode(vcpu->kvm); 1410 init_rmode(vcpu->kvm);
1387} 1411}
@@ -1715,6 +1739,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1715 vmcs_writel(GUEST_GDTR_BASE, dt->base); 1739 vmcs_writel(GUEST_GDTR_BASE, dt->base);
1716} 1740}
1717 1741
1742static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
1743{
1744 struct kvm_segment var;
1745 u32 ar;
1746
1747 vmx_get_segment(vcpu, &var, seg);
1748 ar = vmx_segment_access_rights(&var);
1749
1750 if (var.base != (var.selector << 4))
1751 return false;
1752 if (var.limit != 0xffff)
1753 return false;
1754 if (ar != 0xf3)
1755 return false;
1756
1757 return true;
1758}
1759
1760static bool code_segment_valid(struct kvm_vcpu *vcpu)
1761{
1762 struct kvm_segment cs;
1763 unsigned int cs_rpl;
1764
1765 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1766 cs_rpl = cs.selector & SELECTOR_RPL_MASK;
1767
1768 if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
1769 return false;
1770 if (!cs.s)
1771 return false;
1772 if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) {
1773 if (cs.dpl > cs_rpl)
1774 return false;
1775 } else if (cs.type & AR_TYPE_CODE_MASK) {
1776 if (cs.dpl != cs_rpl)
1777 return false;
1778 }
1779 if (!cs.present)
1780 return false;
1781
1782 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
1783 return true;
1784}
1785
1786static bool stack_segment_valid(struct kvm_vcpu *vcpu)
1787{
1788 struct kvm_segment ss;
1789 unsigned int ss_rpl;
1790
1791 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1792 ss_rpl = ss.selector & SELECTOR_RPL_MASK;
1793
1794 if ((ss.type != 3) || (ss.type != 7))
1795 return false;
1796 if (!ss.s)
1797 return false;
1798 if (ss.dpl != ss_rpl) /* DPL != RPL */
1799 return false;
1800 if (!ss.present)
1801 return false;
1802
1803 return true;
1804}
1805
1806static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
1807{
1808 struct kvm_segment var;
1809 unsigned int rpl;
1810
1811 vmx_get_segment(vcpu, &var, seg);
1812 rpl = var.selector & SELECTOR_RPL_MASK;
1813
1814 if (!var.s)
1815 return false;
1816 if (!var.present)
1817 return false;
1818 if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
1819 if (var.dpl < rpl) /* DPL < RPL */
1820 return false;
1821 }
1822
1823 /* TODO: Add other members to kvm_segment_field to allow checking for other access
1824 * rights flags
1825 */
1826 return true;
1827}
1828
1829static bool tr_valid(struct kvm_vcpu *vcpu)
1830{
1831 struct kvm_segment tr;
1832
1833 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
1834
1835 if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */
1836 return false;
1837 if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */
1838 return false;
1839 if (!tr.present)
1840 return false;
1841
1842 return true;
1843}
1844
1845static bool ldtr_valid(struct kvm_vcpu *vcpu)
1846{
1847 struct kvm_segment ldtr;
1848
1849 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
1850
1851 if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */
1852 return false;
1853 if (ldtr.type != 2)
1854 return false;
1855 if (!ldtr.present)
1856 return false;
1857
1858 return true;
1859}
1860
1861static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
1862{
1863 struct kvm_segment cs, ss;
1864
1865 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1866 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1867
1868 return ((cs.selector & SELECTOR_RPL_MASK) ==
1869 (ss.selector & SELECTOR_RPL_MASK));
1870}
1871
1872/*
1873 * Check if guest state is valid. Returns true if valid, false if
1874 * not.
1875 * We assume that registers are always usable
1876 */
1877static bool guest_state_valid(struct kvm_vcpu *vcpu)
1878{
1879 /* real mode guest state checks */
1880 if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
1881 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
1882 return false;
1883 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
1884 return false;
1885 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
1886 return false;
1887 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
1888 return false;
1889 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
1890 return false;
1891 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
1892 return false;
1893 } else {
1894 /* protected mode guest state checks */
1895 if (!cs_ss_rpl_check(vcpu))
1896 return false;
1897 if (!code_segment_valid(vcpu))
1898 return false;
1899 if (!stack_segment_valid(vcpu))
1900 return false;
1901 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
1902 return false;
1903 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
1904 return false;
1905 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
1906 return false;
1907 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
1908 return false;
1909 if (!tr_valid(vcpu))
1910 return false;
1911 if (!ldtr_valid(vcpu))
1912 return false;
1913 }
1914 /* TODO:
1915 * - Add checks on RIP
1916 * - Add checks on RFLAGS
1917 */
1918
1919 return true;
1920}
1921
1718static int init_rmode_tss(struct kvm *kvm) 1922static int init_rmode_tss(struct kvm *kvm)
1719{ 1923{
1720 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 1924 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
@@ -1726,7 +1930,8 @@ static int init_rmode_tss(struct kvm *kvm)
1726 if (r < 0) 1930 if (r < 0)
1727 goto out; 1931 goto out;
1728 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 1932 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1729 r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); 1933 r = kvm_write_guest_page(kvm, fn++, &data,
1934 TSS_IOPB_BASE_OFFSET, sizeof(u16));
1730 if (r < 0) 1935 if (r < 0)
1731 goto out; 1936 goto out;
1732 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); 1937 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
@@ -1789,7 +1994,7 @@ static void seg_setup(int seg)
1789 vmcs_write16(sf->selector, 0); 1994 vmcs_write16(sf->selector, 0);
1790 vmcs_writel(sf->base, 0); 1995 vmcs_writel(sf->base, 0);
1791 vmcs_write32(sf->limit, 0xffff); 1996 vmcs_write32(sf->limit, 0xffff);
1792 vmcs_write32(sf->ar_bytes, 0x93); 1997 vmcs_write32(sf->ar_bytes, 0xf3);
1793} 1998}
1794 1999
1795static int alloc_apic_access_page(struct kvm *kvm) 2000static int alloc_apic_access_page(struct kvm *kvm)
@@ -1808,9 +2013,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
1808 if (r) 2013 if (r)
1809 goto out; 2014 goto out;
1810 2015
1811 down_read(&current->mm->mmap_sem);
1812 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); 2016 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
1813 up_read(&current->mm->mmap_sem);
1814out: 2017out:
1815 up_write(&kvm->slots_lock); 2018 up_write(&kvm->slots_lock);
1816 return r; 2019 return r;
@@ -1832,10 +2035,8 @@ static int alloc_identity_pagetable(struct kvm *kvm)
1832 if (r) 2035 if (r)
1833 goto out; 2036 goto out;
1834 2037
1835 down_read(&current->mm->mmap_sem);
1836 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, 2038 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
1837 VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); 2039 VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
1838 up_read(&current->mm->mmap_sem);
1839out: 2040out:
1840 up_write(&kvm->slots_lock); 2041 up_write(&kvm->slots_lock);
1841 return r; 2042 return r;
@@ -1917,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1917 } 2118 }
1918 if (!vm_need_ept()) 2119 if (!vm_need_ept())
1919 exec_control |= CPU_BASED_CR3_STORE_EXITING | 2120 exec_control |= CPU_BASED_CR3_STORE_EXITING |
1920 CPU_BASED_CR3_LOAD_EXITING; 2121 CPU_BASED_CR3_LOAD_EXITING |
2122 CPU_BASED_INVLPG_EXITING;
1921 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); 2123 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1922 2124
1923 if (cpu_has_secondary_exec_ctrls()) { 2125 if (cpu_has_secondary_exec_ctrls()) {
@@ -2019,6 +2221,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2019 u64 msr; 2221 u64 msr;
2020 int ret; 2222 int ret;
2021 2223
2224 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2022 down_read(&vcpu->kvm->slots_lock); 2225 down_read(&vcpu->kvm->slots_lock);
2023 if (!init_rmode(vmx->vcpu.kvm)) { 2226 if (!init_rmode(vmx->vcpu.kvm)) {
2024 ret = -ENOMEM; 2227 ret = -ENOMEM;
@@ -2036,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2036 2239
2037 fx_init(&vmx->vcpu); 2240 fx_init(&vmx->vcpu);
2038 2241
2242 seg_setup(VCPU_SREG_CS);
2039 /* 2243 /*
2040 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode 2244 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
2041 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. 2245 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
@@ -2047,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2047 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); 2251 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
2048 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); 2252 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
2049 } 2253 }
2050 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
2051 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
2052 2254
2053 seg_setup(VCPU_SREG_DS); 2255 seg_setup(VCPU_SREG_DS);
2054 seg_setup(VCPU_SREG_ES); 2256 seg_setup(VCPU_SREG_ES);
@@ -2072,10 +2274,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2072 2274
2073 vmcs_writel(GUEST_RFLAGS, 0x02); 2275 vmcs_writel(GUEST_RFLAGS, 0x02);
2074 if (vmx->vcpu.vcpu_id == 0) 2276 if (vmx->vcpu.vcpu_id == 0)
2075 vmcs_writel(GUEST_RIP, 0xfff0); 2277 kvm_rip_write(vcpu, 0xfff0);
2076 else 2278 else
2077 vmcs_writel(GUEST_RIP, 0); 2279 kvm_rip_write(vcpu, 0);
2078 vmcs_writel(GUEST_RSP, 0); 2280 kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
2079 2281
2080 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ 2282 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
2081 vmcs_writel(GUEST_DR7, 0x400); 2283 vmcs_writel(GUEST_DR7, 0x400);
@@ -2125,6 +2327,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2125 2327
2126 ret = 0; 2328 ret = 0;
2127 2329
2330 /* HACK: Don't enable emulation on guest boot/reset */
2331 vmx->emulation_required = 0;
2332
2128out: 2333out:
2129 up_read(&vcpu->kvm->slots_lock); 2334 up_read(&vcpu->kvm->slots_lock);
2130 return ret; 2335 return ret;
@@ -2136,14 +2341,15 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2136 2341
2137 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); 2342 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
2138 2343
2344 ++vcpu->stat.irq_injections;
2139 if (vcpu->arch.rmode.active) { 2345 if (vcpu->arch.rmode.active) {
2140 vmx->rmode.irq.pending = true; 2346 vmx->rmode.irq.pending = true;
2141 vmx->rmode.irq.vector = irq; 2347 vmx->rmode.irq.vector = irq;
2142 vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); 2348 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2143 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2349 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2144 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); 2350 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2145 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); 2351 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2146 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); 2352 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2147 return; 2353 return;
2148 } 2354 }
2149 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2355 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
@@ -2154,7 +2360,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2154{ 2360{
2155 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2361 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2156 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 2362 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2157 vcpu->arch.nmi_pending = 0;
2158} 2363}
2159 2364
2160static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 2365static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
@@ -2166,7 +2371,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2166 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); 2371 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
2167 if (!vcpu->arch.irq_pending[word_index]) 2372 if (!vcpu->arch.irq_pending[word_index])
2168 clear_bit(word_index, &vcpu->arch.irq_summary); 2373 clear_bit(word_index, &vcpu->arch.irq_summary);
2169 vmx_inject_irq(vcpu, irq); 2374 kvm_queue_interrupt(vcpu, irq);
2170} 2375}
2171 2376
2172 2377
@@ -2180,13 +2385,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
2180 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); 2385 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2181 2386
2182 if (vcpu->arch.interrupt_window_open && 2387 if (vcpu->arch.interrupt_window_open &&
2183 vcpu->arch.irq_summary && 2388 vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
2184 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
2185 /*
2186 * If interrupts enabled, and not blocked by sti or mov ss. Good.
2187 */
2188 kvm_do_inject_irq(vcpu); 2389 kvm_do_inject_irq(vcpu);
2189 2390
2391 if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
2392 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
2393
2190 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 2394 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2191 if (!vcpu->arch.interrupt_window_open && 2395 if (!vcpu->arch.interrupt_window_open &&
2192 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) 2396 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
@@ -2237,9 +2441,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
2237static int handle_rmode_exception(struct kvm_vcpu *vcpu, 2441static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2238 int vec, u32 err_code) 2442 int vec, u32 err_code)
2239{ 2443{
2240 if (!vcpu->arch.rmode.active)
2241 return 0;
2242
2243 /* 2444 /*
2244 * Instruction with address size override prefix opcode 0x67 2445 * Instruction with address size override prefix opcode 0x67
2245 * Cause the #SS fault with 0 error code in VM86 mode. 2446 * Cause the #SS fault with 0 error code in VM86 mode.
@@ -2247,6 +2448,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2247 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2448 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2248 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) 2449 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
2249 return 1; 2450 return 1;
2451 /*
2452 * Forward all other exceptions that are valid in real mode.
2453 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
2454 * the required debugging infrastructure rework.
2455 */
2456 switch (vec) {
2457 case DE_VECTOR:
2458 case DB_VECTOR:
2459 case BP_VECTOR:
2460 case OF_VECTOR:
2461 case BR_VECTOR:
2462 case UD_VECTOR:
2463 case DF_VECTOR:
2464 case SS_VECTOR:
2465 case GP_VECTOR:
2466 case MF_VECTOR:
2467 kvm_queue_exception(vcpu, vec);
2468 return 1;
2469 }
2250 return 0; 2470 return 0;
2251} 2471}
2252 2472
@@ -2288,7 +2508,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2288 } 2508 }
2289 2509
2290 error_code = 0; 2510 error_code = 0;
2291 rip = vmcs_readl(GUEST_RIP); 2511 rip = kvm_rip_read(vcpu);
2292 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 2512 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
2293 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 2513 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2294 if (is_page_fault(intr_info)) { 2514 if (is_page_fault(intr_info)) {
@@ -2298,7 +2518,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2298 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2518 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2299 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2519 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2300 (u32)((u64)cr2 >> 32), handler); 2520 (u32)((u64)cr2 >> 32), handler);
2301 if (vect_info & VECTORING_INFO_VALID_MASK) 2521 if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
2302 kvm_mmu_unprotect_page_virt(vcpu, cr2); 2522 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2303 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2523 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2304 } 2524 }
@@ -2386,27 +2606,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2386 reg = (exit_qualification >> 8) & 15; 2606 reg = (exit_qualification >> 8) & 15;
2387 switch ((exit_qualification >> 4) & 3) { 2607 switch ((exit_qualification >> 4) & 3) {
2388 case 0: /* mov to cr */ 2608 case 0: /* mov to cr */
2389 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg], 2609 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
2390 (u32)((u64)vcpu->arch.regs[reg] >> 32), handler); 2610 (u32)kvm_register_read(vcpu, reg),
2611 (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2612 handler);
2391 switch (cr) { 2613 switch (cr) {
2392 case 0: 2614 case 0:
2393 vcpu_load_rsp_rip(vcpu); 2615 kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
2394 kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
2395 skip_emulated_instruction(vcpu); 2616 skip_emulated_instruction(vcpu);
2396 return 1; 2617 return 1;
2397 case 3: 2618 case 3:
2398 vcpu_load_rsp_rip(vcpu); 2619 kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
2399 kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
2400 skip_emulated_instruction(vcpu); 2620 skip_emulated_instruction(vcpu);
2401 return 1; 2621 return 1;
2402 case 4: 2622 case 4:
2403 vcpu_load_rsp_rip(vcpu); 2623 kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
2404 kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
2405 skip_emulated_instruction(vcpu); 2624 skip_emulated_instruction(vcpu);
2406 return 1; 2625 return 1;
2407 case 8: 2626 case 8:
2408 vcpu_load_rsp_rip(vcpu); 2627 kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
2409 kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
2410 skip_emulated_instruction(vcpu); 2628 skip_emulated_instruction(vcpu);
2411 if (irqchip_in_kernel(vcpu->kvm)) 2629 if (irqchip_in_kernel(vcpu->kvm))
2412 return 1; 2630 return 1;
@@ -2415,7 +2633,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2415 }; 2633 };
2416 break; 2634 break;
2417 case 2: /* clts */ 2635 case 2: /* clts */
2418 vcpu_load_rsp_rip(vcpu);
2419 vmx_fpu_deactivate(vcpu); 2636 vmx_fpu_deactivate(vcpu);
2420 vcpu->arch.cr0 &= ~X86_CR0_TS; 2637 vcpu->arch.cr0 &= ~X86_CR0_TS;
2421 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 2638 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
@@ -2426,21 +2643,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2426 case 1: /*mov from cr*/ 2643 case 1: /*mov from cr*/
2427 switch (cr) { 2644 switch (cr) {
2428 case 3: 2645 case 3:
2429 vcpu_load_rsp_rip(vcpu); 2646 kvm_register_write(vcpu, reg, vcpu->arch.cr3);
2430 vcpu->arch.regs[reg] = vcpu->arch.cr3;
2431 vcpu_put_rsp_rip(vcpu);
2432 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, 2647 KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
2433 (u32)vcpu->arch.regs[reg], 2648 (u32)kvm_register_read(vcpu, reg),
2434 (u32)((u64)vcpu->arch.regs[reg] >> 32), 2649 (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2435 handler); 2650 handler);
2436 skip_emulated_instruction(vcpu); 2651 skip_emulated_instruction(vcpu);
2437 return 1; 2652 return 1;
2438 case 8: 2653 case 8:
2439 vcpu_load_rsp_rip(vcpu); 2654 kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
2440 vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
2441 vcpu_put_rsp_rip(vcpu);
2442 KVMTRACE_2D(CR_READ, vcpu, (u32)cr, 2655 KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
2443 (u32)vcpu->arch.regs[reg], handler); 2656 (u32)kvm_register_read(vcpu, reg), handler);
2444 skip_emulated_instruction(vcpu); 2657 skip_emulated_instruction(vcpu);
2445 return 1; 2658 return 1;
2446 } 2659 }
@@ -2472,7 +2685,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2472 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 2685 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2473 dr = exit_qualification & 7; 2686 dr = exit_qualification & 7;
2474 reg = (exit_qualification >> 8) & 15; 2687 reg = (exit_qualification >> 8) & 15;
2475 vcpu_load_rsp_rip(vcpu);
2476 if (exit_qualification & 16) { 2688 if (exit_qualification & 16) {
2477 /* mov from dr */ 2689 /* mov from dr */
2478 switch (dr) { 2690 switch (dr) {
@@ -2485,12 +2697,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2485 default: 2697 default:
2486 val = 0; 2698 val = 0;
2487 } 2699 }
2488 vcpu->arch.regs[reg] = val; 2700 kvm_register_write(vcpu, reg, val);
2489 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); 2701 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
2490 } else { 2702 } else {
2491 /* mov to dr */ 2703 /* mov to dr */
2492 } 2704 }
2493 vcpu_put_rsp_rip(vcpu);
2494 skip_emulated_instruction(vcpu); 2705 skip_emulated_instruction(vcpu);
2495 return 1; 2706 return 1;
2496} 2707}
@@ -2583,6 +2794,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2583 return 1; 2794 return 1;
2584} 2795}
2585 2796
2797static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2798{
2799 u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2800
2801 kvm_mmu_invlpg(vcpu, exit_qualification);
2802 skip_emulated_instruction(vcpu);
2803 return 1;
2804}
2805
2586static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2806static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2587{ 2807{
2588 skip_emulated_instruction(vcpu); 2808 skip_emulated_instruction(vcpu);
@@ -2695,6 +2915,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2695 return 1; 2915 return 1;
2696} 2916}
2697 2917
2918static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
2919 struct kvm_run *kvm_run)
2920{
2921 struct vcpu_vmx *vmx = to_vmx(vcpu);
2922 int err;
2923
2924 preempt_enable();
2925 local_irq_enable();
2926
2927 while (!guest_state_valid(vcpu)) {
2928 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2929
2930 switch (err) {
2931 case EMULATE_DONE:
2932 break;
2933 case EMULATE_DO_MMIO:
2934 kvm_report_emulation_failure(vcpu, "mmio");
2935 /* TODO: Handle MMIO */
2936 return;
2937 default:
2938 kvm_report_emulation_failure(vcpu, "emulation failure");
2939 return;
2940 }
2941
2942 if (signal_pending(current))
2943 break;
2944 if (need_resched())
2945 schedule();
2946 }
2947
2948 local_irq_disable();
2949 preempt_disable();
2950
2951 /* Guest state should be valid now, no more emulation should be needed */
2952 vmx->emulation_required = 0;
2953}
2954
2698/* 2955/*
2699 * The exit handlers return 1 if the exit was handled fully and guest execution 2956 * The exit handlers return 1 if the exit was handled fully and guest execution
2700 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 2957 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -2714,6 +2971,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2714 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 2971 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2715 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 2972 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2716 [EXIT_REASON_HLT] = handle_halt, 2973 [EXIT_REASON_HLT] = handle_halt,
2974 [EXIT_REASON_INVLPG] = handle_invlpg,
2717 [EXIT_REASON_VMCALL] = handle_vmcall, 2975 [EXIT_REASON_VMCALL] = handle_vmcall,
2718 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 2976 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
2719 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 2977 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
@@ -2735,8 +2993,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2735 struct vcpu_vmx *vmx = to_vmx(vcpu); 2993 struct vcpu_vmx *vmx = to_vmx(vcpu);
2736 u32 vectoring_info = vmx->idt_vectoring_info; 2994 u32 vectoring_info = vmx->idt_vectoring_info;
2737 2995
2738 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), 2996 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
2739 (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); 2997 (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
2740 2998
2741 /* Access CR3 don't cause VMExit in paging mode, so we need 2999 /* Access CR3 don't cause VMExit in paging mode, so we need
2742 * to sync with guest real CR3. */ 3000 * to sync with guest real CR3. */
@@ -2829,88 +3087,92 @@ static void enable_intr_window(struct kvm_vcpu *vcpu)
2829 enable_irq_window(vcpu); 3087 enable_irq_window(vcpu);
2830} 3088}
2831 3089
2832static void vmx_intr_assist(struct kvm_vcpu *vcpu) 3090static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
2833{ 3091{
2834 struct vcpu_vmx *vmx = to_vmx(vcpu); 3092 u32 exit_intr_info;
2835 u32 idtv_info_field, intr_info_field, exit_intr_info_field; 3093 u32 idt_vectoring_info;
2836 int vector; 3094 bool unblock_nmi;
3095 u8 vector;
3096 int type;
3097 bool idtv_info_valid;
3098 u32 error;
2837 3099
2838 update_tpr_threshold(vcpu); 3100 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2839 3101 if (cpu_has_virtual_nmis()) {
2840 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); 3102 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
2841 exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO); 3103 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
2842 idtv_info_field = vmx->idt_vectoring_info; 3104 /*
2843 if (intr_info_field & INTR_INFO_VALID_MASK) { 3105 * SDM 3: 25.7.1.2
2844 if (idtv_info_field & INTR_INFO_VALID_MASK) { 3106 * Re-set bit "block by NMI" before VM entry if vmexit caused by
2845 /* TODO: fault when IDT_Vectoring */ 3107 * a guest IRET fault.
2846 if (printk_ratelimit()) 3108 */
2847 printk(KERN_ERR "Fault when IDT_Vectoring\n"); 3109 if (unblock_nmi && vector != DF_VECTOR)
2848 } 3110 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2849 enable_intr_window(vcpu); 3111 GUEST_INTR_STATE_NMI);
2850 return;
2851 } 3112 }
2852 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
2853 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2854 == INTR_TYPE_EXT_INTR
2855 && vcpu->arch.rmode.active) {
2856 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2857
2858 vmx_inject_irq(vcpu, vect);
2859 enable_intr_window(vcpu);
2860 return;
2861 }
2862
2863 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler);
2864 3113
3114 idt_vectoring_info = vmx->idt_vectoring_info;
3115 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3116 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3117 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3118 if (vmx->vcpu.arch.nmi_injected) {
2865 /* 3119 /*
2866 * SDM 3: 25.7.1.2 3120 * SDM 3: 25.7.1.2
2867 * Clear bit "block by NMI" before VM entry if a NMI delivery 3121 * Clear bit "block by NMI" before VM entry if a NMI delivery
2868 * faulted. 3122 * faulted.
2869 */ 3123 */
2870 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) 3124 if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
2871 == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis()) 3125 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2872 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 3126 GUEST_INTR_STATE_NMI);
2873 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3127 else
2874 ~GUEST_INTR_STATE_NMI); 3128 vmx->vcpu.arch.nmi_injected = false;
2875 3129 }
2876 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field 3130 kvm_clear_exception_queue(&vmx->vcpu);
2877 & ~INTR_INFO_RESVD_BITS_MASK); 3131 if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
2878 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 3132 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
2879 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 3133 error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
2880 3134 kvm_queue_exception_e(&vmx->vcpu, vector, error);
2881 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) 3135 } else
2882 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 3136 kvm_queue_exception(&vmx->vcpu, vector);
2883 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 3137 vmx->idt_vectoring_info = 0;
2884 enable_intr_window(vcpu);
2885 return;
2886 } 3138 }
3139 kvm_clear_interrupt_queue(&vmx->vcpu);
3140 if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
3141 kvm_queue_interrupt(&vmx->vcpu, vector);
3142 vmx->idt_vectoring_info = 0;
3143 }
3144}
3145
3146static void vmx_intr_assist(struct kvm_vcpu *vcpu)
3147{
3148 update_tpr_threshold(vcpu);
3149
2887 if (cpu_has_virtual_nmis()) { 3150 if (cpu_has_virtual_nmis()) {
2888 /* 3151 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
2889 * SDM 3: 25.7.1.2 3152 if (vmx_nmi_enabled(vcpu)) {
2890 * Re-set bit "block by NMI" before VM entry if vmexit caused by 3153 vcpu->arch.nmi_pending = false;
2891 * a guest IRET fault. 3154 vcpu->arch.nmi_injected = true;
2892 */ 3155 } else {
2893 if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) && 3156 enable_intr_window(vcpu);
2894 (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8) 3157 return;
2895 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 3158 }
2896 vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) | 3159 }
2897 GUEST_INTR_STATE_NMI); 3160 if (vcpu->arch.nmi_injected) {
2898 else if (vcpu->arch.nmi_pending) { 3161 vmx_inject_nmi(vcpu);
2899 if (vmx_nmi_enabled(vcpu))
2900 vmx_inject_nmi(vcpu);
2901 enable_intr_window(vcpu); 3162 enable_intr_window(vcpu);
2902 return; 3163 return;
2903 } 3164 }
2904
2905 } 3165 }
2906 if (!kvm_cpu_has_interrupt(vcpu)) 3166 if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
2907 return; 3167 if (vmx_irq_enabled(vcpu))
2908 if (vmx_irq_enabled(vcpu)) { 3168 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
2909 vector = kvm_cpu_get_interrupt(vcpu); 3169 else
2910 vmx_inject_irq(vcpu, vector); 3170 enable_irq_window(vcpu);
2911 kvm_timer_intr_post(vcpu, vector); 3171 }
2912 } else 3172 if (vcpu->arch.interrupt.pending) {
2913 enable_irq_window(vcpu); 3173 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
3174 kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
3175 }
2914} 3176}
2915 3177
2916/* 3178/*
@@ -2922,9 +3184,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2922static void fixup_rmode_irq(struct vcpu_vmx *vmx) 3184static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2923{ 3185{
2924 vmx->rmode.irq.pending = 0; 3186 vmx->rmode.irq.pending = 0;
2925 if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) 3187 if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
2926 return; 3188 return;
2927 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); 3189 kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
2928 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { 3190 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
2929 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; 3191 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
2930 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; 3192 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
@@ -2936,11 +3198,30 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2936 | vmx->rmode.irq.vector; 3198 | vmx->rmode.irq.vector;
2937} 3199}
2938 3200
3201#ifdef CONFIG_X86_64
3202#define R "r"
3203#define Q "q"
3204#else
3205#define R "e"
3206#define Q "l"
3207#endif
3208
2939static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3209static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2940{ 3210{
2941 struct vcpu_vmx *vmx = to_vmx(vcpu); 3211 struct vcpu_vmx *vmx = to_vmx(vcpu);
2942 u32 intr_info; 3212 u32 intr_info;
2943 3213
3214 /* Handle invalid guest state instead of entering VMX */
3215 if (vmx->emulation_required && emulate_invalid_guest_state) {
3216 handle_invalid_guest_state(vcpu, kvm_run);
3217 return;
3218 }
3219
3220 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3221 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
3222 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
3223 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
3224
2944 /* 3225 /*
2945 * Loading guest fpu may have cleared host cr0.ts 3226 * Loading guest fpu may have cleared host cr0.ts
2946 */ 3227 */
@@ -2948,26 +3229,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2948 3229
2949 asm( 3230 asm(
2950 /* Store host registers */ 3231 /* Store host registers */
2951#ifdef CONFIG_X86_64 3232 "push %%"R"dx; push %%"R"bp;"
2952 "push %%rdx; push %%rbp;" 3233 "push %%"R"cx \n\t"
2953 "push %%rcx \n\t" 3234 "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
2954#else 3235 "je 1f \n\t"
2955 "push %%edx; push %%ebp;" 3236 "mov %%"R"sp, %c[host_rsp](%0) \n\t"
2956 "push %%ecx \n\t"
2957#endif
2958 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" 3237 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
3238 "1: \n\t"
2959 /* Check if vmlaunch of vmresume is needed */ 3239 /* Check if vmlaunch of vmresume is needed */
2960 "cmpl $0, %c[launched](%0) \n\t" 3240 "cmpl $0, %c[launched](%0) \n\t"
2961 /* Load guest registers. Don't clobber flags. */ 3241 /* Load guest registers. Don't clobber flags. */
3242 "mov %c[cr2](%0), %%"R"ax \n\t"
3243 "mov %%"R"ax, %%cr2 \n\t"
3244 "mov %c[rax](%0), %%"R"ax \n\t"
3245 "mov %c[rbx](%0), %%"R"bx \n\t"
3246 "mov %c[rdx](%0), %%"R"dx \n\t"
3247 "mov %c[rsi](%0), %%"R"si \n\t"
3248 "mov %c[rdi](%0), %%"R"di \n\t"
3249 "mov %c[rbp](%0), %%"R"bp \n\t"
2962#ifdef CONFIG_X86_64 3250#ifdef CONFIG_X86_64
2963 "mov %c[cr2](%0), %%rax \n\t"
2964 "mov %%rax, %%cr2 \n\t"
2965 "mov %c[rax](%0), %%rax \n\t"
2966 "mov %c[rbx](%0), %%rbx \n\t"
2967 "mov %c[rdx](%0), %%rdx \n\t"
2968 "mov %c[rsi](%0), %%rsi \n\t"
2969 "mov %c[rdi](%0), %%rdi \n\t"
2970 "mov %c[rbp](%0), %%rbp \n\t"
2971 "mov %c[r8](%0), %%r8 \n\t" 3251 "mov %c[r8](%0), %%r8 \n\t"
2972 "mov %c[r9](%0), %%r9 \n\t" 3252 "mov %c[r9](%0), %%r9 \n\t"
2973 "mov %c[r10](%0), %%r10 \n\t" 3253 "mov %c[r10](%0), %%r10 \n\t"
@@ -2976,18 +3256,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2976 "mov %c[r13](%0), %%r13 \n\t" 3256 "mov %c[r13](%0), %%r13 \n\t"
2977 "mov %c[r14](%0), %%r14 \n\t" 3257 "mov %c[r14](%0), %%r14 \n\t"
2978 "mov %c[r15](%0), %%r15 \n\t" 3258 "mov %c[r15](%0), %%r15 \n\t"
2979 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
2980#else
2981 "mov %c[cr2](%0), %%eax \n\t"
2982 "mov %%eax, %%cr2 \n\t"
2983 "mov %c[rax](%0), %%eax \n\t"
2984 "mov %c[rbx](%0), %%ebx \n\t"
2985 "mov %c[rdx](%0), %%edx \n\t"
2986 "mov %c[rsi](%0), %%esi \n\t"
2987 "mov %c[rdi](%0), %%edi \n\t"
2988 "mov %c[rbp](%0), %%ebp \n\t"
2989 "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
2990#endif 3259#endif
3260 "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
3261
2991 /* Enter guest mode */ 3262 /* Enter guest mode */
2992 "jne .Llaunched \n\t" 3263 "jne .Llaunched \n\t"
2993 __ex(ASM_VMX_VMLAUNCH) "\n\t" 3264 __ex(ASM_VMX_VMLAUNCH) "\n\t"
@@ -2995,15 +3266,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2995 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" 3266 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
2996 ".Lkvm_vmx_return: " 3267 ".Lkvm_vmx_return: "
2997 /* Save guest registers, load host registers, keep flags */ 3268 /* Save guest registers, load host registers, keep flags */
3269 "xchg %0, (%%"R"sp) \n\t"
3270 "mov %%"R"ax, %c[rax](%0) \n\t"
3271 "mov %%"R"bx, %c[rbx](%0) \n\t"
3272 "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
3273 "mov %%"R"dx, %c[rdx](%0) \n\t"
3274 "mov %%"R"si, %c[rsi](%0) \n\t"
3275 "mov %%"R"di, %c[rdi](%0) \n\t"
3276 "mov %%"R"bp, %c[rbp](%0) \n\t"
2998#ifdef CONFIG_X86_64 3277#ifdef CONFIG_X86_64
2999 "xchg %0, (%%rsp) \n\t"
3000 "mov %%rax, %c[rax](%0) \n\t"
3001 "mov %%rbx, %c[rbx](%0) \n\t"
3002 "pushq (%%rsp); popq %c[rcx](%0) \n\t"
3003 "mov %%rdx, %c[rdx](%0) \n\t"
3004 "mov %%rsi, %c[rsi](%0) \n\t"
3005 "mov %%rdi, %c[rdi](%0) \n\t"
3006 "mov %%rbp, %c[rbp](%0) \n\t"
3007 "mov %%r8, %c[r8](%0) \n\t" 3278 "mov %%r8, %c[r8](%0) \n\t"
3008 "mov %%r9, %c[r9](%0) \n\t" 3279 "mov %%r9, %c[r9](%0) \n\t"
3009 "mov %%r10, %c[r10](%0) \n\t" 3280 "mov %%r10, %c[r10](%0) \n\t"
@@ -3012,28 +3283,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3012 "mov %%r13, %c[r13](%0) \n\t" 3283 "mov %%r13, %c[r13](%0) \n\t"
3013 "mov %%r14, %c[r14](%0) \n\t" 3284 "mov %%r14, %c[r14](%0) \n\t"
3014 "mov %%r15, %c[r15](%0) \n\t" 3285 "mov %%r15, %c[r15](%0) \n\t"
3015 "mov %%cr2, %%rax \n\t"
3016 "mov %%rax, %c[cr2](%0) \n\t"
3017
3018 "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
3019#else
3020 "xchg %0, (%%esp) \n\t"
3021 "mov %%eax, %c[rax](%0) \n\t"
3022 "mov %%ebx, %c[rbx](%0) \n\t"
3023 "pushl (%%esp); popl %c[rcx](%0) \n\t"
3024 "mov %%edx, %c[rdx](%0) \n\t"
3025 "mov %%esi, %c[rsi](%0) \n\t"
3026 "mov %%edi, %c[rdi](%0) \n\t"
3027 "mov %%ebp, %c[rbp](%0) \n\t"
3028 "mov %%cr2, %%eax \n\t"
3029 "mov %%eax, %c[cr2](%0) \n\t"
3030
3031 "pop %%ebp; pop %%ebp; pop %%edx \n\t"
3032#endif 3286#endif
3287 "mov %%cr2, %%"R"ax \n\t"
3288 "mov %%"R"ax, %c[cr2](%0) \n\t"
3289
3290 "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t"
3033 "setbe %c[fail](%0) \n\t" 3291 "setbe %c[fail](%0) \n\t"
3034 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 3292 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
3035 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 3293 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
3036 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 3294 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
3295 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
3037 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), 3296 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
3038 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), 3297 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
3039 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), 3298 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
@@ -3053,14 +3312,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3053#endif 3312#endif
3054 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) 3313 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
3055 : "cc", "memory" 3314 : "cc", "memory"
3315 , R"bx", R"di", R"si"
3056#ifdef CONFIG_X86_64 3316#ifdef CONFIG_X86_64
3057 , "rbx", "rdi", "rsi"
3058 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 3317 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
3059#else
3060 , "ebx", "edi", "rsi"
3061#endif 3318#endif
3062 ); 3319 );
3063 3320
3321 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
3322 vcpu->arch.regs_dirty = 0;
3323
3064 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 3324 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
3065 if (vmx->rmode.irq.pending) 3325 if (vmx->rmode.irq.pending)
3066 fixup_rmode_irq(vmx); 3326 fixup_rmode_irq(vmx);
@@ -3080,8 +3340,13 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3080 KVMTRACE_0D(NMI, vcpu, handler); 3340 KVMTRACE_0D(NMI, vcpu, handler);
3081 asm("int $2"); 3341 asm("int $2");
3082 } 3342 }
3343
3344 vmx_complete_interrupts(vmx);
3083} 3345}
3084 3346
3347#undef R
3348#undef Q
3349
3085static void vmx_free_vmcs(struct kvm_vcpu *vcpu) 3350static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
3086{ 3351{
3087 struct vcpu_vmx *vmx = to_vmx(vcpu); 3352 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3224,8 +3489,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3224 .set_idt = vmx_set_idt, 3489 .set_idt = vmx_set_idt,
3225 .get_gdt = vmx_get_gdt, 3490 .get_gdt = vmx_get_gdt,
3226 .set_gdt = vmx_set_gdt, 3491 .set_gdt = vmx_set_gdt,
3227 .cache_regs = vcpu_load_rsp_rip, 3492 .cache_reg = vmx_cache_reg,
3228 .decache_regs = vcpu_put_rsp_rip,
3229 .get_rflags = vmx_get_rflags, 3493 .get_rflags = vmx_get_rflags,
3230 .set_rflags = vmx_set_rflags, 3494 .set_rflags = vmx_set_rflags,
3231 3495
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 23e8373507a..3e010d21fdd 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -331,24 +331,6 @@ enum vmcs_field {
331 331
332#define AR_RESERVD_MASK 0xfffe0f00 332#define AR_RESERVD_MASK 0xfffe0f00
333 333
334#define MSR_IA32_VMX_BASIC 0x480
335#define MSR_IA32_VMX_PINBASED_CTLS 0x481
336#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
337#define MSR_IA32_VMX_EXIT_CTLS 0x483
338#define MSR_IA32_VMX_ENTRY_CTLS 0x484
339#define MSR_IA32_VMX_MISC 0x485
340#define MSR_IA32_VMX_CR0_FIXED0 0x486
341#define MSR_IA32_VMX_CR0_FIXED1 0x487
342#define MSR_IA32_VMX_CR4_FIXED0 0x488
343#define MSR_IA32_VMX_CR4_FIXED1 0x489
344#define MSR_IA32_VMX_VMCS_ENUM 0x48a
345#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
346#define MSR_IA32_VMX_EPT_VPID_CAP 0x48c
347
348#define MSR_IA32_FEATURE_CONTROL 0x3a
349#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
350#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
351
352#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 334#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
353#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 335#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10
354 336
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0d682fc6aeb..4f0677d1eae 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4,10 +4,14 @@
4 * derived from drivers/kvm/kvm_main.c 4 * derived from drivers/kvm/kvm_main.c
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008
7 * 9 *
8 * Authors: 10 * Authors:
9 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
10 * Yaniv Kamay <yaniv@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Amit Shah <amit.shah@qumranet.com>
14 * Ben-Ami Yassour <benami@il.ibm.com>
11 * 15 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See 16 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory. 17 * the COPYING file in the top-level directory.
@@ -19,14 +23,18 @@
19#include "mmu.h" 23#include "mmu.h"
20#include "i8254.h" 24#include "i8254.h"
21#include "tss.h" 25#include "tss.h"
26#include "kvm_cache_regs.h"
27#include "x86.h"
22 28
23#include <linux/clocksource.h> 29#include <linux/clocksource.h>
30#include <linux/interrupt.h>
24#include <linux/kvm.h> 31#include <linux/kvm.h>
25#include <linux/fs.h> 32#include <linux/fs.h>
26#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
27#include <linux/module.h> 34#include <linux/module.h>
28#include <linux/mman.h> 35#include <linux/mman.h>
29#include <linux/highmem.h> 36#include <linux/highmem.h>
37#include <linux/intel-iommu.h>
30 38
31#include <asm/uaccess.h> 39#include <asm/uaccess.h>
32#include <asm/msr.h> 40#include <asm/msr.h>
@@ -61,6 +69,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
61 struct kvm_cpuid_entry2 __user *entries); 69 struct kvm_cpuid_entry2 __user *entries);
62 70
63struct kvm_x86_ops *kvm_x86_ops; 71struct kvm_x86_ops *kvm_x86_ops;
72EXPORT_SYMBOL_GPL(kvm_x86_ops);
64 73
65struct kvm_stats_debugfs_item debugfs_entries[] = { 74struct kvm_stats_debugfs_item debugfs_entries[] = {
66 { "pf_fixed", VCPU_STAT(pf_fixed) }, 75 { "pf_fixed", VCPU_STAT(pf_fixed) },
@@ -83,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
83 { "fpu_reload", VCPU_STAT(fpu_reload) }, 92 { "fpu_reload", VCPU_STAT(fpu_reload) },
84 { "insn_emulation", VCPU_STAT(insn_emulation) }, 93 { "insn_emulation", VCPU_STAT(insn_emulation) },
85 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 94 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
95 { "irq_injections", VCPU_STAT(irq_injections) },
86 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 96 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
87 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 97 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
88 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 98 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -90,12 +100,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
90 { "mmu_flooded", VM_STAT(mmu_flooded) }, 100 { "mmu_flooded", VM_STAT(mmu_flooded) },
91 { "mmu_recycled", VM_STAT(mmu_recycled) }, 101 { "mmu_recycled", VM_STAT(mmu_recycled) },
92 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 102 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
103 { "mmu_unsync", VM_STAT(mmu_unsync) },
93 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 104 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
94 { "largepages", VM_STAT(lpages) }, 105 { "largepages", VM_STAT(lpages) },
95 { NULL } 106 { NULL }
96}; 107};
97 108
98
99unsigned long segment_base(u16 selector) 109unsigned long segment_base(u16 selector)
100{ 110{
101 struct descriptor_table gdt; 111 struct descriptor_table gdt;
@@ -352,6 +362,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
352void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 362void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
353{ 363{
354 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 364 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
365 kvm_mmu_sync_roots(vcpu);
355 kvm_mmu_flush_tlb(vcpu); 366 kvm_mmu_flush_tlb(vcpu);
356 return; 367 return;
357 } 368 }
@@ -564,7 +575,7 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
564 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 575 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
565 576
566 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 577 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
567 __FUNCTION__, tsc_khz, hv_clock->tsc_shift, 578 __func__, tsc_khz, hv_clock->tsc_shift,
568 hv_clock->tsc_to_system_mul); 579 hv_clock->tsc_to_system_mul);
569} 580}
570 581
@@ -662,6 +673,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
662 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", 673 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
663 __func__, data); 674 __func__, data);
664 break; 675 break;
676 case MSR_IA32_DEBUGCTLMSR:
677 if (!data) {
678 /* We support the non-activated case already */
679 break;
680 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
681 /* Values other than LBR and BTF are vendor-specific,
682 thus reserved and should throw a #GP */
683 return 1;
684 }
685 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
686 __func__, data);
687 break;
665 case MSR_IA32_UCODE_REV: 688 case MSR_IA32_UCODE_REV:
666 case MSR_IA32_UCODE_WRITE: 689 case MSR_IA32_UCODE_WRITE:
667 break; 690 break;
@@ -692,10 +715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
692 /* ...but clean it before doing the actual write */ 715 /* ...but clean it before doing the actual write */
693 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 716 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
694 717
695 down_read(&current->mm->mmap_sem);
696 vcpu->arch.time_page = 718 vcpu->arch.time_page =
697 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 719 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
698 up_read(&current->mm->mmap_sem);
699 720
700 if (is_error_page(vcpu->arch.time_page)) { 721 if (is_error_page(vcpu->arch.time_page)) {
701 kvm_release_page_clean(vcpu->arch.time_page); 722 kvm_release_page_clean(vcpu->arch.time_page);
@@ -752,8 +773,14 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
752 case MSR_IA32_MC0_MISC+8: 773 case MSR_IA32_MC0_MISC+8:
753 case MSR_IA32_MC0_MISC+12: 774 case MSR_IA32_MC0_MISC+12:
754 case MSR_IA32_MC0_MISC+16: 775 case MSR_IA32_MC0_MISC+16:
776 case MSR_IA32_MC0_MISC+20:
755 case MSR_IA32_UCODE_REV: 777 case MSR_IA32_UCODE_REV:
756 case MSR_IA32_EBL_CR_POWERON: 778 case MSR_IA32_EBL_CR_POWERON:
779 case MSR_IA32_DEBUGCTLMSR:
780 case MSR_IA32_LASTBRANCHFROMIP:
781 case MSR_IA32_LASTBRANCHTOIP:
782 case MSR_IA32_LASTINTFROMIP:
783 case MSR_IA32_LASTINTTOIP:
757 data = 0; 784 data = 0;
758 break; 785 break;
759 case MSR_MTRRcap: 786 case MSR_MTRRcap:
@@ -901,6 +928,9 @@ int kvm_dev_ioctl_check_extension(long ext)
901 case KVM_CAP_PV_MMU: 928 case KVM_CAP_PV_MMU:
902 r = !tdp_enabled; 929 r = !tdp_enabled;
903 break; 930 break;
931 case KVM_CAP_IOMMU:
932 r = intel_iommu_found();
933 break;
904 default: 934 default:
905 r = 0; 935 r = 0;
906 break; 936 break;
@@ -1303,28 +1333,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1303 struct kvm_vcpu *vcpu = filp->private_data; 1333 struct kvm_vcpu *vcpu = filp->private_data;
1304 void __user *argp = (void __user *)arg; 1334 void __user *argp = (void __user *)arg;
1305 int r; 1335 int r;
1336 struct kvm_lapic_state *lapic = NULL;
1306 1337
1307 switch (ioctl) { 1338 switch (ioctl) {
1308 case KVM_GET_LAPIC: { 1339 case KVM_GET_LAPIC: {
1309 struct kvm_lapic_state lapic; 1340 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1310 1341
1311 memset(&lapic, 0, sizeof lapic); 1342 r = -ENOMEM;
1312 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); 1343 if (!lapic)
1344 goto out;
1345 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1313 if (r) 1346 if (r)
1314 goto out; 1347 goto out;
1315 r = -EFAULT; 1348 r = -EFAULT;
1316 if (copy_to_user(argp, &lapic, sizeof lapic)) 1349 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1317 goto out; 1350 goto out;
1318 r = 0; 1351 r = 0;
1319 break; 1352 break;
1320 } 1353 }
1321 case KVM_SET_LAPIC: { 1354 case KVM_SET_LAPIC: {
1322 struct kvm_lapic_state lapic; 1355 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1323 1356 r = -ENOMEM;
1357 if (!lapic)
1358 goto out;
1324 r = -EFAULT; 1359 r = -EFAULT;
1325 if (copy_from_user(&lapic, argp, sizeof lapic)) 1360 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1326 goto out; 1361 goto out;
1327 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; 1362 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1328 if (r) 1363 if (r)
1329 goto out; 1364 goto out;
1330 r = 0; 1365 r = 0;
@@ -1422,6 +1457,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1422 r = -EINVAL; 1457 r = -EINVAL;
1423 } 1458 }
1424out: 1459out:
1460 if (lapic)
1461 kfree(lapic);
1425 return r; 1462 return r;
1426} 1463}
1427 1464
@@ -1630,6 +1667,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
1630 struct kvm *kvm = filp->private_data; 1667 struct kvm *kvm = filp->private_data;
1631 void __user *argp = (void __user *)arg; 1668 void __user *argp = (void __user *)arg;
1632 int r = -EINVAL; 1669 int r = -EINVAL;
1670 /*
1671 * This union makes it completely explicit to gcc-3.x
1672 * that these two variables' stack usage should be
1673 * combined, not added together.
1674 */
1675 union {
1676 struct kvm_pit_state ps;
1677 struct kvm_memory_alias alias;
1678 } u;
1633 1679
1634 switch (ioctl) { 1680 switch (ioctl) {
1635 case KVM_SET_TSS_ADDR: 1681 case KVM_SET_TSS_ADDR:
@@ -1661,17 +1707,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
1661 case KVM_GET_NR_MMU_PAGES: 1707 case KVM_GET_NR_MMU_PAGES:
1662 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 1708 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1663 break; 1709 break;
1664 case KVM_SET_MEMORY_ALIAS: { 1710 case KVM_SET_MEMORY_ALIAS:
1665 struct kvm_memory_alias alias;
1666
1667 r = -EFAULT; 1711 r = -EFAULT;
1668 if (copy_from_user(&alias, argp, sizeof alias)) 1712 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
1669 goto out; 1713 goto out;
1670 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); 1714 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
1671 if (r) 1715 if (r)
1672 goto out; 1716 goto out;
1673 break; 1717 break;
1674 }
1675 case KVM_CREATE_IRQCHIP: 1718 case KVM_CREATE_IRQCHIP:
1676 r = -ENOMEM; 1719 r = -ENOMEM;
1677 kvm->arch.vpic = kvm_create_pic(kvm); 1720 kvm->arch.vpic = kvm_create_pic(kvm);
@@ -1699,13 +1742,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
1699 goto out; 1742 goto out;
1700 if (irqchip_in_kernel(kvm)) { 1743 if (irqchip_in_kernel(kvm)) {
1701 mutex_lock(&kvm->lock); 1744 mutex_lock(&kvm->lock);
1702 if (irq_event.irq < 16) 1745 kvm_set_irq(kvm, irq_event.irq, irq_event.level);
1703 kvm_pic_set_irq(pic_irqchip(kvm),
1704 irq_event.irq,
1705 irq_event.level);
1706 kvm_ioapic_set_irq(kvm->arch.vioapic,
1707 irq_event.irq,
1708 irq_event.level);
1709 mutex_unlock(&kvm->lock); 1746 mutex_unlock(&kvm->lock);
1710 r = 0; 1747 r = 0;
1711 } 1748 }
@@ -1713,65 +1750,77 @@ long kvm_arch_vm_ioctl(struct file *filp,
1713 } 1750 }
1714 case KVM_GET_IRQCHIP: { 1751 case KVM_GET_IRQCHIP: {
1715 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1752 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1716 struct kvm_irqchip chip; 1753 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1717 1754
1718 r = -EFAULT; 1755 r = -ENOMEM;
1719 if (copy_from_user(&chip, argp, sizeof chip)) 1756 if (!chip)
1720 goto out; 1757 goto out;
1758 r = -EFAULT;
1759 if (copy_from_user(chip, argp, sizeof *chip))
1760 goto get_irqchip_out;
1721 r = -ENXIO; 1761 r = -ENXIO;
1722 if (!irqchip_in_kernel(kvm)) 1762 if (!irqchip_in_kernel(kvm))
1723 goto out; 1763 goto get_irqchip_out;
1724 r = kvm_vm_ioctl_get_irqchip(kvm, &chip); 1764 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
1725 if (r) 1765 if (r)
1726 goto out; 1766 goto get_irqchip_out;
1727 r = -EFAULT; 1767 r = -EFAULT;
1728 if (copy_to_user(argp, &chip, sizeof chip)) 1768 if (copy_to_user(argp, chip, sizeof *chip))
1729 goto out; 1769 goto get_irqchip_out;
1730 r = 0; 1770 r = 0;
1771 get_irqchip_out:
1772 kfree(chip);
1773 if (r)
1774 goto out;
1731 break; 1775 break;
1732 } 1776 }
1733 case KVM_SET_IRQCHIP: { 1777 case KVM_SET_IRQCHIP: {
1734 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1778 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1735 struct kvm_irqchip chip; 1779 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1736 1780
1737 r = -EFAULT; 1781 r = -ENOMEM;
1738 if (copy_from_user(&chip, argp, sizeof chip)) 1782 if (!chip)
1739 goto out; 1783 goto out;
1784 r = -EFAULT;
1785 if (copy_from_user(chip, argp, sizeof *chip))
1786 goto set_irqchip_out;
1740 r = -ENXIO; 1787 r = -ENXIO;
1741 if (!irqchip_in_kernel(kvm)) 1788 if (!irqchip_in_kernel(kvm))
1742 goto out; 1789 goto set_irqchip_out;
1743 r = kvm_vm_ioctl_set_irqchip(kvm, &chip); 1790 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
1744 if (r) 1791 if (r)
1745 goto out; 1792 goto set_irqchip_out;
1746 r = 0; 1793 r = 0;
1794 set_irqchip_out:
1795 kfree(chip);
1796 if (r)
1797 goto out;
1747 break; 1798 break;
1748 } 1799 }
1749 case KVM_GET_PIT: { 1800 case KVM_GET_PIT: {
1750 struct kvm_pit_state ps;
1751 r = -EFAULT; 1801 r = -EFAULT;
1752 if (copy_from_user(&ps, argp, sizeof ps)) 1802 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
1753 goto out; 1803 goto out;
1754 r = -ENXIO; 1804 r = -ENXIO;
1755 if (!kvm->arch.vpit) 1805 if (!kvm->arch.vpit)
1756 goto out; 1806 goto out;
1757 r = kvm_vm_ioctl_get_pit(kvm, &ps); 1807 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
1758 if (r) 1808 if (r)
1759 goto out; 1809 goto out;
1760 r = -EFAULT; 1810 r = -EFAULT;
1761 if (copy_to_user(argp, &ps, sizeof ps)) 1811 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
1762 goto out; 1812 goto out;
1763 r = 0; 1813 r = 0;
1764 break; 1814 break;
1765 } 1815 }
1766 case KVM_SET_PIT: { 1816 case KVM_SET_PIT: {
1767 struct kvm_pit_state ps;
1768 r = -EFAULT; 1817 r = -EFAULT;
1769 if (copy_from_user(&ps, argp, sizeof ps)) 1818 if (copy_from_user(&u.ps, argp, sizeof u.ps))
1770 goto out; 1819 goto out;
1771 r = -ENXIO; 1820 r = -ENXIO;
1772 if (!kvm->arch.vpit) 1821 if (!kvm->arch.vpit)
1773 goto out; 1822 goto out;
1774 r = kvm_vm_ioctl_set_pit(kvm, &ps); 1823 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
1775 if (r) 1824 if (r)
1776 goto out; 1825 goto out;
1777 r = 0; 1826 r = 0;
@@ -2018,9 +2067,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
2018 2067
2019 val = *(u64 *)new; 2068 val = *(u64 *)new;
2020 2069
2021 down_read(&current->mm->mmap_sem);
2022 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2070 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
2023 up_read(&current->mm->mmap_sem);
2024 2071
2025 kaddr = kmap_atomic(page, KM_USER0); 2072 kaddr = kmap_atomic(page, KM_USER0);
2026 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2073 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
@@ -2040,6 +2087,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2040 2087
2041int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2088int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2042{ 2089{
2090 kvm_mmu_invlpg(vcpu, address);
2043 return X86EMUL_CONTINUE; 2091 return X86EMUL_CONTINUE;
2044} 2092}
2045 2093
@@ -2080,7 +2128,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2080void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2128void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2081{ 2129{
2082 u8 opcodes[4]; 2130 u8 opcodes[4];
2083 unsigned long rip = vcpu->arch.rip; 2131 unsigned long rip = kvm_rip_read(vcpu);
2084 unsigned long rip_linear; 2132 unsigned long rip_linear;
2085 2133
2086 if (!printk_ratelimit()) 2134 if (!printk_ratelimit())
@@ -2102,6 +2150,14 @@ static struct x86_emulate_ops emulate_ops = {
2102 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2150 .cmpxchg_emulated = emulator_cmpxchg_emulated,
2103}; 2151};
2104 2152
2153static void cache_all_regs(struct kvm_vcpu *vcpu)
2154{
2155 kvm_register_read(vcpu, VCPU_REGS_RAX);
2156 kvm_register_read(vcpu, VCPU_REGS_RSP);
2157 kvm_register_read(vcpu, VCPU_REGS_RIP);
2158 vcpu->arch.regs_dirty = ~0;
2159}
2160
2105int emulate_instruction(struct kvm_vcpu *vcpu, 2161int emulate_instruction(struct kvm_vcpu *vcpu,
2106 struct kvm_run *run, 2162 struct kvm_run *run,
2107 unsigned long cr2, 2163 unsigned long cr2,
@@ -2111,8 +2167,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2111 int r; 2167 int r;
2112 struct decode_cache *c; 2168 struct decode_cache *c;
2113 2169
2170 kvm_clear_exception_queue(vcpu);
2114 vcpu->arch.mmio_fault_cr2 = cr2; 2171 vcpu->arch.mmio_fault_cr2 = cr2;
2115 kvm_x86_ops->cache_regs(vcpu); 2172 /*
2173 * TODO: fix x86_emulate.c to use guest_read/write_register
2174 * instead of direct ->regs accesses, can save hundred cycles
2175 * on Intel for instructions that don't read/change RSP, for
2176 * for example.
2177 */
2178 cache_all_regs(vcpu);
2116 2179
2117 vcpu->mmio_is_write = 0; 2180 vcpu->mmio_is_write = 0;
2118 vcpu->arch.pio.string = 0; 2181 vcpu->arch.pio.string = 0;
@@ -2172,7 +2235,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2172 return EMULATE_DO_MMIO; 2235 return EMULATE_DO_MMIO;
2173 } 2236 }
2174 2237
2175 kvm_x86_ops->decache_regs(vcpu);
2176 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 2238 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2177 2239
2178 if (vcpu->mmio_is_write) { 2240 if (vcpu->mmio_is_write) {
@@ -2225,20 +2287,19 @@ int complete_pio(struct kvm_vcpu *vcpu)
2225 struct kvm_pio_request *io = &vcpu->arch.pio; 2287 struct kvm_pio_request *io = &vcpu->arch.pio;
2226 long delta; 2288 long delta;
2227 int r; 2289 int r;
2228 2290 unsigned long val;
2229 kvm_x86_ops->cache_regs(vcpu);
2230 2291
2231 if (!io->string) { 2292 if (!io->string) {
2232 if (io->in) 2293 if (io->in) {
2233 memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, 2294 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2234 io->size); 2295 memcpy(&val, vcpu->arch.pio_data, io->size);
2296 kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2297 }
2235 } else { 2298 } else {
2236 if (io->in) { 2299 if (io->in) {
2237 r = pio_copy_data(vcpu); 2300 r = pio_copy_data(vcpu);
2238 if (r) { 2301 if (r)
2239 kvm_x86_ops->cache_regs(vcpu);
2240 return r; 2302 return r;
2241 }
2242 } 2303 }
2243 2304
2244 delta = 1; 2305 delta = 1;
@@ -2248,19 +2309,24 @@ int complete_pio(struct kvm_vcpu *vcpu)
2248 * The size of the register should really depend on 2309 * The size of the register should really depend on
2249 * current address size. 2310 * current address size.
2250 */ 2311 */
2251 vcpu->arch.regs[VCPU_REGS_RCX] -= delta; 2312 val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2313 val -= delta;
2314 kvm_register_write(vcpu, VCPU_REGS_RCX, val);
2252 } 2315 }
2253 if (io->down) 2316 if (io->down)
2254 delta = -delta; 2317 delta = -delta;
2255 delta *= io->size; 2318 delta *= io->size;
2256 if (io->in) 2319 if (io->in) {
2257 vcpu->arch.regs[VCPU_REGS_RDI] += delta; 2320 val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2258 else 2321 val += delta;
2259 vcpu->arch.regs[VCPU_REGS_RSI] += delta; 2322 kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2323 } else {
2324 val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2325 val += delta;
2326 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2327 }
2260 } 2328 }
2261 2329
2262 kvm_x86_ops->decache_regs(vcpu);
2263
2264 io->count -= io->cur_count; 2330 io->count -= io->cur_count;
2265 io->cur_count = 0; 2331 io->cur_count = 0;
2266 2332
@@ -2313,6 +2379,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2313 int size, unsigned port) 2379 int size, unsigned port)
2314{ 2380{
2315 struct kvm_io_device *pio_dev; 2381 struct kvm_io_device *pio_dev;
2382 unsigned long val;
2316 2383
2317 vcpu->run->exit_reason = KVM_EXIT_IO; 2384 vcpu->run->exit_reason = KVM_EXIT_IO;
2318 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2385 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2333,8 +2400,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2333 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2400 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2334 handler); 2401 handler);
2335 2402
2336 kvm_x86_ops->cache_regs(vcpu); 2403 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2337 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); 2404 memcpy(vcpu->arch.pio_data, &val, 4);
2338 2405
2339 kvm_x86_ops->skip_emulated_instruction(vcpu); 2406 kvm_x86_ops->skip_emulated_instruction(vcpu);
2340 2407
@@ -2492,11 +2559,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2492 KVMTRACE_0D(HLT, vcpu, handler); 2559 KVMTRACE_0D(HLT, vcpu, handler);
2493 if (irqchip_in_kernel(vcpu->kvm)) { 2560 if (irqchip_in_kernel(vcpu->kvm)) {
2494 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 2561 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2495 up_read(&vcpu->kvm->slots_lock);
2496 kvm_vcpu_block(vcpu);
2497 down_read(&vcpu->kvm->slots_lock);
2498 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
2499 return -EINTR;
2500 return 1; 2562 return 1;
2501 } else { 2563 } else {
2502 vcpu->run->exit_reason = KVM_EXIT_HLT; 2564 vcpu->run->exit_reason = KVM_EXIT_HLT;
@@ -2519,13 +2581,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2519 unsigned long nr, a0, a1, a2, a3, ret; 2581 unsigned long nr, a0, a1, a2, a3, ret;
2520 int r = 1; 2582 int r = 1;
2521 2583
2522 kvm_x86_ops->cache_regs(vcpu); 2584 nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
2523 2585 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
2524 nr = vcpu->arch.regs[VCPU_REGS_RAX]; 2586 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
2525 a0 = vcpu->arch.regs[VCPU_REGS_RBX]; 2587 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
2526 a1 = vcpu->arch.regs[VCPU_REGS_RCX]; 2588 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
2527 a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2528 a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2529 2589
2530 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); 2590 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
2531 2591
@@ -2548,8 +2608,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2548 ret = -KVM_ENOSYS; 2608 ret = -KVM_ENOSYS;
2549 break; 2609 break;
2550 } 2610 }
2551 vcpu->arch.regs[VCPU_REGS_RAX] = ret; 2611 kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
2552 kvm_x86_ops->decache_regs(vcpu);
2553 ++vcpu->stat.hypercalls; 2612 ++vcpu->stat.hypercalls;
2554 return r; 2613 return r;
2555} 2614}
@@ -2559,6 +2618,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2559{ 2618{
2560 char instruction[3]; 2619 char instruction[3];
2561 int ret = 0; 2620 int ret = 0;
2621 unsigned long rip = kvm_rip_read(vcpu);
2562 2622
2563 2623
2564 /* 2624 /*
@@ -2568,9 +2628,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2568 */ 2628 */
2569 kvm_mmu_zap_all(vcpu->kvm); 2629 kvm_mmu_zap_all(vcpu->kvm);
2570 2630
2571 kvm_x86_ops->cache_regs(vcpu);
2572 kvm_x86_ops->patch_hypercall(vcpu, instruction); 2631 kvm_x86_ops->patch_hypercall(vcpu, instruction);
2573 if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) 2632 if (emulator_write_emulated(rip, instruction, 3, vcpu)
2574 != X86EMUL_CONTINUE) 2633 != X86EMUL_CONTINUE)
2575 ret = -EFAULT; 2634 ret = -EFAULT;
2576 2635
@@ -2700,13 +2759,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2700 u32 function, index; 2759 u32 function, index;
2701 struct kvm_cpuid_entry2 *e, *best; 2760 struct kvm_cpuid_entry2 *e, *best;
2702 2761
2703 kvm_x86_ops->cache_regs(vcpu); 2762 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
2704 function = vcpu->arch.regs[VCPU_REGS_RAX]; 2763 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
2705 index = vcpu->arch.regs[VCPU_REGS_RCX]; 2764 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
2706 vcpu->arch.regs[VCPU_REGS_RAX] = 0; 2765 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
2707 vcpu->arch.regs[VCPU_REGS_RBX] = 0; 2766 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
2708 vcpu->arch.regs[VCPU_REGS_RCX] = 0; 2767 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
2709 vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2710 best = NULL; 2768 best = NULL;
2711 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 2769 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2712 e = &vcpu->arch.cpuid_entries[i]; 2770 e = &vcpu->arch.cpuid_entries[i];
@@ -2724,18 +2782,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2724 best = e; 2782 best = e;
2725 } 2783 }
2726 if (best) { 2784 if (best) {
2727 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; 2785 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
2728 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; 2786 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
2729 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; 2787 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
2730 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; 2788 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
2731 } 2789 }
2732 kvm_x86_ops->decache_regs(vcpu);
2733 kvm_x86_ops->skip_emulated_instruction(vcpu); 2790 kvm_x86_ops->skip_emulated_instruction(vcpu);
2734 KVMTRACE_5D(CPUID, vcpu, function, 2791 KVMTRACE_5D(CPUID, vcpu, function,
2735 (u32)vcpu->arch.regs[VCPU_REGS_RAX], 2792 (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
2736 (u32)vcpu->arch.regs[VCPU_REGS_RBX], 2793 (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
2737 (u32)vcpu->arch.regs[VCPU_REGS_RCX], 2794 (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
2738 (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); 2795 (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
2739} 2796}
2740EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 2797EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2741 2798
@@ -2776,9 +2833,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
2776 if (!apic || !apic->vapic_addr) 2833 if (!apic || !apic->vapic_addr)
2777 return; 2834 return;
2778 2835
2779 down_read(&current->mm->mmap_sem);
2780 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2836 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2781 up_read(&current->mm->mmap_sem);
2782 2837
2783 vcpu->arch.apic->vapic_page = page; 2838 vcpu->arch.apic->vapic_page = page;
2784} 2839}
@@ -2796,28 +2851,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
2796 up_read(&vcpu->kvm->slots_lock); 2851 up_read(&vcpu->kvm->slots_lock);
2797} 2852}
2798 2853
2799static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2854static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2800{ 2855{
2801 int r; 2856 int r;
2802 2857
2803 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
2804 pr_debug("vcpu %d received sipi with vector # %x\n",
2805 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2806 kvm_lapic_reset(vcpu);
2807 r = kvm_x86_ops->vcpu_reset(vcpu);
2808 if (r)
2809 return r;
2810 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2811 }
2812
2813 down_read(&vcpu->kvm->slots_lock);
2814 vapic_enter(vcpu);
2815
2816preempted:
2817 if (vcpu->guest_debug.enabled)
2818 kvm_x86_ops->guest_debug_pre(vcpu);
2819
2820again:
2821 if (vcpu->requests) 2858 if (vcpu->requests)
2822 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 2859 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
2823 kvm_mmu_unload(vcpu); 2860 kvm_mmu_unload(vcpu);
@@ -2829,6 +2866,8 @@ again:
2829 if (vcpu->requests) { 2866 if (vcpu->requests) {
2830 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 2867 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2831 __kvm_migrate_timers(vcpu); 2868 __kvm_migrate_timers(vcpu);
2869 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
2870 kvm_mmu_sync_roots(vcpu);
2832 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 2871 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2833 kvm_x86_ops->tlb_flush(vcpu); 2872 kvm_x86_ops->tlb_flush(vcpu);
2834 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 2873 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
@@ -2854,21 +2893,15 @@ again:
2854 2893
2855 local_irq_disable(); 2894 local_irq_disable();
2856 2895
2857 if (vcpu->requests || need_resched()) { 2896 if (vcpu->requests || need_resched() || signal_pending(current)) {
2858 local_irq_enable(); 2897 local_irq_enable();
2859 preempt_enable(); 2898 preempt_enable();
2860 r = 1; 2899 r = 1;
2861 goto out; 2900 goto out;
2862 } 2901 }
2863 2902
2864 if (signal_pending(current)) { 2903 if (vcpu->guest_debug.enabled)
2865 local_irq_enable(); 2904 kvm_x86_ops->guest_debug_pre(vcpu);
2866 preempt_enable();
2867 r = -EINTR;
2868 kvm_run->exit_reason = KVM_EXIT_INTR;
2869 ++vcpu->stat.signal_exits;
2870 goto out;
2871 }
2872 2905
2873 vcpu->guest_mode = 1; 2906 vcpu->guest_mode = 1;
2874 /* 2907 /*
@@ -2917,8 +2950,8 @@ again:
2917 * Profile KVM exit RIPs: 2950 * Profile KVM exit RIPs:
2918 */ 2951 */
2919 if (unlikely(prof_on == KVM_PROFILING)) { 2952 if (unlikely(prof_on == KVM_PROFILING)) {
2920 kvm_x86_ops->cache_regs(vcpu); 2953 unsigned long rip = kvm_rip_read(vcpu);
2921 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); 2954 profile_hit(KVM_PROFILING, (void *)rip);
2922 } 2955 }
2923 2956
2924 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) 2957 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
@@ -2927,26 +2960,63 @@ again:
2927 kvm_lapic_sync_from_vapic(vcpu); 2960 kvm_lapic_sync_from_vapic(vcpu);
2928 2961
2929 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 2962 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2963out:
2964 return r;
2965}
2930 2966
2931 if (r > 0) { 2967static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2932 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 2968{
2933 r = -EINTR; 2969 int r;
2934 kvm_run->exit_reason = KVM_EXIT_INTR; 2970
2935 ++vcpu->stat.request_irq_exits; 2971 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
2936 goto out; 2972 pr_debug("vcpu %d received sipi with vector # %x\n",
2937 } 2973 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2938 if (!need_resched()) 2974 kvm_lapic_reset(vcpu);
2939 goto again; 2975 r = kvm_x86_ops->vcpu_reset(vcpu);
2976 if (r)
2977 return r;
2978 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2940 } 2979 }
2941 2980
2942out: 2981 down_read(&vcpu->kvm->slots_lock);
2943 up_read(&vcpu->kvm->slots_lock); 2982 vapic_enter(vcpu);
2944 if (r > 0) { 2983
2945 kvm_resched(vcpu); 2984 r = 1;
2946 down_read(&vcpu->kvm->slots_lock); 2985 while (r > 0) {
2947 goto preempted; 2986 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
2987 r = vcpu_enter_guest(vcpu, kvm_run);
2988 else {
2989 up_read(&vcpu->kvm->slots_lock);
2990 kvm_vcpu_block(vcpu);
2991 down_read(&vcpu->kvm->slots_lock);
2992 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
2993 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
2994 vcpu->arch.mp_state =
2995 KVM_MP_STATE_RUNNABLE;
2996 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
2997 r = -EINTR;
2998 }
2999
3000 if (r > 0) {
3001 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3002 r = -EINTR;
3003 kvm_run->exit_reason = KVM_EXIT_INTR;
3004 ++vcpu->stat.request_irq_exits;
3005 }
3006 if (signal_pending(current)) {
3007 r = -EINTR;
3008 kvm_run->exit_reason = KVM_EXIT_INTR;
3009 ++vcpu->stat.signal_exits;
3010 }
3011 if (need_resched()) {
3012 up_read(&vcpu->kvm->slots_lock);
3013 kvm_resched(vcpu);
3014 down_read(&vcpu->kvm->slots_lock);
3015 }
3016 }
2948 } 3017 }
2949 3018
3019 up_read(&vcpu->kvm->slots_lock);
2950 post_kvm_run_save(vcpu, kvm_run); 3020 post_kvm_run_save(vcpu, kvm_run);
2951 3021
2952 vapic_exit(vcpu); 3022 vapic_exit(vcpu);
@@ -2966,6 +3036,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2966 3036
2967 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 3037 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
2968 kvm_vcpu_block(vcpu); 3038 kvm_vcpu_block(vcpu);
3039 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
2969 r = -EAGAIN; 3040 r = -EAGAIN;
2970 goto out; 3041 goto out;
2971 } 3042 }
@@ -2999,11 +3070,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2999 } 3070 }
3000 } 3071 }
3001#endif 3072#endif
3002 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { 3073 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3003 kvm_x86_ops->cache_regs(vcpu); 3074 kvm_register_write(vcpu, VCPU_REGS_RAX,
3004 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; 3075 kvm_run->hypercall.ret);
3005 kvm_x86_ops->decache_regs(vcpu);
3006 }
3007 3076
3008 r = __vcpu_run(vcpu, kvm_run); 3077 r = __vcpu_run(vcpu, kvm_run);
3009 3078
@@ -3019,28 +3088,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3019{ 3088{
3020 vcpu_load(vcpu); 3089 vcpu_load(vcpu);
3021 3090
3022 kvm_x86_ops->cache_regs(vcpu); 3091 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3023 3092 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3024 regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; 3093 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3025 regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; 3094 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3026 regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; 3095 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3027 regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; 3096 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3028 regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; 3097 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3029 regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; 3098 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3030 regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3031 regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
3032#ifdef CONFIG_X86_64 3099#ifdef CONFIG_X86_64
3033 regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; 3100 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
3034 regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; 3101 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
3035 regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; 3102 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
3036 regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; 3103 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
3037 regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; 3104 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
3038 regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; 3105 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
3039 regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; 3106 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
3040 regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; 3107 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
3041#endif 3108#endif
3042 3109
3043 regs->rip = vcpu->arch.rip; 3110 regs->rip = kvm_rip_read(vcpu);
3044 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 3111 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
3045 3112
3046 /* 3113 /*
@@ -3058,29 +3125,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3058{ 3125{
3059 vcpu_load(vcpu); 3126 vcpu_load(vcpu);
3060 3127
3061 vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; 3128 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
3062 vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; 3129 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
3063 vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; 3130 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
3064 vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; 3131 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
3065 vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; 3132 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
3066 vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; 3133 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
3067 vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; 3134 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
3068 vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; 3135 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
3069#ifdef CONFIG_X86_64 3136#ifdef CONFIG_X86_64
3070 vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; 3137 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
3071 vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; 3138 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
3072 vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; 3139 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
3073 vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; 3140 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
3074 vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; 3141 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
3075 vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; 3142 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3076 vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; 3143 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3077 vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; 3144 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3145
3078#endif 3146#endif
3079 3147
3080 vcpu->arch.rip = regs->rip; 3148 kvm_rip_write(vcpu, regs->rip);
3081 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 3149 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
3082 3150
3083 kvm_x86_ops->decache_regs(vcpu);
3084 3151
3085 vcpu->arch.exception.pending = false; 3152 vcpu->arch.exception.pending = false;
3086 3153
@@ -3294,11 +3361,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3294 return 0; 3361 return 0;
3295} 3362}
3296 3363
3364static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
3365{
3366 struct kvm_segment segvar = {
3367 .base = selector << 4,
3368 .limit = 0xffff,
3369 .selector = selector,
3370 .type = 3,
3371 .present = 1,
3372 .dpl = 3,
3373 .db = 0,
3374 .s = 1,
3375 .l = 0,
3376 .g = 0,
3377 .avl = 0,
3378 .unusable = 0,
3379 };
3380 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
3381 return 0;
3382}
3383
3297int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3384int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3298 int type_bits, int seg) 3385 int type_bits, int seg)
3299{ 3386{
3300 struct kvm_segment kvm_seg; 3387 struct kvm_segment kvm_seg;
3301 3388
3389 if (!(vcpu->arch.cr0 & X86_CR0_PE))
3390 return kvm_load_realmode_segment(vcpu, selector, seg);
3302 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 3391 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3303 return 1; 3392 return 1;
3304 kvm_seg.type |= type_bits; 3393 kvm_seg.type |= type_bits;
@@ -3316,17 +3405,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3316 struct tss_segment_32 *tss) 3405 struct tss_segment_32 *tss)
3317{ 3406{
3318 tss->cr3 = vcpu->arch.cr3; 3407 tss->cr3 = vcpu->arch.cr3;
3319 tss->eip = vcpu->arch.rip; 3408 tss->eip = kvm_rip_read(vcpu);
3320 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 3409 tss->eflags = kvm_x86_ops->get_rflags(vcpu);
3321 tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; 3410 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3322 tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3411 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3323 tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; 3412 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3324 tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; 3413 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3325 tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; 3414 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3326 tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; 3415 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3327 tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; 3416 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3328 tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; 3417 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3329
3330 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3418 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3331 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3419 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3332 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3420 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
@@ -3342,17 +3430,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3342{ 3430{
3343 kvm_set_cr3(vcpu, tss->cr3); 3431 kvm_set_cr3(vcpu, tss->cr3);
3344 3432
3345 vcpu->arch.rip = tss->eip; 3433 kvm_rip_write(vcpu, tss->eip);
3346 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 3434 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
3347 3435
3348 vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; 3436 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
3349 vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; 3437 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
3350 vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; 3438 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
3351 vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; 3439 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
3352 vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; 3440 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
3353 vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; 3441 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
3354 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; 3442 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
3355 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; 3443 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
3356 3444
3357 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3445 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3358 return 1; 3446 return 1;
@@ -3380,16 +3468,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3380static void save_state_to_tss16(struct kvm_vcpu *vcpu, 3468static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3381 struct tss_segment_16 *tss) 3469 struct tss_segment_16 *tss)
3382{ 3470{
3383 tss->ip = vcpu->arch.rip; 3471 tss->ip = kvm_rip_read(vcpu);
3384 tss->flag = kvm_x86_ops->get_rflags(vcpu); 3472 tss->flag = kvm_x86_ops->get_rflags(vcpu);
3385 tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; 3473 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3386 tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; 3474 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3387 tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; 3475 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3388 tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; 3476 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3389 tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; 3477 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3390 tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; 3478 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3391 tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; 3479 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
3392 tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; 3480 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
3393 3481
3394 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3482 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3395 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3483 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
@@ -3402,16 +3490,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3402static int load_state_from_tss16(struct kvm_vcpu *vcpu, 3490static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3403 struct tss_segment_16 *tss) 3491 struct tss_segment_16 *tss)
3404{ 3492{
3405 vcpu->arch.rip = tss->ip; 3493 kvm_rip_write(vcpu, tss->ip);
3406 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 3494 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
3407 vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; 3495 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
3408 vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; 3496 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
3409 vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; 3497 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
3410 vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; 3498 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
3411 vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; 3499 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
3412 vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; 3500 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
3413 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; 3501 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
3414 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; 3502 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
3415 3503
3416 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3504 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3417 return 1; 3505 return 1;
@@ -3534,7 +3622,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3534 } 3622 }
3535 3623
3536 kvm_x86_ops->skip_emulated_instruction(vcpu); 3624 kvm_x86_ops->skip_emulated_instruction(vcpu);
3537 kvm_x86_ops->cache_regs(vcpu);
3538 3625
3539 if (nseg_desc.type & 8) 3626 if (nseg_desc.type & 8)
3540 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, 3627 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
@@ -3559,7 +3646,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3559 tr_seg.type = 11; 3646 tr_seg.type = 11;
3560 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3647 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3561out: 3648out:
3562 kvm_x86_ops->decache_regs(vcpu);
3563 return ret; 3649 return ret;
3564} 3650}
3565EXPORT_SYMBOL_GPL(kvm_task_switch); 3651EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -3622,6 +3708,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3622 pr_debug("Set back pending irq %d\n", 3708 pr_debug("Set back pending irq %d\n",
3623 pending_vec); 3709 pending_vec);
3624 } 3710 }
3711 kvm_pic_clear_isr_ack(vcpu->kvm);
3625 } 3712 }
3626 3713
3627 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3714 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -3634,6 +3721,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3634 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3721 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3635 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3722 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3636 3723
3724 /* Older userspace won't unhalt the vcpu on reset. */
3725 if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
3726 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
3727 !(vcpu->arch.cr0 & X86_CR0_PE))
3728 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3729
3637 vcpu_put(vcpu); 3730 vcpu_put(vcpu);
3638 3731
3639 return 0; 3732 return 0;
@@ -3918,6 +4011,7 @@ struct kvm *kvm_arch_create_vm(void)
3918 return ERR_PTR(-ENOMEM); 4011 return ERR_PTR(-ENOMEM);
3919 4012
3920 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4013 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4014 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
3921 4015
3922 return kvm; 4016 return kvm;
3923} 4017}
@@ -3950,6 +4044,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
3950 4044
3951void kvm_arch_destroy_vm(struct kvm *kvm) 4045void kvm_arch_destroy_vm(struct kvm *kvm)
3952{ 4046{
4047 kvm_iommu_unmap_guest(kvm);
4048 kvm_free_all_assigned_devices(kvm);
3953 kvm_free_pit(kvm); 4049 kvm_free_pit(kvm);
3954 kfree(kvm->arch.vpic); 4050 kfree(kvm->arch.vpic);
3955 kfree(kvm->arch.vioapic); 4051 kfree(kvm->arch.vioapic);
@@ -3981,7 +4077,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
3981 userspace_addr = do_mmap(NULL, 0, 4077 userspace_addr = do_mmap(NULL, 0,
3982 npages * PAGE_SIZE, 4078 npages * PAGE_SIZE,
3983 PROT_READ | PROT_WRITE, 4079 PROT_READ | PROT_WRITE,
3984 MAP_SHARED | MAP_ANONYMOUS, 4080 MAP_PRIVATE | MAP_ANONYMOUS,
3985 0); 4081 0);
3986 up_write(&current->mm->mmap_sem); 4082 up_write(&current->mm->mmap_sem);
3987 4083
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
new file mode 100644
index 00000000000..6a4be78a738
--- /dev/null
+++ b/arch/x86/kvm/x86.h
@@ -0,0 +1,22 @@
1#ifndef ARCH_X86_KVM_X86_H
2#define ARCH_X86_KVM_X86_H
3
4#include <linux/kvm_host.h>
5
6static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
7{
8 vcpu->arch.exception.pending = false;
9}
10
11static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
12{
13 vcpu->arch.interrupt.pending = true;
14 vcpu->arch.interrupt.nr = vector;
15}
16
17static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
18{
19 vcpu->arch.interrupt.pending = false;
20}
21
22#endif
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index f2f90468f8b..ea051173b0d 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -26,6 +26,7 @@
26#define DPRINTF(_f, _a ...) printf(_f , ## _a) 26#define DPRINTF(_f, _a ...) printf(_f , ## _a)
27#else 27#else
28#include <linux/kvm_host.h> 28#include <linux/kvm_host.h>
29#include "kvm_cache_regs.h"
29#define DPRINTF(x...) do {} while (0) 30#define DPRINTF(x...) do {} while (0)
30#endif 31#endif
31#include <linux/module.h> 32#include <linux/module.h>
@@ -46,25 +47,26 @@
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ 47#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */ 48#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */ 49#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1) 50#define DstAcc (4<<1) /* Destination Accumulator */
51#define DstMask (7<<1)
50/* Source operand type. */ 52/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */ 53#define SrcNone (0<<4) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ 54#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */ 55#define SrcReg (1<<4) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */ 56#define SrcMem (2<<4) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ 57#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ 58#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */ 59#define SrcImm (5<<4) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ 60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3) 61#define SrcMask (7<<4)
60/* Generic ModRM decode. */ 62/* Generic ModRM decode. */
61#define ModRM (1<<6) 63#define ModRM (1<<7)
62/* Destination is only written; never read. */ 64/* Destination is only written; never read. */
63#define Mov (1<<7) 65#define Mov (1<<8)
64#define BitOp (1<<8) 66#define BitOp (1<<9)
65#define MemAbs (1<<9) /* Memory operand is absolute displacement */ 67#define MemAbs (1<<10) /* Memory operand is absolute displacement */
66#define String (1<<10) /* String instruction (rep capable) */ 68#define String (1<<12) /* String instruction (rep capable) */
67#define Stack (1<<11) /* Stack instruction (push/pop) */ 69#define Stack (1<<13) /* Stack instruction (push/pop) */
68#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 70#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
69#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 71#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
70#define GroupMask 0xff /* Group number stored in bits 0:7 */ 72#define GroupMask 0xff /* Group number stored in bits 0:7 */
@@ -94,7 +96,7 @@ static u16 opcode_table[256] = {
94 /* 0x20 - 0x27 */ 96 /* 0x20 - 0x27 */
95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 97 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 98 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
97 SrcImmByte, SrcImm, 0, 0, 99 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
98 /* 0x28 - 0x2F */ 100 /* 0x28 - 0x2F */
99 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 101 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
100 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 102 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -106,7 +108,8 @@ static u16 opcode_table[256] = {
106 /* 0x38 - 0x3F */ 108 /* 0x38 - 0x3F */
107 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 109 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
108 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 110 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
109 0, 0, 0, 0, 111 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
112 0, 0,
110 /* 0x40 - 0x47 */ 113 /* 0x40 - 0x47 */
111 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 114 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
112 /* 0x48 - 0x4F */ 115 /* 0x48 - 0x4F */
@@ -153,9 +156,16 @@ static u16 opcode_table[256] = {
153 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 156 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
154 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 157 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
155 ByteOp | ImplicitOps | String, ImplicitOps | String, 158 ByteOp | ImplicitOps | String, ImplicitOps | String,
156 /* 0xB0 - 0xBF */ 159 /* 0xB0 - 0xB7 */
157 0, 0, 0, 0, 0, 0, 0, 0, 160 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
158 DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0, 161 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
162 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
163 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
164 /* 0xB8 - 0xBF */
165 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
166 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
167 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
168 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
159 /* 0xC0 - 0xC7 */ 169 /* 0xC0 - 0xC7 */
160 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 170 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
161 0, ImplicitOps | Stack, 0, 0, 171 0, ImplicitOps | Stack, 0, 0,
@@ -169,17 +179,20 @@ static u16 opcode_table[256] = {
169 /* 0xD8 - 0xDF */ 179 /* 0xD8 - 0xDF */
170 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0,
171 /* 0xE0 - 0xE7 */ 181 /* 0xE0 - 0xE7 */
172 0, 0, 0, 0, 0, 0, 0, 0, 182 0, 0, 0, 0,
183 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
184 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
173 /* 0xE8 - 0xEF */ 185 /* 0xE8 - 0xEF */
174 ImplicitOps | Stack, SrcImm | ImplicitOps, 186 ImplicitOps | Stack, SrcImm | ImplicitOps,
175 ImplicitOps, SrcImmByte | ImplicitOps, 187 ImplicitOps, SrcImmByte | ImplicitOps,
176 0, 0, 0, 0, 188 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
189 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
177 /* 0xF0 - 0xF7 */ 190 /* 0xF0 - 0xF7 */
178 0, 0, 0, 0, 191 0, 0, 0, 0,
179 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, 192 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
180 /* 0xF8 - 0xFF */ 193 /* 0xF8 - 0xFF */
181 ImplicitOps, 0, ImplicitOps, ImplicitOps, 194 ImplicitOps, 0, ImplicitOps, ImplicitOps,
182 0, 0, Group | Group4, Group | Group5, 195 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
183}; 196};
184 197
185static u16 twobyte_table[256] = { 198static u16 twobyte_table[256] = {
@@ -268,15 +281,16 @@ static u16 group_table[] = {
268 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 281 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
269 0, 0, 0, 0, 282 0, 0, 0, 0,
270 [Group3*8] = 283 [Group3*8] =
271 DstMem | SrcImm | ModRM | SrcImm, 0, 284 DstMem | SrcImm | ModRM, 0,
272 DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 285 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
273 0, 0, 0, 0, 286 0, 0, 0, 0,
274 [Group4*8] = 287 [Group4*8] =
275 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 288 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
276 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0,
277 [Group5*8] = 290 [Group5*8] =
278 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, 291 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
279 SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, 292 SrcMem | ModRM | Stack, 0,
293 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
280 [Group7*8] = 294 [Group7*8] =
281 0, 0, ModRM | SrcMem, ModRM | SrcMem, 295 0, 0, ModRM | SrcMem, ModRM | SrcMem,
282 SrcNone | ModRM | DstMem | Mov, 0, 296 SrcNone | ModRM | DstMem | Mov, 0,
@@ -839,7 +853,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
839 /* Shadow copy of register state. Committed on successful emulation. */ 853 /* Shadow copy of register state. Committed on successful emulation. */
840 854
841 memset(c, 0, sizeof(struct decode_cache)); 855 memset(c, 0, sizeof(struct decode_cache));
842 c->eip = ctxt->vcpu->arch.rip; 856 c->eip = kvm_rip_read(ctxt->vcpu);
843 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 857 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
844 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 858 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
845 859
@@ -1048,6 +1062,23 @@ done_prefixes:
1048 } 1062 }
1049 c->dst.type = OP_MEM; 1063 c->dst.type = OP_MEM;
1050 break; 1064 break;
1065 case DstAcc:
1066 c->dst.type = OP_REG;
1067 c->dst.bytes = c->op_bytes;
1068 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1069 switch (c->op_bytes) {
1070 case 1:
1071 c->dst.val = *(u8 *)c->dst.ptr;
1072 break;
1073 case 2:
1074 c->dst.val = *(u16 *)c->dst.ptr;
1075 break;
1076 case 4:
1077 c->dst.val = *(u32 *)c->dst.ptr;
1078 break;
1079 }
1080 c->dst.orig_val = c->dst.val;
1081 break;
1051 } 1082 }
1052 1083
1053 if (c->rip_relative) 1084 if (c->rip_relative)
@@ -1151,6 +1182,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1151 case 1: /* dec */ 1182 case 1: /* dec */
1152 emulate_1op("dec", c->dst, ctxt->eflags); 1183 emulate_1op("dec", c->dst, ctxt->eflags);
1153 break; 1184 break;
1185 case 2: /* call near abs */ {
1186 long int old_eip;
1187 old_eip = c->eip;
1188 c->eip = c->src.val;
1189 c->src.val = old_eip;
1190 emulate_push(ctxt);
1191 break;
1192 }
1154 case 4: /* jmp abs */ 1193 case 4: /* jmp abs */
1155 c->eip = c->src.val; 1194 c->eip = c->src.val;
1156 break; 1195 break;
@@ -1251,6 +1290,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1251 u64 msr_data; 1290 u64 msr_data;
1252 unsigned long saved_eip = 0; 1291 unsigned long saved_eip = 0;
1253 struct decode_cache *c = &ctxt->decode; 1292 struct decode_cache *c = &ctxt->decode;
1293 unsigned int port;
1294 int io_dir_in;
1254 int rc = 0; 1295 int rc = 0;
1255 1296
1256 /* Shadow copy of register state. Committed on successful emulation. 1297 /* Shadow copy of register state. Committed on successful emulation.
@@ -1267,7 +1308,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1267 if (c->rep_prefix && (c->d & String)) { 1308 if (c->rep_prefix && (c->d & String)) {
1268 /* All REP prefixes have the same first termination condition */ 1309 /* All REP prefixes have the same first termination condition */
1269 if (c->regs[VCPU_REGS_RCX] == 0) { 1310 if (c->regs[VCPU_REGS_RCX] == 0) {
1270 ctxt->vcpu->arch.rip = c->eip; 1311 kvm_rip_write(ctxt->vcpu, c->eip);
1271 goto done; 1312 goto done;
1272 } 1313 }
1273 /* The second termination condition only applies for REPE 1314 /* The second termination condition only applies for REPE
@@ -1281,17 +1322,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1281 (c->b == 0xae) || (c->b == 0xaf)) { 1322 (c->b == 0xae) || (c->b == 0xaf)) {
1282 if ((c->rep_prefix == REPE_PREFIX) && 1323 if ((c->rep_prefix == REPE_PREFIX) &&
1283 ((ctxt->eflags & EFLG_ZF) == 0)) { 1324 ((ctxt->eflags & EFLG_ZF) == 0)) {
1284 ctxt->vcpu->arch.rip = c->eip; 1325 kvm_rip_write(ctxt->vcpu, c->eip);
1285 goto done; 1326 goto done;
1286 } 1327 }
1287 if ((c->rep_prefix == REPNE_PREFIX) && 1328 if ((c->rep_prefix == REPNE_PREFIX) &&
1288 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { 1329 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
1289 ctxt->vcpu->arch.rip = c->eip; 1330 kvm_rip_write(ctxt->vcpu, c->eip);
1290 goto done; 1331 goto done;
1291 } 1332 }
1292 } 1333 }
1293 c->regs[VCPU_REGS_RCX]--; 1334 c->regs[VCPU_REGS_RCX]--;
1294 c->eip = ctxt->vcpu->arch.rip; 1335 c->eip = kvm_rip_read(ctxt->vcpu);
1295 } 1336 }
1296 1337
1297 if (c->src.type == OP_MEM) { 1338 if (c->src.type == OP_MEM) {
@@ -1351,27 +1392,10 @@ special_insn:
1351 sbb: /* sbb */ 1392 sbb: /* sbb */
1352 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 1393 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1353 break; 1394 break;
1354 case 0x20 ... 0x23: 1395 case 0x20 ... 0x25:
1355 and: /* and */ 1396 and: /* and */
1356 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); 1397 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
1357 break; 1398 break;
1358 case 0x24: /* and al imm8 */
1359 c->dst.type = OP_REG;
1360 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1361 c->dst.val = *(u8 *)c->dst.ptr;
1362 c->dst.bytes = 1;
1363 c->dst.orig_val = c->dst.val;
1364 goto and;
1365 case 0x25: /* and ax imm16, or eax imm32 */
1366 c->dst.type = OP_REG;
1367 c->dst.bytes = c->op_bytes;
1368 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1369 if (c->op_bytes == 2)
1370 c->dst.val = *(u16 *)c->dst.ptr;
1371 else
1372 c->dst.val = *(u32 *)c->dst.ptr;
1373 c->dst.orig_val = c->dst.val;
1374 goto and;
1375 case 0x28 ... 0x2d: 1399 case 0x28 ... 0x2d:
1376 sub: /* sub */ 1400 sub: /* sub */
1377 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); 1401 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
@@ -1659,7 +1683,7 @@ special_insn:
1659 case 0xae ... 0xaf: /* scas */ 1683 case 0xae ... 0xaf: /* scas */
1660 DPRINTF("Urk! I don't handle SCAS.\n"); 1684 DPRINTF("Urk! I don't handle SCAS.\n");
1661 goto cannot_emulate; 1685 goto cannot_emulate;
1662 case 0xb8: /* mov r, imm */ 1686 case 0xb0 ... 0xbf: /* mov r, imm */
1663 goto mov; 1687 goto mov;
1664 case 0xc0 ... 0xc1: 1688 case 0xc0 ... 0xc1:
1665 emulate_grp2(ctxt); 1689 emulate_grp2(ctxt);
@@ -1679,6 +1703,16 @@ special_insn:
1679 c->src.val = c->regs[VCPU_REGS_RCX]; 1703 c->src.val = c->regs[VCPU_REGS_RCX];
1680 emulate_grp2(ctxt); 1704 emulate_grp2(ctxt);
1681 break; 1705 break;
1706 case 0xe4: /* inb */
1707 case 0xe5: /* in */
1708 port = insn_fetch(u8, 1, c->eip);
1709 io_dir_in = 1;
1710 goto do_io;
1711 case 0xe6: /* outb */
1712 case 0xe7: /* out */
1713 port = insn_fetch(u8, 1, c->eip);
1714 io_dir_in = 0;
1715 goto do_io;
1682 case 0xe8: /* call (near) */ { 1716 case 0xe8: /* call (near) */ {
1683 long int rel; 1717 long int rel;
1684 switch (c->op_bytes) { 1718 switch (c->op_bytes) {
@@ -1729,6 +1763,22 @@ special_insn:
1729 jmp_rel(c, c->src.val); 1763 jmp_rel(c, c->src.val);
1730 c->dst.type = OP_NONE; /* Disable writeback. */ 1764 c->dst.type = OP_NONE; /* Disable writeback. */
1731 break; 1765 break;
1766 case 0xec: /* in al,dx */
1767 case 0xed: /* in (e/r)ax,dx */
1768 port = c->regs[VCPU_REGS_RDX];
1769 io_dir_in = 1;
1770 goto do_io;
1771 case 0xee: /* out al,dx */
1772 case 0xef: /* out (e/r)ax,dx */
1773 port = c->regs[VCPU_REGS_RDX];
1774 io_dir_in = 0;
1775 do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in,
1776 (c->d & ByteOp) ? 1 : c->op_bytes,
1777 port) != 0) {
1778 c->eip = saved_eip;
1779 goto cannot_emulate;
1780 }
1781 return 0;
1732 case 0xf4: /* hlt */ 1782 case 0xf4: /* hlt */
1733 ctxt->vcpu->arch.halt_request = 1; 1783 ctxt->vcpu->arch.halt_request = 1;
1734 break; 1784 break;
@@ -1754,6 +1804,14 @@ special_insn:
1754 ctxt->eflags |= X86_EFLAGS_IF; 1804 ctxt->eflags |= X86_EFLAGS_IF;
1755 c->dst.type = OP_NONE; /* Disable writeback. */ 1805 c->dst.type = OP_NONE; /* Disable writeback. */
1756 break; 1806 break;
1807 case 0xfc: /* cld */
1808 ctxt->eflags &= ~EFLG_DF;
1809 c->dst.type = OP_NONE; /* Disable writeback. */
1810 break;
1811 case 0xfd: /* std */
1812 ctxt->eflags |= EFLG_DF;
1813 c->dst.type = OP_NONE; /* Disable writeback. */
1814 break;
1757 case 0xfe ... 0xff: /* Grp4/Grp5 */ 1815 case 0xfe ... 0xff: /* Grp4/Grp5 */
1758 rc = emulate_grp45(ctxt, ops); 1816 rc = emulate_grp45(ctxt, ops);
1759 if (rc != 0) 1817 if (rc != 0)
@@ -1768,7 +1826,7 @@ writeback:
1768 1826
1769 /* Commit shadow register state. */ 1827 /* Commit shadow register state. */
1770 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 1828 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
1771 ctxt->vcpu->arch.rip = c->eip; 1829 kvm_rip_write(ctxt->vcpu, c->eip);
1772 1830
1773done: 1831done:
1774 if (rc == X86EMUL_UNHANDLEABLE) { 1832 if (rc == X86EMUL_UNHANDLEABLE) {
@@ -1793,7 +1851,7 @@ twobyte_insn:
1793 goto done; 1851 goto done;
1794 1852
1795 /* Let the processor re-execute the fixed hypercall */ 1853 /* Let the processor re-execute the fixed hypercall */
1796 c->eip = ctxt->vcpu->arch.rip; 1854 c->eip = kvm_rip_read(ctxt->vcpu);
1797 /* Disable writeback. */ 1855 /* Disable writeback. */
1798 c->dst.type = OP_NONE; 1856 c->dst.type = OP_NONE;
1799 break; 1857 break;
@@ -1889,7 +1947,7 @@ twobyte_insn:
1889 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); 1947 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
1890 if (rc) { 1948 if (rc) {
1891 kvm_inject_gp(ctxt->vcpu, 0); 1949 kvm_inject_gp(ctxt->vcpu, 0);
1892 c->eip = ctxt->vcpu->arch.rip; 1950 c->eip = kvm_rip_read(ctxt->vcpu);
1893 } 1951 }
1894 rc = X86EMUL_CONTINUE; 1952 rc = X86EMUL_CONTINUE;
1895 c->dst.type = OP_NONE; 1953 c->dst.type = OP_NONE;
@@ -1899,7 +1957,7 @@ twobyte_insn:
1899 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); 1957 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
1900 if (rc) { 1958 if (rc) {
1901 kvm_inject_gp(ctxt->vcpu, 0); 1959 kvm_inject_gp(ctxt->vcpu, 0);
1902 c->eip = ctxt->vcpu->arch.rip; 1960 c->eip = kvm_rip_read(ctxt->vcpu);
1903 } else { 1961 } else {
1904 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 1962 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
1905 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 1963 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index d9249a882aa..48ee4f9435f 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -55,6 +55,7 @@
55#include <linux/lguest_launcher.h> 55#include <linux/lguest_launcher.h>
56#include <linux/virtio_console.h> 56#include <linux/virtio_console.h>
57#include <linux/pm.h> 57#include <linux/pm.h>
58#include <asm/apic.h>
58#include <asm/lguest.h> 59#include <asm/lguest.h>
59#include <asm/paravirt.h> 60#include <asm/paravirt.h>
60#include <asm/param.h> 61#include <asm/param.h>
@@ -581,7 +582,7 @@ static void __init lguest_init_IRQ(void)
581 for (i = 0; i < LGUEST_IRQS; i++) { 582 for (i = 0; i < LGUEST_IRQS; i++) {
582 int vector = FIRST_EXTERNAL_VECTOR + i; 583 int vector = FIRST_EXTERNAL_VECTOR + i;
583 if (vector != SYSCALL_VECTOR) { 584 if (vector != SYSCALL_VECTOR) {
584 set_intr_gate(vector, interrupt[i]); 585 set_intr_gate(vector, interrupt[vector]);
585 set_irq_chip_and_handler_name(i, &lguest_irq_controller, 586 set_irq_chip_and_handler_name(i, &lguest_irq_controller,
586 handle_level_irq, 587 handle_level_irq,
587 "level"); 588 "level");
@@ -783,14 +784,44 @@ static void lguest_wbinvd(void)
783 * code qualifies for Advanced. It will also never interrupt anything. It 784 * code qualifies for Advanced. It will also never interrupt anything. It
784 * does, however, allow us to get through the Linux boot code. */ 785 * does, however, allow us to get through the Linux boot code. */
785#ifdef CONFIG_X86_LOCAL_APIC 786#ifdef CONFIG_X86_LOCAL_APIC
786static void lguest_apic_write(unsigned long reg, u32 v) 787static void lguest_apic_write(u32 reg, u32 v)
787{ 788{
788} 789}
789 790
790static u32 lguest_apic_read(unsigned long reg) 791static u32 lguest_apic_read(u32 reg)
791{ 792{
792 return 0; 793 return 0;
793} 794}
795
796static u64 lguest_apic_icr_read(void)
797{
798 return 0;
799}
800
801static void lguest_apic_icr_write(u32 low, u32 id)
802{
803 /* Warn to see if there's any stray references */
804 WARN_ON(1);
805}
806
807static void lguest_apic_wait_icr_idle(void)
808{
809 return;
810}
811
812static u32 lguest_apic_safe_wait_icr_idle(void)
813{
814 return 0;
815}
816
817static struct apic_ops lguest_basic_apic_ops = {
818 .read = lguest_apic_read,
819 .write = lguest_apic_write,
820 .icr_read = lguest_apic_icr_read,
821 .icr_write = lguest_apic_icr_write,
822 .wait_icr_idle = lguest_apic_wait_icr_idle,
823 .safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle,
824};
794#endif 825#endif
795 826
796/* STOP! Until an interrupt comes in. */ 827/* STOP! Until an interrupt comes in. */
@@ -990,8 +1021,7 @@ __init void lguest_init(void)
990 1021
991#ifdef CONFIG_X86_LOCAL_APIC 1022#ifdef CONFIG_X86_LOCAL_APIC
992 /* apic read/write intercepts */ 1023 /* apic read/write intercepts */
993 pv_apic_ops.apic_write = lguest_apic_write; 1024 apic_ops = &lguest_basic_apic_ops;
994 pv_apic_ops.apic_read = lguest_apic_read;
995#endif 1025#endif
996 1026
997 /* time operations */ 1027 /* time operations */
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index aa3fa411942..55e11aa6d66 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -17,9 +17,6 @@ ifeq ($(CONFIG_X86_32),y)
17 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o 17 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
18else 18else
19 obj-y += io_64.o iomap_copy_64.o 19 obj-y += io_64.o iomap_copy_64.o
20
21 CFLAGS_csum-partial_64.o := -funroll-loops
22
23 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o 20 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
24 lib-y += thunk_64.o clear_page_64.o copy_page_64.o 21 lib-y += thunk_64.o clear_page_64.o copy_page_64.o
25 lib-y += memmove_64.o memset_64.o 22 lib-y += memmove_64.o memset_64.o
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
index 01b868ba82f..321cf720dbb 100644
--- a/arch/x86/lib/msr-on-cpu.c
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -16,37 +16,46 @@ static void __rdmsr_on_cpu(void *info)
16 rdmsr(rv->msr_no, rv->l, rv->h); 16 rdmsr(rv->msr_no, rv->l, rv->h);
17} 17}
18 18
19static void __rdmsr_safe_on_cpu(void *info) 19static void __wrmsr_on_cpu(void *info)
20{ 20{
21 struct msr_info *rv = info; 21 struct msr_info *rv = info;
22 22
23 rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h); 23 wrmsr(rv->msr_no, rv->l, rv->h);
24} 24}
25 25
26static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe) 26int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
27{ 27{
28 int err = 0; 28 int err;
29 struct msr_info rv; 29 struct msr_info rv;
30 30
31 rv.msr_no = msr_no; 31 rv.msr_no = msr_no;
32 if (safe) { 32 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
33 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu,
34 &rv, 1);
35 err = err ? err : rv.err;
36 } else {
37 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
38 }
39 *l = rv.l; 33 *l = rv.l;
40 *h = rv.h; 34 *h = rv.h;
41 35
42 return err; 36 return err;
43} 37}
44 38
45static void __wrmsr_on_cpu(void *info) 39int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
40{
41 int err;
42 struct msr_info rv;
43
44 rv.msr_no = msr_no;
45 rv.l = l;
46 rv.h = h;
47 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
48
49 return err;
50}
51
52/* These "safe" variants are slower and should be used when the target MSR
53 may not actually exist. */
54static void __rdmsr_safe_on_cpu(void *info)
46{ 55{
47 struct msr_info *rv = info; 56 struct msr_info *rv = info;
48 57
49 wrmsr(rv->msr_no, rv->l, rv->h); 58 rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
50} 59}
51 60
52static void __wrmsr_safe_on_cpu(void *info) 61static void __wrmsr_safe_on_cpu(void *info)
@@ -56,45 +65,30 @@ static void __wrmsr_safe_on_cpu(void *info)
56 rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h); 65 rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
57} 66}
58 67
59static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe) 68int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
60{ 69{
61 int err = 0; 70 int err;
62 struct msr_info rv; 71 struct msr_info rv;
63 72
64 rv.msr_no = msr_no; 73 rv.msr_no = msr_no;
65 rv.l = l; 74 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
66 rv.h = h; 75 *l = rv.l;
67 if (safe) { 76 *h = rv.h;
68 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu,
69 &rv, 1);
70 err = err ? err : rv.err;
71 } else {
72 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
73 }
74
75 return err;
76}
77 77
78int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) 78 return err ? err : rv.err;
79{
80 return _wrmsr_on_cpu(cpu, msr_no, l, h, 0);
81} 79}
82 80
83int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
84{
85 return _rdmsr_on_cpu(cpu, msr_no, l, h, 0);
86}
87
88/* These "safe" variants are slower and should be used when the target MSR
89 may not actually exist. */
90int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) 81int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
91{ 82{
92 return _wrmsr_on_cpu(cpu, msr_no, l, h, 1); 83 int err;
93} 84 struct msr_info rv;
94 85
95int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) 86 rv.msr_no = msr_no;
96{ 87 rv.l = l;
97 return _rdmsr_on_cpu(cpu, msr_no, l, h, 1); 88 rv.h = h;
89 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
90
91 return err ? err : rv.err;
98} 92}
99 93
100EXPORT_SYMBOL(rdmsr_on_cpu); 94EXPORT_SYMBOL(rdmsr_on_cpu);
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index 94972e7c094..82004d2bf05 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -22,7 +22,7 @@ char *strcpy(char *dest, const char *src)
22 "testb %%al,%%al\n\t" 22 "testb %%al,%%al\n\t"
23 "jne 1b" 23 "jne 1b"
24 : "=&S" (d0), "=&D" (d1), "=&a" (d2) 24 : "=&S" (d0), "=&D" (d1), "=&a" (d2)
25 :"0" (src), "1" (dest) : "memory"); 25 : "0" (src), "1" (dest) : "memory");
26 return dest; 26 return dest;
27} 27}
28EXPORT_SYMBOL(strcpy); 28EXPORT_SYMBOL(strcpy);
@@ -42,7 +42,7 @@ char *strncpy(char *dest, const char *src, size_t count)
42 "stosb\n" 42 "stosb\n"
43 "2:" 43 "2:"
44 : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) 44 : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
45 :"0" (src), "1" (dest), "2" (count) : "memory"); 45 : "0" (src), "1" (dest), "2" (count) : "memory");
46 return dest; 46 return dest;
47} 47}
48EXPORT_SYMBOL(strncpy); 48EXPORT_SYMBOL(strncpy);
@@ -60,7 +60,7 @@ char *strcat(char *dest, const char *src)
60 "testb %%al,%%al\n\t" 60 "testb %%al,%%al\n\t"
61 "jne 1b" 61 "jne 1b"
62 : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) 62 : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
63 : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory"); 63 : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory");
64 return dest; 64 return dest;
65} 65}
66EXPORT_SYMBOL(strcat); 66EXPORT_SYMBOL(strcat);
@@ -105,9 +105,9 @@ int strcmp(const char *cs, const char *ct)
105 "2:\tsbbl %%eax,%%eax\n\t" 105 "2:\tsbbl %%eax,%%eax\n\t"
106 "orb $1,%%al\n" 106 "orb $1,%%al\n"
107 "3:" 107 "3:"
108 :"=a" (res), "=&S" (d0), "=&D" (d1) 108 : "=a" (res), "=&S" (d0), "=&D" (d1)
109 :"1" (cs), "2" (ct) 109 : "1" (cs), "2" (ct)
110 :"memory"); 110 : "memory");
111 return res; 111 return res;
112} 112}
113EXPORT_SYMBOL(strcmp); 113EXPORT_SYMBOL(strcmp);
@@ -130,9 +130,9 @@ int strncmp(const char *cs, const char *ct, size_t count)
130 "3:\tsbbl %%eax,%%eax\n\t" 130 "3:\tsbbl %%eax,%%eax\n\t"
131 "orb $1,%%al\n" 131 "orb $1,%%al\n"
132 "4:" 132 "4:"
133 :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) 133 : "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
134 :"1" (cs), "2" (ct), "3" (count) 134 : "1" (cs), "2" (ct), "3" (count)
135 :"memory"); 135 : "memory");
136 return res; 136 return res;
137} 137}
138EXPORT_SYMBOL(strncmp); 138EXPORT_SYMBOL(strncmp);
@@ -152,9 +152,9 @@ char *strchr(const char *s, int c)
152 "movl $1,%1\n" 152 "movl $1,%1\n"
153 "2:\tmovl %1,%0\n\t" 153 "2:\tmovl %1,%0\n\t"
154 "decl %0" 154 "decl %0"
155 :"=a" (res), "=&S" (d0) 155 : "=a" (res), "=&S" (d0)
156 :"1" (s), "0" (c) 156 : "1" (s), "0" (c)
157 :"memory"); 157 : "memory");
158 return res; 158 return res;
159} 159}
160EXPORT_SYMBOL(strchr); 160EXPORT_SYMBOL(strchr);
@@ -169,9 +169,9 @@ size_t strlen(const char *s)
169 "scasb\n\t" 169 "scasb\n\t"
170 "notl %0\n\t" 170 "notl %0\n\t"
171 "decl %0" 171 "decl %0"
172 :"=c" (res), "=&D" (d0) 172 : "=c" (res), "=&D" (d0)
173 :"1" (s), "a" (0), "0" (0xffffffffu) 173 : "1" (s), "a" (0), "0" (0xffffffffu)
174 :"memory"); 174 : "memory");
175 return res; 175 return res;
176} 176}
177EXPORT_SYMBOL(strlen); 177EXPORT_SYMBOL(strlen);
@@ -189,9 +189,9 @@ void *memchr(const void *cs, int c, size_t count)
189 "je 1f\n\t" 189 "je 1f\n\t"
190 "movl $1,%0\n" 190 "movl $1,%0\n"
191 "1:\tdecl %0" 191 "1:\tdecl %0"
192 :"=D" (res), "=&c" (d0) 192 : "=D" (res), "=&c" (d0)
193 :"a" (c), "0" (cs), "1" (count) 193 : "a" (c), "0" (cs), "1" (count)
194 :"memory"); 194 : "memory");
195 return res; 195 return res;
196} 196}
197EXPORT_SYMBOL(memchr); 197EXPORT_SYMBOL(memchr);
@@ -228,9 +228,9 @@ size_t strnlen(const char *s, size_t count)
228 "cmpl $-1,%1\n\t" 228 "cmpl $-1,%1\n\t"
229 "jne 1b\n" 229 "jne 1b\n"
230 "3:\tsubl %2,%0" 230 "3:\tsubl %2,%0"
231 :"=a" (res), "=&d" (d0) 231 : "=a" (res), "=&d" (d0)
232 :"c" (s), "1" (count) 232 : "c" (s), "1" (count)
233 :"memory"); 233 : "memory");
234 return res; 234 return res;
235} 235}
236EXPORT_SYMBOL(strnlen); 236EXPORT_SYMBOL(strnlen);
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
index 42e8a50303f..8e2d55f754b 100644
--- a/arch/x86/lib/strstr_32.c
+++ b/arch/x86/lib/strstr_32.c
@@ -23,9 +23,9 @@ __asm__ __volatile__(
23 "jne 1b\n\t" 23 "jne 1b\n\t"
24 "xorl %%eax,%%eax\n\t" 24 "xorl %%eax,%%eax\n\t"
25 "2:" 25 "2:"
26 :"=a" (__res), "=&c" (d0), "=&S" (d1) 26 : "=a" (__res), "=&c" (d0), "=&S" (d1)
27 :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct) 27 : "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
28 :"dx", "di"); 28 : "dx", "di");
29return __res; 29return __res;
30} 30}
31 31
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index fab5faba1d3..4a20b2f9a38 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -14,6 +14,13 @@
14#include <asm/uaccess.h> 14#include <asm/uaccess.h>
15#include <asm/mmx.h> 15#include <asm/mmx.h>
16 16
17#ifdef CONFIG_X86_INTEL_USERCOPY
18/*
19 * Alignment at which movsl is preferred for bulk memory copies.
20 */
21struct movsl_mask movsl_mask __read_mostly;
22#endif
23
17static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n) 24static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
18{ 25{
19#ifdef CONFIG_X86_INTEL_USERCOPY 26#ifdef CONFIG_X86_INTEL_USERCOPY
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 3d317836be9..37b9ae4d44c 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -10,13 +10,15 @@
10#include <asm/e820.h> 10#include <asm/e820.h>
11#include <asm/setup.h> 11#include <asm/setup.h>
12 12
13#include <mach_ipi.h>
14
13#ifdef CONFIG_HOTPLUG_CPU 15#ifdef CONFIG_HOTPLUG_CPU
14#define DEFAULT_SEND_IPI (1) 16#define DEFAULT_SEND_IPI (1)
15#else 17#else
16#define DEFAULT_SEND_IPI (0) 18#define DEFAULT_SEND_IPI (0)
17#endif 19#endif
18 20
19int no_broadcast=DEFAULT_SEND_IPI; 21int no_broadcast = DEFAULT_SEND_IPI;
20 22
21/** 23/**
22 * pre_intr_init_hook - initialisation prior to setting up interrupt vectors 24 * pre_intr_init_hook - initialisation prior to setting up interrupt vectors
@@ -36,15 +38,6 @@ void __init pre_intr_init_hook(void)
36 init_ISA_irqs(); 38 init_ISA_irqs();
37} 39}
38 40
39/*
40 * IRQ2 is cascade interrupt to second interrupt controller
41 */
42static struct irqaction irq2 = {
43 .handler = no_action,
44 .mask = CPU_MASK_NONE,
45 .name = "cascade",
46};
47
48/** 41/**
49 * intr_init_hook - post gate setup interrupt initialisation 42 * intr_init_hook - post gate setup interrupt initialisation
50 * 43 *
@@ -60,12 +53,6 @@ void __init intr_init_hook(void)
60 if (x86_quirks->arch_intr_init()) 53 if (x86_quirks->arch_intr_init())
61 return; 54 return;
62 } 55 }
63#ifdef CONFIG_X86_LOCAL_APIC
64 apic_intr_init();
65#endif
66
67 if (!acpi_ioapic)
68 setup_irq(2, &irq2);
69} 56}
70 57
71/** 58/**
diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile
deleted file mode 100644
index 3ef8b43b62f..00000000000
--- a/arch/x86/mach-es7000/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5obj-$(CONFIG_X86_ES7000) := es7000plat.o
diff --git a/arch/x86/mach-es7000/es7000.h b/arch/x86/mach-es7000/es7000.h
deleted file mode 100644
index c8d5aa132fa..00000000000
--- a/arch/x86/mach-es7000/es7000.h
+++ /dev/null
@@ -1,114 +0,0 @@
1/*
2 * Written by: Garry Forsgren, Unisys Corporation
3 * Natalie Protasevich, Unisys Corporation
4 * This file contains the code to configure and interface
5 * with Unisys ES7000 series hardware system manager.
6 *
7 * Copyright (c) 2003 Unisys Corporation. All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it would be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write the Free Software Foundation, Inc., 59
19 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
20 *
21 * Contact information: Unisys Corporation, Township Line & Union Meeting
22 * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
23 *
24 * http://www.unisys.com
25 */
26
27/*
28 * ES7000 chipsets
29 */
30
31#define NON_UNISYS 0
32#define ES7000_CLASSIC 1
33#define ES7000_ZORRO 2
34
35
36#define MIP_REG 1
37#define MIP_PSAI_REG 4
38
39#define MIP_BUSY 1
40#define MIP_SPIN 0xf0000
41#define MIP_VALID 0x0100000000000000ULL
42#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
43
44#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
45
46struct mip_reg_info {
47 unsigned long long mip_info;
48 unsigned long long delivery_info;
49 unsigned long long host_reg;
50 unsigned long long mip_reg;
51};
52
53struct part_info {
54 unsigned char type;
55 unsigned char length;
56 unsigned char part_id;
57 unsigned char apic_mode;
58 unsigned long snum;
59 char ptype[16];
60 char sname[64];
61 char pname[64];
62};
63
64struct psai {
65 unsigned long long entry_type;
66 unsigned long long addr;
67 unsigned long long bep_addr;
68};
69
70struct es7000_mem_info {
71 unsigned char type;
72 unsigned char length;
73 unsigned char resv[6];
74 unsigned long long start;
75 unsigned long long size;
76};
77
78struct es7000_oem_table {
79 unsigned long long hdr;
80 struct mip_reg_info mip;
81 struct part_info pif;
82 struct es7000_mem_info shm;
83 struct psai psai;
84};
85
86#ifdef CONFIG_ACPI
87
88struct oem_table {
89 struct acpi_table_header Header;
90 u32 OEMTableAddr;
91 u32 OEMTableSize;
92};
93
94extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
95#endif
96
97struct mip_reg {
98 unsigned long long off_0;
99 unsigned long long off_8;
100 unsigned long long off_10;
101 unsigned long long off_18;
102 unsigned long long off_20;
103 unsigned long long off_28;
104 unsigned long long off_30;
105 unsigned long long off_38;
106};
107
108#define MIP_SW_APIC 0x1020b
109#define MIP_FUNC(VALUE) (VALUE & 0xff)
110
111extern int parse_unisys_oem (char *oemptr);
112extern void setup_unisys(void);
113extern int es7000_start_cpu(int cpu, unsigned long eip);
114extern void es7000_sw_apic(void);
diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile
index 0dbd7803a1d..6730f4e7c74 100644
--- a/arch/x86/mach-generic/Makefile
+++ b/arch/x86/mach-generic/Makefile
@@ -9,4 +9,3 @@ obj-$(CONFIG_X86_NUMAQ) += numaq.o
9obj-$(CONFIG_X86_SUMMIT) += summit.o 9obj-$(CONFIG_X86_SUMMIT) += summit.o
10obj-$(CONFIG_X86_BIGSMP) += bigsmp.o 10obj-$(CONFIG_X86_BIGSMP) += bigsmp.o
11obj-$(CONFIG_X86_ES7000) += es7000.o 11obj-$(CONFIG_X86_ES7000) += es7000.o
12obj-$(CONFIG_X86_ES7000) += ../../x86/mach-es7000/
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 59d77171455..3c3b471ea49 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -5,18 +5,17 @@
5#define APIC_DEFINITION 1 5#define APIC_DEFINITION 1
6#include <linux/threads.h> 6#include <linux/threads.h>
7#include <linux/cpumask.h> 7#include <linux/cpumask.h>
8#include <asm/smp.h>
9#include <asm/mpspec.h> 8#include <asm/mpspec.h>
10#include <asm/genapic.h> 9#include <asm/genapic.h>
11#include <asm/fixmap.h> 10#include <asm/fixmap.h>
12#include <asm/apicdef.h> 11#include <asm/apicdef.h>
13#include <linux/kernel.h> 12#include <linux/kernel.h>
14#include <linux/smp.h>
15#include <linux/init.h> 13#include <linux/init.h>
16#include <linux/dmi.h> 14#include <linux/dmi.h>
17#include <asm/mach-bigsmp/mach_apic.h> 15#include <asm/bigsmp/apicdef.h>
18#include <asm/mach-bigsmp/mach_apicdef.h> 16#include <linux/smp.h>
19#include <asm/mach-bigsmp/mach_ipi.h> 17#include <asm/bigsmp/apic.h>
18#include <asm/bigsmp/ipi.h>
20#include <asm/mach-default/mach_mpparse.h> 19#include <asm/mach-default/mach_mpparse.h>
21 20
22static int dmi_bigsmp; /* can be set by dmi scanners */ 21static int dmi_bigsmp; /* can be set by dmi scanners */
@@ -42,6 +41,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
42 { } 41 { }
43}; 42};
44 43
44static cpumask_t vector_allocation_domain(int cpu)
45{
46 return cpumask_of_cpu(cpu);
47}
45 48
46static int probe_bigsmp(void) 49static int probe_bigsmp(void)
47{ 50{
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
index 4742626f08c..28459cab3dd 100644
--- a/arch/x86/mach-generic/es7000.c
+++ b/arch/x86/mach-generic/es7000.c
@@ -4,20 +4,19 @@
4#define APIC_DEFINITION 1 4#define APIC_DEFINITION 1
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7#include <asm/smp.h>
8#include <asm/mpspec.h> 7#include <asm/mpspec.h>
9#include <asm/genapic.h> 8#include <asm/genapic.h>
10#include <asm/fixmap.h> 9#include <asm/fixmap.h>
11#include <asm/apicdef.h> 10#include <asm/apicdef.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/smp.h>
15#include <linux/init.h> 13#include <linux/init.h>
16#include <asm/mach-es7000/mach_apicdef.h> 14#include <asm/es7000/apicdef.h>
17#include <asm/mach-es7000/mach_apic.h> 15#include <linux/smp.h>
18#include <asm/mach-es7000/mach_ipi.h> 16#include <asm/es7000/apic.h>
19#include <asm/mach-es7000/mach_mpparse.h> 17#include <asm/es7000/ipi.h>
20#include <asm/mach-es7000/mach_wakecpu.h> 18#include <asm/es7000/mpparse.h>
19#include <asm/es7000/wakecpu.h>
21 20
22static int probe_es7000(void) 21static int probe_es7000(void)
23{ 22{
@@ -48,16 +47,26 @@ static __init int mps_oem_check(struct mp_config_table *mpc, char *oem,
48/* Hook from generic ACPI tables.c */ 47/* Hook from generic ACPI tables.c */
49static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 48static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{ 49{
51 unsigned long oem_addr; 50 unsigned long oem_addr = 0;
51 int check_dsdt;
52 int ret = 0;
53
54 /* check dsdt at first to avoid clear fix_map for oem_addr */
55 check_dsdt = es7000_check_dsdt();
56
52 if (!find_unisys_acpi_oem_table(&oem_addr)) { 57 if (!find_unisys_acpi_oem_table(&oem_addr)) {
53 if (es7000_check_dsdt()) 58 if (check_dsdt)
54 return parse_unisys_oem((char *)oem_addr); 59 ret = parse_unisys_oem((char *)oem_addr);
55 else { 60 else {
56 setup_unisys(); 61 setup_unisys();
57 return 1; 62 ret = 1;
58 } 63 }
64 /*
65 * we need to unmap it
66 */
67 unmap_unisys_acpi_oem_table(oem_addr);
59 } 68 }
60 return 0; 69 return ret;
61} 70}
62#else 71#else
63static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 72static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -66,4 +75,18 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
66} 75}
67#endif 76#endif
68 77
78static cpumask_t vector_allocation_domain(int cpu)
79{
80 /* Careful. Some cpus do not strictly honor the set of cpus
81 * specified in the interrupt destination when using lowest
82 * priority interrupt delivery mode.
83 *
84 * In particular there was a hyperthreading cpu observed to
85 * deliver interrupts to the wrong hyperthread when only one
86 * hyperthread was specified in the interrupt desitination.
87 */
88 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
89 return domain;
90}
91
69struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); 92struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
index 8091e68764c..71a309b122e 100644
--- a/arch/x86/mach-generic/numaq.c
+++ b/arch/x86/mach-generic/numaq.c
@@ -4,7 +4,6 @@
4#define APIC_DEFINITION 1 4#define APIC_DEFINITION 1
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7#include <linux/smp.h>
8#include <asm/mpspec.h> 7#include <asm/mpspec.h>
9#include <asm/genapic.h> 8#include <asm/genapic.h>
10#include <asm/fixmap.h> 9#include <asm/fixmap.h>
@@ -12,11 +11,12 @@
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/init.h> 13#include <linux/init.h>
15#include <asm/mach-numaq/mach_apic.h> 14#include <asm/numaq/apicdef.h>
16#include <asm/mach-numaq/mach_apicdef.h> 15#include <linux/smp.h>
17#include <asm/mach-numaq/mach_ipi.h> 16#include <asm/numaq/apic.h>
18#include <asm/mach-numaq/mach_mpparse.h> 17#include <asm/numaq/ipi.h>
19#include <asm/mach-numaq/mach_wakecpu.h> 18#include <asm/numaq/mpparse.h>
19#include <asm/numaq/wakecpu.h>
20#include <asm/numaq.h> 20#include <asm/numaq.h>
21 21
22static int mps_oem_check(struct mp_config_table *mpc, char *oem, 22static int mps_oem_check(struct mp_config_table *mpc, char *oem,
@@ -38,4 +38,18 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
38 return 0; 38 return 0;
39} 39}
40 40
41static cpumask_t vector_allocation_domain(int cpu)
42{
43 /* Careful. Some cpus do not strictly honor the set of cpus
44 * specified in the interrupt destination when using lowest
45 * priority interrupt delivery mode.
46 *
47 * In particular there was a hyperthreading cpu observed to
48 * deliver interrupts to the wrong hyperthread when only one
49 * hyperthread was specified in the interrupt desitination.
50 */
51 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
52 return domain;
53}
54
41struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); 55struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
index a97ea0f35b1..6272b5e69da 100644
--- a/arch/x86/mach-generic/summit.c
+++ b/arch/x86/mach-generic/summit.c
@@ -4,19 +4,18 @@
4#define APIC_DEFINITION 1 4#define APIC_DEFINITION 1
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7#include <asm/smp.h>
8#include <asm/mpspec.h> 7#include <asm/mpspec.h>
9#include <asm/genapic.h> 8#include <asm/genapic.h>
10#include <asm/fixmap.h> 9#include <asm/fixmap.h>
11#include <asm/apicdef.h> 10#include <asm/apicdef.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/smp.h>
15#include <linux/init.h> 13#include <linux/init.h>
16#include <asm/mach-summit/mach_apic.h> 14#include <asm/summit/apicdef.h>
17#include <asm/mach-summit/mach_apicdef.h> 15#include <linux/smp.h>
18#include <asm/mach-summit/mach_ipi.h> 16#include <asm/summit/apic.h>
19#include <asm/mach-summit/mach_mpparse.h> 17#include <asm/summit/ipi.h>
18#include <asm/summit/mpparse.h>
20 19
21static int probe_summit(void) 20static int probe_summit(void)
22{ 21{
@@ -24,4 +23,18 @@ static int probe_summit(void)
24 return 0; 23 return 0;
25} 24}
26 25
26static cpumask_t vector_allocation_domain(int cpu)
27{
28 /* Careful. Some cpus do not strictly honor the set of cpus
29 * specified in the interrupt destination when using lowest
30 * priority interrupt delivery mode.
31 *
32 * In particular there was a hyperthreading cpu observed to
33 * deliver interrupts to the wrong hyperthread when only one
34 * hyperthread was specified in the interrupt desitination.
35 */
36 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
37 return domain;
38}
39
27struct genapic apic_summit = APIC_INIT("summit", probe_summit); 40struct genapic apic_summit = APIC_INIT("summit", probe_summit);
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index ee0fba09215..0f6e8a6523a 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -448,6 +448,8 @@ static void __init start_secondary(void *unused)
448 448
449 VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); 449 VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
450 450
451 notify_cpu_starting(cpuid);
452
451 /* enable interrupts */ 453 /* enable interrupts */
452 local_irq_enable(); 454 local_irq_enable();
453 455
@@ -1481,7 +1483,7 @@ static void disable_local_vic_irq(unsigned int irq)
1481 * the interrupt off to another CPU */ 1483 * the interrupt off to another CPU */
1482static void before_handle_vic_irq(unsigned int irq) 1484static void before_handle_vic_irq(unsigned int irq)
1483{ 1485{
1484 irq_desc_t *desc = irq_desc + irq; 1486 irq_desc_t *desc = irq_to_desc(irq);
1485 __u8 cpu = smp_processor_id(); 1487 __u8 cpu = smp_processor_id();
1486 1488
1487 _raw_spin_lock(&vic_irq_lock); 1489 _raw_spin_lock(&vic_irq_lock);
@@ -1516,7 +1518,7 @@ static void before_handle_vic_irq(unsigned int irq)
1516/* Finish the VIC interrupt: basically mask */ 1518/* Finish the VIC interrupt: basically mask */
1517static void after_handle_vic_irq(unsigned int irq) 1519static void after_handle_vic_irq(unsigned int irq)
1518{ 1520{
1519 irq_desc_t *desc = irq_desc + irq; 1521 irq_desc_t *desc = irq_to_desc(irq);
1520 1522
1521 _raw_spin_lock(&vic_irq_lock); 1523 _raw_spin_lock(&vic_irq_lock);
1522 { 1524 {
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index dfb932dcf13..59f89b434b4 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -13,12 +13,8 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
13mmiotrace-y := pf_in.o mmio-mod.o 13mmiotrace-y := pf_in.o mmio-mod.o
14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
15 15
16ifeq ($(CONFIG_X86_32),y) 16obj-$(CONFIG_NUMA) += numa_$(BITS).o
17obj-$(CONFIG_NUMA) += discontig_32.o
18else
19obj-$(CONFIG_NUMA) += numa_64.o
20obj-$(CONFIG_K8_NUMA) += k8topology_64.o 17obj-$(CONFIG_K8_NUMA) += k8topology_64.o
21endif
22obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 18obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
23 19
24obj-$(CONFIG_MEMTEST) += memtest.o 20obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index a20d1fa64b4..e7277cbcfb4 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
148 * we have now. "break" is either changing perms, levels or 148 * we have now. "break" is either changing perms, levels or
149 * address space marker. 149 * address space marker.
150 */ 150 */
151 prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK); 151 prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
152 cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK); 152 cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
153 153
154 if (!st->level) { 154 if (!st->level) {
155 /* First entry */ 155 /* First entry */
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 455f3fe67b4..31e8730fa24 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -35,6 +35,7 @@
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36#include <asm/proto.h> 36#include <asm/proto.h>
37#include <asm-generic/sections.h> 37#include <asm-generic/sections.h>
38#include <asm/traps.h>
38 39
39/* 40/*
40 * Page fault error code bits 41 * Page fault error code bits
@@ -357,8 +358,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
357 return 0; 358 return 0;
358} 359}
359 360
360void do_invalid_op(struct pt_regs *, unsigned long);
361
362static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 361static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
363{ 362{
364#ifdef CONFIG_X86_F00F_BUG 363#ifdef CONFIG_X86_F00F_BUG
@@ -593,11 +592,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
593 unsigned long flags; 592 unsigned long flags;
594#endif 593#endif
595 594
596 /*
597 * We can fault from pretty much anywhere, with unknown IRQ state.
598 */
599 trace_hardirqs_fixup();
600
601 tsk = current; 595 tsk = current;
602 mm = tsk->mm; 596 mm = tsk->mm;
603 prefetchw(&mm->mmap_sem); 597 prefetchw(&mm->mmap_sem);
@@ -646,24 +640,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
646 } 640 }
647 641
648 642
649#ifdef CONFIG_X86_32
650 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
651 fault has been handled. */
652 if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
653 local_irq_enable();
654
655 /* 643 /*
656 * If we're in an interrupt, have no user context or are running in an 644 * It's safe to allow irq's after cr2 has been saved and the
657 * atomic region then we must not take the fault. 645 * vmalloc fault has been handled.
646 *
647 * User-mode registers count as a user access even for any
648 * potential system fault or CPU buglet.
658 */ 649 */
659 if (in_atomic() || !mm) 650 if (user_mode_vm(regs)) {
660 goto bad_area_nosemaphore; 651 local_irq_enable();
661#else /* CONFIG_X86_64 */ 652 error_code |= PF_USER;
662 if (likely(regs->flags & X86_EFLAGS_IF)) 653 } else if (regs->flags & X86_EFLAGS_IF)
663 local_irq_enable(); 654 local_irq_enable();
664 655
656#ifdef CONFIG_X86_64
665 if (unlikely(error_code & PF_RSVD)) 657 if (unlikely(error_code & PF_RSVD))
666 pgtable_bad(address, regs, error_code); 658 pgtable_bad(address, regs, error_code);
659#endif
667 660
668 /* 661 /*
669 * If we're in an interrupt, have no user context or are running in an 662 * If we're in an interrupt, have no user context or are running in an
@@ -672,15 +665,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
672 if (unlikely(in_atomic() || !mm)) 665 if (unlikely(in_atomic() || !mm))
673 goto bad_area_nosemaphore; 666 goto bad_area_nosemaphore;
674 667
675 /*
676 * User-mode registers count as a user access even for any
677 * potential system fault or CPU buglet.
678 */
679 if (user_mode_vm(regs))
680 error_code |= PF_USER;
681again: 668again:
682#endif 669 /*
683 /* When running in the kernel we expect faults to occur only to 670 * When running in the kernel we expect faults to occur only to
684 * addresses in user space. All other faults represent errors in the 671 * addresses in user space. All other faults represent errors in the
685 * kernel and should generate an OOPS. Unfortunately, in the case of an 672 * kernel and should generate an OOPS. Unfortunately, in the case of an
686 * erroneous fault occurring in a code path which already holds mmap_sem 673 * erroneous fault occurring in a code path which already holds mmap_sem
@@ -743,9 +730,6 @@ good_area:
743 goto bad_area; 730 goto bad_area;
744 } 731 }
745 732
746#ifdef CONFIG_X86_32
747survive:
748#endif
749 /* 733 /*
750 * If for any reason at all we couldn't handle the fault, 734 * If for any reason at all we couldn't handle the fault,
751 * make sure we exit gracefully rather than endlessly redo 735 * make sure we exit gracefully rather than endlessly redo
@@ -880,12 +864,11 @@ out_of_memory:
880 up_read(&mm->mmap_sem); 864 up_read(&mm->mmap_sem);
881 if (is_global_init(tsk)) { 865 if (is_global_init(tsk)) {
882 yield(); 866 yield();
883#ifdef CONFIG_X86_32 867 /*
884 down_read(&mm->mmap_sem); 868 * Re-lookup the vma - in theory the vma tree might
885 goto survive; 869 * have changed:
886#else 870 */
887 goto again; 871 goto again;
888#endif
889 } 872 }
890 873
891 printk("VM: killing process %s\n", tsk->comm); 874 printk("VM: killing process %s\n", tsk->comm);
@@ -915,15 +898,15 @@ LIST_HEAD(pgd_list);
915 898
916void vmalloc_sync_all(void) 899void vmalloc_sync_all(void)
917{ 900{
918#ifdef CONFIG_X86_32
919 unsigned long start = VMALLOC_START & PGDIR_MASK;
920 unsigned long address; 901 unsigned long address;
921 902
903#ifdef CONFIG_X86_32
922 if (SHARED_KERNEL_PMD) 904 if (SHARED_KERNEL_PMD)
923 return; 905 return;
924 906
925 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 907 for (address = VMALLOC_START & PMD_MASK;
926 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 908 address >= TASK_SIZE && address < FIXADDR_TOP;
909 address += PMD_SIZE) {
927 unsigned long flags; 910 unsigned long flags;
928 struct page *page; 911 struct page *page;
929 912
@@ -936,10 +919,8 @@ void vmalloc_sync_all(void)
936 spin_unlock_irqrestore(&pgd_lock, flags); 919 spin_unlock_irqrestore(&pgd_lock, flags);
937 } 920 }
938#else /* CONFIG_X86_64 */ 921#else /* CONFIG_X86_64 */
939 unsigned long start = VMALLOC_START & PGDIR_MASK; 922 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
940 unsigned long address; 923 address += PGDIR_SIZE) {
941
942 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
943 const pgd_t *pgd_ref = pgd_offset_k(address); 924 const pgd_t *pgd_ref = pgd_offset_k(address);
944 unsigned long flags; 925 unsigned long flags;
945 struct page *page; 926 struct page *page;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 007bb06c750..4ba373c5b8c 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -82,7 +82,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
82 pte_t pte = gup_get_pte(ptep); 82 pte_t pte = gup_get_pte(ptep);
83 struct page *page; 83 struct page *page;
84 84
85 if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) { 85 if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
86 pte_unmap(ptep); 86 pte_unmap(ptep);
87 return 0; 87 return 0;
88 } 88 }
@@ -116,10 +116,10 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
116 mask = _PAGE_PRESENT|_PAGE_USER; 116 mask = _PAGE_PRESENT|_PAGE_USER;
117 if (write) 117 if (write)
118 mask |= _PAGE_RW; 118 mask |= _PAGE_RW;
119 if ((pte_val(pte) & mask) != mask) 119 if ((pte_flags(pte) & mask) != mask)
120 return 0; 120 return 0;
121 /* hugepages are never "special" */ 121 /* hugepages are never "special" */
122 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); 122 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
123 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 123 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
124 124
125 refs = 0; 125 refs = 0;
@@ -173,10 +173,10 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
173 mask = _PAGE_PRESENT|_PAGE_USER; 173 mask = _PAGE_PRESENT|_PAGE_USER;
174 if (write) 174 if (write)
175 mask |= _PAGE_RW; 175 mask |= _PAGE_RW;
176 if ((pte_val(pte) & mask) != mask) 176 if ((pte_flags(pte) & mask) != mask)
177 return 0; 177 return 0;
178 /* hugepages are never "special" */ 178 /* hugepages are never "special" */
179 VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); 179 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
180 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 180 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
181 181
182 refs = 0; 182 refs = 0;
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 165c871ba9a..bcc079c282d 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -137,6 +137,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
137 137
138 return (void*) vaddr; 138 return (void*) vaddr;
139} 139}
140EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
140 141
141struct page *kmap_atomic_to_page(void *ptr) 142struct page *kmap_atomic_to_page(void *ptr)
142{ 143{
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 60ec1d08ff2..8396868e82c 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -31,6 +31,7 @@
31#include <linux/cpumask.h> 31#include <linux/cpumask.h>
32 32
33#include <asm/asm.h> 33#include <asm/asm.h>
34#include <asm/bios_ebda.h>
34#include <asm/processor.h> 35#include <asm/processor.h>
35#include <asm/system.h> 36#include <asm/system.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
@@ -47,6 +48,7 @@
47#include <asm/paravirt.h> 48#include <asm/paravirt.h>
48#include <asm/setup.h> 49#include <asm/setup.h>
49#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/smp.h>
50 52
51unsigned int __VMALLOC_RESERVE = 128 << 20; 53unsigned int __VMALLOC_RESERVE = 128 << 20;
52 54
@@ -194,11 +196,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
194 pgd_t *pgd; 196 pgd_t *pgd;
195 pmd_t *pmd; 197 pmd_t *pmd;
196 pte_t *pte; 198 pte_t *pte;
197 unsigned pages_2m = 0, pages_4k = 0; 199 unsigned pages_2m, pages_4k;
200 int mapping_iter;
201
202 /*
203 * First iteration will setup identity mapping using large/small pages
204 * based on use_pse, with other attributes same as set by
205 * the early code in head_32.S
206 *
207 * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
208 * as desired for the kernel identity mapping.
209 *
210 * This two pass mechanism conforms to the TLB app note which says:
211 *
212 * "Software should not write to a paging-structure entry in a way
213 * that would change, for any linear address, both the page size
214 * and either the page frame or attributes."
215 */
216 mapping_iter = 1;
198 217
199 if (!cpu_has_pse) 218 if (!cpu_has_pse)
200 use_pse = 0; 219 use_pse = 0;
201 220
221repeat:
222 pages_2m = pages_4k = 0;
202 pfn = start_pfn; 223 pfn = start_pfn;
203 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); 224 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
204 pgd = pgd_base + pgd_idx; 225 pgd = pgd_base + pgd_idx;
@@ -224,6 +245,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
224 if (use_pse) { 245 if (use_pse) {
225 unsigned int addr2; 246 unsigned int addr2;
226 pgprot_t prot = PAGE_KERNEL_LARGE; 247 pgprot_t prot = PAGE_KERNEL_LARGE;
248 /*
249 * first pass will use the same initial
250 * identity mapping attribute + _PAGE_PSE.
251 */
252 pgprot_t init_prot =
253 __pgprot(PTE_IDENT_ATTR |
254 _PAGE_PSE);
227 255
228 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + 256 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
229 PAGE_OFFSET + PAGE_SIZE-1; 257 PAGE_OFFSET + PAGE_SIZE-1;
@@ -233,7 +261,10 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
233 prot = PAGE_KERNEL_LARGE_EXEC; 261 prot = PAGE_KERNEL_LARGE_EXEC;
234 262
235 pages_2m++; 263 pages_2m++;
236 set_pmd(pmd, pfn_pmd(pfn, prot)); 264 if (mapping_iter == 1)
265 set_pmd(pmd, pfn_pmd(pfn, init_prot));
266 else
267 set_pmd(pmd, pfn_pmd(pfn, prot));
237 268
238 pfn += PTRS_PER_PTE; 269 pfn += PTRS_PER_PTE;
239 continue; 270 continue;
@@ -245,17 +276,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
245 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; 276 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
246 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 277 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
247 pgprot_t prot = PAGE_KERNEL; 278 pgprot_t prot = PAGE_KERNEL;
279 /*
280 * first pass will use the same initial
281 * identity mapping attribute.
282 */
283 pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
248 284
249 if (is_kernel_text(addr)) 285 if (is_kernel_text(addr))
250 prot = PAGE_KERNEL_EXEC; 286 prot = PAGE_KERNEL_EXEC;
251 287
252 pages_4k++; 288 pages_4k++;
253 set_pte(pte, pfn_pte(pfn, prot)); 289 if (mapping_iter == 1)
290 set_pte(pte, pfn_pte(pfn, init_prot));
291 else
292 set_pte(pte, pfn_pte(pfn, prot));
254 } 293 }
255 } 294 }
256 } 295 }
257 update_page_count(PG_LEVEL_2M, pages_2m); 296 if (mapping_iter == 1) {
258 update_page_count(PG_LEVEL_4K, pages_4k); 297 /*
298 * update direct mapping page count only in the first
299 * iteration.
300 */
301 update_page_count(PG_LEVEL_2M, pages_2m);
302 update_page_count(PG_LEVEL_4K, pages_4k);
303
304 /*
305 * local global flush tlb, which will flush the previous
306 * mappings present in both small and large page TLB's.
307 */
308 __flush_tlb_all();
309
310 /*
311 * Second iteration will set the actual desired PTE attributes.
312 */
313 mapping_iter = 2;
314 goto repeat;
315 }
259} 316}
260 317
261/* 318/*
@@ -501,7 +558,7 @@ void zap_low_mappings(void)
501 558
502int nx_enabled; 559int nx_enabled;
503 560
504pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); 561pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
505EXPORT_SYMBOL_GPL(__supported_pte_mask); 562EXPORT_SYMBOL_GPL(__supported_pte_mask);
506 563
507#ifdef CONFIG_X86_PAE 564#ifdef CONFIG_X86_PAE
@@ -718,7 +775,7 @@ void __init setup_bootmem_allocator(void)
718 after_init_bootmem = 1; 775 after_init_bootmem = 1;
719} 776}
720 777
721static void __init find_early_table_space(unsigned long end) 778static void __init find_early_table_space(unsigned long end, int use_pse)
722{ 779{
723 unsigned long puds, pmds, ptes, tables, start; 780 unsigned long puds, pmds, ptes, tables, start;
724 781
@@ -728,7 +785,7 @@ static void __init find_early_table_space(unsigned long end)
728 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 785 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
729 tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); 786 tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
730 787
731 if (cpu_has_pse) { 788 if (use_pse) {
732 unsigned long extra; 789 unsigned long extra;
733 790
734 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 791 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
@@ -768,12 +825,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
768 pgd_t *pgd_base = swapper_pg_dir; 825 pgd_t *pgd_base = swapper_pg_dir;
769 unsigned long start_pfn, end_pfn; 826 unsigned long start_pfn, end_pfn;
770 unsigned long big_page_start; 827 unsigned long big_page_start;
828#ifdef CONFIG_DEBUG_PAGEALLOC
829 /*
830 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
831 * This will simplify cpa(), which otherwise needs to support splitting
832 * large pages into small in interrupt context, etc.
833 */
834 int use_pse = 0;
835#else
836 int use_pse = cpu_has_pse;
837#endif
771 838
772 /* 839 /*
773 * Find space for the kernel direct mapping tables. 840 * Find space for the kernel direct mapping tables.
774 */ 841 */
775 if (!after_init_bootmem) 842 if (!after_init_bootmem)
776 find_early_table_space(end); 843 find_early_table_space(end, use_pse);
777 844
778#ifdef CONFIG_X86_PAE 845#ifdef CONFIG_X86_PAE
779 set_nx(); 846 set_nx();
@@ -819,7 +886,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
819 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); 886 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
820 if (start_pfn < end_pfn) 887 if (start_pfn < end_pfn)
821 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 888 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
822 cpu_has_pse); 889 use_pse);
823 890
824 /* tail is not big page alignment ? */ 891 /* tail is not big page alignment ? */
825 start_pfn = end_pfn; 892 start_pfn = end_pfn;
@@ -903,6 +970,8 @@ void __init mem_init(void)
903 int codesize, reservedpages, datasize, initsize; 970 int codesize, reservedpages, datasize, initsize;
904 int tmp; 971 int tmp;
905 972
973 start_periodic_check_for_corruption();
974
906#ifdef CONFIG_FLATMEM 975#ifdef CONFIG_FLATMEM
907 BUG_ON(!mem_map); 976 BUG_ON(!mem_map);
908#endif 977#endif
@@ -982,7 +1051,6 @@ void __init mem_init(void)
982 if (boot_cpu_data.wp_works_ok < 0) 1051 if (boot_cpu_data.wp_works_ok < 0)
983 test_wp_bit(); 1052 test_wp_bit();
984 1053
985 cpa_init();
986 save_pg_dir(); 1054 save_pg_dir();
987 zap_low_mappings(); 1055 zap_low_mappings();
988} 1056}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index d3746efb060..b8e461d4941 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -31,6 +31,7 @@
31#include <linux/nmi.h> 31#include <linux/nmi.h>
32 32
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/bios_ebda.h>
34#include <asm/system.h> 35#include <asm/system.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/pgtable.h> 37#include <asm/pgtable.h>
@@ -88,6 +89,62 @@ early_param("gbpages", parse_direct_gbpages_on);
88 89
89int after_bootmem; 90int after_bootmem;
90 91
92pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
93EXPORT_SYMBOL_GPL(__supported_pte_mask);
94
95static int do_not_nx __cpuinitdata;
96
97/*
98 * noexec=on|off
99 * Control non-executable mappings for 64-bit processes.
100 *
101 * on Enable (default)
102 * off Disable
103 */
104static int __init nonx_setup(char *str)
105{
106 if (!str)
107 return -EINVAL;
108 if (!strncmp(str, "on", 2)) {
109 __supported_pte_mask |= _PAGE_NX;
110 do_not_nx = 0;
111 } else if (!strncmp(str, "off", 3)) {
112 do_not_nx = 1;
113 __supported_pte_mask &= ~_PAGE_NX;
114 }
115 return 0;
116}
117early_param("noexec", nonx_setup);
118
119void __cpuinit check_efer(void)
120{
121 unsigned long efer;
122
123 rdmsrl(MSR_EFER, efer);
124 if (!(efer & EFER_NX) || do_not_nx)
125 __supported_pte_mask &= ~_PAGE_NX;
126}
127
128int force_personality32;
129
130/*
131 * noexec32=on|off
132 * Control non executable heap for 32bit processes.
133 * To control the stack too use noexec=off
134 *
135 * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
136 * off PROT_READ implies PROT_EXEC
137 */
138static int __init nonx32_setup(char *str)
139{
140 if (!strcmp(str, "on"))
141 force_personality32 &= ~READ_IMPLIES_EXEC;
142 else if (!strcmp(str, "off"))
143 force_personality32 |= READ_IMPLIES_EXEC;
144 return 1;
145}
146__setup("noexec32=", nonx32_setup);
147
91/* 148/*
92 * NOTE: This function is marked __ref because it calls __init function 149 * NOTE: This function is marked __ref because it calls __init function
93 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. 150 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
@@ -139,9 +196,6 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
139 } 196 }
140 197
141 pte = pte_offset_kernel(pmd, vaddr); 198 pte = pte_offset_kernel(pmd, vaddr);
142 if (!pte_none(*pte) && pte_val(new_pte) &&
143 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
144 pte_ERROR(*pte);
145 set_pte(pte, new_pte); 199 set_pte(pte, new_pte);
146 200
147 /* 201 /*
@@ -225,7 +279,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
225void __init cleanup_highmap(void) 279void __init cleanup_highmap(void)
226{ 280{
227 unsigned long vaddr = __START_KERNEL_map; 281 unsigned long vaddr = __START_KERNEL_map;
228 unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; 282 unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
229 pmd_t *pmd = level2_kernel_pgt; 283 pmd_t *pmd = level2_kernel_pgt;
230 pmd_t *last_pmd = pmd + PTRS_PER_PMD; 284 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
231 285
@@ -256,7 +310,7 @@ static __ref void *alloc_low_page(unsigned long *phys)
256 if (pfn >= table_top) 310 if (pfn >= table_top)
257 panic("alloc_low_page: ran out of memory"); 311 panic("alloc_low_page: ran out of memory");
258 312
259 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); 313 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
260 memset(adr, 0, PAGE_SIZE); 314 memset(adr, 0, PAGE_SIZE);
261 *phys = pfn * PAGE_SIZE; 315 *phys = pfn * PAGE_SIZE;
262 return adr; 316 return adr;
@@ -271,7 +325,8 @@ static __ref void unmap_low_page(void *adr)
271} 325}
272 326
273static unsigned long __meminit 327static unsigned long __meminit
274phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) 328phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
329 pgprot_t prot)
275{ 330{
276 unsigned pages = 0; 331 unsigned pages = 0;
277 unsigned long last_map_addr = end; 332 unsigned long last_map_addr = end;
@@ -289,36 +344,43 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
289 break; 344 break;
290 } 345 }
291 346
347 /*
348 * We will re-use the existing mapping.
349 * Xen for example has some special requirements, like mapping
350 * pagetable pages as RO. So assume someone who pre-setup
351 * these mappings are more intelligent.
352 */
292 if (pte_val(*pte)) 353 if (pte_val(*pte))
293 continue; 354 continue;
294 355
295 if (0) 356 if (0)
296 printk(" pte=%p addr=%lx pte=%016lx\n", 357 printk(" pte=%p addr=%lx pte=%016lx\n",
297 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); 358 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
298 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
299 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
300 pages++; 359 pages++;
360 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
361 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
301 } 362 }
363
302 update_page_count(PG_LEVEL_4K, pages); 364 update_page_count(PG_LEVEL_4K, pages);
303 365
304 return last_map_addr; 366 return last_map_addr;
305} 367}
306 368
307static unsigned long __meminit 369static unsigned long __meminit
308phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) 370phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
371 pgprot_t prot)
309{ 372{
310 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); 373 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
311 374
312 return phys_pte_init(pte, address, end); 375 return phys_pte_init(pte, address, end, prot);
313} 376}
314 377
315static unsigned long __meminit 378static unsigned long __meminit
316phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 379phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
317 unsigned long page_size_mask) 380 unsigned long page_size_mask, pgprot_t prot)
318{ 381{
319 unsigned long pages = 0; 382 unsigned long pages = 0;
320 unsigned long last_map_addr = end; 383 unsigned long last_map_addr = end;
321 unsigned long start = address;
322 384
323 int i = pmd_index(address); 385 int i = pmd_index(address);
324 386
@@ -326,6 +388,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
326 unsigned long pte_phys; 388 unsigned long pte_phys;
327 pmd_t *pmd = pmd_page + pmd_index(address); 389 pmd_t *pmd = pmd_page + pmd_index(address);
328 pte_t *pte; 390 pte_t *pte;
391 pgprot_t new_prot = prot;
329 392
330 if (address >= end) { 393 if (address >= end) {
331 if (!after_bootmem) { 394 if (!after_bootmem) {
@@ -339,27 +402,40 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
339 if (!pmd_large(*pmd)) { 402 if (!pmd_large(*pmd)) {
340 spin_lock(&init_mm.page_table_lock); 403 spin_lock(&init_mm.page_table_lock);
341 last_map_addr = phys_pte_update(pmd, address, 404 last_map_addr = phys_pte_update(pmd, address,
342 end); 405 end, prot);
343 spin_unlock(&init_mm.page_table_lock); 406 spin_unlock(&init_mm.page_table_lock);
407 continue;
344 } 408 }
345 /* Count entries we're using from level2_ident_pgt */ 409 /*
346 if (start == 0) 410 * If we are ok with PG_LEVEL_2M mapping, then we will
347 pages++; 411 * use the existing mapping,
348 continue; 412 *
413 * Otherwise, we will split the large page mapping but
414 * use the same existing protection bits except for
415 * large page, so that we don't violate Intel's TLB
416 * Application note (317080) which says, while changing
417 * the page sizes, new and old translations should
418 * not differ with respect to page frame and
419 * attributes.
420 */
421 if (page_size_mask & (1 << PG_LEVEL_2M))
422 continue;
423 new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
349 } 424 }
350 425
351 if (page_size_mask & (1<<PG_LEVEL_2M)) { 426 if (page_size_mask & (1<<PG_LEVEL_2M)) {
352 pages++; 427 pages++;
353 spin_lock(&init_mm.page_table_lock); 428 spin_lock(&init_mm.page_table_lock);
354 set_pte((pte_t *)pmd, 429 set_pte((pte_t *)pmd,
355 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 430 pfn_pte(address >> PAGE_SHIFT,
431 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
356 spin_unlock(&init_mm.page_table_lock); 432 spin_unlock(&init_mm.page_table_lock);
357 last_map_addr = (address & PMD_MASK) + PMD_SIZE; 433 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
358 continue; 434 continue;
359 } 435 }
360 436
361 pte = alloc_low_page(&pte_phys); 437 pte = alloc_low_page(&pte_phys);
362 last_map_addr = phys_pte_init(pte, address, end); 438 last_map_addr = phys_pte_init(pte, address, end, new_prot);
363 unmap_low_page(pte); 439 unmap_low_page(pte);
364 440
365 spin_lock(&init_mm.page_table_lock); 441 spin_lock(&init_mm.page_table_lock);
@@ -372,12 +448,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
372 448
373static unsigned long __meminit 449static unsigned long __meminit
374phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, 450phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
375 unsigned long page_size_mask) 451 unsigned long page_size_mask, pgprot_t prot)
376{ 452{
377 pmd_t *pmd = pmd_offset(pud, 0); 453 pmd_t *pmd = pmd_offset(pud, 0);
378 unsigned long last_map_addr; 454 unsigned long last_map_addr;
379 455
380 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); 456 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
381 __flush_tlb_all(); 457 __flush_tlb_all();
382 return last_map_addr; 458 return last_map_addr;
383} 459}
@@ -394,6 +470,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
394 unsigned long pmd_phys; 470 unsigned long pmd_phys;
395 pud_t *pud = pud_page + pud_index(addr); 471 pud_t *pud = pud_page + pud_index(addr);
396 pmd_t *pmd; 472 pmd_t *pmd;
473 pgprot_t prot = PAGE_KERNEL;
397 474
398 if (addr >= end) 475 if (addr >= end)
399 break; 476 break;
@@ -405,10 +482,26 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
405 } 482 }
406 483
407 if (pud_val(*pud)) { 484 if (pud_val(*pud)) {
408 if (!pud_large(*pud)) 485 if (!pud_large(*pud)) {
409 last_map_addr = phys_pmd_update(pud, addr, end, 486 last_map_addr = phys_pmd_update(pud, addr, end,
410 page_size_mask); 487 page_size_mask, prot);
411 continue; 488 continue;
489 }
490 /*
491 * If we are ok with PG_LEVEL_1G mapping, then we will
492 * use the existing mapping.
493 *
494 * Otherwise, we will split the gbpage mapping but use
495 * the same existing protection bits except for large
496 * page, so that we don't violate Intel's TLB
497 * Application note (317080) which says, while changing
498 * the page sizes, new and old translations should
499 * not differ with respect to page frame and
500 * attributes.
501 */
502 if (page_size_mask & (1 << PG_LEVEL_1G))
503 continue;
504 prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
412 } 505 }
413 506
414 if (page_size_mask & (1<<PG_LEVEL_1G)) { 507 if (page_size_mask & (1<<PG_LEVEL_1G)) {
@@ -422,7 +515,8 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
422 } 515 }
423 516
424 pmd = alloc_low_page(&pmd_phys); 517 pmd = alloc_low_page(&pmd_phys);
425 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); 518 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
519 prot);
426 unmap_low_page(pmd); 520 unmap_low_page(pmd);
427 521
428 spin_lock(&init_mm.page_table_lock); 522 spin_lock(&init_mm.page_table_lock);
@@ -430,6 +524,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
430 spin_unlock(&init_mm.page_table_lock); 524 spin_unlock(&init_mm.page_table_lock);
431 } 525 }
432 __flush_tlb_all(); 526 __flush_tlb_all();
527
433 update_page_count(PG_LEVEL_1G, pages); 528 update_page_count(PG_LEVEL_1G, pages);
434 529
435 return last_map_addr; 530 return last_map_addr;
@@ -446,27 +541,28 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
446 return phys_pud_init(pud, addr, end, page_size_mask); 541 return phys_pud_init(pud, addr, end, page_size_mask);
447} 542}
448 543
449static void __init find_early_table_space(unsigned long end) 544static void __init find_early_table_space(unsigned long end, int use_pse,
545 int use_gbpages)
450{ 546{
451 unsigned long puds, pmds, ptes, tables, start; 547 unsigned long puds, pmds, ptes, tables, start;
452 548
453 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 549 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
454 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); 550 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
455 if (direct_gbpages) { 551 if (use_gbpages) {
456 unsigned long extra; 552 unsigned long extra;
457 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); 553 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
458 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; 554 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
459 } else 555 } else
460 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 556 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
461 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 557 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
462 558
463 if (cpu_has_pse) { 559 if (use_pse) {
464 unsigned long extra; 560 unsigned long extra;
465 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 561 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
466 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; 562 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
467 } else 563 } else
468 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 564 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
469 tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); 565 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
470 566
471 /* 567 /*
472 * RED-PEN putting page tables only on node 0 could 568 * RED-PEN putting page tables only on node 0 could
@@ -528,6 +624,7 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start,
528 pgd_populate(&init_mm, pgd, __va(pud_phys)); 624 pgd_populate(&init_mm, pgd, __va(pud_phys));
529 spin_unlock(&init_mm.page_table_lock); 625 spin_unlock(&init_mm.page_table_lock);
530 } 626 }
627 __flush_tlb_all();
531 628
532 return last_map_addr; 629 return last_map_addr;
533} 630}
@@ -571,6 +668,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
571 668
572 struct map_range mr[NR_RANGE_MR]; 669 struct map_range mr[NR_RANGE_MR];
573 int nr_range, i; 670 int nr_range, i;
671 int use_pse, use_gbpages;
574 672
575 printk(KERN_INFO "init_memory_mapping\n"); 673 printk(KERN_INFO "init_memory_mapping\n");
576 674
@@ -584,9 +682,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
584 if (!after_bootmem) 682 if (!after_bootmem)
585 init_gbpages(); 683 init_gbpages();
586 684
587 if (direct_gbpages) 685#ifdef CONFIG_DEBUG_PAGEALLOC
686 /*
687 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
688 * This will simplify cpa(), which otherwise needs to support splitting
689 * large pages into small in interrupt context, etc.
690 */
691 use_pse = use_gbpages = 0;
692#else
693 use_pse = cpu_has_pse;
694 use_gbpages = direct_gbpages;
695#endif
696
697 if (use_gbpages)
588 page_size_mask |= 1 << PG_LEVEL_1G; 698 page_size_mask |= 1 << PG_LEVEL_1G;
589 if (cpu_has_pse) 699 if (use_pse)
590 page_size_mask |= 1 << PG_LEVEL_2M; 700 page_size_mask |= 1 << PG_LEVEL_2M;
591 701
592 memset(mr, 0, sizeof(mr)); 702 memset(mr, 0, sizeof(mr));
@@ -636,7 +746,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
636 old_start = mr[i].start; 746 old_start = mr[i].start;
637 memmove(&mr[i], &mr[i+1], 747 memmove(&mr[i], &mr[i+1],
638 (nr_range - 1 - i) * sizeof (struct map_range)); 748 (nr_range - 1 - i) * sizeof (struct map_range));
639 mr[i].start = old_start; 749 mr[i--].start = old_start;
640 nr_range--; 750 nr_range--;
641 } 751 }
642 752
@@ -647,7 +757,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
647 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); 757 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
648 758
649 if (!after_bootmem) 759 if (!after_bootmem)
650 find_early_table_space(end); 760 find_early_table_space(end, use_pse, use_gbpages);
651 761
652 for (i = 0; i < nr_range; i++) 762 for (i = 0; i < nr_range; i++)
653 last_map_addr = kernel_physical_mapping_init( 763 last_map_addr = kernel_physical_mapping_init(
@@ -769,6 +879,8 @@ void __init mem_init(void)
769{ 879{
770 long codesize, reservedpages, datasize, initsize; 880 long codesize, reservedpages, datasize, initsize;
771 881
882 start_periodic_check_for_corruption();
883
772 pci_iommu_alloc(); 884 pci_iommu_alloc();
773 885
774 /* clear_bss() already clear the empty_zero_page */ 886 /* clear_bss() already clear the empty_zero_page */
@@ -806,8 +918,6 @@ void __init mem_init(void)
806 reservedpages << (PAGE_SHIFT-10), 918 reservedpages << (PAGE_SHIFT-10),
807 datasize >> 10, 919 datasize >> 10,
808 initsize >> 10); 920 initsize >> 10);
809
810 cpa_init();
811} 921}
812 922
813void free_init_pages(char *what, unsigned long begin, unsigned long end) 923void free_init_pages(char *what, unsigned long begin, unsigned long end)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index d4b6e6a29ae..ae71e11eb3e 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,18 +24,47 @@
24 24
25#ifdef CONFIG_X86_64 25#ifdef CONFIG_X86_64
26 26
27static inline int phys_addr_valid(unsigned long addr)
28{
29 return addr < (1UL << boot_cpu_data.x86_phys_bits);
30}
31
27unsigned long __phys_addr(unsigned long x) 32unsigned long __phys_addr(unsigned long x)
28{ 33{
29 if (x >= __START_KERNEL_map) 34 if (x >= __START_KERNEL_map) {
30 return x - __START_KERNEL_map + phys_base; 35 x -= __START_KERNEL_map;
31 return x - PAGE_OFFSET; 36 VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
37 x += phys_base;
38 } else {
39 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
40 x -= PAGE_OFFSET;
41 VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
42 !phys_addr_valid(x));
43 }
44 return x;
32} 45}
33EXPORT_SYMBOL(__phys_addr); 46EXPORT_SYMBOL(__phys_addr);
34 47
35static inline int phys_addr_valid(unsigned long addr) 48bool __virt_addr_valid(unsigned long x)
36{ 49{
37 return addr < (1UL << boot_cpu_data.x86_phys_bits); 50 if (x >= __START_KERNEL_map) {
51 x -= __START_KERNEL_map;
52 if (x >= KERNEL_IMAGE_SIZE)
53 return false;
54 x += phys_base;
55 } else {
56 if (x < PAGE_OFFSET)
57 return false;
58 x -= PAGE_OFFSET;
59 if (system_state == SYSTEM_BOOTING ?
60 x > MAXMEM : !phys_addr_valid(x)) {
61 return false;
62 }
63 }
64
65 return pfn_valid(x >> PAGE_SHIFT);
38} 66}
67EXPORT_SYMBOL(__virt_addr_valid);
39 68
40#else 69#else
41 70
@@ -44,6 +73,28 @@ static inline int phys_addr_valid(unsigned long addr)
44 return 1; 73 return 1;
45} 74}
46 75
76#ifdef CONFIG_DEBUG_VIRTUAL
77unsigned long __phys_addr(unsigned long x)
78{
79 /* VMALLOC_* aren't constants; not available at the boot time */
80 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
81 VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
82 is_vmalloc_addr((void *) x));
83 return x - PAGE_OFFSET;
84}
85EXPORT_SYMBOL(__phys_addr);
86#endif
87
88bool __virt_addr_valid(unsigned long x)
89{
90 if (x < PAGE_OFFSET)
91 return false;
92 if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
93 return false;
94 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
95}
96EXPORT_SYMBOL(__virt_addr_valid);
97
47#endif 98#endif
48 99
49int page_is_ram(unsigned long pagenr) 100int page_is_ram(unsigned long pagenr)
@@ -83,6 +134,25 @@ int page_is_ram(unsigned long pagenr)
83 return 0; 134 return 0;
84} 135}
85 136
137int pagerange_is_ram(unsigned long start, unsigned long end)
138{
139 int ram_page = 0, not_rampage = 0;
140 unsigned long page_nr;
141
142 for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
143 ++page_nr) {
144 if (page_is_ram(page_nr))
145 ram_page = 1;
146 else
147 not_rampage = 1;
148
149 if (ram_page == not_rampage)
150 return -1;
151 }
152
153 return ram_page;
154}
155
86/* 156/*
87 * Fix up the linear direct mapping of the kernel to avoid cache attribute 157 * Fix up the linear direct mapping of the kernel to avoid cache attribute
88 * conflicts. 158 * conflicts.
@@ -150,6 +220,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
150 return (__force void __iomem *)phys_to_virt(phys_addr); 220 return (__force void __iomem *)phys_to_virt(phys_addr);
151 221
152 /* 222 /*
223 * Check if the request spans more than any BAR in the iomem resource
224 * tree.
225 */
226 WARN_ON(iomem_map_sanity_check(phys_addr, size));
227
228 /*
153 * Don't allow anybody to remap normal RAM that we're using.. 229 * Don't allow anybody to remap normal RAM that we're using..
154 */ 230 */
155 for (pfn = phys_addr >> PAGE_SHIFT; 231 for (pfn = phys_addr >> PAGE_SHIFT;
@@ -204,16 +280,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
204 switch (prot_val) { 280 switch (prot_val) {
205 case _PAGE_CACHE_UC: 281 case _PAGE_CACHE_UC:
206 default: 282 default:
207 prot = PAGE_KERNEL_NOCACHE; 283 prot = PAGE_KERNEL_IO_NOCACHE;
208 break; 284 break;
209 case _PAGE_CACHE_UC_MINUS: 285 case _PAGE_CACHE_UC_MINUS:
210 prot = PAGE_KERNEL_UC_MINUS; 286 prot = PAGE_KERNEL_IO_UC_MINUS;
211 break; 287 break;
212 case _PAGE_CACHE_WC: 288 case _PAGE_CACHE_WC:
213 prot = PAGE_KERNEL_WC; 289 prot = PAGE_KERNEL_IO_WC;
214 break; 290 break;
215 case _PAGE_CACHE_WB: 291 case _PAGE_CACHE_WB:
216 prot = PAGE_KERNEL; 292 prot = PAGE_KERNEL_IO;
217 break; 293 break;
218 } 294 }
219 295
@@ -421,7 +497,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
421 return; 497 return;
422} 498}
423 499
424int __initdata early_ioremap_debug; 500static int __initdata early_ioremap_debug;
425 501
426static int __init early_ioremap_debug_setup(char *str) 502static int __init early_ioremap_debug_setup(char *str)
427{ 503{
@@ -530,12 +606,12 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
530} 606}
531 607
532static inline void __init early_set_fixmap(enum fixed_addresses idx, 608static inline void __init early_set_fixmap(enum fixed_addresses idx,
533 unsigned long phys) 609 unsigned long phys, pgprot_t prot)
534{ 610{
535 if (after_paging_init) 611 if (after_paging_init)
536 set_fixmap(idx, phys); 612 __set_fixmap(idx, phys, prot);
537 else 613 else
538 __early_set_fixmap(idx, phys, PAGE_KERNEL); 614 __early_set_fixmap(idx, phys, prot);
539} 615}
540 616
541static inline void __init early_clear_fixmap(enum fixed_addresses idx) 617static inline void __init early_clear_fixmap(enum fixed_addresses idx)
@@ -546,16 +622,22 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
546 __early_set_fixmap(idx, 0, __pgprot(0)); 622 __early_set_fixmap(idx, 0, __pgprot(0));
547} 623}
548 624
549 625static void *prev_map[FIX_BTMAPS_SLOTS] __initdata;
550int __initdata early_ioremap_nested; 626static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
551
552static int __init check_early_ioremap_leak(void) 627static int __init check_early_ioremap_leak(void)
553{ 628{
554 if (!early_ioremap_nested) 629 int count = 0;
630 int i;
631
632 for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
633 if (prev_map[i])
634 count++;
635
636 if (!count)
555 return 0; 637 return 0;
556 WARN(1, KERN_WARNING 638 WARN(1, KERN_WARNING
557 "Debug warning: early ioremap leak of %d areas detected.\n", 639 "Debug warning: early ioremap leak of %d areas detected.\n",
558 early_ioremap_nested); 640 count);
559 printk(KERN_WARNING 641 printk(KERN_WARNING
560 "please boot with early_ioremap_debug and report the dmesg.\n"); 642 "please boot with early_ioremap_debug and report the dmesg.\n");
561 643
@@ -563,18 +645,33 @@ static int __init check_early_ioremap_leak(void)
563} 645}
564late_initcall(check_early_ioremap_leak); 646late_initcall(check_early_ioremap_leak);
565 647
566void __init *early_ioremap(unsigned long phys_addr, unsigned long size) 648static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
567{ 649{
568 unsigned long offset, last_addr; 650 unsigned long offset, last_addr;
569 unsigned int nrpages, nesting; 651 unsigned int nrpages;
570 enum fixed_addresses idx0, idx; 652 enum fixed_addresses idx0, idx;
653 int i, slot;
571 654
572 WARN_ON(system_state != SYSTEM_BOOTING); 655 WARN_ON(system_state != SYSTEM_BOOTING);
573 656
574 nesting = early_ioremap_nested; 657 slot = -1;
658 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
659 if (!prev_map[i]) {
660 slot = i;
661 break;
662 }
663 }
664
665 if (slot < 0) {
666 printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n",
667 phys_addr, size);
668 WARN_ON(1);
669 return NULL;
670 }
671
575 if (early_ioremap_debug) { 672 if (early_ioremap_debug) {
576 printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", 673 printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
577 phys_addr, size, nesting); 674 phys_addr, size, slot);
578 dump_stack(); 675 dump_stack();
579 } 676 }
580 677
@@ -585,17 +682,13 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
585 return NULL; 682 return NULL;
586 } 683 }
587 684
588 if (nesting >= FIX_BTMAPS_NESTING) { 685 prev_size[slot] = size;
589 WARN_ON(1);
590 return NULL;
591 }
592 early_ioremap_nested++;
593 /* 686 /*
594 * Mappings have to be page-aligned 687 * Mappings have to be page-aligned
595 */ 688 */
596 offset = phys_addr & ~PAGE_MASK; 689 offset = phys_addr & ~PAGE_MASK;
597 phys_addr &= PAGE_MASK; 690 phys_addr &= PAGE_MASK;
598 size = PAGE_ALIGN(last_addr) - phys_addr; 691 size = PAGE_ALIGN(last_addr + 1) - phys_addr;
599 692
600 /* 693 /*
601 * Mappings have to fit in the FIX_BTMAP area. 694 * Mappings have to fit in the FIX_BTMAP area.
@@ -609,10 +702,10 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
609 /* 702 /*
610 * Ok, go for it.. 703 * Ok, go for it..
611 */ 704 */
612 idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; 705 idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
613 idx = idx0; 706 idx = idx0;
614 while (nrpages > 0) { 707 while (nrpages > 0) {
615 early_set_fixmap(idx, phys_addr); 708 early_set_fixmap(idx, phys_addr, prot);
616 phys_addr += PAGE_SIZE; 709 phys_addr += PAGE_SIZE;
617 --idx; 710 --idx;
618 --nrpages; 711 --nrpages;
@@ -620,7 +713,20 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
620 if (early_ioremap_debug) 713 if (early_ioremap_debug)
621 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); 714 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
622 715
623 return (void *) (offset + fix_to_virt(idx0)); 716 prev_map[slot] = (void *) (offset + fix_to_virt(idx0));
717 return prev_map[slot];
718}
719
720/* Remap an IO device */
721void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
722{
723 return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
724}
725
726/* Remap memory */
727void __init *early_memremap(unsigned long phys_addr, unsigned long size)
728{
729 return __early_ioremap(phys_addr, size, PAGE_KERNEL);
624} 730}
625 731
626void __init early_iounmap(void *addr, unsigned long size) 732void __init early_iounmap(void *addr, unsigned long size)
@@ -629,15 +735,33 @@ void __init early_iounmap(void *addr, unsigned long size)
629 unsigned long offset; 735 unsigned long offset;
630 unsigned int nrpages; 736 unsigned int nrpages;
631 enum fixed_addresses idx; 737 enum fixed_addresses idx;
632 int nesting; 738 int i, slot;
739
740 slot = -1;
741 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
742 if (prev_map[i] == addr) {
743 slot = i;
744 break;
745 }
746 }
747
748 if (slot < 0) {
749 printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
750 addr, size);
751 WARN_ON(1);
752 return;
753 }
633 754
634 nesting = --early_ioremap_nested; 755 if (prev_size[slot] != size) {
635 if (WARN_ON(nesting < 0)) 756 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
757 addr, size, slot, prev_size[slot]);
758 WARN_ON(1);
636 return; 759 return;
760 }
637 761
638 if (early_ioremap_debug) { 762 if (early_ioremap_debug) {
639 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, 763 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
640 size, nesting); 764 size, slot);
641 dump_stack(); 765 dump_stack();
642 } 766 }
643 767
@@ -649,12 +773,13 @@ void __init early_iounmap(void *addr, unsigned long size)
649 offset = virt_addr & ~PAGE_MASK; 773 offset = virt_addr & ~PAGE_MASK;
650 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; 774 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
651 775
652 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; 776 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
653 while (nrpages > 0) { 777 while (nrpages > 0) {
654 early_clear_fixmap(idx); 778 early_clear_fixmap(idx);
655 --idx; 779 --idx;
656 --nrpages; 780 --nrpages;
657 } 781 }
782 prev_map[slot] = 0;
658} 783}
659 784
660void __this_fixmap_does_not_exist(void) 785void __this_fixmap_does_not_exist(void)
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 672e17f8262..9cab18b0b85 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -61,9 +61,9 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
61 last_bad += incr; 61 last_bad += incr;
62 } else { 62 } else {
63 if (start_bad) { 63 if (start_bad) {
64 printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved", 64 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
65 val, start_bad, last_bad + incr); 65 val, start_bad, last_bad + incr);
66 reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); 66 reserve_early(start_bad, last_bad + incr, "BAD RAM");
67 } 67 }
68 start_bad = last_bad = start_phys_aligned; 68 start_bad = last_bad = start_phys_aligned;
69 } 69 }
@@ -72,9 +72,8 @@ static void __init memtest(unsigned long start_phys, unsigned long size,
72 if (start_bad) { 72 if (start_bad) {
73 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", 73 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
74 val, start_bad, last_bad + incr); 74 val, start_bad, last_bad + incr);
75 reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); 75 reserve_early(start_bad, last_bad + incr, "BAD RAM");
76 } 76 }
77
78} 77}
79 78
80/* default is disabled */ 79/* default is disabled */
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 635b50e8558..2c4baa88f2c 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -56,13 +56,6 @@ struct remap_trace {
56static DEFINE_PER_CPU(struct trap_reason, pf_reason); 56static DEFINE_PER_CPU(struct trap_reason, pf_reason);
57static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); 57static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
58 58
59#if 0 /* XXX: no way gather this info anymore */
60/* Access to this is not per-cpu. */
61static DEFINE_PER_CPU(atomic_t, dropped);
62#endif
63
64static struct dentry *marker_file;
65
66static DEFINE_MUTEX(mmiotrace_mutex); 59static DEFINE_MUTEX(mmiotrace_mutex);
67static DEFINE_SPINLOCK(trace_lock); 60static DEFINE_SPINLOCK(trace_lock);
68static atomic_t mmiotrace_enabled; 61static atomic_t mmiotrace_enabled;
@@ -75,7 +68,7 @@ static LIST_HEAD(trace_list); /* struct remap_trace */
75 * and trace_lock. 68 * and trace_lock.
76 * - Routines depending on is_enabled() must take trace_lock. 69 * - Routines depending on is_enabled() must take trace_lock.
77 * - trace_list users must hold trace_lock. 70 * - trace_list users must hold trace_lock.
78 * - is_enabled() guarantees that mmio_trace_record is allowed. 71 * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
79 * - pre/post callbacks assume the effect of is_enabled() being true. 72 * - pre/post callbacks assume the effect of is_enabled() being true.
80 */ 73 */
81 74
@@ -97,44 +90,6 @@ static bool is_enabled(void)
97 return atomic_read(&mmiotrace_enabled); 90 return atomic_read(&mmiotrace_enabled);
98} 91}
99 92
100#if 0 /* XXX: needs rewrite */
101/*
102 * Write callback for the debugfs entry:
103 * Read a marker and write it to the mmio trace log
104 */
105static ssize_t write_marker(struct file *file, const char __user *buffer,
106 size_t count, loff_t *ppos)
107{
108 char *event = NULL;
109 struct mm_io_header *headp;
110 ssize_t len = (count > 65535) ? 65535 : count;
111
112 event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
113 if (!event)
114 return -ENOMEM;
115
116 headp = (struct mm_io_header *)event;
117 headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
118 headp->data_len = len;
119
120 if (copy_from_user(event + sizeof(*headp), buffer, len)) {
121 kfree(event);
122 return -EFAULT;
123 }
124
125 spin_lock_irq(&trace_lock);
126#if 0 /* XXX: convert this to use tracing */
127 if (is_enabled())
128 relay_write(chan, event, sizeof(*headp) + len);
129 else
130#endif
131 len = -EINVAL;
132 spin_unlock_irq(&trace_lock);
133 kfree(event);
134 return len;
135}
136#endif
137
138static void print_pte(unsigned long address) 93static void print_pte(unsigned long address)
139{ 94{
140 unsigned int level; 95 unsigned int level;
@@ -307,8 +262,10 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size,
307 map.map_id = trace->id; 262 map.map_id = trace->id;
308 263
309 spin_lock_irq(&trace_lock); 264 spin_lock_irq(&trace_lock);
310 if (!is_enabled()) 265 if (!is_enabled()) {
266 kfree(trace);
311 goto not_enabled; 267 goto not_enabled;
268 }
312 269
313 mmio_trace_mapping(&map); 270 mmio_trace_mapping(&map);
314 list_add_tail(&trace->list, &trace_list); 271 list_add_tail(&trace->list, &trace_list);
@@ -377,6 +334,23 @@ void mmiotrace_iounmap(volatile void __iomem *addr)
377 iounmap_trace_core(addr); 334 iounmap_trace_core(addr);
378} 335}
379 336
337int mmiotrace_printk(const char *fmt, ...)
338{
339 int ret = 0;
340 va_list args;
341 unsigned long flags;
342 va_start(args, fmt);
343
344 spin_lock_irqsave(&trace_lock, flags);
345 if (is_enabled())
346 ret = mmio_trace_printk(fmt, args);
347 spin_unlock_irqrestore(&trace_lock, flags);
348
349 va_end(args);
350 return ret;
351}
352EXPORT_SYMBOL(mmiotrace_printk);
353
380static void clear_trace_list(void) 354static void clear_trace_list(void)
381{ 355{
382 struct remap_trace *trace; 356 struct remap_trace *trace;
@@ -462,26 +436,12 @@ static void leave_uniprocessor(void)
462} 436}
463#endif 437#endif
464 438
465#if 0 /* XXX: out of order */
466static struct file_operations fops_marker = {
467 .owner = THIS_MODULE,
468 .write = write_marker
469};
470#endif
471
472void enable_mmiotrace(void) 439void enable_mmiotrace(void)
473{ 440{
474 mutex_lock(&mmiotrace_mutex); 441 mutex_lock(&mmiotrace_mutex);
475 if (is_enabled()) 442 if (is_enabled())
476 goto out; 443 goto out;
477 444
478#if 0 /* XXX: tracing does not support text entries */
479 marker_file = debugfs_create_file("marker", 0660, dir, NULL,
480 &fops_marker);
481 if (!marker_file)
482 pr_err(NAME "marker file creation failed.\n");
483#endif
484
485 if (nommiotrace) 445 if (nommiotrace)
486 pr_info(NAME "MMIO tracing disabled.\n"); 446 pr_info(NAME "MMIO tracing disabled.\n");
487 enter_uniprocessor(); 447 enter_uniprocessor();
@@ -506,11 +466,6 @@ void disable_mmiotrace(void)
506 466
507 clear_trace_list(); /* guarantees: no more kmmio callbacks */ 467 clear_trace_list(); /* guarantees: no more kmmio callbacks */
508 leave_uniprocessor(); 468 leave_uniprocessor();
509 if (marker_file) {
510 debugfs_remove(marker_file);
511 marker_file = NULL;
512 }
513
514 pr_info(NAME "disabled.\n"); 469 pr_info(NAME "disabled.\n");
515out: 470out:
516 mutex_unlock(&mmiotrace_mutex); 471 mutex_unlock(&mmiotrace_mutex);
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/numa_32.c
index 62fa440678d..847c164725f 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -328,7 +328,7 @@ void __init initmem_init(unsigned long start_pfn,
328 328
329 get_memcfg_numa(); 329 get_memcfg_numa();
330 330
331 kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); 331 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
332 332
333 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); 333 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
334 do { 334 do {
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a4dd793d600..cebcbf152d4 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -79,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
79 return 0; 79 return 0;
80 80
81 addr = 0x8000; 81 addr = 0x8000;
82 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 82 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
83 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, 83 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
84 nodemap_size, L1_CACHE_BYTES); 84 nodemap_size, L1_CACHE_BYTES);
85 if (nodemap_addr == -1UL) { 85 if (nodemap_addr == -1UL) {
@@ -176,10 +176,10 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
176 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 176 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
177 unsigned long bootmap_start, nodedata_phys; 177 unsigned long bootmap_start, nodedata_phys;
178 void *bootmap; 178 void *bootmap;
179 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 179 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
180 int nid; 180 int nid;
181 181
182 start = round_up(start, ZONE_ALIGN); 182 start = roundup(start, ZONE_ALIGN);
183 183
184 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 184 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
185 start, end); 185 start, end);
@@ -210,9 +210,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
210 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); 210 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
211 nid = phys_to_nid(nodedata_phys); 211 nid = phys_to_nid(nodedata_phys);
212 if (nid == nodeid) 212 if (nid == nodeid)
213 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 213 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
214 else 214 else
215 bootmap_start = round_up(start, PAGE_SIZE); 215 bootmap_start = roundup(start, PAGE_SIZE);
216 /* 216 /*
217 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like 217 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
218 * to use that to align to PAGE_SIZE 218 * to use that to align to PAGE_SIZE
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index d4aa503caaa..e1d10690921 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -32,7 +32,7 @@ enum {
32 GPS = (1<<30) 32 GPS = (1<<30)
33}; 33};
34 34
35#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1) 35#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST)
36 36
37static int pte_testbit(pte_t pte) 37static int pte_testbit(pte_t pte)
38{ 38{
@@ -118,6 +118,7 @@ static int pageattr_test(void)
118 unsigned int level; 118 unsigned int level;
119 int i, k; 119 int i, k;
120 int err; 120 int err;
121 unsigned long test_addr;
121 122
122 if (print) 123 if (print)
123 printk(KERN_INFO "CPA self-test:\n"); 124 printk(KERN_INFO "CPA self-test:\n");
@@ -172,7 +173,8 @@ static int pageattr_test(void)
172 continue; 173 continue;
173 } 174 }
174 175
175 err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT); 176 test_addr = addr[i];
177 err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
176 if (err < 0) { 178 if (err < 0) {
177 printk(KERN_ERR "CPA %d failed %d\n", i, err); 179 printk(KERN_ERR "CPA %d failed %d\n", i, err);
178 failed++; 180 failed++;
@@ -204,7 +206,8 @@ static int pageattr_test(void)
204 failed++; 206 failed++;
205 continue; 207 continue;
206 } 208 }
207 err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT); 209 test_addr = addr[i];
210 err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
208 if (err < 0) { 211 if (err < 0) {
209 printk(KERN_ERR "CPA reverting failed: %d\n", err); 212 printk(KERN_ERR "CPA reverting failed: %d\n", err);
210 failed++; 213 failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 43e2f8483e4..f1dc1b75d16 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -25,15 +25,27 @@
25 * The current flushing context - we pass it instead of 5 arguments: 25 * The current flushing context - we pass it instead of 5 arguments:
26 */ 26 */
27struct cpa_data { 27struct cpa_data {
28 unsigned long vaddr; 28 unsigned long *vaddr;
29 pgprot_t mask_set; 29 pgprot_t mask_set;
30 pgprot_t mask_clr; 30 pgprot_t mask_clr;
31 int numpages; 31 int numpages;
32 int flushtlb; 32 int flags;
33 unsigned long pfn; 33 unsigned long pfn;
34 unsigned force_split : 1; 34 unsigned force_split : 1;
35 int curpage;
35}; 36};
36 37
38/*
39 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
40 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
41 * entries change the page attribute in parallel to some other cpu
42 * splitting a large page entry along with changing the attribute.
43 */
44static DEFINE_SPINLOCK(cpa_lock);
45
46#define CPA_FLUSHTLB 1
47#define CPA_ARRAY 2
48
37#ifdef CONFIG_PROC_FS 49#ifdef CONFIG_PROC_FS
38static unsigned long direct_pages_count[PG_LEVEL_NUM]; 50static unsigned long direct_pages_count[PG_LEVEL_NUM];
39 51
@@ -53,23 +65,22 @@ static void split_page_count(int level)
53 direct_pages_count[level - 1] += PTRS_PER_PTE; 65 direct_pages_count[level - 1] += PTRS_PER_PTE;
54} 66}
55 67
56int arch_report_meminfo(char *page) 68void arch_report_meminfo(struct seq_file *m)
57{ 69{
58 int n = sprintf(page, "DirectMap4k: %8lu kB\n", 70 seq_printf(m, "DirectMap4k: %8lu kB\n",
59 direct_pages_count[PG_LEVEL_4K] << 2); 71 direct_pages_count[PG_LEVEL_4K] << 2);
60#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 72#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
61 n += sprintf(page + n, "DirectMap2M: %8lu kB\n", 73 seq_printf(m, "DirectMap2M: %8lu kB\n",
62 direct_pages_count[PG_LEVEL_2M] << 11); 74 direct_pages_count[PG_LEVEL_2M] << 11);
63#else 75#else
64 n += sprintf(page + n, "DirectMap4M: %8lu kB\n", 76 seq_printf(m, "DirectMap4M: %8lu kB\n",
65 direct_pages_count[PG_LEVEL_2M] << 12); 77 direct_pages_count[PG_LEVEL_2M] << 12);
66#endif 78#endif
67#ifdef CONFIG_X86_64 79#ifdef CONFIG_X86_64
68 if (direct_gbpages) 80 if (direct_gbpages)
69 n += sprintf(page + n, "DirectMap1G: %8lu kB\n", 81 seq_printf(m, "DirectMap1G: %8lu kB\n",
70 direct_pages_count[PG_LEVEL_1G] << 20); 82 direct_pages_count[PG_LEVEL_1G] << 20);
71#endif 83#endif
72 return n;
73} 84}
74#else 85#else
75static inline void split_page_count(int level) { } 86static inline void split_page_count(int level) { }
@@ -84,7 +95,7 @@ static inline unsigned long highmap_start_pfn(void)
84 95
85static inline unsigned long highmap_end_pfn(void) 96static inline unsigned long highmap_end_pfn(void)
86{ 97{
87 return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; 98 return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
88} 99}
89 100
90#endif 101#endif
@@ -190,6 +201,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
190 } 201 }
191} 202}
192 203
204static void cpa_flush_array(unsigned long *start, int numpages, int cache)
205{
206 unsigned int i, level;
207 unsigned long *addr;
208
209 BUG_ON(irqs_disabled());
210
211 on_each_cpu(__cpa_flush_range, NULL, 1);
212
213 if (!cache)
214 return;
215
216 /* 4M threshold */
217 if (numpages >= 1024) {
218 if (boot_cpu_data.x86_model >= 4)
219 wbinvd();
220 return;
221 }
222 /*
223 * We only need to flush on one CPU,
224 * clflush is a MESI-coherent instruction that
225 * will cause all other CPUs to flush the same
226 * cachelines:
227 */
228 for (i = 0, addr = start; i < numpages; i++, addr++) {
229 pte_t *pte = lookup_address(*addr, &level);
230
231 /*
232 * Only flush present addresses:
233 */
234 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
235 clflush_cache_range((void *) *addr, PAGE_SIZE);
236 }
237}
238
193/* 239/*
194 * Certain areas of memory on x86 require very specific protection flags, 240 * Certain areas of memory on x86 require very specific protection flags,
195 * for example the BIOS area or kernel text. Callers don't always get this 241 * for example the BIOS area or kernel text. Callers don't always get this
@@ -398,7 +444,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
398 */ 444 */
399 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); 445 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
400 __set_pmd_pte(kpte, address, new_pte); 446 __set_pmd_pte(kpte, address, new_pte);
401 cpa->flushtlb = 1; 447 cpa->flags |= CPA_FLUSHTLB;
402 do_split = 0; 448 do_split = 0;
403 } 449 }
404 450
@@ -408,84 +454,6 @@ out_unlock:
408 return do_split; 454 return do_split;
409} 455}
410 456
411static LIST_HEAD(page_pool);
412static unsigned long pool_size, pool_pages, pool_low;
413static unsigned long pool_used, pool_failed;
414
415static void cpa_fill_pool(struct page **ret)
416{
417 gfp_t gfp = GFP_KERNEL;
418 unsigned long flags;
419 struct page *p;
420
421 /*
422 * Avoid recursion (on debug-pagealloc) and also signal
423 * our priority to get to these pagetables:
424 */
425 if (current->flags & PF_MEMALLOC)
426 return;
427 current->flags |= PF_MEMALLOC;
428
429 /*
430 * Allocate atomically from atomic contexts:
431 */
432 if (in_atomic() || irqs_disabled() || debug_pagealloc)
433 gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
434
435 while (pool_pages < pool_size || (ret && !*ret)) {
436 p = alloc_pages(gfp, 0);
437 if (!p) {
438 pool_failed++;
439 break;
440 }
441 /*
442 * If the call site needs a page right now, provide it:
443 */
444 if (ret && !*ret) {
445 *ret = p;
446 continue;
447 }
448 spin_lock_irqsave(&pgd_lock, flags);
449 list_add(&p->lru, &page_pool);
450 pool_pages++;
451 spin_unlock_irqrestore(&pgd_lock, flags);
452 }
453
454 current->flags &= ~PF_MEMALLOC;
455}
456
457#define SHIFT_MB (20 - PAGE_SHIFT)
458#define ROUND_MB_GB ((1 << 10) - 1)
459#define SHIFT_MB_GB 10
460#define POOL_PAGES_PER_GB 16
461
462void __init cpa_init(void)
463{
464 struct sysinfo si;
465 unsigned long gb;
466
467 si_meminfo(&si);
468 /*
469 * Calculate the number of pool pages:
470 *
471 * Convert totalram (nr of pages) to MiB and round to the next
472 * GiB. Shift MiB to Gib and multiply the result by
473 * POOL_PAGES_PER_GB:
474 */
475 if (debug_pagealloc) {
476 gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
477 pool_size = POOL_PAGES_PER_GB * gb;
478 } else {
479 pool_size = 1;
480 }
481 pool_low = pool_size;
482
483 cpa_fill_pool(NULL);
484 printk(KERN_DEBUG
485 "CPA: page pool initialized %lu of %lu pages preallocated\n",
486 pool_pages, pool_size);
487}
488
489static int split_large_page(pte_t *kpte, unsigned long address) 457static int split_large_page(pte_t *kpte, unsigned long address)
490{ 458{
491 unsigned long flags, pfn, pfninc = 1; 459 unsigned long flags, pfn, pfninc = 1;
@@ -494,28 +462,15 @@ static int split_large_page(pte_t *kpte, unsigned long address)
494 pgprot_t ref_prot; 462 pgprot_t ref_prot;
495 struct page *base; 463 struct page *base;
496 464
497 /* 465 if (!debug_pagealloc)
498 * Get a page from the pool. The pool list is protected by the 466 spin_unlock(&cpa_lock);
499 * pgd_lock, which we have to take anyway for the split 467 base = alloc_pages(GFP_KERNEL, 0);
500 * operation: 468 if (!debug_pagealloc)
501 */ 469 spin_lock(&cpa_lock);
502 spin_lock_irqsave(&pgd_lock, flags); 470 if (!base)
503 if (list_empty(&page_pool)) { 471 return -ENOMEM;
504 spin_unlock_irqrestore(&pgd_lock, flags);
505 base = NULL;
506 cpa_fill_pool(&base);
507 if (!base)
508 return -ENOMEM;
509 spin_lock_irqsave(&pgd_lock, flags);
510 } else {
511 base = list_first_entry(&page_pool, struct page, lru);
512 list_del(&base->lru);
513 pool_pages--;
514
515 if (pool_pages < pool_low)
516 pool_low = pool_pages;
517 }
518 472
473 spin_lock_irqsave(&pgd_lock, flags);
519 /* 474 /*
520 * Check for races, another CPU might have split this page 475 * Check for races, another CPU might have split this page
521 * up for us already: 476 * up for us already:
@@ -572,11 +527,8 @@ out_unlock:
572 * If we dropped out via the lookup_address check under 527 * If we dropped out via the lookup_address check under
573 * pgd_lock then stick the page back into the pool: 528 * pgd_lock then stick the page back into the pool:
574 */ 529 */
575 if (base) { 530 if (base)
576 list_add(&base->lru, &page_pool); 531 __free_page(base);
577 pool_pages++;
578 } else
579 pool_used++;
580 spin_unlock_irqrestore(&pgd_lock, flags); 532 spin_unlock_irqrestore(&pgd_lock, flags);
581 533
582 return 0; 534 return 0;
@@ -584,11 +536,16 @@ out_unlock:
584 536
585static int __change_page_attr(struct cpa_data *cpa, int primary) 537static int __change_page_attr(struct cpa_data *cpa, int primary)
586{ 538{
587 unsigned long address = cpa->vaddr; 539 unsigned long address;
588 int do_split, err; 540 int do_split, err;
589 unsigned int level; 541 unsigned int level;
590 pte_t *kpte, old_pte; 542 pte_t *kpte, old_pte;
591 543
544 if (cpa->flags & CPA_ARRAY)
545 address = cpa->vaddr[cpa->curpage];
546 else
547 address = *cpa->vaddr;
548
592repeat: 549repeat:
593 kpte = lookup_address(address, &level); 550 kpte = lookup_address(address, &level);
594 if (!kpte) 551 if (!kpte)
@@ -600,7 +557,7 @@ repeat:
600 return 0; 557 return 0;
601 WARN(1, KERN_WARNING "CPA: called for zero pte. " 558 WARN(1, KERN_WARNING "CPA: called for zero pte. "
602 "vaddr = %lx cpa->vaddr = %lx\n", address, 559 "vaddr = %lx cpa->vaddr = %lx\n", address,
603 cpa->vaddr); 560 *cpa->vaddr);
604 return -EINVAL; 561 return -EINVAL;
605 } 562 }
606 563
@@ -626,7 +583,7 @@ repeat:
626 */ 583 */
627 if (pte_val(old_pte) != pte_val(new_pte)) { 584 if (pte_val(old_pte) != pte_val(new_pte)) {
628 set_pte_atomic(kpte, new_pte); 585 set_pte_atomic(kpte, new_pte);
629 cpa->flushtlb = 1; 586 cpa->flags |= CPA_FLUSHTLB;
630 } 587 }
631 cpa->numpages = 1; 588 cpa->numpages = 1;
632 return 0; 589 return 0;
@@ -650,7 +607,25 @@ repeat:
650 */ 607 */
651 err = split_large_page(kpte, address); 608 err = split_large_page(kpte, address);
652 if (!err) { 609 if (!err) {
653 cpa->flushtlb = 1; 610 /*
611 * Do a global flush tlb after splitting the large page
612 * and before we do the actual change page attribute in the PTE.
613 *
614 * With out this, we violate the TLB application note, that says
615 * "The TLBs may contain both ordinary and large-page
616 * translations for a 4-KByte range of linear addresses. This
617 * may occur if software modifies the paging structures so that
618 * the page size used for the address range changes. If the two
619 * translations differ with respect to page frame or attributes
620 * (e.g., permissions), processor behavior is undefined and may
621 * be implementation-specific."
622 *
623 * We do this global tlb flush inside the cpa_lock, so that we
624 * don't allow any other cpu, with stale tlb entries change the
625 * page attribute in parallel, that also falls into the
626 * just split large page entry.
627 */
628 flush_tlb_all();
654 goto repeat; 629 goto repeat;
655 } 630 }
656 631
@@ -663,6 +638,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
663{ 638{
664 struct cpa_data alias_cpa; 639 struct cpa_data alias_cpa;
665 int ret = 0; 640 int ret = 0;
641 unsigned long temp_cpa_vaddr, vaddr;
666 642
667 if (cpa->pfn >= max_pfn_mapped) 643 if (cpa->pfn >= max_pfn_mapped)
668 return 0; 644 return 0;
@@ -675,16 +651,24 @@ static int cpa_process_alias(struct cpa_data *cpa)
675 * No need to redo, when the primary call touched the direct 651 * No need to redo, when the primary call touched the direct
676 * mapping already: 652 * mapping already:
677 */ 653 */
678 if (!(within(cpa->vaddr, PAGE_OFFSET, 654 if (cpa->flags & CPA_ARRAY)
655 vaddr = cpa->vaddr[cpa->curpage];
656 else
657 vaddr = *cpa->vaddr;
658
659 if (!(within(vaddr, PAGE_OFFSET,
679 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) 660 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
680#ifdef CONFIG_X86_64 661#ifdef CONFIG_X86_64
681 || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32), 662 || within(vaddr, PAGE_OFFSET + (1UL<<32),
682 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) 663 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
683#endif 664#endif
684 )) { 665 )) {
685 666
686 alias_cpa = *cpa; 667 alias_cpa = *cpa;
687 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 668 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
669 alias_cpa.vaddr = &temp_cpa_vaddr;
670 alias_cpa.flags &= ~CPA_ARRAY;
671
688 672
689 ret = __change_page_attr_set_clr(&alias_cpa, 0); 673 ret = __change_page_attr_set_clr(&alias_cpa, 0);
690 } 674 }
@@ -696,7 +680,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
696 * No need to redo, when the primary call touched the high 680 * No need to redo, when the primary call touched the high
697 * mapping already: 681 * mapping already:
698 */ 682 */
699 if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) 683 if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
700 return 0; 684 return 0;
701 685
702 /* 686 /*
@@ -707,8 +691,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
707 return 0; 691 return 0;
708 692
709 alias_cpa = *cpa; 693 alias_cpa = *cpa;
710 alias_cpa.vaddr = 694 temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
711 (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; 695 alias_cpa.vaddr = &temp_cpa_vaddr;
696 alias_cpa.flags &= ~CPA_ARRAY;
712 697
713 /* 698 /*
714 * The high mapping range is imprecise, so ignore the return value. 699 * The high mapping range is imprecise, so ignore the return value.
@@ -728,8 +713,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
728 * preservation check. 713 * preservation check.
729 */ 714 */
730 cpa->numpages = numpages; 715 cpa->numpages = numpages;
716 /* for array changes, we can't use large page */
717 if (cpa->flags & CPA_ARRAY)
718 cpa->numpages = 1;
731 719
720 if (!debug_pagealloc)
721 spin_lock(&cpa_lock);
732 ret = __change_page_attr(cpa, checkalias); 722 ret = __change_page_attr(cpa, checkalias);
723 if (!debug_pagealloc)
724 spin_unlock(&cpa_lock);
733 if (ret) 725 if (ret)
734 return ret; 726 return ret;
735 727
@@ -746,7 +738,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
746 */ 738 */
747 BUG_ON(cpa->numpages > numpages); 739 BUG_ON(cpa->numpages > numpages);
748 numpages -= cpa->numpages; 740 numpages -= cpa->numpages;
749 cpa->vaddr += cpa->numpages * PAGE_SIZE; 741 if (cpa->flags & CPA_ARRAY)
742 cpa->curpage++;
743 else
744 *cpa->vaddr += cpa->numpages * PAGE_SIZE;
745
750 } 746 }
751 return 0; 747 return 0;
752} 748}
@@ -757,9 +753,9 @@ static inline int cache_attr(pgprot_t attr)
757 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); 753 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
758} 754}
759 755
760static int change_page_attr_set_clr(unsigned long addr, int numpages, 756static int change_page_attr_set_clr(unsigned long *addr, int numpages,
761 pgprot_t mask_set, pgprot_t mask_clr, 757 pgprot_t mask_set, pgprot_t mask_clr,
762 int force_split) 758 int force_split, int array)
763{ 759{
764 struct cpa_data cpa; 760 struct cpa_data cpa;
765 int ret, cache, checkalias; 761 int ret, cache, checkalias;
@@ -774,21 +770,40 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
774 return 0; 770 return 0;
775 771
776 /* Ensure we are PAGE_SIZE aligned */ 772 /* Ensure we are PAGE_SIZE aligned */
777 if (addr & ~PAGE_MASK) { 773 if (!array) {
778 addr &= PAGE_MASK; 774 if (*addr & ~PAGE_MASK) {
779 /* 775 *addr &= PAGE_MASK;
780 * People should not be passing in unaligned addresses: 776 /*
781 */ 777 * People should not be passing in unaligned addresses:
782 WARN_ON_ONCE(1); 778 */
779 WARN_ON_ONCE(1);
780 }
781 } else {
782 int i;
783 for (i = 0; i < numpages; i++) {
784 if (addr[i] & ~PAGE_MASK) {
785 addr[i] &= PAGE_MASK;
786 WARN_ON_ONCE(1);
787 }
788 }
783 } 789 }
784 790
791 /* Must avoid aliasing mappings in the highmem code */
792 kmap_flush_unused();
793
794 vm_unmap_aliases();
795
785 cpa.vaddr = addr; 796 cpa.vaddr = addr;
786 cpa.numpages = numpages; 797 cpa.numpages = numpages;
787 cpa.mask_set = mask_set; 798 cpa.mask_set = mask_set;
788 cpa.mask_clr = mask_clr; 799 cpa.mask_clr = mask_clr;
789 cpa.flushtlb = 0; 800 cpa.flags = 0;
801 cpa.curpage = 0;
790 cpa.force_split = force_split; 802 cpa.force_split = force_split;
791 803
804 if (array)
805 cpa.flags |= CPA_ARRAY;
806
792 /* No alias checking for _NX bit modifications */ 807 /* No alias checking for _NX bit modifications */
793 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; 808 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
794 809
@@ -797,7 +812,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
797 /* 812 /*
798 * Check whether we really changed something: 813 * Check whether we really changed something:
799 */ 814 */
800 if (!cpa.flushtlb) 815 if (!(cpa.flags & CPA_FLUSHTLB))
801 goto out; 816 goto out;
802 817
803 /* 818 /*
@@ -812,27 +827,30 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
812 * error case we fall back to cpa_flush_all (which uses 827 * error case we fall back to cpa_flush_all (which uses
813 * wbindv): 828 * wbindv):
814 */ 829 */
815 if (!ret && cpu_has_clflush) 830 if (!ret && cpu_has_clflush) {
816 cpa_flush_range(addr, numpages, cache); 831 if (cpa.flags & CPA_ARRAY)
817 else 832 cpa_flush_array(addr, numpages, cache);
833 else
834 cpa_flush_range(*addr, numpages, cache);
835 } else
818 cpa_flush_all(cache); 836 cpa_flush_all(cache);
819 837
820out: 838out:
821 cpa_fill_pool(NULL);
822
823 return ret; 839 return ret;
824} 840}
825 841
826static inline int change_page_attr_set(unsigned long addr, int numpages, 842static inline int change_page_attr_set(unsigned long *addr, int numpages,
827 pgprot_t mask) 843 pgprot_t mask, int array)
828{ 844{
829 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); 845 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
846 array);
830} 847}
831 848
832static inline int change_page_attr_clear(unsigned long addr, int numpages, 849static inline int change_page_attr_clear(unsigned long *addr, int numpages,
833 pgprot_t mask) 850 pgprot_t mask, int array)
834{ 851{
835 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); 852 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
853 array);
836} 854}
837 855
838int _set_memory_uc(unsigned long addr, int numpages) 856int _set_memory_uc(unsigned long addr, int numpages)
@@ -840,8 +858,8 @@ int _set_memory_uc(unsigned long addr, int numpages)
840 /* 858 /*
841 * for now UC MINUS. see comments in ioremap_nocache() 859 * for now UC MINUS. see comments in ioremap_nocache()
842 */ 860 */
843 return change_page_attr_set(addr, numpages, 861 return change_page_attr_set(&addr, numpages,
844 __pgprot(_PAGE_CACHE_UC_MINUS)); 862 __pgprot(_PAGE_CACHE_UC_MINUS), 0);
845} 863}
846 864
847int set_memory_uc(unsigned long addr, int numpages) 865int set_memory_uc(unsigned long addr, int numpages)
@@ -857,10 +875,48 @@ int set_memory_uc(unsigned long addr, int numpages)
857} 875}
858EXPORT_SYMBOL(set_memory_uc); 876EXPORT_SYMBOL(set_memory_uc);
859 877
878int set_memory_array_uc(unsigned long *addr, int addrinarray)
879{
880 unsigned long start;
881 unsigned long end;
882 int i;
883 /*
884 * for now UC MINUS. see comments in ioremap_nocache()
885 */
886 for (i = 0; i < addrinarray; i++) {
887 start = __pa(addr[i]);
888 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
889 if (end != __pa(addr[i + 1]))
890 break;
891 i++;
892 }
893 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
894 goto out;
895 }
896
897 return change_page_attr_set(addr, addrinarray,
898 __pgprot(_PAGE_CACHE_UC_MINUS), 1);
899out:
900 for (i = 0; i < addrinarray; i++) {
901 unsigned long tmp = __pa(addr[i]);
902
903 if (tmp == start)
904 break;
905 for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
906 if (end != __pa(addr[i + 1]))
907 break;
908 i++;
909 }
910 free_memtype(tmp, end);
911 }
912 return -EINVAL;
913}
914EXPORT_SYMBOL(set_memory_array_uc);
915
860int _set_memory_wc(unsigned long addr, int numpages) 916int _set_memory_wc(unsigned long addr, int numpages)
861{ 917{
862 return change_page_attr_set(addr, numpages, 918 return change_page_attr_set(&addr, numpages,
863 __pgprot(_PAGE_CACHE_WC)); 919 __pgprot(_PAGE_CACHE_WC), 0);
864} 920}
865 921
866int set_memory_wc(unsigned long addr, int numpages) 922int set_memory_wc(unsigned long addr, int numpages)
@@ -878,8 +934,8 @@ EXPORT_SYMBOL(set_memory_wc);
878 934
879int _set_memory_wb(unsigned long addr, int numpages) 935int _set_memory_wb(unsigned long addr, int numpages)
880{ 936{
881 return change_page_attr_clear(addr, numpages, 937 return change_page_attr_clear(&addr, numpages,
882 __pgprot(_PAGE_CACHE_MASK)); 938 __pgprot(_PAGE_CACHE_MASK), 0);
883} 939}
884 940
885int set_memory_wb(unsigned long addr, int numpages) 941int set_memory_wb(unsigned long addr, int numpages)
@@ -890,37 +946,59 @@ int set_memory_wb(unsigned long addr, int numpages)
890} 946}
891EXPORT_SYMBOL(set_memory_wb); 947EXPORT_SYMBOL(set_memory_wb);
892 948
949int set_memory_array_wb(unsigned long *addr, int addrinarray)
950{
951 int i;
952
953 for (i = 0; i < addrinarray; i++) {
954 unsigned long start = __pa(addr[i]);
955 unsigned long end;
956
957 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
958 if (end != __pa(addr[i + 1]))
959 break;
960 i++;
961 }
962 free_memtype(start, end);
963 }
964 return change_page_attr_clear(addr, addrinarray,
965 __pgprot(_PAGE_CACHE_MASK), 1);
966}
967EXPORT_SYMBOL(set_memory_array_wb);
968
893int set_memory_x(unsigned long addr, int numpages) 969int set_memory_x(unsigned long addr, int numpages)
894{ 970{
895 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); 971 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
896} 972}
897EXPORT_SYMBOL(set_memory_x); 973EXPORT_SYMBOL(set_memory_x);
898 974
899int set_memory_nx(unsigned long addr, int numpages) 975int set_memory_nx(unsigned long addr, int numpages)
900{ 976{
901 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); 977 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
902} 978}
903EXPORT_SYMBOL(set_memory_nx); 979EXPORT_SYMBOL(set_memory_nx);
904 980
905int set_memory_ro(unsigned long addr, int numpages) 981int set_memory_ro(unsigned long addr, int numpages)
906{ 982{
907 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); 983 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
908} 984}
985EXPORT_SYMBOL_GPL(set_memory_ro);
909 986
910int set_memory_rw(unsigned long addr, int numpages) 987int set_memory_rw(unsigned long addr, int numpages)
911{ 988{
912 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); 989 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
913} 990}
991EXPORT_SYMBOL_GPL(set_memory_rw);
914 992
915int set_memory_np(unsigned long addr, int numpages) 993int set_memory_np(unsigned long addr, int numpages)
916{ 994{
917 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); 995 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
918} 996}
919 997
920int set_memory_4k(unsigned long addr, int numpages) 998int set_memory_4k(unsigned long addr, int numpages)
921{ 999{
922 return change_page_attr_set_clr(addr, numpages, __pgprot(0), 1000 return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
923 __pgprot(0), 1); 1001 __pgprot(0), 1, 0);
924} 1002}
925 1003
926int set_pages_uc(struct page *page, int numpages) 1004int set_pages_uc(struct page *page, int numpages)
@@ -973,22 +1051,38 @@ int set_pages_rw(struct page *page, int numpages)
973 1051
974static int __set_pages_p(struct page *page, int numpages) 1052static int __set_pages_p(struct page *page, int numpages)
975{ 1053{
976 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1054 unsigned long tempaddr = (unsigned long) page_address(page);
1055 struct cpa_data cpa = { .vaddr = &tempaddr,
977 .numpages = numpages, 1056 .numpages = numpages,
978 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1057 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
979 .mask_clr = __pgprot(0)}; 1058 .mask_clr = __pgprot(0),
1059 .flags = 0};
980 1060
981 return __change_page_attr_set_clr(&cpa, 1); 1061 /*
1062 * No alias checking needed for setting present flag. otherwise,
1063 * we may need to break large pages for 64-bit kernel text
1064 * mappings (this adds to complexity if we want to do this from
1065 * atomic context especially). Let's keep it simple!
1066 */
1067 return __change_page_attr_set_clr(&cpa, 0);
982} 1068}
983 1069
984static int __set_pages_np(struct page *page, int numpages) 1070static int __set_pages_np(struct page *page, int numpages)
985{ 1071{
986 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1072 unsigned long tempaddr = (unsigned long) page_address(page);
1073 struct cpa_data cpa = { .vaddr = &tempaddr,
987 .numpages = numpages, 1074 .numpages = numpages,
988 .mask_set = __pgprot(0), 1075 .mask_set = __pgprot(0),
989 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; 1076 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1077 .flags = 0};
990 1078
991 return __change_page_attr_set_clr(&cpa, 1); 1079 /*
1080 * No alias checking needed for setting not present flag. otherwise,
1081 * we may need to break large pages for 64-bit kernel text
1082 * mappings (this adds to complexity if we want to do this from
1083 * atomic context especially). Let's keep it simple!
1084 */
1085 return __change_page_attr_set_clr(&cpa, 0);
992} 1086}
993 1087
994void kernel_map_pages(struct page *page, int numpages, int enable) 1088void kernel_map_pages(struct page *page, int numpages, int enable)
@@ -1008,11 +1102,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
1008 1102
1009 /* 1103 /*
1010 * The return value is ignored as the calls cannot fail. 1104 * The return value is ignored as the calls cannot fail.
1011 * Large pages are kept enabled at boot time, and are 1105 * Large pages for identity mappings are not used at boot time
1012 * split up quickly with DEBUG_PAGEALLOC. If a splitup 1106 * and hence no memory allocations during large page split.
1013 * fails here (due to temporary memory shortage) no damage
1014 * is done because we just keep the largepage intact up
1015 * to the next attempt when it will likely be split up:
1016 */ 1107 */
1017 if (enable) 1108 if (enable)
1018 __set_pages_p(page, numpages); 1109 __set_pages_p(page, numpages);
@@ -1024,53 +1115,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
1024 * but that can deadlock->flush only current cpu: 1115 * but that can deadlock->flush only current cpu:
1025 */ 1116 */
1026 __flush_tlb_all(); 1117 __flush_tlb_all();
1027
1028 /*
1029 * Try to refill the page pool here. We can do this only after
1030 * the tlb flush.
1031 */
1032 cpa_fill_pool(NULL);
1033} 1118}
1034 1119
1035#ifdef CONFIG_DEBUG_FS
1036static int dpa_show(struct seq_file *m, void *v)
1037{
1038 seq_puts(m, "DEBUG_PAGEALLOC\n");
1039 seq_printf(m, "pool_size : %lu\n", pool_size);
1040 seq_printf(m, "pool_pages : %lu\n", pool_pages);
1041 seq_printf(m, "pool_low : %lu\n", pool_low);
1042 seq_printf(m, "pool_used : %lu\n", pool_used);
1043 seq_printf(m, "pool_failed : %lu\n", pool_failed);
1044
1045 return 0;
1046}
1047
1048static int dpa_open(struct inode *inode, struct file *filp)
1049{
1050 return single_open(filp, dpa_show, NULL);
1051}
1052
1053static const struct file_operations dpa_fops = {
1054 .open = dpa_open,
1055 .read = seq_read,
1056 .llseek = seq_lseek,
1057 .release = single_release,
1058};
1059
1060static int __init debug_pagealloc_proc_init(void)
1061{
1062 struct dentry *de;
1063
1064 de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
1065 &dpa_fops);
1066 if (!de)
1067 return -ENOMEM;
1068
1069 return 0;
1070}
1071__initcall(debug_pagealloc_proc_init);
1072#endif
1073
1074#ifdef CONFIG_HIBERNATION 1120#ifdef CONFIG_HIBERNATION
1075 1121
1076bool kernel_page_present(struct page *page) 1122bool kernel_page_present(struct page *page)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 2a50e0fa64a..738fd0f2495 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -7,24 +7,24 @@
7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. 7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
8 */ 8 */
9 9
10#include <linux/mm.h> 10#include <linux/seq_file.h>
11#include <linux/bootmem.h>
12#include <linux/debugfs.h>
11#include <linux/kernel.h> 13#include <linux/kernel.h>
12#include <linux/gfp.h> 14#include <linux/gfp.h>
15#include <linux/mm.h>
13#include <linux/fs.h> 16#include <linux/fs.h>
14#include <linux/bootmem.h>
15#include <linux/debugfs.h>
16#include <linux/seq_file.h>
17 17
18#include <asm/msr.h> 18#include <asm/cacheflush.h>
19#include <asm/tlbflush.h>
20#include <asm/processor.h> 19#include <asm/processor.h>
21#include <asm/page.h> 20#include <asm/tlbflush.h>
22#include <asm/pgtable.h> 21#include <asm/pgtable.h>
23#include <asm/pat.h>
24#include <asm/e820.h>
25#include <asm/cacheflush.h>
26#include <asm/fcntl.h> 22#include <asm/fcntl.h>
23#include <asm/e820.h>
27#include <asm/mtrr.h> 24#include <asm/mtrr.h>
25#include <asm/page.h>
26#include <asm/msr.h>
27#include <asm/pat.h>
28#include <asm/io.h> 28#include <asm/io.h>
29 29
30#ifdef CONFIG_X86_PAT 30#ifdef CONFIG_X86_PAT
@@ -46,6 +46,7 @@ early_param("nopat", nopat);
46 46
47 47
48static int debug_enable; 48static int debug_enable;
49
49static int __init pat_debug_setup(char *str) 50static int __init pat_debug_setup(char *str)
50{ 51{
51 debug_enable = 1; 52 debug_enable = 1;
@@ -145,14 +146,14 @@ static char *cattr_name(unsigned long flags)
145 */ 146 */
146 147
147struct memtype { 148struct memtype {
148 u64 start; 149 u64 start;
149 u64 end; 150 u64 end;
150 unsigned long type; 151 unsigned long type;
151 struct list_head nd; 152 struct list_head nd;
152}; 153};
153 154
154static LIST_HEAD(memtype_list); 155static LIST_HEAD(memtype_list);
155static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ 156static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
156 157
157/* 158/*
158 * Does intersection of PAT memory type and MTRR memory type and returns 159 * Does intersection of PAT memory type and MTRR memory type and returns
@@ -180,8 +181,8 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
180 return req_type; 181 return req_type;
181} 182}
182 183
183static int chk_conflict(struct memtype *new, struct memtype *entry, 184static int
184 unsigned long *type) 185chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
185{ 186{
186 if (new->type != entry->type) { 187 if (new->type != entry->type) {
187 if (type) { 188 if (type) {
@@ -211,6 +212,66 @@ static struct memtype *cached_entry;
211static u64 cached_start; 212static u64 cached_start;
212 213
213/* 214/*
215 * For RAM pages, mark the pages as non WB memory type using
216 * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
217 * set_memory_wc() on a RAM page at a time before marking it as WB again.
218 * This is ok, because only one driver will be owning the page and
219 * doing set_memory_*() calls.
220 *
221 * For now, we use PageNonWB to track that the RAM page is being mapped
222 * as non WB. In future, we will have to use one more flag
223 * (or some other mechanism in page_struct) to distinguish between
224 * UC and WC mapping.
225 */
226static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
227 unsigned long *new_type)
228{
229 struct page *page;
230 u64 pfn, end_pfn;
231
232 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
233 page = pfn_to_page(pfn);
234 if (page_mapped(page) || PageNonWB(page))
235 goto out;
236
237 SetPageNonWB(page);
238 }
239 return 0;
240
241out:
242 end_pfn = pfn;
243 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
244 page = pfn_to_page(pfn);
245 ClearPageNonWB(page);
246 }
247
248 return -EINVAL;
249}
250
251static int free_ram_pages_type(u64 start, u64 end)
252{
253 struct page *page;
254 u64 pfn, end_pfn;
255
256 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
257 page = pfn_to_page(pfn);
258 if (page_mapped(page) || !PageNonWB(page))
259 goto out;
260
261 ClearPageNonWB(page);
262 }
263 return 0;
264
265out:
266 end_pfn = pfn;
267 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
268 page = pfn_to_page(pfn);
269 SetPageNonWB(page);
270 }
271 return -EINVAL;
272}
273
274/*
214 * req_type typically has one of the: 275 * req_type typically has one of the:
215 * - _PAGE_CACHE_WB 276 * - _PAGE_CACHE_WB
216 * - _PAGE_CACHE_WC 277 * - _PAGE_CACHE_WC
@@ -226,14 +287,15 @@ static u64 cached_start;
226 * it will return a negative return value. 287 * it will return a negative return value.
227 */ 288 */
228int reserve_memtype(u64 start, u64 end, unsigned long req_type, 289int reserve_memtype(u64 start, u64 end, unsigned long req_type,
229 unsigned long *new_type) 290 unsigned long *new_type)
230{ 291{
231 struct memtype *new, *entry; 292 struct memtype *new, *entry;
232 unsigned long actual_type; 293 unsigned long actual_type;
233 struct list_head *where; 294 struct list_head *where;
295 int is_range_ram;
234 int err = 0; 296 int err = 0;
235 297
236 BUG_ON(start >= end); /* end is exclusive */ 298 BUG_ON(start >= end); /* end is exclusive */
237 299
238 if (!pat_enabled) { 300 if (!pat_enabled) {
239 /* This is identical to page table setting without PAT */ 301 /* This is identical to page table setting without PAT */
@@ -266,17 +328,24 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
266 actual_type = _PAGE_CACHE_WB; 328 actual_type = _PAGE_CACHE_WB;
267 else 329 else
268 actual_type = _PAGE_CACHE_UC_MINUS; 330 actual_type = _PAGE_CACHE_UC_MINUS;
269 } else 331 } else {
270 actual_type = pat_x_mtrr_type(start, end, 332 actual_type = pat_x_mtrr_type(start, end,
271 req_type & _PAGE_CACHE_MASK); 333 req_type & _PAGE_CACHE_MASK);
334 }
335
336 is_range_ram = pagerange_is_ram(start, end);
337 if (is_range_ram == 1)
338 return reserve_ram_pages_type(start, end, req_type, new_type);
339 else if (is_range_ram < 0)
340 return -EINVAL;
272 341
273 new = kmalloc(sizeof(struct memtype), GFP_KERNEL); 342 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
274 if (!new) 343 if (!new)
275 return -ENOMEM; 344 return -ENOMEM;
276 345
277 new->start = start; 346 new->start = start;
278 new->end = end; 347 new->end = end;
279 new->type = actual_type; 348 new->type = actual_type;
280 349
281 if (new_type) 350 if (new_type)
282 *new_type = actual_type; 351 *new_type = actual_type;
@@ -335,6 +404,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
335 start, end, cattr_name(new->type), cattr_name(req_type)); 404 start, end, cattr_name(new->type), cattr_name(req_type));
336 kfree(new); 405 kfree(new);
337 spin_unlock(&memtype_lock); 406 spin_unlock(&memtype_lock);
407
338 return err; 408 return err;
339 } 409 }
340 410
@@ -358,6 +428,7 @@ int free_memtype(u64 start, u64 end)
358{ 428{
359 struct memtype *entry; 429 struct memtype *entry;
360 int err = -EINVAL; 430 int err = -EINVAL;
431 int is_range_ram;
361 432
362 if (!pat_enabled) 433 if (!pat_enabled)
363 return 0; 434 return 0;
@@ -366,6 +437,12 @@ int free_memtype(u64 start, u64 end)
366 if (is_ISA_range(start, end - 1)) 437 if (is_ISA_range(start, end - 1))
367 return 0; 438 return 0;
368 439
440 is_range_ram = pagerange_is_ram(start, end);
441 if (is_range_ram == 1)
442 return free_ram_pages_type(start, end);
443 else if (is_range_ram < 0)
444 return -EINVAL;
445
369 spin_lock(&memtype_lock); 446 spin_lock(&memtype_lock);
370 list_for_each_entry(entry, &memtype_list, nd) { 447 list_for_each_entry(entry, &memtype_list, nd) {
371 if (entry->start == start && entry->end == end) { 448 if (entry->start == start && entry->end == end) {
@@ -386,6 +463,7 @@ int free_memtype(u64 start, u64 end)
386 } 463 }
387 464
388 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); 465 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
466
389 return err; 467 return err;
390} 468}
391 469
@@ -492,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
492 570
493void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) 571void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
494{ 572{
573 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
495 u64 addr = (u64)pfn << PAGE_SHIFT; 574 u64 addr = (u64)pfn << PAGE_SHIFT;
496 unsigned long flags; 575 unsigned long flags;
497 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
498 576
499 reserve_memtype(addr, addr + size, want_flags, &flags); 577 reserve_memtype(addr, addr + size, want_flags, &flags);
500 if (flags != want_flags) { 578 if (flags != want_flags) {
@@ -514,7 +592,7 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
514 free_memtype(addr, addr + size); 592 free_memtype(addr, addr + size);
515} 593}
516 594
517#if defined(CONFIG_DEBUG_FS) 595#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
518 596
519/* get Nth element of the linked list */ 597/* get Nth element of the linked list */
520static struct memtype *memtype_get_idx(loff_t pos) 598static struct memtype *memtype_get_idx(loff_t pos)
@@ -537,6 +615,7 @@ static struct memtype *memtype_get_idx(loff_t pos)
537 } 615 }
538 spin_unlock(&memtype_lock); 616 spin_unlock(&memtype_lock);
539 kfree(print_entry); 617 kfree(print_entry);
618
540 return NULL; 619 return NULL;
541} 620}
542 621
@@ -567,6 +646,7 @@ static int memtype_seq_show(struct seq_file *seq, void *v)
567 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), 646 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
568 print_entry->start, print_entry->end); 647 print_entry->start, print_entry->end);
569 kfree(print_entry); 648 kfree(print_entry);
649
570 return 0; 650 return 0;
571} 651}
572 652
@@ -598,4 +678,4 @@ static int __init pat_memtype_list_init(void)
598 678
599late_initcall(pat_memtype_list_init); 679late_initcall(pat_memtype_list_init);
600 680
601#endif /* CONFIG_DEBUG_FS */ 681#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index efa1911e20c..df3d5c861cd 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -79,25 +79,34 @@ static unsigned int mw32[] = { 0xC7 };
79static unsigned int mw64[] = { 0x89, 0x8B }; 79static unsigned int mw64[] = { 0x89, 0x8B };
80#endif /* not __i386__ */ 80#endif /* not __i386__ */
81 81
82static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged, 82struct prefix_bits {
83 int *rexr) 83 unsigned shorted:1;
84 unsigned enlarged:1;
85 unsigned rexr:1;
86 unsigned rex:1;
87};
88
89static int skip_prefix(unsigned char *addr, struct prefix_bits *prf)
84{ 90{
85 int i; 91 int i;
86 unsigned char *p = addr; 92 unsigned char *p = addr;
87 *shorted = 0; 93 prf->shorted = 0;
88 *enlarged = 0; 94 prf->enlarged = 0;
89 *rexr = 0; 95 prf->rexr = 0;
96 prf->rex = 0;
90 97
91restart: 98restart:
92 for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { 99 for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
93 if (*p == prefix_codes[i]) { 100 if (*p == prefix_codes[i]) {
94 if (*p == 0x66) 101 if (*p == 0x66)
95 *shorted = 1; 102 prf->shorted = 1;
96#ifdef __amd64__ 103#ifdef __amd64__
97 if ((*p & 0xf8) == 0x48) 104 if ((*p & 0xf8) == 0x48)
98 *enlarged = 1; 105 prf->enlarged = 1;
99 if ((*p & 0xf4) == 0x44) 106 if ((*p & 0xf4) == 0x44)
100 *rexr = 1; 107 prf->rexr = 1;
108 if ((*p & 0xf0) == 0x40)
109 prf->rex = 1;
101#endif 110#endif
102 p++; 111 p++;
103 goto restart; 112 goto restart;
@@ -135,12 +144,12 @@ enum reason_type get_ins_type(unsigned long ins_addr)
135{ 144{
136 unsigned int opcode; 145 unsigned int opcode;
137 unsigned char *p; 146 unsigned char *p;
138 int shorted, enlarged, rexr; 147 struct prefix_bits prf;
139 int i; 148 int i;
140 enum reason_type rv = OTHERS; 149 enum reason_type rv = OTHERS;
141 150
142 p = (unsigned char *)ins_addr; 151 p = (unsigned char *)ins_addr;
143 p += skip_prefix(p, &shorted, &enlarged, &rexr); 152 p += skip_prefix(p, &prf);
144 p += get_opcode(p, &opcode); 153 p += get_opcode(p, &opcode);
145 154
146 CHECK_OP_TYPE(opcode, reg_rop, REG_READ); 155 CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
@@ -156,10 +165,11 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr)
156{ 165{
157 unsigned int opcode; 166 unsigned int opcode;
158 unsigned char *p; 167 unsigned char *p;
159 int i, shorted, enlarged, rexr; 168 struct prefix_bits prf;
169 int i;
160 170
161 p = (unsigned char *)ins_addr; 171 p = (unsigned char *)ins_addr;
162 p += skip_prefix(p, &shorted, &enlarged, &rexr); 172 p += skip_prefix(p, &prf);
163 p += get_opcode(p, &opcode); 173 p += get_opcode(p, &opcode);
164 174
165 for (i = 0; i < ARRAY_SIZE(rw8); i++) 175 for (i = 0; i < ARRAY_SIZE(rw8); i++)
@@ -168,7 +178,7 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr)
168 178
169 for (i = 0; i < ARRAY_SIZE(rw32); i++) 179 for (i = 0; i < ARRAY_SIZE(rw32); i++)
170 if (rw32[i] == opcode) 180 if (rw32[i] == opcode)
171 return (shorted ? 2 : (enlarged ? 8 : 4)); 181 return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
172 182
173 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); 183 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
174 return 0; 184 return 0;
@@ -178,10 +188,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr)
178{ 188{
179 unsigned int opcode; 189 unsigned int opcode;
180 unsigned char *p; 190 unsigned char *p;
181 int i, shorted, enlarged, rexr; 191 struct prefix_bits prf;
192 int i;
182 193
183 p = (unsigned char *)ins_addr; 194 p = (unsigned char *)ins_addr;
184 p += skip_prefix(p, &shorted, &enlarged, &rexr); 195 p += skip_prefix(p, &prf);
185 p += get_opcode(p, &opcode); 196 p += get_opcode(p, &opcode);
186 197
187 for (i = 0; i < ARRAY_SIZE(mw8); i++) 198 for (i = 0; i < ARRAY_SIZE(mw8); i++)
@@ -194,11 +205,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr)
194 205
195 for (i = 0; i < ARRAY_SIZE(mw32); i++) 206 for (i = 0; i < ARRAY_SIZE(mw32); i++)
196 if (mw32[i] == opcode) 207 if (mw32[i] == opcode)
197 return shorted ? 2 : 4; 208 return prf.shorted ? 2 : 4;
198 209
199 for (i = 0; i < ARRAY_SIZE(mw64); i++) 210 for (i = 0; i < ARRAY_SIZE(mw64); i++)
200 if (mw64[i] == opcode) 211 if (mw64[i] == opcode)
201 return shorted ? 2 : (enlarged ? 8 : 4); 212 return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
202 213
203 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); 214 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
204 return 0; 215 return 0;
@@ -238,7 +249,7 @@ enum {
238#endif 249#endif
239}; 250};
240 251
241static unsigned char *get_reg_w8(int no, struct pt_regs *regs) 252static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs)
242{ 253{
243 unsigned char *rv = NULL; 254 unsigned char *rv = NULL;
244 255
@@ -255,18 +266,6 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
255 case arg_DL: 266 case arg_DL:
256 rv = (unsigned char *)&regs->dx; 267 rv = (unsigned char *)&regs->dx;
257 break; 268 break;
258 case arg_AH:
259 rv = 1 + (unsigned char *)&regs->ax;
260 break;
261 case arg_BH:
262 rv = 1 + (unsigned char *)&regs->bx;
263 break;
264 case arg_CH:
265 rv = 1 + (unsigned char *)&regs->cx;
266 break;
267 case arg_DH:
268 rv = 1 + (unsigned char *)&regs->dx;
269 break;
270#ifdef __amd64__ 269#ifdef __amd64__
271 case arg_R8: 270 case arg_R8:
272 rv = (unsigned char *)&regs->r8; 271 rv = (unsigned char *)&regs->r8;
@@ -294,9 +293,55 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
294 break; 293 break;
295#endif 294#endif
296 default: 295 default:
297 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
298 break; 296 break;
299 } 297 }
298
299 if (rv)
300 return rv;
301
302 if (rex) {
303 /*
304 * If REX prefix exists, access low bytes of SI etc.
305 * instead of AH etc.
306 */
307 switch (no) {
308 case arg_SI:
309 rv = (unsigned char *)&regs->si;
310 break;
311 case arg_DI:
312 rv = (unsigned char *)&regs->di;
313 break;
314 case arg_BP:
315 rv = (unsigned char *)&regs->bp;
316 break;
317 case arg_SP:
318 rv = (unsigned char *)&regs->sp;
319 break;
320 default:
321 break;
322 }
323 } else {
324 switch (no) {
325 case arg_AH:
326 rv = 1 + (unsigned char *)&regs->ax;
327 break;
328 case arg_BH:
329 rv = 1 + (unsigned char *)&regs->bx;
330 break;
331 case arg_CH:
332 rv = 1 + (unsigned char *)&regs->cx;
333 break;
334 case arg_DH:
335 rv = 1 + (unsigned char *)&regs->dx;
336 break;
337 default:
338 break;
339 }
340 }
341
342 if (!rv)
343 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
344
300 return rv; 345 return rv;
301} 346}
302 347
@@ -368,11 +413,12 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
368 unsigned char mod_rm; 413 unsigned char mod_rm;
369 int reg; 414 int reg;
370 unsigned char *p; 415 unsigned char *p;
371 int i, shorted, enlarged, rexr; 416 struct prefix_bits prf;
417 int i;
372 unsigned long rv; 418 unsigned long rv;
373 419
374 p = (unsigned char *)ins_addr; 420 p = (unsigned char *)ins_addr;
375 p += skip_prefix(p, &shorted, &enlarged, &rexr); 421 p += skip_prefix(p, &prf);
376 p += get_opcode(p, &opcode); 422 p += get_opcode(p, &opcode);
377 for (i = 0; i < ARRAY_SIZE(reg_rop); i++) 423 for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
378 if (reg_rop[i] == opcode) { 424 if (reg_rop[i] == opcode) {
@@ -392,10 +438,10 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
392 438
393do_work: 439do_work:
394 mod_rm = *p; 440 mod_rm = *p;
395 reg = ((mod_rm >> 3) & 0x7) | (rexr << 3); 441 reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
396 switch (get_ins_reg_width(ins_addr)) { 442 switch (get_ins_reg_width(ins_addr)) {
397 case 1: 443 case 1:
398 return *get_reg_w8(reg, regs); 444 return *get_reg_w8(reg, prf.rex, regs);
399 445
400 case 2: 446 case 2:
401 return *(unsigned short *)get_reg_w32(reg, regs); 447 return *(unsigned short *)get_reg_w32(reg, regs);
@@ -422,11 +468,12 @@ unsigned long get_ins_imm_val(unsigned long ins_addr)
422 unsigned char mod_rm; 468 unsigned char mod_rm;
423 unsigned char mod; 469 unsigned char mod;
424 unsigned char *p; 470 unsigned char *p;
425 int i, shorted, enlarged, rexr; 471 struct prefix_bits prf;
472 int i;
426 unsigned long rv; 473 unsigned long rv;
427 474
428 p = (unsigned char *)ins_addr; 475 p = (unsigned char *)ins_addr;
429 p += skip_prefix(p, &shorted, &enlarged, &rexr); 476 p += skip_prefix(p, &prf);
430 p += get_opcode(p, &opcode); 477 p += get_opcode(p, &opcode);
431 for (i = 0; i < ARRAY_SIZE(imm_wop); i++) 478 for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
432 if (imm_wop[i] == opcode) { 479 if (imm_wop[i] == opcode) {
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d50302774fe..86f2ffc43c3 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -63,10 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd)
63#define UNSHARED_PTRS_PER_PGD \ 63#define UNSHARED_PTRS_PER_PGD \
64 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 64 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
65 65
66static void pgd_ctor(void *p) 66static void pgd_ctor(pgd_t *pgd)
67{ 67{
68 pgd_t *pgd = p;
69
70 /* If the pgd points to a shared pagetable level (either the 68 /* If the pgd points to a shared pagetable level (either the
71 ptes in non-PAE, or shared PMD in PAE), then just copy the 69 ptes in non-PAE, or shared PMD in PAE), then just copy the
72 references from swapper_pg_dir. */ 70 references from swapper_pg_dir. */
@@ -87,7 +85,7 @@ static void pgd_ctor(void *p)
87 pgd_list_add(pgd); 85 pgd_list_add(pgd);
88} 86}
89 87
90static void pgd_dtor(void *pgd) 88static void pgd_dtor(pgd_t *pgd)
91{ 89{
92 unsigned long flags; /* can be called from interrupt context */ 90 unsigned long flags; /* can be called from interrupt context */
93 91
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index cab0abbd1eb..0951db9ee51 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -123,7 +123,8 @@ static int __init parse_vmalloc(char *arg)
123 if (!arg) 123 if (!arg)
124 return -EINVAL; 124 return -EINVAL;
125 125
126 __VMALLOC_RESERVE = memparse(arg, &arg); 126 /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
127 __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
127 return 0; 128 return 0;
128} 129}
129early_param("vmalloc", parse_vmalloc); 130early_param("vmalloc", parse_vmalloc);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 1b4763e26ea..51c0a2fc14f 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -138,7 +138,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
138 return; 138 return;
139 } 139 }
140 140
141 if (is_uv_system()) 141 if (get_uv_system_type() >= UV_X2APIC)
142 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; 142 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
143 else 143 else
144 apic_id = pa->apic_id; 144 apic_id = pa->apic_id;
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index d877c5b423e..ab50a8d7402 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/io.h> 5#include <linux/io.h>
6#include <linux/mmiotrace.h>
6 7
7#define MODULE_NAME "testmmiotrace" 8#define MODULE_NAME "testmmiotrace"
8 9
@@ -13,6 +14,7 @@ MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
13static void do_write_test(void __iomem *p) 14static void do_write_test(void __iomem *p)
14{ 15{
15 unsigned int i; 16 unsigned int i;
17 mmiotrace_printk("Write test.\n");
16 for (i = 0; i < 256; i++) 18 for (i = 0; i < 256; i++)
17 iowrite8(i, p + i); 19 iowrite8(i, p + i);
18 for (i = 1024; i < (5 * 1024); i += 2) 20 for (i = 1024; i < (5 * 1024); i += 2)
@@ -24,6 +26,7 @@ static void do_write_test(void __iomem *p)
24static void do_read_test(void __iomem *p) 26static void do_read_test(void __iomem *p)
25{ 27{
26 unsigned int i; 28 unsigned int i;
29 mmiotrace_printk("Read test.\n");
27 for (i = 0; i < 256; i++) 30 for (i = 0; i < 256; i++)
28 ioread8(p + i); 31 ioread8(p + i);
29 for (i = 1024; i < (5 * 1024); i += 2) 32 for (i = 1024; i < (5 * 1024); i += 2)
@@ -39,6 +42,7 @@ static void do_test(void)
39 pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); 42 pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
40 return; 43 return;
41 } 44 }
45 mmiotrace_printk("ioremap returned %p.\n", p);
42 do_write_test(p); 46 do_write_test(p);
43 do_read_test(p); 47 do_read_test(p);
44 iounmap(p); 48 iounmap(p);
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile
index 30f3eb36666..446902b2a6b 100644
--- a/arch/x86/oprofile/Makefile
+++ b/arch/x86/oprofile/Makefile
@@ -7,6 +7,6 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
7 timer_int.o ) 7 timer_int.o )
8 8
9oprofile-y := $(DRIVER_OBJS) init.o backtrace.o 9oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
10oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \ 10oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \
11 op_model_ppro.o op_model_p4.o 11 op_model_ppro.o op_model_p4.o
12oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o 12oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index e2095cba409..04df67f8a7b 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -52,8 +52,7 @@ struct frame_head {
52 unsigned long ret; 52 unsigned long ret;
53} __attribute__((packed)); 53} __attribute__((packed));
54 54
55static struct frame_head * 55static struct frame_head *dump_user_backtrace(struct frame_head *head)
56dump_user_backtrace(struct frame_head * head)
57{ 56{
58 struct frame_head bufhead[2]; 57 struct frame_head bufhead[2];
59 58
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 0227694f7da..022cd41ea9b 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -1,10 +1,11 @@
1/** 1/**
2 * @file nmi_int.c 2 * @file nmi_int.c
3 * 3 *
4 * @remark Copyright 2002 OProfile authors 4 * @remark Copyright 2002-2008 OProfile authors
5 * @remark Read the file COPYING 5 * @remark Read the file COPYING
6 * 6 *
7 * @author John Levon <levon@movementarian.org> 7 * @author John Levon <levon@movementarian.org>
8 * @author Robert Richter <robert.richter@amd.com>
8 */ 9 */
9 10
10#include <linux/init.h> 11#include <linux/init.h>
@@ -27,85 +28,9 @@ static struct op_x86_model_spec const *model;
27static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); 28static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
28static DEFINE_PER_CPU(unsigned long, saved_lvtpc); 29static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
29 30
30static int nmi_start(void);
31static void nmi_stop(void);
32static void nmi_cpu_start(void *dummy);
33static void nmi_cpu_stop(void *dummy);
34
35/* 0 == registered but off, 1 == registered and on */ 31/* 0 == registered but off, 1 == registered and on */
36static int nmi_enabled = 0; 32static int nmi_enabled = 0;
37 33
38#ifdef CONFIG_SMP
39static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
40 void *data)
41{
42 int cpu = (unsigned long)data;
43 switch (action) {
44 case CPU_DOWN_FAILED:
45 case CPU_ONLINE:
46 smp_call_function_single(cpu, nmi_cpu_start, NULL, 0);
47 break;
48 case CPU_DOWN_PREPARE:
49 smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1);
50 break;
51 }
52 return NOTIFY_DONE;
53}
54
55static struct notifier_block oprofile_cpu_nb = {
56 .notifier_call = oprofile_cpu_notifier
57};
58#endif
59
60#ifdef CONFIG_PM
61
62static int nmi_suspend(struct sys_device *dev, pm_message_t state)
63{
64 /* Only one CPU left, just stop that one */
65 if (nmi_enabled == 1)
66 nmi_cpu_stop(NULL);
67 return 0;
68}
69
70static int nmi_resume(struct sys_device *dev)
71{
72 if (nmi_enabled == 1)
73 nmi_cpu_start(NULL);
74 return 0;
75}
76
77static struct sysdev_class oprofile_sysclass = {
78 .name = "oprofile",
79 .resume = nmi_resume,
80 .suspend = nmi_suspend,
81};
82
83static struct sys_device device_oprofile = {
84 .id = 0,
85 .cls = &oprofile_sysclass,
86};
87
88static int __init init_sysfs(void)
89{
90 int error;
91
92 error = sysdev_class_register(&oprofile_sysclass);
93 if (!error)
94 error = sysdev_register(&device_oprofile);
95 return error;
96}
97
98static void exit_sysfs(void)
99{
100 sysdev_unregister(&device_oprofile);
101 sysdev_class_unregister(&oprofile_sysclass);
102}
103
104#else
105#define init_sysfs() do { } while (0)
106#define exit_sysfs() do { } while (0)
107#endif /* CONFIG_PM */
108
109static int profile_exceptions_notify(struct notifier_block *self, 34static int profile_exceptions_notify(struct notifier_block *self,
110 unsigned long val, void *data) 35 unsigned long val, void *data)
111{ 36{
@@ -295,10 +220,12 @@ static void nmi_cpu_shutdown(void *dummy)
295 220
296static void nmi_shutdown(void) 221static void nmi_shutdown(void)
297{ 222{
298 struct op_msrs *msrs = &get_cpu_var(cpu_msrs); 223 struct op_msrs *msrs;
224
299 nmi_enabled = 0; 225 nmi_enabled = 0;
300 on_each_cpu(nmi_cpu_shutdown, NULL, 1); 226 on_each_cpu(nmi_cpu_shutdown, NULL, 1);
301 unregister_die_notifier(&profile_exceptions_nb); 227 unregister_die_notifier(&profile_exceptions_nb);
228 msrs = &get_cpu_var(cpu_msrs);
302 model->shutdown(msrs); 229 model->shutdown(msrs);
303 free_msrs(); 230 free_msrs();
304 put_cpu_var(cpu_msrs); 231 put_cpu_var(cpu_msrs);
@@ -358,6 +285,77 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
358 return 0; 285 return 0;
359} 286}
360 287
288#ifdef CONFIG_SMP
289static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
290 void *data)
291{
292 int cpu = (unsigned long)data;
293 switch (action) {
294 case CPU_DOWN_FAILED:
295 case CPU_ONLINE:
296 smp_call_function_single(cpu, nmi_cpu_start, NULL, 0);
297 break;
298 case CPU_DOWN_PREPARE:
299 smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1);
300 break;
301 }
302 return NOTIFY_DONE;
303}
304
305static struct notifier_block oprofile_cpu_nb = {
306 .notifier_call = oprofile_cpu_notifier
307};
308#endif
309
310#ifdef CONFIG_PM
311
312static int nmi_suspend(struct sys_device *dev, pm_message_t state)
313{
314 /* Only one CPU left, just stop that one */
315 if (nmi_enabled == 1)
316 nmi_cpu_stop(NULL);
317 return 0;
318}
319
320static int nmi_resume(struct sys_device *dev)
321{
322 if (nmi_enabled == 1)
323 nmi_cpu_start(NULL);
324 return 0;
325}
326
327static struct sysdev_class oprofile_sysclass = {
328 .name = "oprofile",
329 .resume = nmi_resume,
330 .suspend = nmi_suspend,
331};
332
333static struct sys_device device_oprofile = {
334 .id = 0,
335 .cls = &oprofile_sysclass,
336};
337
338static int __init init_sysfs(void)
339{
340 int error;
341
342 error = sysdev_class_register(&oprofile_sysclass);
343 if (!error)
344 error = sysdev_register(&device_oprofile);
345 return error;
346}
347
348static void exit_sysfs(void)
349{
350 sysdev_unregister(&device_oprofile);
351 sysdev_class_unregister(&oprofile_sysclass);
352}
353
354#else
355#define init_sysfs() do { } while (0)
356#define exit_sysfs() do { } while (0)
357#endif /* CONFIG_PM */
358
361static int p4force; 359static int p4force;
362module_param(p4force, int, 0); 360module_param(p4force, int, 0);
363 361
@@ -417,9 +415,6 @@ static int __init ppro_init(char **cpu_type)
417 case 15: case 23: 415 case 15: case 23:
418 *cpu_type = "i386/core_2"; 416 *cpu_type = "i386/core_2";
419 break; 417 break;
420 case 26:
421 *cpu_type = "i386/core_2";
422 break;
423 default: 418 default:
424 /* Unknown */ 419 /* Unknown */
425 return 0; 420 return 0;
@@ -429,6 +424,16 @@ static int __init ppro_init(char **cpu_type)
429 return 1; 424 return 1;
430} 425}
431 426
427static int __init arch_perfmon_init(char **cpu_type)
428{
429 if (!cpu_has_arch_perfmon)
430 return 0;
431 *cpu_type = "i386/arch_perfmon";
432 model = &op_arch_perfmon_spec;
433 arch_perfmon_setup_counters();
434 return 1;
435}
436
432/* in order to get sysfs right */ 437/* in order to get sysfs right */
433static int using_nmi; 438static int using_nmi;
434 439
@@ -436,7 +441,8 @@ int __init op_nmi_init(struct oprofile_operations *ops)
436{ 441{
437 __u8 vendor = boot_cpu_data.x86_vendor; 442 __u8 vendor = boot_cpu_data.x86_vendor;
438 __u8 family = boot_cpu_data.x86; 443 __u8 family = boot_cpu_data.x86;
439 char *cpu_type; 444 char *cpu_type = NULL;
445 int ret = 0;
440 446
441 if (!cpu_has_apic) 447 if (!cpu_has_apic)
442 return -ENODEV; 448 return -ENODEV;
@@ -449,19 +455,23 @@ int __init op_nmi_init(struct oprofile_operations *ops)
449 default: 455 default:
450 return -ENODEV; 456 return -ENODEV;
451 case 6: 457 case 6:
452 model = &op_athlon_spec; 458 model = &op_amd_spec;
453 cpu_type = "i386/athlon"; 459 cpu_type = "i386/athlon";
454 break; 460 break;
455 case 0xf: 461 case 0xf:
456 model = &op_athlon_spec; 462 model = &op_amd_spec;
457 /* Actually it could be i386/hammer too, but give 463 /* Actually it could be i386/hammer too, but give
458 user space an consistent name. */ 464 user space an consistent name. */
459 cpu_type = "x86-64/hammer"; 465 cpu_type = "x86-64/hammer";
460 break; 466 break;
461 case 0x10: 467 case 0x10:
462 model = &op_athlon_spec; 468 model = &op_amd_spec;
463 cpu_type = "x86-64/family10"; 469 cpu_type = "x86-64/family10";
464 break; 470 break;
471 case 0x11:
472 model = &op_amd_spec;
473 cpu_type = "x86-64/family11h";
474 break;
465 } 475 }
466 break; 476 break;
467 477
@@ -469,36 +479,44 @@ int __init op_nmi_init(struct oprofile_operations *ops)
469 switch (family) { 479 switch (family) {
470 /* Pentium IV */ 480 /* Pentium IV */
471 case 0xf: 481 case 0xf:
472 if (!p4_init(&cpu_type)) 482 p4_init(&cpu_type);
473 return -ENODEV;
474 break; 483 break;
475 484
476 /* A P6-class processor */ 485 /* A P6-class processor */
477 case 6: 486 case 6:
478 if (!ppro_init(&cpu_type)) 487 ppro_init(&cpu_type);
479 return -ENODEV;
480 break; 488 break;
481 489
482 default: 490 default:
483 return -ENODEV; 491 break;
484 } 492 }
493
494 if (!cpu_type && !arch_perfmon_init(&cpu_type))
495 return -ENODEV;
485 break; 496 break;
486 497
487 default: 498 default:
488 return -ENODEV; 499 return -ENODEV;
489 } 500 }
490 501
491 init_sysfs();
492#ifdef CONFIG_SMP 502#ifdef CONFIG_SMP
493 register_cpu_notifier(&oprofile_cpu_nb); 503 register_cpu_notifier(&oprofile_cpu_nb);
494#endif 504#endif
495 using_nmi = 1; 505 /* default values, can be overwritten by model */
496 ops->create_files = nmi_create_files; 506 ops->create_files = nmi_create_files;
497 ops->setup = nmi_setup; 507 ops->setup = nmi_setup;
498 ops->shutdown = nmi_shutdown; 508 ops->shutdown = nmi_shutdown;
499 ops->start = nmi_start; 509 ops->start = nmi_start;
500 ops->stop = nmi_stop; 510 ops->stop = nmi_stop;
501 ops->cpu_type = cpu_type; 511 ops->cpu_type = cpu_type;
512
513 if (model->init)
514 ret = model->init(ops);
515 if (ret)
516 return ret;
517
518 init_sysfs();
519 using_nmi = 1;
502 printk(KERN_INFO "oprofile: using NMI interrupt.\n"); 520 printk(KERN_INFO "oprofile: using NMI interrupt.\n");
503 return 0; 521 return 0;
504} 522}
@@ -511,4 +529,6 @@ void op_nmi_exit(void)
511 unregister_cpu_notifier(&oprofile_cpu_nb); 529 unregister_cpu_notifier(&oprofile_cpu_nb);
512#endif 530#endif
513 } 531 }
532 if (model->exit)
533 model->exit();
514} 534}
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
index 2880b15c467..91b6a116165 100644
--- a/arch/x86/oprofile/op_counter.h
+++ b/arch/x86/oprofile/op_counter.h
@@ -6,22 +6,22 @@
6 * 6 *
7 * @author John Levon 7 * @author John Levon
8 */ 8 */
9 9
10#ifndef OP_COUNTER_H 10#ifndef OP_COUNTER_H
11#define OP_COUNTER_H 11#define OP_COUNTER_H
12 12
13#define OP_MAX_COUNTER 8 13#define OP_MAX_COUNTER 8
14 14
15/* Per-perfctr configuration as set via 15/* Per-perfctr configuration as set via
16 * oprofilefs. 16 * oprofilefs.
17 */ 17 */
18struct op_counter_config { 18struct op_counter_config {
19 unsigned long count; 19 unsigned long count;
20 unsigned long enabled; 20 unsigned long enabled;
21 unsigned long event; 21 unsigned long event;
22 unsigned long kernel; 22 unsigned long kernel;
23 unsigned long user; 23 unsigned long user;
24 unsigned long unit_mask; 24 unsigned long unit_mask;
25}; 25};
26 26
27extern struct op_counter_config counter_config[]; 27extern struct op_counter_config counter_config[];
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
new file mode 100644
index 00000000000..509513760a6
--- /dev/null
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -0,0 +1,546 @@
1/*
2 * @file op_model_amd.c
3 * athlon / K7 / K8 / Family 10h model-specific MSR operations
4 *
5 * @remark Copyright 2002-2008 OProfile authors
6 * @remark Read the file COPYING
7 *
8 * @author John Levon
9 * @author Philippe Elie
10 * @author Graydon Hoare
11 * @author Robert Richter <robert.richter@amd.com>
12 * @author Barry Kasindorf
13*/
14
15#include <linux/oprofile.h>
16#include <linux/device.h>
17#include <linux/pci.h>
18
19#include <asm/ptrace.h>
20#include <asm/msr.h>
21#include <asm/nmi.h>
22
23#include "op_x86_model.h"
24#include "op_counter.h"
25
26#define NUM_COUNTERS 4
27#define NUM_CONTROLS 4
28
29#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
30#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
31#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0)
32#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
33
34#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
35#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
36#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
37#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
38#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
39#define CTRL_CLEAR_LO(x) (x &= (1<<21))
40#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0)
41#define CTRL_SET_ENABLE(val) (val |= 1<<20)
42#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
43#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
44#define CTRL_SET_UM(val, m) (val |= (m << 8))
45#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff))
46#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf))
47#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9))
48#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
49
50static unsigned long reset_value[NUM_COUNTERS];
51
52#ifdef CONFIG_OPROFILE_IBS
53
54/* IbsFetchCtl bits/masks */
55#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */
56#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */
57#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */
58
59/*IbsOpCtl bits */
60#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */
61#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */
62
63/* Codes used in cpu_buffer.c */
64/* This produces duplicate code, need to be fixed */
65#define IBS_FETCH_BEGIN 3
66#define IBS_OP_BEGIN 4
67
68/* The function interface needs to be fixed, something like add
69 data. Should then be added to linux/oprofile.h. */
70extern void
71oprofile_add_ibs_sample(struct pt_regs *const regs,
72 unsigned int *const ibs_sample, int ibs_code);
73
74struct ibs_fetch_sample {
75 /* MSRC001_1031 IBS Fetch Linear Address Register */
76 unsigned int ibs_fetch_lin_addr_low;
77 unsigned int ibs_fetch_lin_addr_high;
78 /* MSRC001_1030 IBS Fetch Control Register */
79 unsigned int ibs_fetch_ctl_low;
80 unsigned int ibs_fetch_ctl_high;
81 /* MSRC001_1032 IBS Fetch Physical Address Register */
82 unsigned int ibs_fetch_phys_addr_low;
83 unsigned int ibs_fetch_phys_addr_high;
84};
85
86struct ibs_op_sample {
87 /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */
88 unsigned int ibs_op_rip_low;
89 unsigned int ibs_op_rip_high;
90 /* MSRC001_1035 IBS Op Data Register */
91 unsigned int ibs_op_data1_low;
92 unsigned int ibs_op_data1_high;
93 /* MSRC001_1036 IBS Op Data 2 Register */
94 unsigned int ibs_op_data2_low;
95 unsigned int ibs_op_data2_high;
96 /* MSRC001_1037 IBS Op Data 3 Register */
97 unsigned int ibs_op_data3_low;
98 unsigned int ibs_op_data3_high;
99 /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */
100 unsigned int ibs_dc_linear_low;
101 unsigned int ibs_dc_linear_high;
102 /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */
103 unsigned int ibs_dc_phys_low;
104 unsigned int ibs_dc_phys_high;
105};
106
107/*
108 * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+
109*/
110static void clear_ibs_nmi(void);
111
112static int ibs_allowed; /* AMD Family10h and later */
113
114struct op_ibs_config {
115 unsigned long op_enabled;
116 unsigned long fetch_enabled;
117 unsigned long max_cnt_fetch;
118 unsigned long max_cnt_op;
119 unsigned long rand_en;
120 unsigned long dispatched_ops;
121};
122
123static struct op_ibs_config ibs_config;
124
125#endif
126
127/* functions for op_amd_spec */
128
129static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
130{
131 int i;
132
133 for (i = 0; i < NUM_COUNTERS; i++) {
134 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
135 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
136 else
137 msrs->counters[i].addr = 0;
138 }
139
140 for (i = 0; i < NUM_CONTROLS; i++) {
141 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
142 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
143 else
144 msrs->controls[i].addr = 0;
145 }
146}
147
148
149static void op_amd_setup_ctrs(struct op_msrs const * const msrs)
150{
151 unsigned int low, high;
152 int i;
153
154 /* clear all counters */
155 for (i = 0 ; i < NUM_CONTROLS; ++i) {
156 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
157 continue;
158 CTRL_READ(low, high, msrs, i);
159 CTRL_CLEAR_LO(low);
160 CTRL_CLEAR_HI(high);
161 CTRL_WRITE(low, high, msrs, i);
162 }
163
164 /* avoid a false detection of ctr overflows in NMI handler */
165 for (i = 0; i < NUM_COUNTERS; ++i) {
166 if (unlikely(!CTR_IS_RESERVED(msrs, i)))
167 continue;
168 CTR_WRITE(1, msrs, i);
169 }
170
171 /* enable active counters */
172 for (i = 0; i < NUM_COUNTERS; ++i) {
173 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
174 reset_value[i] = counter_config[i].count;
175
176 CTR_WRITE(counter_config[i].count, msrs, i);
177
178 CTRL_READ(low, high, msrs, i);
179 CTRL_CLEAR_LO(low);
180 CTRL_CLEAR_HI(high);
181 CTRL_SET_ENABLE(low);
182 CTRL_SET_USR(low, counter_config[i].user);
183 CTRL_SET_KERN(low, counter_config[i].kernel);
184 CTRL_SET_UM(low, counter_config[i].unit_mask);
185 CTRL_SET_EVENT_LOW(low, counter_config[i].event);
186 CTRL_SET_EVENT_HIGH(high, counter_config[i].event);
187 CTRL_SET_HOST_ONLY(high, 0);
188 CTRL_SET_GUEST_ONLY(high, 0);
189
190 CTRL_WRITE(low, high, msrs, i);
191 } else {
192 reset_value[i] = 0;
193 }
194 }
195}
196
197#ifdef CONFIG_OPROFILE_IBS
198
199static inline int
200op_amd_handle_ibs(struct pt_regs * const regs,
201 struct op_msrs const * const msrs)
202{
203 unsigned int low, high;
204 struct ibs_fetch_sample ibs_fetch;
205 struct ibs_op_sample ibs_op;
206
207 if (!ibs_allowed)
208 return 1;
209
210 if (ibs_config.fetch_enabled) {
211 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
212 if (high & IBS_FETCH_HIGH_VALID_BIT) {
213 ibs_fetch.ibs_fetch_ctl_high = high;
214 ibs_fetch.ibs_fetch_ctl_low = low;
215 rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high);
216 ibs_fetch.ibs_fetch_lin_addr_high = high;
217 ibs_fetch.ibs_fetch_lin_addr_low = low;
218 rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high);
219 ibs_fetch.ibs_fetch_phys_addr_high = high;
220 ibs_fetch.ibs_fetch_phys_addr_low = low;
221
222 oprofile_add_ibs_sample(regs,
223 (unsigned int *)&ibs_fetch,
224 IBS_FETCH_BEGIN);
225
226 /*reenable the IRQ */
227 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
228 high &= ~IBS_FETCH_HIGH_VALID_BIT;
229 high |= IBS_FETCH_HIGH_ENABLE;
230 low &= IBS_FETCH_LOW_MAX_CNT_MASK;
231 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
232 }
233 }
234
235 if (ibs_config.op_enabled) {
236 rdmsr(MSR_AMD64_IBSOPCTL, low, high);
237 if (low & IBS_OP_LOW_VALID_BIT) {
238 rdmsr(MSR_AMD64_IBSOPRIP, low, high);
239 ibs_op.ibs_op_rip_low = low;
240 ibs_op.ibs_op_rip_high = high;
241 rdmsr(MSR_AMD64_IBSOPDATA, low, high);
242 ibs_op.ibs_op_data1_low = low;
243 ibs_op.ibs_op_data1_high = high;
244 rdmsr(MSR_AMD64_IBSOPDATA2, low, high);
245 ibs_op.ibs_op_data2_low = low;
246 ibs_op.ibs_op_data2_high = high;
247 rdmsr(MSR_AMD64_IBSOPDATA3, low, high);
248 ibs_op.ibs_op_data3_low = low;
249 ibs_op.ibs_op_data3_high = high;
250 rdmsr(MSR_AMD64_IBSDCLINAD, low, high);
251 ibs_op.ibs_dc_linear_low = low;
252 ibs_op.ibs_dc_linear_high = high;
253 rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high);
254 ibs_op.ibs_dc_phys_low = low;
255 ibs_op.ibs_dc_phys_high = high;
256
257 /* reenable the IRQ */
258 oprofile_add_ibs_sample(regs,
259 (unsigned int *)&ibs_op,
260 IBS_OP_BEGIN);
261 rdmsr(MSR_AMD64_IBSOPCTL, low, high);
262 high = 0;
263 low &= ~IBS_OP_LOW_VALID_BIT;
264 low |= IBS_OP_LOW_ENABLE;
265 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
266 }
267 }
268
269 return 1;
270}
271
272#endif
273
274static int op_amd_check_ctrs(struct pt_regs * const regs,
275 struct op_msrs const * const msrs)
276{
277 unsigned int low, high;
278 int i;
279
280 for (i = 0 ; i < NUM_COUNTERS; ++i) {
281 if (!reset_value[i])
282 continue;
283 CTR_READ(low, high, msrs, i);
284 if (CTR_OVERFLOWED(low)) {
285 oprofile_add_sample(regs, i);
286 CTR_WRITE(reset_value[i], msrs, i);
287 }
288 }
289
290#ifdef CONFIG_OPROFILE_IBS
291 op_amd_handle_ibs(regs, msrs);
292#endif
293
294 /* See op_model_ppro.c */
295 return 1;
296}
297
298static void op_amd_start(struct op_msrs const * const msrs)
299{
300 unsigned int low, high;
301 int i;
302 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
303 if (reset_value[i]) {
304 CTRL_READ(low, high, msrs, i);
305 CTRL_SET_ACTIVE(low);
306 CTRL_WRITE(low, high, msrs, i);
307 }
308 }
309
310#ifdef CONFIG_OPROFILE_IBS
311 if (ibs_allowed && ibs_config.fetch_enabled) {
312 low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF;
313 high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */
314 + IBS_FETCH_HIGH_ENABLE;
315 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
316 }
317
318 if (ibs_allowed && ibs_config.op_enabled) {
319 low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF)
320 + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */
321 + IBS_OP_LOW_ENABLE;
322 high = 0;
323 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
324 }
325#endif
326}
327
328
329static void op_amd_stop(struct op_msrs const * const msrs)
330{
331 unsigned int low, high;
332 int i;
333
334 /* Subtle: stop on all counters to avoid race with
335 * setting our pm callback */
336 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
337 if (!reset_value[i])
338 continue;
339 CTRL_READ(low, high, msrs, i);
340 CTRL_SET_INACTIVE(low);
341 CTRL_WRITE(low, high, msrs, i);
342 }
343
344#ifdef CONFIG_OPROFILE_IBS
345 if (ibs_allowed && ibs_config.fetch_enabled) {
346 low = 0; /* clear max count and enable */
347 high = 0;
348 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
349 }
350
351 if (ibs_allowed && ibs_config.op_enabled) {
352 low = 0; /* clear max count and enable */
353 high = 0;
354 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
355 }
356#endif
357}
358
359static void op_amd_shutdown(struct op_msrs const * const msrs)
360{
361 int i;
362
363 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
364 if (CTR_IS_RESERVED(msrs, i))
365 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
366 }
367 for (i = 0 ; i < NUM_CONTROLS ; ++i) {
368 if (CTRL_IS_RESERVED(msrs, i))
369 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
370 }
371}
372
373#ifndef CONFIG_OPROFILE_IBS
374
375/* no IBS support */
376
377static int op_amd_init(struct oprofile_operations *ops)
378{
379 return 0;
380}
381
382static void op_amd_exit(void) {}
383
384#else
385
386static u8 ibs_eilvt_off;
387
388static inline void apic_init_ibs_nmi_per_cpu(void *arg)
389{
390 ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
391}
392
393static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
394{
395 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
396}
397
398static int pfm_amd64_setup_eilvt(void)
399{
400#define IBSCTL_LVTOFFSETVAL (1 << 8)
401#define IBSCTL 0x1cc
402 struct pci_dev *cpu_cfg;
403 int nodes;
404 u32 value = 0;
405
406 /* per CPU setup */
407 on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1);
408
409 nodes = 0;
410 cpu_cfg = NULL;
411 do {
412 cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
413 PCI_DEVICE_ID_AMD_10H_NB_MISC,
414 cpu_cfg);
415 if (!cpu_cfg)
416 break;
417 ++nodes;
418 pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
419 | IBSCTL_LVTOFFSETVAL);
420 pci_read_config_dword(cpu_cfg, IBSCTL, &value);
421 if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) {
422 printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
423 "IBSCTL = 0x%08x", value);
424 return 1;
425 }
426 } while (1);
427
428 if (!nodes) {
429 printk(KERN_DEBUG "No CPU node configured for IBS");
430 return 1;
431 }
432
433#ifdef CONFIG_NUMA
434 /* Sanity check */
435 /* Works only for 64bit with proper numa implementation. */
436 if (nodes != num_possible_nodes()) {
437 printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, "
438 "found: %d, expected %d",
439 nodes, num_possible_nodes());
440 return 1;
441 }
442#endif
443 return 0;
444}
445
446/*
447 * initialize the APIC for the IBS interrupts
448 * if available (AMD Family10h rev B0 and later)
449 */
450static void setup_ibs(void)
451{
452 ibs_allowed = boot_cpu_has(X86_FEATURE_IBS);
453
454 if (!ibs_allowed)
455 return;
456
457 if (pfm_amd64_setup_eilvt()) {
458 ibs_allowed = 0;
459 return;
460 }
461
462 printk(KERN_INFO "oprofile: AMD IBS detected\n");
463}
464
465
466/*
467 * unitialize the APIC for the IBS interrupts if needed on AMD Family10h
468 * rev B0 and later */
469static void clear_ibs_nmi(void)
470{
471 if (ibs_allowed)
472 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
473}
474
475static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
476
477static int setup_ibs_files(struct super_block *sb, struct dentry *root)
478{
479 struct dentry *dir;
480 int ret = 0;
481
482 /* architecture specific files */
483 if (create_arch_files)
484 ret = create_arch_files(sb, root);
485
486 if (ret)
487 return ret;
488
489 if (!ibs_allowed)
490 return ret;
491
492 /* model specific files */
493
494 /* setup some reasonable defaults */
495 ibs_config.max_cnt_fetch = 250000;
496 ibs_config.fetch_enabled = 0;
497 ibs_config.max_cnt_op = 250000;
498 ibs_config.op_enabled = 0;
499 ibs_config.dispatched_ops = 1;
500
501 dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
502 oprofilefs_create_ulong(sb, dir, "enable",
503 &ibs_config.fetch_enabled);
504 oprofilefs_create_ulong(sb, dir, "max_count",
505 &ibs_config.max_cnt_fetch);
506 oprofilefs_create_ulong(sb, dir, "rand_enable",
507 &ibs_config.rand_en);
508
509 dir = oprofilefs_mkdir(sb, root, "ibs_op");
510 oprofilefs_create_ulong(sb, dir, "enable",
511 &ibs_config.op_enabled);
512 oprofilefs_create_ulong(sb, dir, "max_count",
513 &ibs_config.max_cnt_op);
514 oprofilefs_create_ulong(sb, dir, "dispatched_ops",
515 &ibs_config.dispatched_ops);
516
517 return 0;
518}
519
520static int op_amd_init(struct oprofile_operations *ops)
521{
522 setup_ibs();
523 create_arch_files = ops->create_files;
524 ops->create_files = setup_ibs_files;
525 return 0;
526}
527
528static void op_amd_exit(void)
529{
530 clear_ibs_nmi();
531}
532
533#endif
534
535struct op_x86_model_spec const op_amd_spec = {
536 .init = op_amd_init,
537 .exit = op_amd_exit,
538 .num_counters = NUM_COUNTERS,
539 .num_controls = NUM_CONTROLS,
540 .fill_in_addresses = &op_amd_fill_in_addresses,
541 .setup_ctrs = &op_amd_setup_ctrs,
542 .check_ctrs = &op_amd_check_ctrs,
543 .start = &op_amd_start,
544 .stop = &op_amd_stop,
545 .shutdown = &op_amd_shutdown
546};
diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c
deleted file mode 100644
index 3d534879a9d..00000000000
--- a/arch/x86/oprofile/op_model_athlon.c
+++ /dev/null
@@ -1,190 +0,0 @@
1/*
2 * @file op_model_athlon.h
3 * athlon / K7 / K8 / Family 10h model-specific MSR operations
4 *
5 * @remark Copyright 2002 OProfile authors
6 * @remark Read the file COPYING
7 *
8 * @author John Levon
9 * @author Philippe Elie
10 * @author Graydon Hoare
11 */
12
13#include <linux/oprofile.h>
14#include <asm/ptrace.h>
15#include <asm/msr.h>
16#include <asm/nmi.h>
17
18#include "op_x86_model.h"
19#include "op_counter.h"
20
21#define NUM_COUNTERS 4
22#define NUM_CONTROLS 4
23
24#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
25#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
26#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0)
27#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
28
29#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
30#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
31#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
32#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
33#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
34#define CTRL_CLEAR_LO(x) (x &= (1<<21))
35#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0)
36#define CTRL_SET_ENABLE(val) (val |= 1<<20)
37#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
38#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
39#define CTRL_SET_UM(val, m) (val |= (m << 8))
40#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff))
41#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf))
42#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9))
43#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
44
45static unsigned long reset_value[NUM_COUNTERS];
46
47static void athlon_fill_in_addresses(struct op_msrs * const msrs)
48{
49 int i;
50
51 for (i = 0; i < NUM_COUNTERS; i++) {
52 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
53 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
54 else
55 msrs->counters[i].addr = 0;
56 }
57
58 for (i = 0; i < NUM_CONTROLS; i++) {
59 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
60 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
61 else
62 msrs->controls[i].addr = 0;
63 }
64}
65
66
67static void athlon_setup_ctrs(struct op_msrs const * const msrs)
68{
69 unsigned int low, high;
70 int i;
71
72 /* clear all counters */
73 for (i = 0 ; i < NUM_CONTROLS; ++i) {
74 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
75 continue;
76 CTRL_READ(low, high, msrs, i);
77 CTRL_CLEAR_LO(low);
78 CTRL_CLEAR_HI(high);
79 CTRL_WRITE(low, high, msrs, i);
80 }
81
82 /* avoid a false detection of ctr overflows in NMI handler */
83 for (i = 0; i < NUM_COUNTERS; ++i) {
84 if (unlikely(!CTR_IS_RESERVED(msrs, i)))
85 continue;
86 CTR_WRITE(1, msrs, i);
87 }
88
89 /* enable active counters */
90 for (i = 0; i < NUM_COUNTERS; ++i) {
91 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
92 reset_value[i] = counter_config[i].count;
93
94 CTR_WRITE(counter_config[i].count, msrs, i);
95
96 CTRL_READ(low, high, msrs, i);
97 CTRL_CLEAR_LO(low);
98 CTRL_CLEAR_HI(high);
99 CTRL_SET_ENABLE(low);
100 CTRL_SET_USR(low, counter_config[i].user);
101 CTRL_SET_KERN(low, counter_config[i].kernel);
102 CTRL_SET_UM(low, counter_config[i].unit_mask);
103 CTRL_SET_EVENT_LOW(low, counter_config[i].event);
104 CTRL_SET_EVENT_HIGH(high, counter_config[i].event);
105 CTRL_SET_HOST_ONLY(high, 0);
106 CTRL_SET_GUEST_ONLY(high, 0);
107
108 CTRL_WRITE(low, high, msrs, i);
109 } else {
110 reset_value[i] = 0;
111 }
112 }
113}
114
115
116static int athlon_check_ctrs(struct pt_regs * const regs,
117 struct op_msrs const * const msrs)
118{
119 unsigned int low, high;
120 int i;
121
122 for (i = 0 ; i < NUM_COUNTERS; ++i) {
123 if (!reset_value[i])
124 continue;
125 CTR_READ(low, high, msrs, i);
126 if (CTR_OVERFLOWED(low)) {
127 oprofile_add_sample(regs, i);
128 CTR_WRITE(reset_value[i], msrs, i);
129 }
130 }
131
132 /* See op_model_ppro.c */
133 return 1;
134}
135
136
137static void athlon_start(struct op_msrs const * const msrs)
138{
139 unsigned int low, high;
140 int i;
141 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
142 if (reset_value[i]) {
143 CTRL_READ(low, high, msrs, i);
144 CTRL_SET_ACTIVE(low);
145 CTRL_WRITE(low, high, msrs, i);
146 }
147 }
148}
149
150
151static void athlon_stop(struct op_msrs const * const msrs)
152{
153 unsigned int low, high;
154 int i;
155
156 /* Subtle: stop on all counters to avoid race with
157 * setting our pm callback */
158 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
159 if (!reset_value[i])
160 continue;
161 CTRL_READ(low, high, msrs, i);
162 CTRL_SET_INACTIVE(low);
163 CTRL_WRITE(low, high, msrs, i);
164 }
165}
166
167static void athlon_shutdown(struct op_msrs const * const msrs)
168{
169 int i;
170
171 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
172 if (CTR_IS_RESERVED(msrs, i))
173 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
174 }
175 for (i = 0 ; i < NUM_CONTROLS ; ++i) {
176 if (CTRL_IS_RESERVED(msrs, i))
177 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
178 }
179}
180
181struct op_x86_model_spec const op_athlon_spec = {
182 .num_counters = NUM_COUNTERS,
183 .num_controls = NUM_CONTROLS,
184 .fill_in_addresses = &athlon_fill_in_addresses,
185 .setup_ctrs = &athlon_setup_ctrs,
186 .check_ctrs = &athlon_check_ctrs,
187 .start = &athlon_start,
188 .stop = &athlon_stop,
189 .shutdown = &athlon_shutdown
190};
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 56b4757a1f4..4c4a51c90bc 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -10,11 +10,12 @@
10 10
11#include <linux/oprofile.h> 11#include <linux/oprofile.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/ptrace.h>
14#include <linux/nmi.h>
13#include <asm/msr.h> 15#include <asm/msr.h>
14#include <asm/ptrace.h>
15#include <asm/fixmap.h> 16#include <asm/fixmap.h>
16#include <asm/apic.h> 17#include <asm/apic.h>
17#include <asm/nmi.h> 18
18 19
19#include "op_x86_model.h" 20#include "op_x86_model.h"
20#include "op_counter.h" 21#include "op_counter.h"
@@ -40,7 +41,7 @@ static unsigned int num_controls = NUM_CONTROLS_NON_HT;
40static inline void setup_num_counters(void) 41static inline void setup_num_counters(void)
41{ 42{
42#ifdef CONFIG_SMP 43#ifdef CONFIG_SMP
43 if (smp_num_siblings == 2){ 44 if (smp_num_siblings == 2) {
44 num_counters = NUM_COUNTERS_HT2; 45 num_counters = NUM_COUNTERS_HT2;
45 num_controls = NUM_CONTROLS_HT2; 46 num_controls = NUM_CONTROLS_HT2;
46 } 47 }
@@ -86,7 +87,7 @@ struct p4_event_binding {
86#define CTR_FLAME_2 (1 << 6) 87#define CTR_FLAME_2 (1 << 6)
87#define CTR_IQ_5 (1 << 7) 88#define CTR_IQ_5 (1 << 7)
88 89
89static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { 90static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = {
90 { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, 91 { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 },
91 { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, 92 { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 },
92 { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, 93 { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
@@ -97,32 +98,32 @@ static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = {
97 { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } 98 { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 }
98}; 99};
99 100
100#define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT 101#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT)
101 102
102/* p4 event codes in libop/op_event.h are indices into this table. */ 103/* p4 event codes in libop/op_event.h are indices into this table. */
103 104
104static struct p4_event_binding p4_events[NUM_EVENTS] = { 105static struct p4_event_binding p4_events[NUM_EVENTS] = {
105 106
106 { /* BRANCH_RETIRED */ 107 { /* BRANCH_RETIRED */
107 0x05, 0x06, 108 0x05, 0x06,
108 { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, 109 { {CTR_IQ_4, MSR_P4_CRU_ESCR2},
109 {CTR_IQ_5, MSR_P4_CRU_ESCR3} } 110 {CTR_IQ_5, MSR_P4_CRU_ESCR3} }
110 }, 111 },
111 112
112 { /* MISPRED_BRANCH_RETIRED */ 113 { /* MISPRED_BRANCH_RETIRED */
113 0x04, 0x03, 114 0x04, 0x03,
114 { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, 115 { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
115 { CTR_IQ_5, MSR_P4_CRU_ESCR1} } 116 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
116 }, 117 },
117 118
118 { /* TC_DELIVER_MODE */ 119 { /* TC_DELIVER_MODE */
119 0x01, 0x01, 120 0x01, 0x01,
120 { { CTR_MS_0, MSR_P4_TC_ESCR0}, 121 { { CTR_MS_0, MSR_P4_TC_ESCR0},
121 { CTR_MS_2, MSR_P4_TC_ESCR1} } 122 { CTR_MS_2, MSR_P4_TC_ESCR1} }
122 }, 123 },
123 124
124 { /* BPU_FETCH_REQUEST */ 125 { /* BPU_FETCH_REQUEST */
125 0x00, 0x03, 126 0x00, 0x03,
126 { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, 127 { { CTR_BPU_0, MSR_P4_BPU_ESCR0},
127 { CTR_BPU_2, MSR_P4_BPU_ESCR1} } 128 { CTR_BPU_2, MSR_P4_BPU_ESCR1} }
128 }, 129 },
@@ -146,7 +147,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
146 }, 147 },
147 148
148 { /* LOAD_PORT_REPLAY */ 149 { /* LOAD_PORT_REPLAY */
149 0x02, 0x04, 150 0x02, 0x04,
150 { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, 151 { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
151 { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } 152 { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
152 }, 153 },
@@ -170,43 +171,43 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
170 }, 171 },
171 172
172 { /* BSQ_CACHE_REFERENCE */ 173 { /* BSQ_CACHE_REFERENCE */
173 0x07, 0x0c, 174 0x07, 0x0c,
174 { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, 175 { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
175 { CTR_BPU_2, MSR_P4_BSU_ESCR1} } 176 { CTR_BPU_2, MSR_P4_BSU_ESCR1} }
176 }, 177 },
177 178
178 { /* IOQ_ALLOCATION */ 179 { /* IOQ_ALLOCATION */
179 0x06, 0x03, 180 0x06, 0x03,
180 { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, 181 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
181 { 0, 0 } } 182 { 0, 0 } }
182 }, 183 },
183 184
184 { /* IOQ_ACTIVE_ENTRIES */ 185 { /* IOQ_ACTIVE_ENTRIES */
185 0x06, 0x1a, 186 0x06, 0x1a,
186 { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, 187 { { CTR_BPU_2, MSR_P4_FSB_ESCR1},
187 { 0, 0 } } 188 { 0, 0 } }
188 }, 189 },
189 190
190 { /* FSB_DATA_ACTIVITY */ 191 { /* FSB_DATA_ACTIVITY */
191 0x06, 0x17, 192 0x06, 0x17,
192 { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, 193 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
193 { CTR_BPU_2, MSR_P4_FSB_ESCR1} } 194 { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
194 }, 195 },
195 196
196 { /* BSQ_ALLOCATION */ 197 { /* BSQ_ALLOCATION */
197 0x07, 0x05, 198 0x07, 0x05,
198 { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, 199 { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
199 { 0, 0 } } 200 { 0, 0 } }
200 }, 201 },
201 202
202 { /* BSQ_ACTIVE_ENTRIES */ 203 { /* BSQ_ACTIVE_ENTRIES */
203 0x07, 0x06, 204 0x07, 0x06,
204 { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, 205 { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
205 { 0, 0 } } 206 { 0, 0 } }
206 }, 207 },
207 208
208 { /* X87_ASSIST */ 209 { /* X87_ASSIST */
209 0x05, 0x03, 210 0x05, 0x03,
210 { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, 211 { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
211 { CTR_IQ_5, MSR_P4_CRU_ESCR3} } 212 { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
212 }, 213 },
@@ -216,21 +217,21 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
216 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 217 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
217 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 218 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
218 }, 219 },
219 220
220 { /* PACKED_SP_UOP */ 221 { /* PACKED_SP_UOP */
221 0x01, 0x08, 222 0x01, 0x08,
222 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 223 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
223 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 224 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
224 }, 225 },
225 226
226 { /* PACKED_DP_UOP */ 227 { /* PACKED_DP_UOP */
227 0x01, 0x0c, 228 0x01, 0x0c,
228 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 229 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
229 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 230 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
230 }, 231 },
231 232
232 { /* SCALAR_SP_UOP */ 233 { /* SCALAR_SP_UOP */
233 0x01, 0x0a, 234 0x01, 0x0a,
234 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 235 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
235 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 236 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
236 }, 237 },
@@ -242,31 +243,31 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
242 }, 243 },
243 244
244 { /* 64BIT_MMX_UOP */ 245 { /* 64BIT_MMX_UOP */
245 0x01, 0x02, 246 0x01, 0x02,
246 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 247 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
247 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 248 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
248 }, 249 },
249 250
250 { /* 128BIT_MMX_UOP */ 251 { /* 128BIT_MMX_UOP */
251 0x01, 0x1a, 252 0x01, 0x1a,
252 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 253 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
253 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 254 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
254 }, 255 },
255 256
256 { /* X87_FP_UOP */ 257 { /* X87_FP_UOP */
257 0x01, 0x04, 258 0x01, 0x04,
258 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 259 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
259 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 260 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
260 }, 261 },
261 262
262 { /* X87_SIMD_MOVES_UOP */ 263 { /* X87_SIMD_MOVES_UOP */
263 0x01, 0x2e, 264 0x01, 0x2e,
264 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 265 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
265 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 266 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
266 }, 267 },
267 268
268 { /* MACHINE_CLEAR */ 269 { /* MACHINE_CLEAR */
269 0x05, 0x02, 270 0x05, 0x02,
270 { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, 271 { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
271 { CTR_IQ_5, MSR_P4_CRU_ESCR3} } 272 { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
272 }, 273 },
@@ -276,9 +277,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
276 { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, 277 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
277 { CTR_BPU_2, MSR_P4_FSB_ESCR1} } 278 { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
278 }, 279 },
279 280
280 { /* TC_MS_XFER */ 281 { /* TC_MS_XFER */
281 0x00, 0x05, 282 0x00, 0x05,
282 { { CTR_MS_0, MSR_P4_MS_ESCR0}, 283 { { CTR_MS_0, MSR_P4_MS_ESCR0},
283 { CTR_MS_2, MSR_P4_MS_ESCR1} } 284 { CTR_MS_2, MSR_P4_MS_ESCR1} }
284 }, 285 },
@@ -308,7 +309,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
308 }, 309 },
309 310
310 { /* INSTR_RETIRED */ 311 { /* INSTR_RETIRED */
311 0x04, 0x02, 312 0x04, 0x02,
312 { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, 313 { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
313 { CTR_IQ_5, MSR_P4_CRU_ESCR1} } 314 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
314 }, 315 },
@@ -319,14 +320,14 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
319 { CTR_IQ_5, MSR_P4_CRU_ESCR1} } 320 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
320 }, 321 },
321 322
322 { /* UOP_TYPE */ 323 { /* UOP_TYPE */
323 0x02, 0x02, 324 0x02, 0x02,
324 { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, 325 { { CTR_IQ_4, MSR_P4_RAT_ESCR0},
325 { CTR_IQ_5, MSR_P4_RAT_ESCR1} } 326 { CTR_IQ_5, MSR_P4_RAT_ESCR1} }
326 }, 327 },
327 328
328 { /* RETIRED_MISPRED_BRANCH_TYPE */ 329 { /* RETIRED_MISPRED_BRANCH_TYPE */
329 0x02, 0x05, 330 0x02, 0x05,
330 { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, 331 { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
331 { CTR_MS_2, MSR_P4_TBPU_ESCR1} } 332 { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
332 }, 333 },
@@ -349,8 +350,8 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
349#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) 350#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
350#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) 351#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
351#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) 352#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
352#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) 353#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
353#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) 354#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
354 355
355#define CCCR_RESERVED_BITS 0x38030FFF 356#define CCCR_RESERVED_BITS 0x38030FFF
356#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) 357#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
@@ -360,15 +361,15 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
360#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) 361#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
361#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) 362#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
362#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) 363#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
363#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) 364#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
364#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) 365#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
365#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) 366#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
366#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) 367#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
367 368
368#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0) 369#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
369#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0) 370#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
370#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0) 371#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0)
371#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0) 372#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0)
372#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) 373#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000))
373 374
374 375
@@ -380,7 +381,7 @@ static unsigned int get_stagger(void)
380#ifdef CONFIG_SMP 381#ifdef CONFIG_SMP
381 int cpu = smp_processor_id(); 382 int cpu = smp_processor_id();
382 return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu))); 383 return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu)));
383#endif 384#endif
384 return 0; 385 return 0;
385} 386}
386 387
@@ -395,25 +396,23 @@ static unsigned long reset_value[NUM_COUNTERS_NON_HT];
395 396
396static void p4_fill_in_addresses(struct op_msrs * const msrs) 397static void p4_fill_in_addresses(struct op_msrs * const msrs)
397{ 398{
398 unsigned int i; 399 unsigned int i;
399 unsigned int addr, cccraddr, stag; 400 unsigned int addr, cccraddr, stag;
400 401
401 setup_num_counters(); 402 setup_num_counters();
402 stag = get_stagger(); 403 stag = get_stagger();
403 404
404 /* initialize some registers */ 405 /* initialize some registers */
405 for (i = 0; i < num_counters; ++i) { 406 for (i = 0; i < num_counters; ++i)
406 msrs->counters[i].addr = 0; 407 msrs->counters[i].addr = 0;
407 } 408 for (i = 0; i < num_controls; ++i)
408 for (i = 0; i < num_controls; ++i) {
409 msrs->controls[i].addr = 0; 409 msrs->controls[i].addr = 0;
410 } 410
411
412 /* the counter & cccr registers we pay attention to */ 411 /* the counter & cccr registers we pay attention to */
413 for (i = 0; i < num_counters; ++i) { 412 for (i = 0; i < num_counters; ++i) {
414 addr = p4_counters[VIRT_CTR(stag, i)].counter_address; 413 addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
415 cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address; 414 cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
416 if (reserve_perfctr_nmi(addr)){ 415 if (reserve_perfctr_nmi(addr)) {
417 msrs->counters[i].addr = addr; 416 msrs->counters[i].addr = addr;
418 msrs->controls[i].addr = cccraddr; 417 msrs->controls[i].addr = cccraddr;
419 } 418 }
@@ -447,22 +446,22 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
447 if (reserve_evntsel_nmi(addr)) 446 if (reserve_evntsel_nmi(addr))
448 msrs->controls[i].addr = addr; 447 msrs->controls[i].addr = addr;
449 } 448 }
450 449
451 for (addr = MSR_P4_MS_ESCR0 + stag; 450 for (addr = MSR_P4_MS_ESCR0 + stag;
452 addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { 451 addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
453 if (reserve_evntsel_nmi(addr)) 452 if (reserve_evntsel_nmi(addr))
454 msrs->controls[i].addr = addr; 453 msrs->controls[i].addr = addr;
455 } 454 }
456 455
457 for (addr = MSR_P4_IX_ESCR0 + stag; 456 for (addr = MSR_P4_IX_ESCR0 + stag;
458 addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { 457 addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
459 if (reserve_evntsel_nmi(addr)) 458 if (reserve_evntsel_nmi(addr))
460 msrs->controls[i].addr = addr; 459 msrs->controls[i].addr = addr;
461 } 460 }
462 461
463 /* there are 2 remaining non-contiguously located ESCRs */ 462 /* there are 2 remaining non-contiguously located ESCRs */
464 463
465 if (num_counters == NUM_COUNTERS_NON_HT) { 464 if (num_counters == NUM_COUNTERS_NON_HT) {
466 /* standard non-HT CPUs handle both remaining ESCRs*/ 465 /* standard non-HT CPUs handle both remaining ESCRs*/
467 if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) 466 if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
468 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; 467 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
@@ -498,20 +497,20 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
498 unsigned int stag; 497 unsigned int stag;
499 498
500 stag = get_stagger(); 499 stag = get_stagger();
501 500
502 /* convert from counter *number* to counter *bit* */ 501 /* convert from counter *number* to counter *bit* */
503 counter_bit = 1 << VIRT_CTR(stag, ctr); 502 counter_bit = 1 << VIRT_CTR(stag, ctr);
504 503
505 /* find our event binding structure. */ 504 /* find our event binding structure. */
506 if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { 505 if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
507 printk(KERN_ERR 506 printk(KERN_ERR
508 "oprofile: P4 event code 0x%lx out of range\n", 507 "oprofile: P4 event code 0x%lx out of range\n",
509 counter_config[ctr].event); 508 counter_config[ctr].event);
510 return; 509 return;
511 } 510 }
512 511
513 ev = &(p4_events[counter_config[ctr].event - 1]); 512 ev = &(p4_events[counter_config[ctr].event - 1]);
514 513
515 for (i = 0; i < maxbind; i++) { 514 for (i = 0; i < maxbind; i++) {
516 if (ev->bindings[i].virt_counter & counter_bit) { 515 if (ev->bindings[i].virt_counter & counter_bit) {
517 516
@@ -526,25 +525,24 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
526 ESCR_SET_OS_1(escr, counter_config[ctr].kernel); 525 ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
527 } 526 }
528 ESCR_SET_EVENT_SELECT(escr, ev->event_select); 527 ESCR_SET_EVENT_SELECT(escr, ev->event_select);
529 ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); 528 ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
530 ESCR_WRITE(escr, high, ev, i); 529 ESCR_WRITE(escr, high, ev, i);
531 530
532 /* modify CCCR */ 531 /* modify CCCR */
533 CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); 532 CCCR_READ(cccr, high, VIRT_CTR(stag, ctr));
534 CCCR_CLEAR(cccr); 533 CCCR_CLEAR(cccr);
535 CCCR_SET_REQUIRED_BITS(cccr); 534 CCCR_SET_REQUIRED_BITS(cccr);
536 CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); 535 CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
537 if (stag == 0) { 536 if (stag == 0)
538 CCCR_SET_PMI_OVF_0(cccr); 537 CCCR_SET_PMI_OVF_0(cccr);
539 } else { 538 else
540 CCCR_SET_PMI_OVF_1(cccr); 539 CCCR_SET_PMI_OVF_1(cccr);
541 }
542 CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); 540 CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr));
543 return; 541 return;
544 } 542 }
545 } 543 }
546 544
547 printk(KERN_ERR 545 printk(KERN_ERR
548 "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n", 546 "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
549 counter_config[ctr].event, stag, ctr); 547 counter_config[ctr].event, stag, ctr);
550} 548}
@@ -559,14 +557,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
559 stag = get_stagger(); 557 stag = get_stagger();
560 558
561 rdmsr(MSR_IA32_MISC_ENABLE, low, high); 559 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
562 if (! MISC_PMC_ENABLED_P(low)) { 560 if (!MISC_PMC_ENABLED_P(low)) {
563 printk(KERN_ERR "oprofile: P4 PMC not available\n"); 561 printk(KERN_ERR "oprofile: P4 PMC not available\n");
564 return; 562 return;
565 } 563 }
566 564
567 /* clear the cccrs we will use */ 565 /* clear the cccrs we will use */
568 for (i = 0 ; i < num_counters ; i++) { 566 for (i = 0 ; i < num_counters ; i++) {
569 if (unlikely(!CTRL_IS_RESERVED(msrs,i))) 567 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
570 continue; 568 continue;
571 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); 569 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
572 CCCR_CLEAR(low); 570 CCCR_CLEAR(low);
@@ -576,14 +574,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
576 574
577 /* clear all escrs (including those outside our concern) */ 575 /* clear all escrs (including those outside our concern) */
578 for (i = num_counters; i < num_controls; i++) { 576 for (i = num_counters; i < num_controls; i++) {
579 if (unlikely(!CTRL_IS_RESERVED(msrs,i))) 577 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
580 continue; 578 continue;
581 wrmsr(msrs->controls[i].addr, 0, 0); 579 wrmsr(msrs->controls[i].addr, 0, 0);
582 } 580 }
583 581
584 /* setup all counters */ 582 /* setup all counters */
585 for (i = 0 ; i < num_counters ; ++i) { 583 for (i = 0 ; i < num_counters ; ++i) {
586 if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs,i))) { 584 if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) {
587 reset_value[i] = counter_config[i].count; 585 reset_value[i] = counter_config[i].count;
588 pmc_setup_one_p4_counter(i); 586 pmc_setup_one_p4_counter(i);
589 CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); 587 CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i));
@@ -603,11 +601,11 @@ static int p4_check_ctrs(struct pt_regs * const regs,
603 stag = get_stagger(); 601 stag = get_stagger();
604 602
605 for (i = 0; i < num_counters; ++i) { 603 for (i = 0; i < num_counters; ++i) {
606 604
607 if (!reset_value[i]) 605 if (!reset_value[i])
608 continue; 606 continue;
609 607
610 /* 608 /*
611 * there is some eccentricity in the hardware which 609 * there is some eccentricity in the hardware which
612 * requires that we perform 2 extra corrections: 610 * requires that we perform 2 extra corrections:
613 * 611 *
@@ -616,24 +614,24 @@ static int p4_check_ctrs(struct pt_regs * const regs,
616 * 614 *
617 * - write the counter back twice to ensure it gets 615 * - write the counter back twice to ensure it gets
618 * updated properly. 616 * updated properly.
619 * 617 *
620 * the former seems to be related to extra NMIs happening 618 * the former seems to be related to extra NMIs happening
621 * during the current NMI; the latter is reported as errata 619 * during the current NMI; the latter is reported as errata
622 * N15 in intel doc 249199-029, pentium 4 specification 620 * N15 in intel doc 249199-029, pentium 4 specification
623 * update, though their suggested work-around does not 621 * update, though their suggested work-around does not
624 * appear to solve the problem. 622 * appear to solve the problem.
625 */ 623 */
626 624
627 real = VIRT_CTR(stag, i); 625 real = VIRT_CTR(stag, i);
628 626
629 CCCR_READ(low, high, real); 627 CCCR_READ(low, high, real);
630 CTR_READ(ctr, high, real); 628 CTR_READ(ctr, high, real);
631 if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { 629 if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) {
632 oprofile_add_sample(regs, i); 630 oprofile_add_sample(regs, i);
633 CTR_WRITE(reset_value[i], real); 631 CTR_WRITE(reset_value[i], real);
634 CCCR_CLEAR_OVF(low); 632 CCCR_CLEAR_OVF(low);
635 CCCR_WRITE(low, high, real); 633 CCCR_WRITE(low, high, real);
636 CTR_WRITE(reset_value[i], real); 634 CTR_WRITE(reset_value[i], real);
637 } 635 }
638 } 636 }
639 637
@@ -683,15 +681,16 @@ static void p4_shutdown(struct op_msrs const * const msrs)
683 int i; 681 int i;
684 682
685 for (i = 0 ; i < num_counters ; ++i) { 683 for (i = 0 ; i < num_counters ; ++i) {
686 if (CTR_IS_RESERVED(msrs,i)) 684 if (CTR_IS_RESERVED(msrs, i))
687 release_perfctr_nmi(msrs->counters[i].addr); 685 release_perfctr_nmi(msrs->counters[i].addr);
688 } 686 }
689 /* some of the control registers are specially reserved in 687 /*
688 * some of the control registers are specially reserved in
690 * conjunction with the counter registers (hence the starting offset). 689 * conjunction with the counter registers (hence the starting offset).
691 * This saves a few bits. 690 * This saves a few bits.
692 */ 691 */
693 for (i = num_counters ; i < num_controls ; ++i) { 692 for (i = num_counters ; i < num_controls ; ++i) {
694 if (CTRL_IS_RESERVED(msrs,i)) 693 if (CTRL_IS_RESERVED(msrs, i))
695 release_evntsel_nmi(msrs->controls[i].addr); 694 release_evntsel_nmi(msrs->controls[i].addr);
696 } 695 }
697} 696}
@@ -699,24 +698,24 @@ static void p4_shutdown(struct op_msrs const * const msrs)
699 698
700#ifdef CONFIG_SMP 699#ifdef CONFIG_SMP
701struct op_x86_model_spec const op_p4_ht2_spec = { 700struct op_x86_model_spec const op_p4_ht2_spec = {
702 .num_counters = NUM_COUNTERS_HT2, 701 .num_counters = NUM_COUNTERS_HT2,
703 .num_controls = NUM_CONTROLS_HT2, 702 .num_controls = NUM_CONTROLS_HT2,
704 .fill_in_addresses = &p4_fill_in_addresses, 703 .fill_in_addresses = &p4_fill_in_addresses,
705 .setup_ctrs = &p4_setup_ctrs, 704 .setup_ctrs = &p4_setup_ctrs,
706 .check_ctrs = &p4_check_ctrs, 705 .check_ctrs = &p4_check_ctrs,
707 .start = &p4_start, 706 .start = &p4_start,
708 .stop = &p4_stop, 707 .stop = &p4_stop,
709 .shutdown = &p4_shutdown 708 .shutdown = &p4_shutdown
710}; 709};
711#endif 710#endif
712 711
713struct op_x86_model_spec const op_p4_spec = { 712struct op_x86_model_spec const op_p4_spec = {
714 .num_counters = NUM_COUNTERS_NON_HT, 713 .num_counters = NUM_COUNTERS_NON_HT,
715 .num_controls = NUM_CONTROLS_NON_HT, 714 .num_controls = NUM_CONTROLS_NON_HT,
716 .fill_in_addresses = &p4_fill_in_addresses, 715 .fill_in_addresses = &p4_fill_in_addresses,
717 .setup_ctrs = &p4_setup_ctrs, 716 .setup_ctrs = &p4_setup_ctrs,
718 .check_ctrs = &p4_check_ctrs, 717 .check_ctrs = &p4_check_ctrs,
719 .start = &p4_start, 718 .start = &p4_start,
720 .stop = &p4_stop, 719 .stop = &p4_stop,
721 .shutdown = &p4_shutdown 720 .shutdown = &p4_shutdown
722}; 721};
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index eff431f6c57..0620d6d45f7 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -1,32 +1,34 @@
1/* 1/*
2 * @file op_model_ppro.h 2 * @file op_model_ppro.h
3 * pentium pro / P6 model-specific MSR operations 3 * Family 6 perfmon and architectural perfmon MSR operations
4 * 4 *
5 * @remark Copyright 2002 OProfile authors 5 * @remark Copyright 2002 OProfile authors
6 * @remark Copyright 2008 Intel Corporation
6 * @remark Read the file COPYING 7 * @remark Read the file COPYING
7 * 8 *
8 * @author John Levon 9 * @author John Levon
9 * @author Philippe Elie 10 * @author Philippe Elie
10 * @author Graydon Hoare 11 * @author Graydon Hoare
12 * @author Andi Kleen
11 */ 13 */
12 14
13#include <linux/oprofile.h> 15#include <linux/oprofile.h>
16#include <linux/slab.h>
14#include <asm/ptrace.h> 17#include <asm/ptrace.h>
15#include <asm/msr.h> 18#include <asm/msr.h>
16#include <asm/apic.h> 19#include <asm/apic.h>
17#include <asm/nmi.h> 20#include <asm/nmi.h>
21#include <asm/intel_arch_perfmon.h>
18 22
19#include "op_x86_model.h" 23#include "op_x86_model.h"
20#include "op_counter.h" 24#include "op_counter.h"
21 25
22#define NUM_COUNTERS 2 26static int num_counters = 2;
23#define NUM_CONTROLS 2 27static int counter_width = 32;
24 28
25#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) 29#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
26#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) 30#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
27#define CTR_32BIT_WRITE(l, msrs, c) \ 31#define CTR_OVERFLOWED(n) (!((n) & (1U<<(counter_width-1))))
28 do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0); } while (0)
29#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
30 32
31#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) 33#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
32#define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) 34#define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0)
@@ -40,20 +42,20 @@
40#define CTRL_SET_UM(val, m) (val |= (m << 8)) 42#define CTRL_SET_UM(val, m) (val |= (m << 8))
41#define CTRL_SET_EVENT(val, e) (val |= e) 43#define CTRL_SET_EVENT(val, e) (val |= e)
42 44
43static unsigned long reset_value[NUM_COUNTERS]; 45static u64 *reset_value;
44 46
45static void ppro_fill_in_addresses(struct op_msrs * const msrs) 47static void ppro_fill_in_addresses(struct op_msrs * const msrs)
46{ 48{
47 int i; 49 int i;
48 50
49 for (i = 0; i < NUM_COUNTERS; i++) { 51 for (i = 0; i < num_counters; i++) {
50 if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) 52 if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
51 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; 53 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
52 else 54 else
53 msrs->counters[i].addr = 0; 55 msrs->counters[i].addr = 0;
54 } 56 }
55 57
56 for (i = 0; i < NUM_CONTROLS; i++) { 58 for (i = 0; i < num_counters; i++) {
57 if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) 59 if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
58 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; 60 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
59 else 61 else
@@ -67,8 +69,22 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
67 unsigned int low, high; 69 unsigned int low, high;
68 int i; 70 int i;
69 71
72 if (!reset_value) {
73 reset_value = kmalloc(sizeof(unsigned) * num_counters,
74 GFP_ATOMIC);
75 if (!reset_value)
76 return;
77 }
78
79 if (cpu_has_arch_perfmon) {
80 union cpuid10_eax eax;
81 eax.full = cpuid_eax(0xa);
82 if (counter_width < eax.split.bit_width)
83 counter_width = eax.split.bit_width;
84 }
85
70 /* clear all counters */ 86 /* clear all counters */
71 for (i = 0 ; i < NUM_CONTROLS; ++i) { 87 for (i = 0 ; i < num_counters; ++i) {
72 if (unlikely(!CTRL_IS_RESERVED(msrs, i))) 88 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
73 continue; 89 continue;
74 CTRL_READ(low, high, msrs, i); 90 CTRL_READ(low, high, msrs, i);
@@ -77,18 +93,18 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
77 } 93 }
78 94
79 /* avoid a false detection of ctr overflows in NMI handler */ 95 /* avoid a false detection of ctr overflows in NMI handler */
80 for (i = 0; i < NUM_COUNTERS; ++i) { 96 for (i = 0; i < num_counters; ++i) {
81 if (unlikely(!CTR_IS_RESERVED(msrs, i))) 97 if (unlikely(!CTR_IS_RESERVED(msrs, i)))
82 continue; 98 continue;
83 CTR_32BIT_WRITE(1, msrs, i); 99 wrmsrl(msrs->counters[i].addr, -1LL);
84 } 100 }
85 101
86 /* enable active counters */ 102 /* enable active counters */
87 for (i = 0; i < NUM_COUNTERS; ++i) { 103 for (i = 0; i < num_counters; ++i) {
88 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { 104 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
89 reset_value[i] = counter_config[i].count; 105 reset_value[i] = counter_config[i].count;
90 106
91 CTR_32BIT_WRITE(counter_config[i].count, msrs, i); 107 wrmsrl(msrs->counters[i].addr, -reset_value[i]);
92 108
93 CTRL_READ(low, high, msrs, i); 109 CTRL_READ(low, high, msrs, i);
94 CTRL_CLEAR(low); 110 CTRL_CLEAR(low);
@@ -111,13 +127,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
111 unsigned int low, high; 127 unsigned int low, high;
112 int i; 128 int i;
113 129
114 for (i = 0 ; i < NUM_COUNTERS; ++i) { 130 for (i = 0 ; i < num_counters; ++i) {
115 if (!reset_value[i]) 131 if (!reset_value[i])
116 continue; 132 continue;
117 CTR_READ(low, high, msrs, i); 133 CTR_READ(low, high, msrs, i);
118 if (CTR_OVERFLOWED(low)) { 134 if (CTR_OVERFLOWED(low)) {
119 oprofile_add_sample(regs, i); 135 oprofile_add_sample(regs, i);
120 CTR_32BIT_WRITE(reset_value[i], msrs, i); 136 wrmsrl(msrs->counters[i].addr, -reset_value[i]);
121 } 137 }
122 } 138 }
123 139
@@ -141,7 +157,7 @@ static void ppro_start(struct op_msrs const * const msrs)
141 unsigned int low, high; 157 unsigned int low, high;
142 int i; 158 int i;
143 159
144 for (i = 0; i < NUM_COUNTERS; ++i) { 160 for (i = 0; i < num_counters; ++i) {
145 if (reset_value[i]) { 161 if (reset_value[i]) {
146 CTRL_READ(low, high, msrs, i); 162 CTRL_READ(low, high, msrs, i);
147 CTRL_SET_ACTIVE(low); 163 CTRL_SET_ACTIVE(low);
@@ -156,7 +172,7 @@ static void ppro_stop(struct op_msrs const * const msrs)
156 unsigned int low, high; 172 unsigned int low, high;
157 int i; 173 int i;
158 174
159 for (i = 0; i < NUM_COUNTERS; ++i) { 175 for (i = 0; i < num_counters; ++i) {
160 if (!reset_value[i]) 176 if (!reset_value[i])
161 continue; 177 continue;
162 CTRL_READ(low, high, msrs, i); 178 CTRL_READ(low, high, msrs, i);
@@ -169,24 +185,70 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
169{ 185{
170 int i; 186 int i;
171 187
172 for (i = 0 ; i < NUM_COUNTERS ; ++i) { 188 for (i = 0 ; i < num_counters ; ++i) {
173 if (CTR_IS_RESERVED(msrs, i)) 189 if (CTR_IS_RESERVED(msrs, i))
174 release_perfctr_nmi(MSR_P6_PERFCTR0 + i); 190 release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
175 } 191 }
176 for (i = 0 ; i < NUM_CONTROLS ; ++i) { 192 for (i = 0 ; i < num_counters ; ++i) {
177 if (CTRL_IS_RESERVED(msrs, i)) 193 if (CTRL_IS_RESERVED(msrs, i))
178 release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); 194 release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
179 } 195 }
196 if (reset_value) {
197 kfree(reset_value);
198 reset_value = NULL;
199 }
180} 200}
181 201
182 202
183struct op_x86_model_spec const op_ppro_spec = { 203struct op_x86_model_spec op_ppro_spec = {
184 .num_counters = NUM_COUNTERS, 204 .num_counters = 2, /* can be overriden */
185 .num_controls = NUM_CONTROLS, 205 .num_controls = 2, /* dito */
186 .fill_in_addresses = &ppro_fill_in_addresses, 206 .fill_in_addresses = &ppro_fill_in_addresses,
187 .setup_ctrs = &ppro_setup_ctrs, 207 .setup_ctrs = &ppro_setup_ctrs,
188 .check_ctrs = &ppro_check_ctrs, 208 .check_ctrs = &ppro_check_ctrs,
189 .start = &ppro_start, 209 .start = &ppro_start,
190 .stop = &ppro_stop, 210 .stop = &ppro_stop,
191 .shutdown = &ppro_shutdown 211 .shutdown = &ppro_shutdown
212};
213
214/*
215 * Architectural performance monitoring.
216 *
217 * Newer Intel CPUs (Core1+) have support for architectural
218 * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details.
219 * The advantage of this is that it can be done without knowing about
220 * the specific CPU.
221 */
222
223void arch_perfmon_setup_counters(void)
224{
225 union cpuid10_eax eax;
226
227 eax.full = cpuid_eax(0xa);
228
229 /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */
230 if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
231 current_cpu_data.x86_model == 15) {
232 eax.split.version_id = 2;
233 eax.split.num_counters = 2;
234 eax.split.bit_width = 40;
235 }
236
237 num_counters = eax.split.num_counters;
238
239 op_arch_perfmon_spec.num_counters = num_counters;
240 op_arch_perfmon_spec.num_controls = num_counters;
241 op_ppro_spec.num_counters = num_counters;
242 op_ppro_spec.num_controls = num_counters;
243}
244
245struct op_x86_model_spec op_arch_perfmon_spec = {
246 /* num_counters/num_controls filled in at runtime */
247 .fill_in_addresses = &ppro_fill_in_addresses,
248 /* user space does the cpuid check for available events */
249 .setup_ctrs = &ppro_setup_ctrs,
250 .check_ctrs = &ppro_check_ctrs,
251 .start = &ppro_start,
252 .stop = &ppro_stop,
253 .shutdown = &ppro_shutdown
192}; 254};
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 45b605fa71d..825e79064d6 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -22,8 +22,8 @@ struct op_msr {
22}; 22};
23 23
24struct op_msrs { 24struct op_msrs {
25 struct op_msr * counters; 25 struct op_msr *counters;
26 struct op_msr * controls; 26 struct op_msr *controls;
27}; 27};
28 28
29struct pt_regs; 29struct pt_regs;
@@ -32,8 +32,10 @@ struct pt_regs;
32 * various x86 CPU models' perfctr support. 32 * various x86 CPU models' perfctr support.
33 */ 33 */
34struct op_x86_model_spec { 34struct op_x86_model_spec {
35 unsigned int const num_counters; 35 int (*init)(struct oprofile_operations *ops);
36 unsigned int const num_controls; 36 void (*exit)(void);
37 unsigned int num_counters;
38 unsigned int num_controls;
37 void (*fill_in_addresses)(struct op_msrs * const msrs); 39 void (*fill_in_addresses)(struct op_msrs * const msrs);
38 void (*setup_ctrs)(struct op_msrs const * const msrs); 40 void (*setup_ctrs)(struct op_msrs const * const msrs);
39 int (*check_ctrs)(struct pt_regs * const regs, 41 int (*check_ctrs)(struct pt_regs * const regs,
@@ -43,9 +45,12 @@ struct op_x86_model_spec {
43 void (*shutdown)(struct op_msrs const * const msrs); 45 void (*shutdown)(struct op_msrs const * const msrs);
44}; 46};
45 47
46extern struct op_x86_model_spec const op_ppro_spec; 48extern struct op_x86_model_spec op_ppro_spec;
47extern struct op_x86_model_spec const op_p4_spec; 49extern struct op_x86_model_spec const op_p4_spec;
48extern struct op_x86_model_spec const op_p4_ht2_spec; 50extern struct op_x86_model_spec const op_p4_ht2_spec;
49extern struct op_x86_model_spec const op_athlon_spec; 51extern struct op_x86_model_spec const op_amd_spec;
52extern struct op_x86_model_spec op_arch_perfmon_spec;
53
54extern void arch_perfmon_setup_counters(void);
50 55
51#endif /* OP_X86_MODEL_H */ 56#endif /* OP_X86_MODEL_H */
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 19af06927fb..1d88d2b3977 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -250,10 +250,5 @@ int __init pci_acpi_init(void)
250 acpi_pci_irq_enable(dev); 250 acpi_pci_irq_enable(dev);
251 } 251 }
252 252
253#ifdef CONFIG_X86_IO_APIC
254 if (acpi_ioapic)
255 print_IO_APIC();
256#endif
257
258 return 0; 253 return 0;
259} 254}
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 6a0fca78c36..22e057665e5 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -580,7 +580,7 @@ static int __cpuinit amd_cpu_notify(struct notifier_block *self,
580 unsigned long action, void *hcpu) 580 unsigned long action, void *hcpu)
581{ 581{
582 int cpu = (long)hcpu; 582 int cpu = (long)hcpu;
583 switch(action) { 583 switch (action) {
584 case CPU_ONLINE: 584 case CPU_ONLINE:
585 case CPU_ONLINE_FROZEN: 585 case CPU_ONLINE_FROZEN:
586 smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0); 586 smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0);
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index 4bdaa590375..3c27a809393 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -511,3 +511,31 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, fam10h_pci_cfg_space_size);
511DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size); 511DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size);
512DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size); 512DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size);
513DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size); 513DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size);
514
515/*
516 * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from
517 * confusing the PCI engine:
518 */
519static void sb600_disable_hpet_bar(struct pci_dev *dev)
520{
521 u8 val;
522
523 /*
524 * The SB600 and SB700 both share the same device
525 * ID, but the PM register 0x55 does something different
526 * for the SB700, so make sure we are dealing with the
527 * SB600 before touching the bit:
528 */
529
530 pci_read_config_byte(dev, 0x08, &val);
531
532 if (val < 0x2F) {
533 outb(0x55, 0xCD6);
534 val = inb(0xCD7);
535
536 /* Set bit 7 in PM register 0x55 */
537 outb(0x55, 0xCD6);
538 outb(val | 0x80, 0xCD7);
539 }
540}
541DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar);
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 8791fc55e71..844df0cbbd3 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -33,6 +33,7 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34 34
35#include <asm/pat.h> 35#include <asm/pat.h>
36#include <asm/e820.h>
36 37
37#include "pci.h" 38#include "pci.h"
38 39
@@ -227,6 +228,8 @@ void __init pcibios_resource_survey(void)
227 pcibios_allocate_bus_resources(&pci_root_buses); 228 pcibios_allocate_bus_resources(&pci_root_buses);
228 pcibios_allocate_resources(0); 229 pcibios_allocate_resources(0);
229 pcibios_allocate_resources(1); 230 pcibios_allocate_resources(1);
231
232 e820_reserve_resources_late();
230} 233}
231 234
232/** 235/**
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 8e077185e18..bf69dbe08bf 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -493,7 +493,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
493 if (pirq <= 4) 493 if (pirq <= 4)
494 irq = read_config_nybble(router, 0x56, pirq - 1); 494 irq = read_config_nybble(router, 0x56, pirq - 1);
495 dev_info(&dev->dev, 495 dev_info(&dev->dev,
496 "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n", 496 "AMD756: dev [%04x:%04x], router PIRQ %d get IRQ %d\n",
497 dev->vendor, dev->device, pirq, irq); 497 dev->vendor, dev->device, pirq, irq);
498 return irq; 498 return irq;
499} 499}
@@ -501,7 +501,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
501static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 501static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
502{ 502{
503 dev_info(&dev->dev, 503 dev_info(&dev->dev,
504 "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n", 504 "AMD756: dev [%04x:%04x], router PIRQ %d set IRQ %d\n",
505 dev->vendor, dev->device, pirq, irq); 505 dev->vendor, dev->device, pirq, irq);
506 if (pirq <= 4) 506 if (pirq <= 4)
507 write_config_nybble(router, 0x56, pirq - 1, irq); 507 write_config_nybble(router, 0x56, pirq - 1, irq);
@@ -590,13 +590,20 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
590 case PCI_DEVICE_ID_INTEL_ICH10_1: 590 case PCI_DEVICE_ID_INTEL_ICH10_1:
591 case PCI_DEVICE_ID_INTEL_ICH10_2: 591 case PCI_DEVICE_ID_INTEL_ICH10_2:
592 case PCI_DEVICE_ID_INTEL_ICH10_3: 592 case PCI_DEVICE_ID_INTEL_ICH10_3:
593 case PCI_DEVICE_ID_INTEL_PCH_0:
594 case PCI_DEVICE_ID_INTEL_PCH_1:
595 r->name = "PIIX/ICH"; 593 r->name = "PIIX/ICH";
596 r->get = pirq_piix_get; 594 r->get = pirq_piix_get;
597 r->set = pirq_piix_set; 595 r->set = pirq_piix_set;
598 return 1; 596 return 1;
599 } 597 }
598
599 if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) &&
600 (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) {
601 r->name = "PIIX/ICH";
602 r->get = pirq_piix_get;
603 r->set = pirq_piix_set;
604 return 1;
605 }
606
600 return 0; 607 return 0;
601} 608}
602 609
@@ -823,7 +830,7 @@ static void __init pirq_find_router(struct irq_router *r)
823 r->get = NULL; 830 r->get = NULL;
824 r->set = NULL; 831 r->set = NULL;
825 832
826 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", 833 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n",
827 rt->rtr_vendor, rt->rtr_device); 834 rt->rtr_vendor, rt->rtr_device);
828 835
829 pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn); 836 pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
@@ -843,7 +850,7 @@ static void __init pirq_find_router(struct irq_router *r)
843 h->probe(r, pirq_router_dev, pirq_router_dev->device)) 850 h->probe(r, pirq_router_dev, pirq_router_dev->device))
844 break; 851 break;
845 } 852 }
846 dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n", 853 dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n",
847 pirq_router.name, 854 pirq_router.name,
848 pirq_router_dev->vendor, pirq_router_dev->device); 855 pirq_router_dev->vendor, pirq_router_dev->device);
849 856
@@ -1043,35 +1050,44 @@ static void __init pcibios_fixup_irqs(void)
1043 if (io_apic_assign_pci_irqs) { 1050 if (io_apic_assign_pci_irqs) {
1044 int irq; 1051 int irq;
1045 1052
1046 if (pin) { 1053 if (!pin)
1047 /* 1054 continue;
1048 * interrupt pins are numbered starting 1055
1049 * from 1 1056 /*
1050 */ 1057 * interrupt pins are numbered starting from 1
1051 pin--; 1058 */
1052 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, 1059 pin--;
1053 PCI_SLOT(dev->devfn), pin); 1060 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
1054 /* 1061 PCI_SLOT(dev->devfn), pin);
1055 * Busses behind bridges are typically not listed in the MP-table. 1062 /*
1056 * In this case we have to look up the IRQ based on the parent bus, 1063 * Busses behind bridges are typically not listed in the
1057 * parent slot, and pin number. The SMP code detects such bridged 1064 * MP-table. In this case we have to look up the IRQ
1058 * busses itself so we should get into this branch reliably. 1065 * based on the parent bus, parent slot, and pin number.
1059 */ 1066 * The SMP code detects such bridged busses itself so we
1060 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ 1067 * should get into this branch reliably.
1061 struct pci_dev *bridge = dev->bus->self; 1068 */
1062 1069 if (irq < 0 && dev->bus->parent) {
1063 pin = (pin + PCI_SLOT(dev->devfn)) % 4; 1070 /* go back to the bridge */
1064 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1071 struct pci_dev *bridge = dev->bus->self;
1065 PCI_SLOT(bridge->devfn), pin); 1072 int bus;
1066 if (irq >= 0) 1073
1067 dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n", 1074 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
1068 pci_name(bridge), 1075 bus = bridge->bus->number;
1069 'A' + pin, irq); 1076 irq = IO_APIC_get_PCI_irq_vector(bus,
1070 } 1077 PCI_SLOT(bridge->devfn), pin);
1071 if (irq >= 0) { 1078 if (irq >= 0)
1072 dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq); 1079 dev_warn(&dev->dev,
1073 dev->irq = irq; 1080 "using bridge %s INT %c to "
1074 } 1081 "get IRQ %d\n",
1082 pci_name(bridge),
1083 'A' + pin, irq);
1084 }
1085 if (irq >= 0) {
1086 dev_info(&dev->dev,
1087 "PCI->APIC IRQ transform: INT %c "
1088 "-> IRQ %d\n",
1089 'A' + pin, irq);
1090 dev->irq = irq;
1075 } 1091 }
1076 } 1092 }
1077#endif 1093#endif
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index d9635764ce3..654a2234f8f 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -209,7 +209,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
209 return name != NULL; 209 return name != NULL;
210} 210}
211 211
212static void __init pci_mmcfg_insert_resources(unsigned long resource_flags) 212static void __init pci_mmcfg_insert_resources(void)
213{ 213{
214#define PCI_MMCFG_RESOURCE_NAME_LEN 19 214#define PCI_MMCFG_RESOURCE_NAME_LEN 19
215 int i; 215 int i;
@@ -233,7 +233,7 @@ static void __init pci_mmcfg_insert_resources(unsigned long resource_flags)
233 cfg->pci_segment); 233 cfg->pci_segment);
234 res->start = cfg->address; 234 res->start = cfg->address;
235 res->end = res->start + (num_buses << 20) - 1; 235 res->end = res->start + (num_buses << 20) - 1;
236 res->flags = IORESOURCE_MEM | resource_flags; 236 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
237 insert_resource(&iomem_resource, res); 237 insert_resource(&iomem_resource, res);
238 names += PCI_MMCFG_RESOURCE_NAME_LEN; 238 names += PCI_MMCFG_RESOURCE_NAME_LEN;
239 } 239 }
@@ -434,11 +434,9 @@ static void __init __pci_mmcfg_init(int early)
434 (pci_mmcfg_config[0].address == 0)) 434 (pci_mmcfg_config[0].address == 0))
435 return; 435 return;
436 436
437 if (pci_mmcfg_arch_init()) { 437 if (pci_mmcfg_arch_init())
438 if (known_bridge)
439 pci_mmcfg_insert_resources(IORESOURCE_BUSY);
440 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 438 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
441 } else { 439 else {
442 /* 440 /*
443 * Signal not to attempt to insert mmcfg resources because 441 * Signal not to attempt to insert mmcfg resources because
444 * the architecture mmcfg setup could not initialize. 442 * the architecture mmcfg setup could not initialize.
@@ -475,7 +473,7 @@ static int __init pci_mmcfg_late_insert_resources(void)
475 * marked so it won't cause request errors when __request_region is 473 * marked so it won't cause request errors when __request_region is
476 * called. 474 * called.
477 */ 475 */
478 pci_mmcfg_insert_resources(0); 476 pci_mmcfg_insert_resources();
479 477
480 return 0; 478 return 0;
481} 479}
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index d3e083dea72..274d06082f4 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -11,6 +11,7 @@
11#include <linux/suspend.h> 11#include <linux/suspend.h>
12#include <asm/mtrr.h> 12#include <asm/mtrr.h>
13#include <asm/mce.h> 13#include <asm/mce.h>
14#include <asm/xcr.h>
14 15
15static struct saved_context saved_context; 16static struct saved_context saved_context;
16 17
@@ -126,6 +127,12 @@ static void __restore_processor_state(struct saved_context *ctxt)
126 if (boot_cpu_has(X86_FEATURE_SEP)) 127 if (boot_cpu_has(X86_FEATURE_SEP))
127 enable_sep_cpu(); 128 enable_sep_cpu();
128 129
130 /*
131 * restore XCR0 for xsave capable cpu's.
132 */
133 if (cpu_has_xsave)
134 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
135
129 fix_processor_context(); 136 fix_processor_context();
130 do_fpu_end(); 137 do_fpu_end();
131 mtrr_ap_init(); 138 mtrr_ap_init();
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c
index 66bdfb591fd..e3b6cf70d62 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu_64.c
@@ -14,6 +14,7 @@
14#include <asm/page.h> 14#include <asm/page.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable.h>
16#include <asm/mtrr.h> 16#include <asm/mtrr.h>
17#include <asm/xcr.h>
17 18
18static void fix_processor_context(void); 19static void fix_processor_context(void);
19 20
@@ -122,6 +123,12 @@ static void __restore_processor_state(struct saved_context *ctxt)
122 wrmsrl(MSR_GS_BASE, ctxt->gs_base); 123 wrmsrl(MSR_GS_BASE, ctxt->gs_base);
123 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); 124 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
124 125
126 /*
127 * restore XCR0 for xsave capable cpu's.
128 */
129 if (cpu_has_xsave)
130 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
131
125 fix_processor_context(); 132 fix_processor_context();
126 133
127 do_fpu_end(); 134 do_fpu_end();
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index 4fc7e872c85..d1e9b53f9d3 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -1,5 +1,3 @@
1.text
2
3/* 1/*
4 * This may not use any stack, nor any variable that is not "NoSave": 2 * This may not use any stack, nor any variable that is not "NoSave":
5 * 3 *
@@ -12,17 +10,18 @@
12#include <asm/segment.h> 10#include <asm/segment.h>
13#include <asm/page.h> 11#include <asm/page.h>
14#include <asm/asm-offsets.h> 12#include <asm/asm-offsets.h>
13#include <asm/processor-flags.h>
15 14
16 .text 15.text
17 16
18ENTRY(swsusp_arch_suspend) 17ENTRY(swsusp_arch_suspend)
19
20 movl %esp, saved_context_esp 18 movl %esp, saved_context_esp
21 movl %ebx, saved_context_ebx 19 movl %ebx, saved_context_ebx
22 movl %ebp, saved_context_ebp 20 movl %ebp, saved_context_ebp
23 movl %esi, saved_context_esi 21 movl %esi, saved_context_esi
24 movl %edi, saved_context_edi 22 movl %edi, saved_context_edi
25 pushfl ; popl saved_context_eflags 23 pushfl
24 popl saved_context_eflags
26 25
27 call swsusp_save 26 call swsusp_save
28 ret 27 ret
@@ -59,7 +58,7 @@ done:
59 movl mmu_cr4_features, %ecx 58 movl mmu_cr4_features, %ecx
60 jecxz 1f # cr4 Pentium and higher, skip if zero 59 jecxz 1f # cr4 Pentium and higher, skip if zero
61 movl %ecx, %edx 60 movl %ecx, %edx
62 andl $~(1<<7), %edx; # PGE 61 andl $~(X86_CR4_PGE), %edx
63 movl %edx, %cr4; # turn off PGE 62 movl %edx, %cr4; # turn off PGE
641: 631:
65 movl %cr3, %eax; # flush TLB 64 movl %cr3, %eax; # flush TLB
@@ -74,7 +73,8 @@ done:
74 movl saved_context_esi, %esi 73 movl saved_context_esi, %esi
75 movl saved_context_edi, %edi 74 movl saved_context_edi, %edi
76 75
77 pushl saved_context_eflags ; popfl 76 pushl saved_context_eflags
77 popfl
78 78
79 xorl %eax, %eax 79 xorl %eax, %eax
80 80
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 3815e425f47..87b9ab16642 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -26,5 +26,13 @@ config XEN_MAX_DOMAIN_MEMORY
26 26
27config XEN_SAVE_RESTORE 27config XEN_SAVE_RESTORE
28 bool 28 bool
29 depends on PM 29 depends on XEN && PM
30 default y \ No newline at end of file 30 default y
31
32config XEN_DEBUG_FS
33 bool "Enable Xen debug and tuning parameters in debugfs"
34 depends on XEN && DEBUG_FS
35 default n
36 help
37 Enable statistics output and various tuning options in debugfs.
38 Enabling this option may incur a significant performance overhead.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 59c1e539aed..313947940a1 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,12 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1ifdef CONFIG_FTRACE
2# Do not profile debug and lowlevel utilities
3CFLAGS_REMOVE_spinlock.o = -pg
4CFLAGS_REMOVE_time.o = -pg
5CFLAGS_REMOVE_irq.o = -pg
6endif
7
8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
2 time.o xen-asm_$(BITS).o grant-table.o suspend.o 9 time.o xen-asm_$(BITS).o grant-table.o suspend.o
3 10
4obj-$(CONFIG_SMP) += smp.o 11obj-$(CONFIG_SMP) += smp.o spinlock.o
12obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
new file mode 100644
index 00000000000..b53225d2cac
--- /dev/null
+++ b/arch/x86/xen/debugfs.c
@@ -0,0 +1,123 @@
1#include <linux/init.h>
2#include <linux/debugfs.h>
3#include <linux/module.h>
4
5#include "debugfs.h"
6
7static struct dentry *d_xen_debug;
8
9struct dentry * __init xen_init_debugfs(void)
10{
11 if (!d_xen_debug) {
12 d_xen_debug = debugfs_create_dir("xen", NULL);
13
14 if (!d_xen_debug)
15 pr_warning("Could not create 'xen' debugfs directory\n");
16 }
17
18 return d_xen_debug;
19}
20
21struct array_data
22{
23 void *array;
24 unsigned elements;
25};
26
27static int u32_array_open(struct inode *inode, struct file *file)
28{
29 file->private_data = NULL;
30 return nonseekable_open(inode, file);
31}
32
33static size_t format_array(char *buf, size_t bufsize, const char *fmt,
34 u32 *array, unsigned array_size)
35{
36 size_t ret = 0;
37 unsigned i;
38
39 for(i = 0; i < array_size; i++) {
40 size_t len;
41
42 len = snprintf(buf, bufsize, fmt, array[i]);
43 len++; /* ' ' or '\n' */
44 ret += len;
45
46 if (buf) {
47 buf += len;
48 bufsize -= len;
49 buf[-1] = (i == array_size-1) ? '\n' : ' ';
50 }
51 }
52
53 ret++; /* \0 */
54 if (buf)
55 *buf = '\0';
56
57 return ret;
58}
59
60static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
61{
62 size_t len = format_array(NULL, 0, fmt, array, array_size);
63 char *ret;
64
65 ret = kmalloc(len, GFP_KERNEL);
66 if (ret == NULL)
67 return NULL;
68
69 format_array(ret, len, fmt, array, array_size);
70 return ret;
71}
72
73static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
74 loff_t *ppos)
75{
76 struct inode *inode = file->f_path.dentry->d_inode;
77 struct array_data *data = inode->i_private;
78 size_t size;
79
80 if (*ppos == 0) {
81 if (file->private_data) {
82 kfree(file->private_data);
83 file->private_data = NULL;
84 }
85
86 file->private_data = format_array_alloc("%u", data->array, data->elements);
87 }
88
89 size = 0;
90 if (file->private_data)
91 size = strlen(file->private_data);
92
93 return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
94}
95
96static int xen_array_release(struct inode *inode, struct file *file)
97{
98 kfree(file->private_data);
99
100 return 0;
101}
102
103static struct file_operations u32_array_fops = {
104 .owner = THIS_MODULE,
105 .open = u32_array_open,
106 .release= xen_array_release,
107 .read = u32_array_read,
108};
109
110struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
111 struct dentry *parent,
112 u32 *array, unsigned elements)
113{
114 struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
115
116 if (data == NULL)
117 return NULL;
118
119 data->array = array;
120 data->elements = elements;
121
122 return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
123}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
new file mode 100644
index 00000000000..e2813208483
--- /dev/null
+++ b/arch/x86/xen/debugfs.h
@@ -0,0 +1,10 @@
1#ifndef _XEN_DEBUGFS_H
2#define _XEN_DEBUGFS_H
3
4struct dentry * __init xen_init_debugfs(void);
5
6struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
7 struct dentry *parent,
8 u32 *array, unsigned elements);
9
10#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index a4e201b47f6..b61534c7a4c 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,12 +30,12 @@
30#include <xen/interface/xen.h> 30#include <xen/interface/xen.h>
31#include <xen/interface/physdev.h> 31#include <xen/interface/physdev.h>
32#include <xen/interface/vcpu.h> 32#include <xen/interface/vcpu.h>
33#include <xen/interface/sched.h>
34#include <xen/features.h> 33#include <xen/features.h>
35#include <xen/page.h> 34#include <xen/page.h>
36#include <xen/hvc-console.h> 35#include <xen/hvc-console.h>
37 36
38#include <asm/paravirt.h> 37#include <asm/paravirt.h>
38#include <asm/apic.h>
39#include <asm/page.h> 39#include <asm/page.h>
40#include <asm/xen/hypercall.h> 40#include <asm/xen/hypercall.h>
41#include <asm/xen/hypervisor.h> 41#include <asm/xen/hypervisor.h>
@@ -57,6 +57,9 @@ EXPORT_SYMBOL_GPL(hypercall_page);
57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
59 59
60enum xen_domain_type xen_domain_type = XEN_NATIVE;
61EXPORT_SYMBOL_GPL(xen_domain_type);
62
60/* 63/*
61 * Identity map, in addition to plain kernel map. This needs to be 64 * Identity map, in addition to plain kernel map. This needs to be
62 * large enough to allocate page table pages to allocate the rest. 65 * large enough to allocate page table pages to allocate the rest.
@@ -110,7 +113,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
110 * 113 *
111 * 0: not available, 1: available 114 * 0: not available, 1: available
112 */ 115 */
113static int have_vcpu_info_placement = 1; 116static int have_vcpu_info_placement =
117#ifdef CONFIG_X86_32
118 1
119#else
120 0
121#endif
122 ;
123
114 124
115static void xen_vcpu_setup(int cpu) 125static void xen_vcpu_setup(int cpu)
116{ 126{
@@ -226,103 +236,68 @@ static unsigned long xen_get_debugreg(int reg)
226 return HYPERVISOR_get_debugreg(reg); 236 return HYPERVISOR_get_debugreg(reg);
227} 237}
228 238
229static unsigned long xen_save_fl(void) 239static void xen_leave_lazy(void)
230{ 240{
231 struct vcpu_info *vcpu; 241 paravirt_leave_lazy(paravirt_get_lazy_mode());
232 unsigned long flags; 242 xen_mc_flush();
233
234 vcpu = x86_read_percpu(xen_vcpu);
235
236 /* flag has opposite sense of mask */
237 flags = !vcpu->evtchn_upcall_mask;
238
239 /* convert to IF type flag
240 -0 -> 0x00000000
241 -1 -> 0xffffffff
242 */
243 return (-flags) & X86_EFLAGS_IF;
244} 243}
245 244
246static void xen_restore_fl(unsigned long flags) 245static unsigned long xen_store_tr(void)
247{ 246{
248 struct vcpu_info *vcpu; 247 return 0;
249
250 /* convert from IF type flag */
251 flags = !(flags & X86_EFLAGS_IF);
252
253 /* There's a one instruction preempt window here. We need to
254 make sure we're don't switch CPUs between getting the vcpu
255 pointer and updating the mask. */
256 preempt_disable();
257 vcpu = x86_read_percpu(xen_vcpu);
258 vcpu->evtchn_upcall_mask = flags;
259 preempt_enable_no_resched();
260
261 /* Doesn't matter if we get preempted here, because any
262 pending event will get dealt with anyway. */
263
264 if (flags == 0) {
265 preempt_check_resched();
266 barrier(); /* unmask then check (avoid races) */
267 if (unlikely(vcpu->evtchn_upcall_pending))
268 force_evtchn_callback();
269 }
270} 248}
271 249
272static void xen_irq_disable(void) 250/*
251 * Set the page permissions for a particular virtual address. If the
252 * address is a vmalloc mapping (or other non-linear mapping), then
253 * find the linear mapping of the page and also set its protections to
254 * match.
255 */
256static void set_aliased_prot(void *v, pgprot_t prot)
273{ 257{
274 /* There's a one instruction preempt window here. We need to 258 int level;
275 make sure we're don't switch CPUs between getting the vcpu 259 pte_t *ptep;
276 pointer and updating the mask. */ 260 pte_t pte;
277 preempt_disable(); 261 unsigned long pfn;
278 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; 262 struct page *page;
279 preempt_enable_no_resched();
280}
281 263
282static void xen_irq_enable(void) 264 ptep = lookup_address((unsigned long)v, &level);
283{ 265 BUG_ON(ptep == NULL);
284 struct vcpu_info *vcpu;
285 266
286 /* We don't need to worry about being preempted here, since 267 pfn = pte_pfn(*ptep);
287 either a) interrupts are disabled, so no preemption, or b) 268 page = pfn_to_page(pfn);
288 the caller is confused and is trying to re-enable interrupts
289 on an indeterminate processor. */
290 269
291 vcpu = x86_read_percpu(xen_vcpu); 270 pte = pfn_pte(pfn, prot);
292 vcpu->evtchn_upcall_mask = 0;
293 271
294 /* Doesn't matter if we get preempted here, because any 272 if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
295 pending event will get dealt with anyway. */ 273 BUG();
296 274
297 barrier(); /* unmask then check (avoid races) */ 275 if (!PageHighMem(page)) {
298 if (unlikely(vcpu->evtchn_upcall_pending)) 276 void *av = __va(PFN_PHYS(pfn));
299 force_evtchn_callback();
300}
301 277
302static void xen_safe_halt(void) 278 if (av != v)
303{ 279 if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
304 /* Blocking includes an implicit local_irq_enable(). */ 280 BUG();
305 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) 281 } else
306 BUG(); 282 kmap_flush_unused();
307} 283}
308 284
309static void xen_halt(void) 285static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
310{ 286{
311 if (irqs_disabled()) 287 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
312 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); 288 int i;
313 else
314 xen_safe_halt();
315}
316 289
317static void xen_leave_lazy(void) 290 for(i = 0; i < entries; i += entries_per_page)
318{ 291 set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
319 paravirt_leave_lazy(paravirt_get_lazy_mode());
320 xen_mc_flush();
321} 292}
322 293
323static unsigned long xen_store_tr(void) 294static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
324{ 295{
325 return 0; 296 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
297 int i;
298
299 for(i = 0; i < entries; i += entries_per_page)
300 set_aliased_prot(ldt + i, PAGE_KERNEL);
326} 301}
327 302
328static void xen_set_ldt(const void *addr, unsigned entries) 303static void xen_set_ldt(const void *addr, unsigned entries)
@@ -425,8 +400,7 @@ static void xen_load_gs_index(unsigned int idx)
425static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 400static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
426 const void *ptr) 401 const void *ptr)
427{ 402{
428 unsigned long lp = (unsigned long)&dt[entrynum]; 403 xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
429 xmaddr_t mach_lp = virt_to_machine(lp);
430 u64 entry = *(u64 *)ptr; 404 u64 entry = *(u64 *)ptr;
431 405
432 preempt_disable(); 406 preempt_disable();
@@ -559,7 +533,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
559} 533}
560 534
561static void xen_load_sp0(struct tss_struct *tss, 535static void xen_load_sp0(struct tss_struct *tss,
562 struct thread_struct *thread) 536 struct thread_struct *thread)
563{ 537{
564 struct multicall_space mcs = xen_mc_entry(0); 538 struct multicall_space mcs = xen_mc_entry(0);
565 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 539 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
@@ -580,16 +554,47 @@ static void xen_io_delay(void)
580} 554}
581 555
582#ifdef CONFIG_X86_LOCAL_APIC 556#ifdef CONFIG_X86_LOCAL_APIC
583static u32 xen_apic_read(unsigned long reg) 557static u32 xen_apic_read(u32 reg)
584{ 558{
585 return 0; 559 return 0;
586} 560}
587 561
588static void xen_apic_write(unsigned long reg, u32 val) 562static void xen_apic_write(u32 reg, u32 val)
589{ 563{
590 /* Warn to see if there's any stray references */ 564 /* Warn to see if there's any stray references */
591 WARN_ON(1); 565 WARN_ON(1);
592} 566}
567
568static u64 xen_apic_icr_read(void)
569{
570 return 0;
571}
572
573static void xen_apic_icr_write(u32 low, u32 id)
574{
575 /* Warn to see if there's any stray references */
576 WARN_ON(1);
577}
578
579static void xen_apic_wait_icr_idle(void)
580{
581 return;
582}
583
584static u32 xen_safe_apic_wait_icr_idle(void)
585{
586 return 0;
587}
588
589static struct apic_ops xen_basic_apic_ops = {
590 .read = xen_apic_read,
591 .write = xen_apic_write,
592 .icr_read = xen_apic_icr_read,
593 .icr_write = xen_apic_icr_write,
594 .wait_icr_idle = xen_apic_wait_icr_idle,
595 .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
596};
597
593#endif 598#endif
594 599
595static void xen_flush_tlb(void) 600static void xen_flush_tlb(void)
@@ -803,6 +808,19 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
803 ret = -EFAULT; 808 ret = -EFAULT;
804 break; 809 break;
805#endif 810#endif
811
812 case MSR_STAR:
813 case MSR_CSTAR:
814 case MSR_LSTAR:
815 case MSR_SYSCALL_MASK:
816 case MSR_IA32_SYSENTER_CS:
817 case MSR_IA32_SYSENTER_ESP:
818 case MSR_IA32_SYSENTER_EIP:
819 /* Fast syscall setup is all done in hypercalls, so
820 these are all ignored. Stub them out here to stop
821 Xen console noise. */
822 break;
823
806 default: 824 default:
807 ret = native_write_msr_safe(msr, low, high); 825 ret = native_write_msr_safe(msr, low, high);
808 } 826 }
@@ -812,7 +830,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
812 830
813/* Early in boot, while setting up the initial pagetable, assume 831/* Early in boot, while setting up the initial pagetable, assume
814 everything is pinned. */ 832 everything is pinned. */
815static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 833static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
816{ 834{
817#ifdef CONFIG_FLATMEM 835#ifdef CONFIG_FLATMEM
818 BUG_ON(mem_map); /* should only be used early */ 836 BUG_ON(mem_map); /* should only be used early */
@@ -822,7 +840,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
822 840
823/* Early release_pte assumes that all pts are pinned, since there's 841/* Early release_pte assumes that all pts are pinned, since there's
824 only init_mm and anything attached to that is pinned. */ 842 only init_mm and anything attached to that is pinned. */
825static void xen_release_pte_init(u32 pfn) 843static void xen_release_pte_init(unsigned long pfn)
826{ 844{
827 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 845 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
828} 846}
@@ -838,7 +856,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
838 856
839/* This needs to make sure the new pte page is pinned iff its being 857/* This needs to make sure the new pte page is pinned iff its being
840 attached to a pinned pagetable. */ 858 attached to a pinned pagetable. */
841static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) 859static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
842{ 860{
843 struct page *page = pfn_to_page(pfn); 861 struct page *page = pfn_to_page(pfn);
844 862
@@ -846,22 +864,23 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
846 SetPagePinned(page); 864 SetPagePinned(page);
847 865
848 if (!PageHighMem(page)) { 866 if (!PageHighMem(page)) {
849 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 867 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
850 if (level == PT_PTE) 868 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
851 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 869 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
852 } else 870 } else
853 /* make sure there are no stray mappings of 871 /* make sure there are no stray mappings of
854 this page */ 872 this page */
855 kmap_flush_unused(); 873 kmap_flush_unused();
874 vm_unmap_aliases();
856 } 875 }
857} 876}
858 877
859static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) 878static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
860{ 879{
861 xen_alloc_ptpage(mm, pfn, PT_PTE); 880 xen_alloc_ptpage(mm, pfn, PT_PTE);
862} 881}
863 882
864static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) 883static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
865{ 884{
866 xen_alloc_ptpage(mm, pfn, PT_PMD); 885 xen_alloc_ptpage(mm, pfn, PT_PMD);
867} 886}
@@ -909,13 +928,13 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
909} 928}
910 929
911/* This should never happen until we're OK to use struct page */ 930/* This should never happen until we're OK to use struct page */
912static void xen_release_ptpage(u32 pfn, unsigned level) 931static void xen_release_ptpage(unsigned long pfn, unsigned level)
913{ 932{
914 struct page *page = pfn_to_page(pfn); 933 struct page *page = pfn_to_page(pfn);
915 934
916 if (PagePinned(page)) { 935 if (PagePinned(page)) {
917 if (!PageHighMem(page)) { 936 if (!PageHighMem(page)) {
918 if (level == PT_PTE) 937 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
919 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 938 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
920 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 939 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
921 } 940 }
@@ -923,23 +942,23 @@ static void xen_release_ptpage(u32 pfn, unsigned level)
923 } 942 }
924} 943}
925 944
926static void xen_release_pte(u32 pfn) 945static void xen_release_pte(unsigned long pfn)
927{ 946{
928 xen_release_ptpage(pfn, PT_PTE); 947 xen_release_ptpage(pfn, PT_PTE);
929} 948}
930 949
931static void xen_release_pmd(u32 pfn) 950static void xen_release_pmd(unsigned long pfn)
932{ 951{
933 xen_release_ptpage(pfn, PT_PMD); 952 xen_release_ptpage(pfn, PT_PMD);
934} 953}
935 954
936#if PAGETABLE_LEVELS == 4 955#if PAGETABLE_LEVELS == 4
937static void xen_alloc_pud(struct mm_struct *mm, u32 pfn) 956static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
938{ 957{
939 xen_alloc_ptpage(mm, pfn, PT_PUD); 958 xen_alloc_ptpage(mm, pfn, PT_PUD);
940} 959}
941 960
942static void xen_release_pud(u32 pfn) 961static void xen_release_pud(unsigned long pfn)
943{ 962{
944 xen_release_ptpage(pfn, PT_PUD); 963 xen_release_ptpage(pfn, PT_PUD);
945} 964}
@@ -962,6 +981,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
962} 981}
963#endif 982#endif
964 983
984#ifdef CONFIG_X86_32
965static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 985static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
966{ 986{
967 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 987 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
@@ -980,6 +1000,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
980 1000
981 xen_set_pte(ptep, pte); 1001 xen_set_pte(ptep, pte);
982} 1002}
1003#endif
983 1004
984static __init void xen_pagetable_setup_start(pgd_t *base) 1005static __init void xen_pagetable_setup_start(pgd_t *base)
985{ 1006{
@@ -1046,7 +1067,6 @@ void xen_setup_vcpu_info_placement(void)
1046 1067
1047 /* xen_vcpu_setup managed to place the vcpu_info within the 1068 /* xen_vcpu_setup managed to place the vcpu_info within the
1048 percpu area for all cpus, so make use of it */ 1069 percpu area for all cpus, so make use of it */
1049#ifdef CONFIG_X86_32
1050 if (have_vcpu_info_placement) { 1070 if (have_vcpu_info_placement) {
1051 printk(KERN_INFO "Xen: using vcpu_info placement\n"); 1071 printk(KERN_INFO "Xen: using vcpu_info placement\n");
1052 1072
@@ -1056,7 +1076,6 @@ void xen_setup_vcpu_info_placement(void)
1056 pv_irq_ops.irq_enable = xen_irq_enable_direct; 1076 pv_irq_ops.irq_enable = xen_irq_enable_direct;
1057 pv_mmu_ops.read_cr2 = xen_read_cr2_direct; 1077 pv_mmu_ops.read_cr2 = xen_read_cr2_direct;
1058 } 1078 }
1059#endif
1060} 1079}
1061 1080
1062static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, 1081static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
@@ -1077,12 +1096,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
1077 goto patch_site 1096 goto patch_site
1078 1097
1079 switch (type) { 1098 switch (type) {
1080#ifdef CONFIG_X86_32
1081 SITE(pv_irq_ops, irq_enable); 1099 SITE(pv_irq_ops, irq_enable);
1082 SITE(pv_irq_ops, irq_disable); 1100 SITE(pv_irq_ops, irq_disable);
1083 SITE(pv_irq_ops, save_fl); 1101 SITE(pv_irq_ops, save_fl);
1084 SITE(pv_irq_ops, restore_fl); 1102 SITE(pv_irq_ops, restore_fl);
1085#endif /* CONFIG_X86_32 */
1086#undef SITE 1103#undef SITE
1087 1104
1088 patch_site: 1105 patch_site:
@@ -1220,6 +1237,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1220 .load_gs_index = xen_load_gs_index, 1237 .load_gs_index = xen_load_gs_index,
1221#endif 1238#endif
1222 1239
1240 .alloc_ldt = xen_alloc_ldt,
1241 .free_ldt = xen_free_ldt,
1242
1223 .store_gdt = native_store_gdt, 1243 .store_gdt = native_store_gdt,
1224 .store_idt = native_store_idt, 1244 .store_idt = native_store_idt,
1225 .store_tr = xen_store_tr, 1245 .store_tr = xen_store_tr,
@@ -1241,40 +1261,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1241 }, 1261 },
1242}; 1262};
1243 1263
1244static void __init __xen_init_IRQ(void)
1245{
1246#ifdef CONFIG_X86_64
1247 int i;
1248
1249 /* Create identity vector->irq map */
1250 for(i = 0; i < NR_VECTORS; i++) {
1251 int cpu;
1252
1253 for_each_possible_cpu(cpu)
1254 per_cpu(vector_irq, cpu)[i] = i;
1255 }
1256#endif /* CONFIG_X86_64 */
1257
1258 xen_init_IRQ();
1259}
1260
1261static const struct pv_irq_ops xen_irq_ops __initdata = {
1262 .init_IRQ = __xen_init_IRQ,
1263 .save_fl = xen_save_fl,
1264 .restore_fl = xen_restore_fl,
1265 .irq_disable = xen_irq_disable,
1266 .irq_enable = xen_irq_enable,
1267 .safe_halt = xen_safe_halt,
1268 .halt = xen_halt,
1269#ifdef CONFIG_X86_64
1270 .adjust_exception_frame = xen_adjust_exception_frame,
1271#endif
1272};
1273
1274static const struct pv_apic_ops xen_apic_ops __initdata = { 1264static const struct pv_apic_ops xen_apic_ops __initdata = {
1275#ifdef CONFIG_X86_LOCAL_APIC 1265#ifdef CONFIG_X86_LOCAL_APIC
1276 .apic_write = xen_apic_write,
1277 .apic_read = xen_apic_read,
1278 .setup_boot_clock = paravirt_nop, 1266 .setup_boot_clock = paravirt_nop,
1279 .setup_secondary_clock = paravirt_nop, 1267 .setup_secondary_clock = paravirt_nop,
1280 .startup_ipi_hook = paravirt_nop, 1268 .startup_ipi_hook = paravirt_nop,
@@ -1413,7 +1401,7 @@ static void __init xen_reserve_top(void)
1413 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) 1401 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1414 top = pp.virt_start; 1402 top = pp.virt_start;
1415 1403
1416 reserve_top_address(-top + 2 * PAGE_SIZE); 1404 reserve_top_address(-top);
1417#endif /* CONFIG_X86_32 */ 1405#endif /* CONFIG_X86_32 */
1418} 1406}
1419 1407
@@ -1447,48 +1435,11 @@ static void *m2v(phys_addr_t maddr)
1447 return __ka(m2p(maddr)); 1435 return __ka(m2p(maddr));
1448} 1436}
1449 1437
1450#ifdef CONFIG_X86_64
1451static void walk(pgd_t *pgd, unsigned long addr)
1452{
1453 unsigned l4idx = pgd_index(addr);
1454 unsigned l3idx = pud_index(addr);
1455 unsigned l2idx = pmd_index(addr);
1456 unsigned l1idx = pte_index(addr);
1457 pgd_t l4;
1458 pud_t l3;
1459 pmd_t l2;
1460 pte_t l1;
1461
1462 xen_raw_printk("walk %p, %lx -> %d %d %d %d\n",
1463 pgd, addr, l4idx, l3idx, l2idx, l1idx);
1464
1465 l4 = pgd[l4idx];
1466 xen_raw_printk(" l4: %016lx\n", l4.pgd);
1467 xen_raw_printk(" %016lx\n", pgd_val(l4));
1468
1469 l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx];
1470 xen_raw_printk(" l3: %016lx\n", l3.pud);
1471 xen_raw_printk(" %016lx\n", pud_val(l3));
1472
1473 l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx];
1474 xen_raw_printk(" l2: %016lx\n", l2.pmd);
1475 xen_raw_printk(" %016lx\n", pmd_val(l2));
1476
1477 l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx];
1478 xen_raw_printk(" l1: %016lx\n", l1.pte);
1479 xen_raw_printk(" %016lx\n", pte_val(l1));
1480}
1481#endif
1482
1483static void set_page_prot(void *addr, pgprot_t prot) 1438static void set_page_prot(void *addr, pgprot_t prot)
1484{ 1439{
1485 unsigned long pfn = __pa(addr) >> PAGE_SHIFT; 1440 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1486 pte_t pte = pfn_pte(pfn, prot); 1441 pte_t pte = pfn_pte(pfn, prot);
1487 1442
1488 xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n",
1489 addr, pfn, get_phys_to_machine(pfn),
1490 pgprot_val(prot), pte.pte);
1491
1492 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) 1443 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1493 BUG(); 1444 BUG();
1494} 1445}
@@ -1664,6 +1615,8 @@ asmlinkage void __init xen_start_kernel(void)
1664 if (!xen_start_info) 1615 if (!xen_start_info)
1665 return; 1616 return;
1666 1617
1618 xen_domain_type = XEN_PV_DOMAIN;
1619
1667 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); 1620 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
1668 1621
1669 xen_setup_features(); 1622 xen_setup_features();
@@ -1673,10 +1626,18 @@ asmlinkage void __init xen_start_kernel(void)
1673 pv_init_ops = xen_init_ops; 1626 pv_init_ops = xen_init_ops;
1674 pv_time_ops = xen_time_ops; 1627 pv_time_ops = xen_time_ops;
1675 pv_cpu_ops = xen_cpu_ops; 1628 pv_cpu_ops = xen_cpu_ops;
1676 pv_irq_ops = xen_irq_ops;
1677 pv_apic_ops = xen_apic_ops; 1629 pv_apic_ops = xen_apic_ops;
1678 pv_mmu_ops = xen_mmu_ops; 1630 pv_mmu_ops = xen_mmu_ops;
1679 1631
1632 xen_init_irq_ops();
1633
1634#ifdef CONFIG_X86_LOCAL_APIC
1635 /*
1636 * set up the basic apic ops.
1637 */
1638 apic_ops = &xen_basic_apic_ops;
1639#endif
1640
1680 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { 1641 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1681 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; 1642 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
1682 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; 1643 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
@@ -1700,7 +1661,7 @@ asmlinkage void __init xen_start_kernel(void)
1700 1661
1701 /* Prevent unwanted bits from being set in PTEs. */ 1662 /* Prevent unwanted bits from being set in PTEs. */
1702 __supported_pte_mask &= ~_PAGE_GLOBAL; 1663 __supported_pte_mask &= ~_PAGE_GLOBAL;
1703 if (!is_initial_xendomain()) 1664 if (!xen_initial_domain())
1704 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); 1665 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1705 1666
1706 /* Don't do the full vcpu_info placement stuff until we have a 1667 /* Don't do the full vcpu_info placement stuff until we have a
@@ -1735,7 +1696,7 @@ asmlinkage void __init xen_start_kernel(void)
1735 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1696 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1736 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); 1697 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1737 1698
1738 if (!is_initial_xendomain()) { 1699 if (!xen_initial_domain()) {
1739 add_preferred_console("xenboot", 0, NULL); 1700 add_preferred_console("xenboot", 0, NULL);
1740 add_preferred_console("tty", 0, NULL); 1701 add_preferred_console("tty", 0, NULL);
1741 add_preferred_console("hvc", 0, NULL); 1702 add_preferred_console("hvc", 0, NULL);
@@ -1743,15 +1704,6 @@ asmlinkage void __init xen_start_kernel(void)
1743 1704
1744 xen_raw_console_write("about to get started...\n"); 1705 xen_raw_console_write("about to get started...\n");
1745 1706
1746#if 0
1747 xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n",
1748 &boot_params, __pa_symbol(&boot_params),
1749 __va(__pa_symbol(&boot_params)));
1750
1751 walk(pgd, &boot_params);
1752 walk(pgd, __va(__pa(&boot_params)));
1753#endif
1754
1755 /* Start the world */ 1707 /* Start the world */
1756#ifdef CONFIG_X86_32 1708#ifdef CONFIG_X86_32
1757 i386_start_kernel(); 1709 i386_start_kernel();
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
new file mode 100644
index 00000000000..bb042608c60
--- /dev/null
+++ b/arch/x86/xen/irq.c
@@ -0,0 +1,141 @@
1#include <linux/hardirq.h>
2
3#include <xen/interface/xen.h>
4#include <xen/interface/sched.h>
5#include <xen/interface/vcpu.h>
6
7#include <asm/xen/hypercall.h>
8#include <asm/xen/hypervisor.h>
9
10#include "xen-ops.h"
11
12/*
13 * Force a proper event-channel callback from Xen after clearing the
14 * callback mask. We do this in a very simple manner, by making a call
15 * down into Xen. The pending flag will be checked by Xen on return.
16 */
17void xen_force_evtchn_callback(void)
18{
19 (void)HYPERVISOR_xen_version(0, NULL);
20}
21
22static void __init __xen_init_IRQ(void)
23{
24 int i;
25
26 /* Create identity vector->irq map */
27 for(i = 0; i < NR_VECTORS; i++) {
28 int cpu;
29
30 for_each_possible_cpu(cpu)
31 per_cpu(vector_irq, cpu)[i] = i;
32 }
33
34 xen_init_IRQ();
35}
36
37static unsigned long xen_save_fl(void)
38{
39 struct vcpu_info *vcpu;
40 unsigned long flags;
41
42 vcpu = x86_read_percpu(xen_vcpu);
43
44 /* flag has opposite sense of mask */
45 flags = !vcpu->evtchn_upcall_mask;
46
47 /* convert to IF type flag
48 -0 -> 0x00000000
49 -1 -> 0xffffffff
50 */
51 return (-flags) & X86_EFLAGS_IF;
52}
53
54static void xen_restore_fl(unsigned long flags)
55{
56 struct vcpu_info *vcpu;
57
58 /* convert from IF type flag */
59 flags = !(flags & X86_EFLAGS_IF);
60
61 /* There's a one instruction preempt window here. We need to
62 make sure we're don't switch CPUs between getting the vcpu
63 pointer and updating the mask. */
64 preempt_disable();
65 vcpu = x86_read_percpu(xen_vcpu);
66 vcpu->evtchn_upcall_mask = flags;
67 preempt_enable_no_resched();
68
69 /* Doesn't matter if we get preempted here, because any
70 pending event will get dealt with anyway. */
71
72 if (flags == 0) {
73 preempt_check_resched();
74 barrier(); /* unmask then check (avoid races) */
75 if (unlikely(vcpu->evtchn_upcall_pending))
76 xen_force_evtchn_callback();
77 }
78}
79
80static void xen_irq_disable(void)
81{
82 /* There's a one instruction preempt window here. We need to
83 make sure we're don't switch CPUs between getting the vcpu
84 pointer and updating the mask. */
85 preempt_disable();
86 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
87 preempt_enable_no_resched();
88}
89
90static void xen_irq_enable(void)
91{
92 struct vcpu_info *vcpu;
93
94 /* We don't need to worry about being preempted here, since
95 either a) interrupts are disabled, so no preemption, or b)
96 the caller is confused and is trying to re-enable interrupts
97 on an indeterminate processor. */
98
99 vcpu = x86_read_percpu(xen_vcpu);
100 vcpu->evtchn_upcall_mask = 0;
101
102 /* Doesn't matter if we get preempted here, because any
103 pending event will get dealt with anyway. */
104
105 barrier(); /* unmask then check (avoid races) */
106 if (unlikely(vcpu->evtchn_upcall_pending))
107 xen_force_evtchn_callback();
108}
109
110static void xen_safe_halt(void)
111{
112 /* Blocking includes an implicit local_irq_enable(). */
113 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
114 BUG();
115}
116
117static void xen_halt(void)
118{
119 if (irqs_disabled())
120 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
121 else
122 xen_safe_halt();
123}
124
125static const struct pv_irq_ops xen_irq_ops __initdata = {
126 .init_IRQ = __xen_init_IRQ,
127 .save_fl = xen_save_fl,
128 .restore_fl = xen_restore_fl,
129 .irq_disable = xen_irq_disable,
130 .irq_enable = xen_irq_enable,
131 .safe_halt = xen_safe_halt,
132 .halt = xen_halt,
133#ifdef CONFIG_X86_64
134 .adjust_exception_frame = xen_adjust_exception_frame,
135#endif
136};
137
138void __init xen_init_irq_ops()
139{
140 pv_irq_ops = xen_irq_ops;
141}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index aa37469da69..d4d52f5a1cf 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -40,6 +40,7 @@
40 */ 40 */
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/debugfs.h>
43#include <linux/bug.h> 44#include <linux/bug.h>
44 45
45#include <asm/pgtable.h> 46#include <asm/pgtable.h>
@@ -57,6 +58,61 @@
57 58
58#include "multicalls.h" 59#include "multicalls.h"
59#include "mmu.h" 60#include "mmu.h"
61#include "debugfs.h"
62
63#define MMU_UPDATE_HISTO 30
64
65#ifdef CONFIG_XEN_DEBUG_FS
66
67static struct {
68 u32 pgd_update;
69 u32 pgd_update_pinned;
70 u32 pgd_update_batched;
71
72 u32 pud_update;
73 u32 pud_update_pinned;
74 u32 pud_update_batched;
75
76 u32 pmd_update;
77 u32 pmd_update_pinned;
78 u32 pmd_update_batched;
79
80 u32 pte_update;
81 u32 pte_update_pinned;
82 u32 pte_update_batched;
83
84 u32 mmu_update;
85 u32 mmu_update_extended;
86 u32 mmu_update_histo[MMU_UPDATE_HISTO];
87
88 u32 prot_commit;
89 u32 prot_commit_batched;
90
91 u32 set_pte_at;
92 u32 set_pte_at_batched;
93 u32 set_pte_at_pinned;
94 u32 set_pte_at_current;
95 u32 set_pte_at_kernel;
96} mmu_stats;
97
98static u8 zero_stats;
99
100static inline void check_zero(void)
101{
102 if (unlikely(zero_stats)) {
103 memset(&mmu_stats, 0, sizeof(mmu_stats));
104 zero_stats = 0;
105 }
106}
107
108#define ADD_STATS(elem, val) \
109 do { check_zero(); mmu_stats.elem += (val); } while(0)
110
111#else /* !CONFIG_XEN_DEBUG_FS */
112
113#define ADD_STATS(elem, val) do { (void)(val); } while(0)
114
115#endif /* CONFIG_XEN_DEBUG_FS */
60 116
61/* 117/*
62 * Just beyond the highest usermode address. STACK_TOP_MAX has a 118 * Just beyond the highest usermode address. STACK_TOP_MAX has a
@@ -229,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr)
229} 285}
230 286
231 287
232static bool page_pinned(void *ptr) 288static bool xen_page_pinned(void *ptr)
233{ 289{
234 struct page *page = virt_to_page(ptr); 290 struct page *page = virt_to_page(ptr);
235 291
236 return PagePinned(page); 292 return PagePinned(page);
237} 293}
238 294
239static void extend_mmu_update(const struct mmu_update *update) 295static void xen_extend_mmu_update(const struct mmu_update *update)
240{ 296{
241 struct multicall_space mcs; 297 struct multicall_space mcs;
242 struct mmu_update *u; 298 struct mmu_update *u;
243 299
244 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 300 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
245 301
246 if (mcs.mc != NULL) 302 if (mcs.mc != NULL) {
303 ADD_STATS(mmu_update_extended, 1);
304 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
305
247 mcs.mc->args[1]++; 306 mcs.mc->args[1]++;
248 else { 307
308 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
309 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
310 else
311 ADD_STATS(mmu_update_histo[0], 1);
312 } else {
313 ADD_STATS(mmu_update, 1);
249 mcs = __xen_mc_entry(sizeof(*u)); 314 mcs = __xen_mc_entry(sizeof(*u));
250 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 315 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
316 ADD_STATS(mmu_update_histo[1], 1);
251 } 317 }
252 318
253 u = mcs.args; 319 u = mcs.args;
@@ -265,7 +331,9 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
265 /* ptr may be ioremapped for 64-bit pagetable setup */ 331 /* ptr may be ioremapped for 64-bit pagetable setup */
266 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 332 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
267 u.val = pmd_val_ma(val); 333 u.val = pmd_val_ma(val);
268 extend_mmu_update(&u); 334 xen_extend_mmu_update(&u);
335
336 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
269 337
270 xen_mc_issue(PARAVIRT_LAZY_MMU); 338 xen_mc_issue(PARAVIRT_LAZY_MMU);
271 339
@@ -274,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
274 342
275void xen_set_pmd(pmd_t *ptr, pmd_t val) 343void xen_set_pmd(pmd_t *ptr, pmd_t val)
276{ 344{
345 ADD_STATS(pmd_update, 1);
346
277 /* If page is not pinned, we can just update the entry 347 /* If page is not pinned, we can just update the entry
278 directly */ 348 directly */
279 if (!page_pinned(ptr)) { 349 if (!xen_page_pinned(ptr)) {
280 *ptr = val; 350 *ptr = val;
281 return; 351 return;
282 } 352 }
283 353
354 ADD_STATS(pmd_update_pinned, 1);
355
284 xen_set_pmd_hyper(ptr, val); 356 xen_set_pmd_hyper(ptr, val);
285} 357}
286 358
@@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
300 if (mm == &init_mm) 372 if (mm == &init_mm)
301 preempt_disable(); 373 preempt_disable();
302 374
375 ADD_STATS(set_pte_at, 1);
376// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
377 ADD_STATS(set_pte_at_current, mm == current->mm);
378 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
379
303 if (mm == current->mm || mm == &init_mm) { 380 if (mm == current->mm || mm == &init_mm) {
304 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 381 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
305 struct multicall_space mcs; 382 struct multicall_space mcs;
306 mcs = xen_mc_entry(0); 383 mcs = xen_mc_entry(0);
307 384
308 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 385 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
386 ADD_STATS(set_pte_at_batched, 1);
309 xen_mc_issue(PARAVIRT_LAZY_MMU); 387 xen_mc_issue(PARAVIRT_LAZY_MMU);
310 goto out; 388 goto out;
311 } else 389 } else
@@ -334,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
334 412
335 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 413 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
336 u.val = pte_val_ma(pte); 414 u.val = pte_val_ma(pte);
337 extend_mmu_update(&u); 415 xen_extend_mmu_update(&u);
416
417 ADD_STATS(prot_commit, 1);
418 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
338 419
339 xen_mc_issue(PARAVIRT_LAZY_MMU); 420 xen_mc_issue(PARAVIRT_LAZY_MMU);
340} 421}
@@ -400,7 +481,9 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
400 /* ptr may be ioremapped for 64-bit pagetable setup */ 481 /* ptr may be ioremapped for 64-bit pagetable setup */
401 u.ptr = arbitrary_virt_to_machine(ptr).maddr; 482 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
402 u.val = pud_val_ma(val); 483 u.val = pud_val_ma(val);
403 extend_mmu_update(&u); 484 xen_extend_mmu_update(&u);
485
486 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
404 487
405 xen_mc_issue(PARAVIRT_LAZY_MMU); 488 xen_mc_issue(PARAVIRT_LAZY_MMU);
406 489
@@ -409,18 +492,26 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
409 492
410void xen_set_pud(pud_t *ptr, pud_t val) 493void xen_set_pud(pud_t *ptr, pud_t val)
411{ 494{
495 ADD_STATS(pud_update, 1);
496
412 /* If page is not pinned, we can just update the entry 497 /* If page is not pinned, we can just update the entry
413 directly */ 498 directly */
414 if (!page_pinned(ptr)) { 499 if (!xen_page_pinned(ptr)) {
415 *ptr = val; 500 *ptr = val;
416 return; 501 return;
417 } 502 }
418 503
504 ADD_STATS(pud_update_pinned, 1);
505
419 xen_set_pud_hyper(ptr, val); 506 xen_set_pud_hyper(ptr, val);
420} 507}
421 508
422void xen_set_pte(pte_t *ptep, pte_t pte) 509void xen_set_pte(pte_t *ptep, pte_t pte)
423{ 510{
511 ADD_STATS(pte_update, 1);
512// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
513 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
514
424#ifdef CONFIG_X86_PAE 515#ifdef CONFIG_X86_PAE
425 ptep->pte_high = pte.pte_high; 516 ptep->pte_high = pte.pte_high;
426 smp_wmb(); 517 smp_wmb();
@@ -490,7 +581,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
490 581
491 u.ptr = virt_to_machine(ptr).maddr; 582 u.ptr = virt_to_machine(ptr).maddr;
492 u.val = pgd_val_ma(val); 583 u.val = pgd_val_ma(val);
493 extend_mmu_update(&u); 584 xen_extend_mmu_update(&u);
494} 585}
495 586
496/* 587/*
@@ -517,17 +608,22 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
517{ 608{
518 pgd_t *user_ptr = xen_get_user_pgd(ptr); 609 pgd_t *user_ptr = xen_get_user_pgd(ptr);
519 610
611 ADD_STATS(pgd_update, 1);
612
520 /* If page is not pinned, we can just update the entry 613 /* If page is not pinned, we can just update the entry
521 directly */ 614 directly */
522 if (!page_pinned(ptr)) { 615 if (!xen_page_pinned(ptr)) {
523 *ptr = val; 616 *ptr = val;
524 if (user_ptr) { 617 if (user_ptr) {
525 WARN_ON(page_pinned(user_ptr)); 618 WARN_ON(xen_page_pinned(user_ptr));
526 *user_ptr = val; 619 *user_ptr = val;
527 } 620 }
528 return; 621 return;
529 } 622 }
530 623
624 ADD_STATS(pgd_update_pinned, 1);
625 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
626
531 /* If it's pinned, then we can at least batch the kernel and 627 /* If it's pinned, then we can at least batch the kernel and
532 user updates together. */ 628 user updates together. */
533 xen_mc_batch(); 629 xen_mc_batch();
@@ -555,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
555 * For 64-bit, we must skip the Xen hole in the middle of the address 651 * For 64-bit, we must skip the Xen hole in the middle of the address
556 * space, just after the big x86-64 virtual hole. 652 * space, just after the big x86-64 virtual hole.
557 */ 653 */
558static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), 654static int xen_pgd_walk(struct mm_struct *mm,
559 unsigned long limit) 655 int (*func)(struct mm_struct *mm, struct page *,
656 enum pt_level),
657 unsigned long limit)
560{ 658{
659 pgd_t *pgd = mm->pgd;
561 int flush = 0; 660 int flush = 0;
562 unsigned hole_low, hole_high; 661 unsigned hole_low, hole_high;
563 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; 662 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
@@ -590,8 +689,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
590 pmdidx_limit = 0; 689 pmdidx_limit = 0;
591#endif 690#endif
592 691
593 flush |= (*func)(virt_to_page(pgd), PT_PGD);
594
595 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { 692 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
596 pud_t *pud; 693 pud_t *pud;
597 694
@@ -604,7 +701,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
604 pud = pud_offset(&pgd[pgdidx], 0); 701 pud = pud_offset(&pgd[pgdidx], 0);
605 702
606 if (PTRS_PER_PUD > 1) /* not folded */ 703 if (PTRS_PER_PUD > 1) /* not folded */
607 flush |= (*func)(virt_to_page(pud), PT_PUD); 704 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
608 705
609 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { 706 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
610 pmd_t *pmd; 707 pmd_t *pmd;
@@ -619,7 +716,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
619 pmd = pmd_offset(&pud[pudidx], 0); 716 pmd = pmd_offset(&pud[pudidx], 0);
620 717
621 if (PTRS_PER_PMD > 1) /* not folded */ 718 if (PTRS_PER_PMD > 1) /* not folded */
622 flush |= (*func)(virt_to_page(pmd), PT_PMD); 719 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
623 720
624 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { 721 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
625 struct page *pte; 722 struct page *pte;
@@ -633,28 +730,34 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level),
633 continue; 730 continue;
634 731
635 pte = pmd_page(pmd[pmdidx]); 732 pte = pmd_page(pmd[pmdidx]);
636 flush |= (*func)(pte, PT_PTE); 733 flush |= (*func)(mm, pte, PT_PTE);
637 } 734 }
638 } 735 }
639 } 736 }
737
640out: 738out:
739 /* Do the top level last, so that the callbacks can use it as
740 a cue to do final things like tlb flushes. */
741 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
641 742
642 return flush; 743 return flush;
643} 744}
644 745
645static spinlock_t *lock_pte(struct page *page) 746/* If we're using split pte locks, then take the page's lock and
747 return a pointer to it. Otherwise return NULL. */
748static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
646{ 749{
647 spinlock_t *ptl = NULL; 750 spinlock_t *ptl = NULL;
648 751
649#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 752#if USE_SPLIT_PTLOCKS
650 ptl = __pte_lockptr(page); 753 ptl = __pte_lockptr(page);
651 spin_lock(ptl); 754 spin_lock_nest_lock(ptl, &mm->page_table_lock);
652#endif 755#endif
653 756
654 return ptl; 757 return ptl;
655} 758}
656 759
657static void do_unlock(void *v) 760static void xen_pte_unlock(void *v)
658{ 761{
659 spinlock_t *ptl = v; 762 spinlock_t *ptl = v;
660 spin_unlock(ptl); 763 spin_unlock(ptl);
@@ -672,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
672 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 775 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
673} 776}
674 777
675static int pin_page(struct page *page, enum pt_level level) 778static int xen_pin_page(struct mm_struct *mm, struct page *page,
779 enum pt_level level)
676{ 780{
677 unsigned pgfl = TestSetPagePinned(page); 781 unsigned pgfl = TestSetPagePinned(page);
678 int flush; 782 int flush;
@@ -691,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level)
691 795
692 flush = 0; 796 flush = 0;
693 797
798 /*
799 * We need to hold the pagetable lock between the time
800 * we make the pagetable RO and when we actually pin
801 * it. If we don't, then other users may come in and
802 * attempt to update the pagetable by writing it,
803 * which will fail because the memory is RO but not
804 * pinned, so Xen won't do the trap'n'emulate.
805 *
806 * If we're using split pte locks, we can't hold the
807 * entire pagetable's worth of locks during the
808 * traverse, because we may wrap the preempt count (8
809 * bits). The solution is to mark RO and pin each PTE
810 * page while holding the lock. This means the number
811 * of locks we end up holding is never more than a
812 * batch size (~32 entries, at present).
813 *
814 * If we're not using split pte locks, we needn't pin
815 * the PTE pages independently, because we're
816 * protected by the overall pagetable lock.
817 */
694 ptl = NULL; 818 ptl = NULL;
695 if (level == PT_PTE) 819 if (level == PT_PTE)
696 ptl = lock_pte(page); 820 ptl = xen_pte_lock(page, mm);
697 821
698 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 822 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
699 pfn_pte(pfn, PAGE_KERNEL_RO), 823 pfn_pte(pfn, PAGE_KERNEL_RO),
700 level == PT_PGD ? UVMF_TLB_FLUSH : 0); 824 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
701 825
702 if (level == PT_PTE) 826 if (ptl) {
703 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); 827 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
704 828
705 if (ptl) {
706 /* Queue a deferred unlock for when this batch 829 /* Queue a deferred unlock for when this batch
707 is completed. */ 830 is completed. */
708 xen_mc_callback(do_unlock, ptl); 831 xen_mc_callback(xen_pte_unlock, ptl);
709 } 832 }
710 } 833 }
711 834
@@ -715,14 +838,15 @@ static int pin_page(struct page *page, enum pt_level level)
715/* This is called just after a mm has been created, but it has not 838/* This is called just after a mm has been created, but it has not
716 been used yet. We need to make sure that its pagetable is all 839 been used yet. We need to make sure that its pagetable is all
717 read-only, and can be pinned. */ 840 read-only, and can be pinned. */
718void xen_pgd_pin(pgd_t *pgd) 841static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
719{ 842{
720 xen_mc_batch(); 843 xen_mc_batch();
721 844
722 if (pgd_walk(pgd, pin_page, USER_LIMIT)) { 845 if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) {
723 /* re-enable interrupts for kmap_flush_unused */ 846 /* re-enable interrupts for kmap_flush_unused */
724 xen_mc_issue(0); 847 xen_mc_issue(0);
725 kmap_flush_unused(); 848 kmap_flush_unused();
849 vm_unmap_aliases();
726 xen_mc_batch(); 850 xen_mc_batch();
727 } 851 }
728 852
@@ -733,25 +857,35 @@ void xen_pgd_pin(pgd_t *pgd)
733 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); 857 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
734 858
735 if (user_pgd) { 859 if (user_pgd) {
736 pin_page(virt_to_page(user_pgd), PT_PGD); 860 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
737 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); 861 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
738 } 862 }
739 } 863 }
740#else /* CONFIG_X86_32 */ 864#else /* CONFIG_X86_32 */
741#ifdef CONFIG_X86_PAE 865#ifdef CONFIG_X86_PAE
742 /* Need to make sure unshared kernel PMD is pinnable */ 866 /* Need to make sure unshared kernel PMD is pinnable */
743 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 867 xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
868 PT_PMD);
744#endif 869#endif
745 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 870 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
746#endif /* CONFIG_X86_64 */ 871#endif /* CONFIG_X86_64 */
747 xen_mc_issue(0); 872 xen_mc_issue(0);
748} 873}
749 874
875static void xen_pgd_pin(struct mm_struct *mm)
876{
877 __xen_pgd_pin(mm, mm->pgd);
878}
879
750/* 880/*
751 * On save, we need to pin all pagetables to make sure they get their 881 * On save, we need to pin all pagetables to make sure they get their
752 * mfns turned into pfns. Search the list for any unpinned pgds and pin 882 * mfns turned into pfns. Search the list for any unpinned pgds and pin
753 * them (unpinned pgds are not currently in use, probably because the 883 * them (unpinned pgds are not currently in use, probably because the
754 * process is under construction or destruction). 884 * process is under construction or destruction).
885 *
886 * Expected to be called in stop_machine() ("equivalent to taking
887 * every spinlock in the system"), so the locking doesn't really
888 * matter all that much.
755 */ 889 */
756void xen_mm_pin_all(void) 890void xen_mm_pin_all(void)
757{ 891{
@@ -762,7 +896,7 @@ void xen_mm_pin_all(void)
762 896
763 list_for_each_entry(page, &pgd_list, lru) { 897 list_for_each_entry(page, &pgd_list, lru) {
764 if (!PagePinned(page)) { 898 if (!PagePinned(page)) {
765 xen_pgd_pin((pgd_t *)page_address(page)); 899 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
766 SetPageSavePinned(page); 900 SetPageSavePinned(page);
767 } 901 }
768 } 902 }
@@ -775,7 +909,8 @@ void xen_mm_pin_all(void)
775 * that's before we have page structures to store the bits. So do all 909 * that's before we have page structures to store the bits. So do all
776 * the book-keeping now. 910 * the book-keeping now.
777 */ 911 */
778static __init int mark_pinned(struct page *page, enum pt_level level) 912static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
913 enum pt_level level)
779{ 914{
780 SetPagePinned(page); 915 SetPagePinned(page);
781 return 0; 916 return 0;
@@ -783,10 +918,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level)
783 918
784void __init xen_mark_init_mm_pinned(void) 919void __init xen_mark_init_mm_pinned(void)
785{ 920{
786 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 921 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
787} 922}
788 923
789static int unpin_page(struct page *page, enum pt_level level) 924static int xen_unpin_page(struct mm_struct *mm, struct page *page,
925 enum pt_level level)
790{ 926{
791 unsigned pgfl = TestClearPagePinned(page); 927 unsigned pgfl = TestClearPagePinned(page);
792 928
@@ -796,10 +932,18 @@ static int unpin_page(struct page *page, enum pt_level level)
796 spinlock_t *ptl = NULL; 932 spinlock_t *ptl = NULL;
797 struct multicall_space mcs; 933 struct multicall_space mcs;
798 934
935 /*
936 * Do the converse to pin_page. If we're using split
937 * pte locks, we must be holding the lock for while
938 * the pte page is unpinned but still RO to prevent
939 * concurrent updates from seeing it in this
940 * partially-pinned state.
941 */
799 if (level == PT_PTE) { 942 if (level == PT_PTE) {
800 ptl = lock_pte(page); 943 ptl = xen_pte_lock(page, mm);
801 944
802 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); 945 if (ptl)
946 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
803 } 947 }
804 948
805 mcs = __xen_mc_entry(0); 949 mcs = __xen_mc_entry(0);
@@ -810,7 +954,7 @@ static int unpin_page(struct page *page, enum pt_level level)
810 954
811 if (ptl) { 955 if (ptl) {
812 /* unlock when batch completed */ 956 /* unlock when batch completed */
813 xen_mc_callback(do_unlock, ptl); 957 xen_mc_callback(xen_pte_unlock, ptl);
814 } 958 }
815 } 959 }
816 960
@@ -818,7 +962,7 @@ static int unpin_page(struct page *page, enum pt_level level)
818} 962}
819 963
820/* Release a pagetables pages back as normal RW */ 964/* Release a pagetables pages back as normal RW */
821static void xen_pgd_unpin(pgd_t *pgd) 965static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
822{ 966{
823 xen_mc_batch(); 967 xen_mc_batch();
824 968
@@ -830,21 +974,27 @@ static void xen_pgd_unpin(pgd_t *pgd)
830 974
831 if (user_pgd) { 975 if (user_pgd) {
832 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); 976 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
833 unpin_page(virt_to_page(user_pgd), PT_PGD); 977 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
834 } 978 }
835 } 979 }
836#endif 980#endif
837 981
838#ifdef CONFIG_X86_PAE 982#ifdef CONFIG_X86_PAE
839 /* Need to make sure unshared kernel PMD is unpinned */ 983 /* Need to make sure unshared kernel PMD is unpinned */
840 pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); 984 xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
985 PT_PMD);
841#endif 986#endif
842 987
843 pgd_walk(pgd, unpin_page, USER_LIMIT); 988 xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT);
844 989
845 xen_mc_issue(0); 990 xen_mc_issue(0);
846} 991}
847 992
993static void xen_pgd_unpin(struct mm_struct *mm)
994{
995 __xen_pgd_unpin(mm, mm->pgd);
996}
997
848/* 998/*
849 * On resume, undo any pinning done at save, so that the rest of the 999 * On resume, undo any pinning done at save, so that the rest of the
850 * kernel doesn't see any unexpected pinned pagetables. 1000 * kernel doesn't see any unexpected pinned pagetables.
@@ -859,7 +1009,7 @@ void xen_mm_unpin_all(void)
859 list_for_each_entry(page, &pgd_list, lru) { 1009 list_for_each_entry(page, &pgd_list, lru) {
860 if (PageSavePinned(page)) { 1010 if (PageSavePinned(page)) {
861 BUG_ON(!PagePinned(page)); 1011 BUG_ON(!PagePinned(page));
862 xen_pgd_unpin((pgd_t *)page_address(page)); 1012 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
863 ClearPageSavePinned(page); 1013 ClearPageSavePinned(page);
864 } 1014 }
865 } 1015 }
@@ -870,14 +1020,14 @@ void xen_mm_unpin_all(void)
870void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1020void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
871{ 1021{
872 spin_lock(&next->page_table_lock); 1022 spin_lock(&next->page_table_lock);
873 xen_pgd_pin(next->pgd); 1023 xen_pgd_pin(next);
874 spin_unlock(&next->page_table_lock); 1024 spin_unlock(&next->page_table_lock);
875} 1025}
876 1026
877void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 1027void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
878{ 1028{
879 spin_lock(&mm->page_table_lock); 1029 spin_lock(&mm->page_table_lock);
880 xen_pgd_pin(mm->pgd); 1030 xen_pgd_pin(mm);
881 spin_unlock(&mm->page_table_lock); 1031 spin_unlock(&mm->page_table_lock);
882} 1032}
883 1033
@@ -907,7 +1057,7 @@ static void drop_other_mm_ref(void *info)
907 } 1057 }
908} 1058}
909 1059
910static void drop_mm_ref(struct mm_struct *mm) 1060static void xen_drop_mm_ref(struct mm_struct *mm)
911{ 1061{
912 cpumask_t mask; 1062 cpumask_t mask;
913 unsigned cpu; 1063 unsigned cpu;
@@ -937,7 +1087,7 @@ static void drop_mm_ref(struct mm_struct *mm)
937 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 1087 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
938} 1088}
939#else 1089#else
940static void drop_mm_ref(struct mm_struct *mm) 1090static void xen_drop_mm_ref(struct mm_struct *mm)
941{ 1091{
942 if (current->active_mm == mm) 1092 if (current->active_mm == mm)
943 load_cr3(swapper_pg_dir); 1093 load_cr3(swapper_pg_dir);
@@ -961,14 +1111,77 @@ static void drop_mm_ref(struct mm_struct *mm)
961void xen_exit_mmap(struct mm_struct *mm) 1111void xen_exit_mmap(struct mm_struct *mm)
962{ 1112{
963 get_cpu(); /* make sure we don't move around */ 1113 get_cpu(); /* make sure we don't move around */
964 drop_mm_ref(mm); 1114 xen_drop_mm_ref(mm);
965 put_cpu(); 1115 put_cpu();
966 1116
967 spin_lock(&mm->page_table_lock); 1117 spin_lock(&mm->page_table_lock);
968 1118
969 /* pgd may not be pinned in the error exit path of execve */ 1119 /* pgd may not be pinned in the error exit path of execve */
970 if (page_pinned(mm->pgd)) 1120 if (xen_page_pinned(mm->pgd))
971 xen_pgd_unpin(mm->pgd); 1121 xen_pgd_unpin(mm);
972 1122
973 spin_unlock(&mm->page_table_lock); 1123 spin_unlock(&mm->page_table_lock);
974} 1124}
1125
1126#ifdef CONFIG_XEN_DEBUG_FS
1127
1128static struct dentry *d_mmu_debug;
1129
1130static int __init xen_mmu_debugfs(void)
1131{
1132 struct dentry *d_xen = xen_init_debugfs();
1133
1134 if (d_xen == NULL)
1135 return -ENOMEM;
1136
1137 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1138
1139 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1140
1141 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1142 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1143 &mmu_stats.pgd_update_pinned);
1144 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1145 &mmu_stats.pgd_update_pinned);
1146
1147 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1148 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1149 &mmu_stats.pud_update_pinned);
1150 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1151 &mmu_stats.pud_update_pinned);
1152
1153 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1154 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1155 &mmu_stats.pmd_update_pinned);
1156 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1157 &mmu_stats.pmd_update_pinned);
1158
1159 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1160// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1161// &mmu_stats.pte_update_pinned);
1162 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1163 &mmu_stats.pte_update_pinned);
1164
1165 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1166 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1167 &mmu_stats.mmu_update_extended);
1168 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1169 mmu_stats.mmu_update_histo, 20);
1170
1171 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1172 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1173 &mmu_stats.set_pte_at_batched);
1174 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1175 &mmu_stats.set_pte_at_current);
1176 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1177 &mmu_stats.set_pte_at_kernel);
1178
1179 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1180 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1181 &mmu_stats.prot_commit_batched);
1182
1183 return 0;
1184}
1185fs_initcall(xen_mmu_debugfs);
1186
1187#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 0f59bd03f9e..98d71659da5 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -18,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); 18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
19void xen_exit_mmap(struct mm_struct *mm); 19void xen_exit_mmap(struct mm_struct *mm);
20 20
21void xen_pgd_pin(pgd_t *pgd);
22//void xen_pgd_unpin(pgd_t *pgd);
23
24pteval_t xen_pte_val(pte_t); 21pteval_t xen_pte_val(pte_t);
25pmdval_t xen_pmd_val(pmd_t); 22pmdval_t xen_pmd_val(pmd_t);
26pgdval_t xen_pgd_val(pgd_t); 23pgdval_t xen_pgd_val(pgd_t);
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 9efd1c6c977..8ea8a0d0b0d 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -21,16 +21,20 @@
21 */ 21 */
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/hardirq.h> 23#include <linux/hardirq.h>
24#include <linux/debugfs.h>
24 25
25#include <asm/xen/hypercall.h> 26#include <asm/xen/hypercall.h>
26 27
27#include "multicalls.h" 28#include "multicalls.h"
29#include "debugfs.h"
30
31#define MC_BATCH 32
28 32
29#define MC_DEBUG 1 33#define MC_DEBUG 1
30 34
31#define MC_BATCH 32
32#define MC_ARGS (MC_BATCH * 16) 35#define MC_ARGS (MC_BATCH * 16)
33 36
37
34struct mc_buffer { 38struct mc_buffer {
35 struct multicall_entry entries[MC_BATCH]; 39 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG 40#if MC_DEBUG
@@ -47,6 +51,76 @@ struct mc_buffer {
47static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); 51static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
48DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); 52DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
49 53
54/* flush reasons 0- slots, 1- args, 2- callbacks */
55enum flush_reasons
56{
57 FL_SLOTS,
58 FL_ARGS,
59 FL_CALLBACKS,
60
61 FL_N_REASONS
62};
63
64#ifdef CONFIG_XEN_DEBUG_FS
65#define NHYPERCALLS 40 /* not really */
66
67static struct {
68 unsigned histo[MC_BATCH+1];
69
70 unsigned issued;
71 unsigned arg_total;
72 unsigned hypercalls;
73 unsigned histo_hypercalls[NHYPERCALLS];
74
75 unsigned flush[FL_N_REASONS];
76} mc_stats;
77
78static u8 zero_stats;
79
80static inline void check_zero(void)
81{
82 if (unlikely(zero_stats)) {
83 memset(&mc_stats, 0, sizeof(mc_stats));
84 zero_stats = 0;
85 }
86}
87
88static void mc_add_stats(const struct mc_buffer *mc)
89{
90 int i;
91
92 check_zero();
93
94 mc_stats.issued++;
95 mc_stats.hypercalls += mc->mcidx;
96 mc_stats.arg_total += mc->argidx;
97
98 mc_stats.histo[mc->mcidx]++;
99 for(i = 0; i < mc->mcidx; i++) {
100 unsigned op = mc->entries[i].op;
101 if (op < NHYPERCALLS)
102 mc_stats.histo_hypercalls[op]++;
103 }
104}
105
106static void mc_stats_flush(enum flush_reasons idx)
107{
108 check_zero();
109
110 mc_stats.flush[idx]++;
111}
112
113#else /* !CONFIG_XEN_DEBUG_FS */
114
115static inline void mc_add_stats(const struct mc_buffer *mc)
116{
117}
118
119static inline void mc_stats_flush(enum flush_reasons idx)
120{
121}
122#endif /* CONFIG_XEN_DEBUG_FS */
123
50void xen_mc_flush(void) 124void xen_mc_flush(void)
51{ 125{
52 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 126 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
@@ -60,6 +134,8 @@ void xen_mc_flush(void)
60 something in the middle */ 134 something in the middle */
61 local_irq_save(flags); 135 local_irq_save(flags);
62 136
137 mc_add_stats(b);
138
63 if (b->mcidx) { 139 if (b->mcidx) {
64#if MC_DEBUG 140#if MC_DEBUG
65 memcpy(b->debug, b->entries, 141 memcpy(b->debug, b->entries,
@@ -115,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args)
115 191
116 if (b->mcidx == MC_BATCH || 192 if (b->mcidx == MC_BATCH ||
117 (argidx + args) > MC_ARGS) { 193 (argidx + args) > MC_ARGS) {
194 mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
118 xen_mc_flush(); 195 xen_mc_flush();
119 argidx = roundup(b->argidx, sizeof(u64)); 196 argidx = roundup(b->argidx, sizeof(u64));
120 } 197 }
@@ -158,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data)
158 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 235 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
159 struct callback *cb; 236 struct callback *cb;
160 237
161 if (b->cbidx == MC_BATCH) 238 if (b->cbidx == MC_BATCH) {
239 mc_stats_flush(FL_CALLBACKS);
162 xen_mc_flush(); 240 xen_mc_flush();
241 }
163 242
164 cb = &b->callbacks[b->cbidx++]; 243 cb = &b->callbacks[b->cbidx++];
165 cb->fn = fn; 244 cb->fn = fn;
166 cb->data = data; 245 cb->data = data;
167} 246}
247
248#ifdef CONFIG_XEN_DEBUG_FS
249
250static struct dentry *d_mc_debug;
251
252static int __init xen_mc_debugfs(void)
253{
254 struct dentry *d_xen = xen_init_debugfs();
255
256 if (d_xen == NULL)
257 return -ENOMEM;
258
259 d_mc_debug = debugfs_create_dir("multicalls", d_xen);
260
261 debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
262
263 debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
264 debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
265 debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
266
267 xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
268 mc_stats.histo, MC_BATCH);
269 xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
270 mc_stats.histo_hypercalls, NHYPERCALLS);
271 xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
272 mc_stats.flush, FL_N_REASONS);
273
274 return 0;
275}
276fs_initcall(xen_mc_debugfs);
277
278#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index d8faf79a0a1..d77da613b1d 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -11,11 +11,8 @@
11 * useful topology information for the kernel to make use of. As a 11 * useful topology information for the kernel to make use of. As a
12 * result, all CPUs are treated as if they're single-core and 12 * result, all CPUs are treated as if they're single-core and
13 * single-threaded. 13 * single-threaded.
14 *
15 * This does not handle HOTPLUG_CPU yet.
16 */ 14 */
17#include <linux/sched.h> 15#include <linux/sched.h>
18#include <linux/kernel_stat.h>
19#include <linux/err.h> 16#include <linux/err.h>
20#include <linux/smp.h> 17#include <linux/smp.h>
21 18
@@ -36,8 +33,6 @@
36#include "xen-ops.h" 33#include "xen-ops.h"
37#include "mmu.h" 34#include "mmu.h"
38 35
39static void __cpuinit xen_init_lock_cpu(int cpu);
40
41cpumask_t xen_cpu_initialized_map; 36cpumask_t xen_cpu_initialized_map;
42 37
43static DEFINE_PER_CPU(int, resched_irq); 38static DEFINE_PER_CPU(int, resched_irq);
@@ -64,11 +59,12 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
64 return IRQ_HANDLED; 59 return IRQ_HANDLED;
65} 60}
66 61
67static __cpuinit void cpu_bringup_and_idle(void) 62static __cpuinit void cpu_bringup(void)
68{ 63{
69 int cpu = smp_processor_id(); 64 int cpu = smp_processor_id();
70 65
71 cpu_init(); 66 cpu_init();
67 touch_softlockup_watchdog();
72 preempt_disable(); 68 preempt_disable();
73 69
74 xen_enable_sysenter(); 70 xen_enable_sysenter();
@@ -89,6 +85,11 @@ static __cpuinit void cpu_bringup_and_idle(void)
89 local_irq_enable(); 85 local_irq_enable();
90 86
91 wmb(); /* make sure everything is out */ 87 wmb(); /* make sure everything is out */
88}
89
90static __cpuinit void cpu_bringup_and_idle(void)
91{
92 cpu_bringup();
92 cpu_idle(); 93 cpu_idle();
93} 94}
94 95
@@ -212,8 +213,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
212 213
213 cpu_set(cpu, cpu_present_map); 214 cpu_set(cpu, cpu_present_map);
214 } 215 }
215
216 //init_xenbus_allowed_cpumask();
217} 216}
218 217
219static __cpuinit int 218static __cpuinit int
@@ -281,12 +280,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
281 struct task_struct *idle = idle_task(cpu); 280 struct task_struct *idle = idle_task(cpu);
282 int rc; 281 int rc;
283 282
284#if 0
285 rc = cpu_up_check(cpu);
286 if (rc)
287 return rc;
288#endif
289
290#ifdef CONFIG_X86_64 283#ifdef CONFIG_X86_64
291 /* Allocate node local memory for AP pdas */ 284 /* Allocate node local memory for AP pdas */
292 WARN_ON(cpu == 0); 285 WARN_ON(cpu == 0);
@@ -339,6 +332,60 @@ static void xen_smp_cpus_done(unsigned int max_cpus)
339{ 332{
340} 333}
341 334
335#ifdef CONFIG_HOTPLUG_CPU
336static int xen_cpu_disable(void)
337{
338 unsigned int cpu = smp_processor_id();
339 if (cpu == 0)
340 return -EBUSY;
341
342 cpu_disable_common();
343
344 load_cr3(swapper_pg_dir);
345 return 0;
346}
347
348static void xen_cpu_die(unsigned int cpu)
349{
350 while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
351 current->state = TASK_UNINTERRUPTIBLE;
352 schedule_timeout(HZ/10);
353 }
354 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
355 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
356 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
357 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
358 xen_uninit_lock_cpu(cpu);
359 xen_teardown_timer(cpu);
360
361 if (num_online_cpus() == 1)
362 alternatives_smp_switch(0);
363}
364
365static void xen_play_dead(void)
366{
367 play_dead_common();
368 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
369 cpu_bringup();
370}
371
372#else /* !CONFIG_HOTPLUG_CPU */
373static int xen_cpu_disable(void)
374{
375 return -ENOSYS;
376}
377
378static void xen_cpu_die(unsigned int cpu)
379{
380 BUG();
381}
382
383static void xen_play_dead(void)
384{
385 BUG();
386}
387
388#endif
342static void stop_self(void *v) 389static void stop_self(void *v)
343{ 390{
344 int cpu = smp_processor_id(); 391 int cpu = smp_processor_id();
@@ -419,176 +466,16 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
419 return IRQ_HANDLED; 466 return IRQ_HANDLED;
420} 467}
421 468
422struct xen_spinlock {
423 unsigned char lock; /* 0 -> free; 1 -> locked */
424 unsigned short spinners; /* count of waiting cpus */
425};
426
427static int xen_spin_is_locked(struct raw_spinlock *lock)
428{
429 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
430
431 return xl->lock != 0;
432}
433
434static int xen_spin_is_contended(struct raw_spinlock *lock)
435{
436 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
437
438 /* Not strictly true; this is only the count of contended
439 lock-takers entering the slow path. */
440 return xl->spinners != 0;
441}
442
443static int xen_spin_trylock(struct raw_spinlock *lock)
444{
445 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
446 u8 old = 1;
447
448 asm("xchgb %b0,%1"
449 : "+q" (old), "+m" (xl->lock) : : "memory");
450
451 return old == 0;
452}
453
454static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
455static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
456
457static inline void spinning_lock(struct xen_spinlock *xl)
458{
459 __get_cpu_var(lock_spinners) = xl;
460 wmb(); /* set lock of interest before count */
461 asm(LOCK_PREFIX " incw %0"
462 : "+m" (xl->spinners) : : "memory");
463}
464
465static inline void unspinning_lock(struct xen_spinlock *xl)
466{
467 asm(LOCK_PREFIX " decw %0"
468 : "+m" (xl->spinners) : : "memory");
469 wmb(); /* decrement count before clearing lock */
470 __get_cpu_var(lock_spinners) = NULL;
471}
472
473static noinline int xen_spin_lock_slow(struct raw_spinlock *lock)
474{
475 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
476 int irq = __get_cpu_var(lock_kicker_irq);
477 int ret;
478
479 /* If kicker interrupts not initialized yet, just spin */
480 if (irq == -1)
481 return 0;
482
483 /* announce we're spinning */
484 spinning_lock(xl);
485
486 /* clear pending */
487 xen_clear_irq_pending(irq);
488
489 /* check again make sure it didn't become free while
490 we weren't looking */
491 ret = xen_spin_trylock(lock);
492 if (ret)
493 goto out;
494
495 /* block until irq becomes pending */
496 xen_poll_irq(irq);
497 kstat_this_cpu.irqs[irq]++;
498
499out:
500 unspinning_lock(xl);
501 return ret;
502}
503
504static void xen_spin_lock(struct raw_spinlock *lock)
505{
506 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
507 int timeout;
508 u8 oldval;
509
510 do {
511 timeout = 1 << 10;
512
513 asm("1: xchgb %1,%0\n"
514 " testb %1,%1\n"
515 " jz 3f\n"
516 "2: rep;nop\n"
517 " cmpb $0,%0\n"
518 " je 1b\n"
519 " dec %2\n"
520 " jnz 2b\n"
521 "3:\n"
522 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
523 : "1" (1)
524 : "memory");
525
526 } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock)));
527}
528
529static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
530{
531 int cpu;
532
533 for_each_online_cpu(cpu) {
534 /* XXX should mix up next cpu selection */
535 if (per_cpu(lock_spinners, cpu) == xl) {
536 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
537 break;
538 }
539 }
540}
541
542static void xen_spin_unlock(struct raw_spinlock *lock)
543{
544 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
545
546 smp_wmb(); /* make sure no writes get moved after unlock */
547 xl->lock = 0; /* release lock */
548
549 /* make sure unlock happens before kick */
550 barrier();
551
552 if (unlikely(xl->spinners))
553 xen_spin_unlock_slow(xl);
554}
555
556static __cpuinit void xen_init_lock_cpu(int cpu)
557{
558 int irq;
559 const char *name;
560
561 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
562 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
563 cpu,
564 xen_reschedule_interrupt,
565 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
566 name,
567 NULL);
568
569 if (irq >= 0) {
570 disable_irq(irq); /* make sure it's never delivered */
571 per_cpu(lock_kicker_irq, cpu) = irq;
572 }
573
574 printk("cpu %d spinlock event irq %d\n", cpu, irq);
575}
576
577static void __init xen_init_spinlocks(void)
578{
579 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
580 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
581 pv_lock_ops.spin_lock = xen_spin_lock;
582 pv_lock_ops.spin_trylock = xen_spin_trylock;
583 pv_lock_ops.spin_unlock = xen_spin_unlock;
584}
585
586static const struct smp_ops xen_smp_ops __initdata = { 469static const struct smp_ops xen_smp_ops __initdata = {
587 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, 470 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
588 .smp_prepare_cpus = xen_smp_prepare_cpus, 471 .smp_prepare_cpus = xen_smp_prepare_cpus,
589 .cpu_up = xen_cpu_up,
590 .smp_cpus_done = xen_smp_cpus_done, 472 .smp_cpus_done = xen_smp_cpus_done,
591 473
474 .cpu_up = xen_cpu_up,
475 .cpu_die = xen_cpu_die,
476 .cpu_disable = xen_cpu_disable,
477 .play_dead = xen_play_dead,
478
592 .smp_send_stop = xen_smp_send_stop, 479 .smp_send_stop = xen_smp_send_stop,
593 .smp_send_reschedule = xen_smp_send_reschedule, 480 .smp_send_reschedule = xen_smp_send_reschedule,
594 481
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
new file mode 100644
index 00000000000..5601506f2dd
--- /dev/null
+++ b/arch/x86/xen/spinlock.c
@@ -0,0 +1,428 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/kernel_stat.h>
6#include <linux/spinlock.h>
7#include <linux/debugfs.h>
8#include <linux/log2.h>
9
10#include <asm/paravirt.h>
11
12#include <xen/interface/xen.h>
13#include <xen/events.h>
14
15#include "xen-ops.h"
16#include "debugfs.h"
17
18#ifdef CONFIG_XEN_DEBUG_FS
19static struct xen_spinlock_stats
20{
21 u64 taken;
22 u32 taken_slow;
23 u32 taken_slow_nested;
24 u32 taken_slow_pickup;
25 u32 taken_slow_spurious;
26 u32 taken_slow_irqenable;
27
28 u64 released;
29 u32 released_slow;
30 u32 released_slow_kicked;
31
32#define HISTO_BUCKETS 30
33 u32 histo_spin_total[HISTO_BUCKETS+1];
34 u32 histo_spin_spinning[HISTO_BUCKETS+1];
35 u32 histo_spin_blocked[HISTO_BUCKETS+1];
36
37 u64 time_total;
38 u64 time_spinning;
39 u64 time_blocked;
40} spinlock_stats;
41
42static u8 zero_stats;
43
44static unsigned lock_timeout = 1 << 10;
45#define TIMEOUT lock_timeout
46
47static inline void check_zero(void)
48{
49 if (unlikely(zero_stats)) {
50 memset(&spinlock_stats, 0, sizeof(spinlock_stats));
51 zero_stats = 0;
52 }
53}
54
55#define ADD_STATS(elem, val) \
56 do { check_zero(); spinlock_stats.elem += (val); } while(0)
57
58static inline u64 spin_time_start(void)
59{
60 return xen_clocksource_read();
61}
62
63static void __spin_time_accum(u64 delta, u32 *array)
64{
65 unsigned index = ilog2(delta);
66
67 check_zero();
68
69 if (index < HISTO_BUCKETS)
70 array[index]++;
71 else
72 array[HISTO_BUCKETS]++;
73}
74
75static inline void spin_time_accum_spinning(u64 start)
76{
77 u32 delta = xen_clocksource_read() - start;
78
79 __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
80 spinlock_stats.time_spinning += delta;
81}
82
83static inline void spin_time_accum_total(u64 start)
84{
85 u32 delta = xen_clocksource_read() - start;
86
87 __spin_time_accum(delta, spinlock_stats.histo_spin_total);
88 spinlock_stats.time_total += delta;
89}
90
91static inline void spin_time_accum_blocked(u64 start)
92{
93 u32 delta = xen_clocksource_read() - start;
94
95 __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
96 spinlock_stats.time_blocked += delta;
97}
98#else /* !CONFIG_XEN_DEBUG_FS */
99#define TIMEOUT (1 << 10)
100#define ADD_STATS(elem, val) do { (void)(val); } while(0)
101
102static inline u64 spin_time_start(void)
103{
104 return 0;
105}
106
107static inline void spin_time_accum_total(u64 start)
108{
109}
110static inline void spin_time_accum_spinning(u64 start)
111{
112}
113static inline void spin_time_accum_blocked(u64 start)
114{
115}
116#endif /* CONFIG_XEN_DEBUG_FS */
117
118struct xen_spinlock {
119 unsigned char lock; /* 0 -> free; 1 -> locked */
120 unsigned short spinners; /* count of waiting cpus */
121};
122
123static int xen_spin_is_locked(struct raw_spinlock *lock)
124{
125 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
126
127 return xl->lock != 0;
128}
129
130static int xen_spin_is_contended(struct raw_spinlock *lock)
131{
132 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
133
134 /* Not strictly true; this is only the count of contended
135 lock-takers entering the slow path. */
136 return xl->spinners != 0;
137}
138
139static int xen_spin_trylock(struct raw_spinlock *lock)
140{
141 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
142 u8 old = 1;
143
144 asm("xchgb %b0,%1"
145 : "+q" (old), "+m" (xl->lock) : : "memory");
146
147 return old == 0;
148}
149
150static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
151static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
152
153/*
154 * Mark a cpu as interested in a lock. Returns the CPU's previous
155 * lock of interest, in case we got preempted by an interrupt.
156 */
157static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
158{
159 struct xen_spinlock *prev;
160
161 prev = __get_cpu_var(lock_spinners);
162 __get_cpu_var(lock_spinners) = xl;
163
164 wmb(); /* set lock of interest before count */
165
166 asm(LOCK_PREFIX " incw %0"
167 : "+m" (xl->spinners) : : "memory");
168
169 return prev;
170}
171
172/*
173 * Mark a cpu as no longer interested in a lock. Restores previous
174 * lock of interest (NULL for none).
175 */
176static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
177{
178 asm(LOCK_PREFIX " decw %0"
179 : "+m" (xl->spinners) : : "memory");
180 wmb(); /* decrement count before restoring lock */
181 __get_cpu_var(lock_spinners) = prev;
182}
183
184static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
185{
186 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
187 struct xen_spinlock *prev;
188 int irq = __get_cpu_var(lock_kicker_irq);
189 int ret;
190 unsigned long flags;
191 u64 start;
192
193 /* If kicker interrupts not initialized yet, just spin */
194 if (irq == -1)
195 return 0;
196
197 start = spin_time_start();
198
199 /* announce we're spinning */
200 prev = spinning_lock(xl);
201
202 flags = __raw_local_save_flags();
203 if (irq_enable) {
204 ADD_STATS(taken_slow_irqenable, 1);
205 raw_local_irq_enable();
206 }
207
208 ADD_STATS(taken_slow, 1);
209 ADD_STATS(taken_slow_nested, prev != NULL);
210
211 do {
212 /* clear pending */
213 xen_clear_irq_pending(irq);
214
215 /* check again make sure it didn't become free while
216 we weren't looking */
217 ret = xen_spin_trylock(lock);
218 if (ret) {
219 ADD_STATS(taken_slow_pickup, 1);
220
221 /*
222 * If we interrupted another spinlock while it
223 * was blocking, make sure it doesn't block
224 * without rechecking the lock.
225 */
226 if (prev != NULL)
227 xen_set_irq_pending(irq);
228 goto out;
229 }
230
231 /*
232 * Block until irq becomes pending. If we're
233 * interrupted at this point (after the trylock but
234 * before entering the block), then the nested lock
235 * handler guarantees that the irq will be left
236 * pending if there's any chance the lock became free;
237 * xen_poll_irq() returns immediately if the irq is
238 * pending.
239 */
240 xen_poll_irq(irq);
241 ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
242 } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
243
244 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
245
246out:
247 raw_local_irq_restore(flags);
248 unspinning_lock(xl, prev);
249 spin_time_accum_blocked(start);
250
251 return ret;
252}
253
254static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
255{
256 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
257 unsigned timeout;
258 u8 oldval;
259 u64 start_spin;
260
261 ADD_STATS(taken, 1);
262
263 start_spin = spin_time_start();
264
265 do {
266 u64 start_spin_fast = spin_time_start();
267
268 timeout = TIMEOUT;
269
270 asm("1: xchgb %1,%0\n"
271 " testb %1,%1\n"
272 " jz 3f\n"
273 "2: rep;nop\n"
274 " cmpb $0,%0\n"
275 " je 1b\n"
276 " dec %2\n"
277 " jnz 2b\n"
278 "3:\n"
279 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
280 : "1" (1)
281 : "memory");
282
283 spin_time_accum_spinning(start_spin_fast);
284
285 } while (unlikely(oldval != 0 &&
286 (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
287
288 spin_time_accum_total(start_spin);
289}
290
291static void xen_spin_lock(struct raw_spinlock *lock)
292{
293 __xen_spin_lock(lock, false);
294}
295
296static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
297{
298 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
299}
300
301static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
302{
303 int cpu;
304
305 ADD_STATS(released_slow, 1);
306
307 for_each_online_cpu(cpu) {
308 /* XXX should mix up next cpu selection */
309 if (per_cpu(lock_spinners, cpu) == xl) {
310 ADD_STATS(released_slow_kicked, 1);
311 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
312 break;
313 }
314 }
315}
316
317static void xen_spin_unlock(struct raw_spinlock *lock)
318{
319 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
320
321 ADD_STATS(released, 1);
322
323 smp_wmb(); /* make sure no writes get moved after unlock */
324 xl->lock = 0; /* release lock */
325
326 /* make sure unlock happens before kick */
327 barrier();
328
329 if (unlikely(xl->spinners))
330 xen_spin_unlock_slow(xl);
331}
332
333static irqreturn_t dummy_handler(int irq, void *dev_id)
334{
335 BUG();
336 return IRQ_HANDLED;
337}
338
339void __cpuinit xen_init_lock_cpu(int cpu)
340{
341 int irq;
342 const char *name;
343
344 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
345 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
346 cpu,
347 dummy_handler,
348 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
349 name,
350 NULL);
351
352 if (irq >= 0) {
353 disable_irq(irq); /* make sure it's never delivered */
354 per_cpu(lock_kicker_irq, cpu) = irq;
355 }
356
357 printk("cpu %d spinlock event irq %d\n", cpu, irq);
358}
359
360void xen_uninit_lock_cpu(int cpu)
361{
362 unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
363}
364
365void __init xen_init_spinlocks(void)
366{
367 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
368 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
369 pv_lock_ops.spin_lock = xen_spin_lock;
370 pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
371 pv_lock_ops.spin_trylock = xen_spin_trylock;
372 pv_lock_ops.spin_unlock = xen_spin_unlock;
373}
374
375#ifdef CONFIG_XEN_DEBUG_FS
376
377static struct dentry *d_spin_debug;
378
379static int __init xen_spinlock_debugfs(void)
380{
381 struct dentry *d_xen = xen_init_debugfs();
382
383 if (d_xen == NULL)
384 return -ENOMEM;
385
386 d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
387
388 debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
389
390 debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
391
392 debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
393 debugfs_create_u32("taken_slow", 0444, d_spin_debug,
394 &spinlock_stats.taken_slow);
395 debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
396 &spinlock_stats.taken_slow_nested);
397 debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
398 &spinlock_stats.taken_slow_pickup);
399 debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
400 &spinlock_stats.taken_slow_spurious);
401 debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
402 &spinlock_stats.taken_slow_irqenable);
403
404 debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
405 debugfs_create_u32("released_slow", 0444, d_spin_debug,
406 &spinlock_stats.released_slow);
407 debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
408 &spinlock_stats.released_slow_kicked);
409
410 debugfs_create_u64("time_spinning", 0444, d_spin_debug,
411 &spinlock_stats.time_spinning);
412 debugfs_create_u64("time_blocked", 0444, d_spin_debug,
413 &spinlock_stats.time_blocked);
414 debugfs_create_u64("time_total", 0444, d_spin_debug,
415 &spinlock_stats.time_total);
416
417 xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
418 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
419 xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
420 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
421 xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
422 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
423
424 return 0;
425}
426fs_initcall(xen_spinlock_debugfs);
427
428#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 685b77470fc..c9f7cda48ed 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -30,8 +30,6 @@
30#define TIMER_SLOP 100000 30#define TIMER_SLOP 100000
31#define NS_PER_TICK (1000000000LL / HZ) 31#define NS_PER_TICK (1000000000LL / HZ)
32 32
33static cycle_t xen_clocksource_read(void);
34
35/* runstate info updated by Xen */ 33/* runstate info updated by Xen */
36static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 34static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
37 35
@@ -200,20 +198,13 @@ unsigned long long xen_sched_clock(void)
200/* Get the TSC speed from Xen */ 198/* Get the TSC speed from Xen */
201unsigned long xen_tsc_khz(void) 199unsigned long xen_tsc_khz(void)
202{ 200{
203 u64 xen_khz = 1000000ULL << 32; 201 struct pvclock_vcpu_time_info *info =
204 const struct pvclock_vcpu_time_info *info =
205 &HYPERVISOR_shared_info->vcpu_info[0].time; 202 &HYPERVISOR_shared_info->vcpu_info[0].time;
206 203
207 do_div(xen_khz, info->tsc_to_system_mul); 204 return pvclock_tsc_khz(info);
208 if (info->tsc_shift < 0)
209 xen_khz <<= -info->tsc_shift;
210 else
211 xen_khz >>= info->tsc_shift;
212
213 return xen_khz;
214} 205}
215 206
216static cycle_t xen_clocksource_read(void) 207cycle_t xen_clocksource_read(void)
217{ 208{
218 struct pvclock_vcpu_time_info *src; 209 struct pvclock_vcpu_time_info *src;
219 cycle_t ret; 210 cycle_t ret;
@@ -452,6 +443,14 @@ void xen_setup_timer(int cpu)
452 setup_runstate_info(cpu); 443 setup_runstate_info(cpu);
453} 444}
454 445
446void xen_teardown_timer(int cpu)
447{
448 struct clock_event_device *evt;
449 BUG_ON(cpu == 0);
450 evt = &per_cpu(xen_clock_events, cpu);
451 unbind_from_irqhandler(evt->irq, NULL);
452}
453
455void xen_setup_cpu_clockevents(void) 454void xen_setup_cpu_clockevents(void)
456{ 455{
457 BUG_ON(preemptible()); 456 BUG_ON(preemptible());
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41d..42786f59d9c 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -298,7 +298,7 @@ check_events:
298 push %eax 298 push %eax
299 push %ecx 299 push %ecx
300 push %edx 300 push %edx
301 call force_evtchn_callback 301 call xen_force_evtchn_callback
302 pop %edx 302 pop %edx
303 pop %ecx 303 pop %ecx
304 pop %eax 304 pop %eax
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 7f58304fafb..05794c566e8 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -26,8 +26,15 @@
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */ 26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000 27#define XEN_EFLAGS_NMI 0x80000000
28 28
29#if 0 29#if 1
30#include <asm/percpu.h> 30/*
31 x86-64 does not yet support direct access to percpu variables
32 via a segment override, so we just need to make sure this code
33 never gets used
34 */
35#define BUG ud2a
36#define PER_CPU_VAR(var, off) 0xdeadbeef
37#endif
31 38
32/* 39/*
33 Enable events. This clears the event mask and tests the pending 40 Enable events. This clears the event mask and tests the pending
@@ -35,6 +42,8 @@
35 events, then enter the hypervisor to get them handled. 42 events, then enter the hypervisor to get them handled.
36 */ 43 */
37ENTRY(xen_irq_enable_direct) 44ENTRY(xen_irq_enable_direct)
45 BUG
46
38 /* Unmask events */ 47 /* Unmask events */
39 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 48 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
40 49
@@ -58,6 +67,8 @@ ENDPATCH(xen_irq_enable_direct)
58 non-zero. 67 non-zero.
59 */ 68 */
60ENTRY(xen_irq_disable_direct) 69ENTRY(xen_irq_disable_direct)
70 BUG
71
61 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 72 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
62ENDPATCH(xen_irq_disable_direct) 73ENDPATCH(xen_irq_disable_direct)
63 ret 74 ret
@@ -74,6 +85,8 @@ ENDPATCH(xen_irq_disable_direct)
74 Xen and x86 use opposite senses (mask vs enable). 85 Xen and x86 use opposite senses (mask vs enable).
75 */ 86 */
76ENTRY(xen_save_fl_direct) 87ENTRY(xen_save_fl_direct)
88 BUG
89
77 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 90 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
78 setz %ah 91 setz %ah
79 addb %ah,%ah 92 addb %ah,%ah
@@ -91,6 +104,8 @@ ENDPATCH(xen_save_fl_direct)
91 if so. 104 if so.
92 */ 105 */
93ENTRY(xen_restore_fl_direct) 106ENTRY(xen_restore_fl_direct)
107 BUG
108
94 testb $X86_EFLAGS_IF>>8, %ah 109 testb $X86_EFLAGS_IF>>8, %ah
95 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) 110 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
96 /* Preempt here doesn't matter because that will deal with 111 /* Preempt here doesn't matter because that will deal with
@@ -122,7 +137,7 @@ check_events:
122 push %r9 137 push %r9
123 push %r10 138 push %r10
124 push %r11 139 push %r11
125 call force_evtchn_callback 140 call xen_force_evtchn_callback
126 pop %r11 141 pop %r11
127 pop %r10 142 pop %r10
128 pop %r9 143 pop %r9
@@ -133,7 +148,6 @@ check_events:
133 pop %rcx 148 pop %rcx
134 pop %rax 149 pop %rax
135 ret 150 ret
136#endif
137 151
138ENTRY(xen_adjust_exception_frame) 152ENTRY(xen_adjust_exception_frame)
139 mov 8+0(%rsp),%rcx 153 mov 8+0(%rsp),%rcx
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index dd3c23152a2..d7422dc2a55 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,6 +2,7 @@
2#define XEN_OPS_H 2#define XEN_OPS_H
3 3
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/clocksource.h>
5#include <linux/irqreturn.h> 6#include <linux/irqreturn.h>
6#include <xen/xen-ops.h> 7#include <xen/xen-ops.h>
7 8
@@ -31,7 +32,10 @@ void xen_vcpu_restore(void);
31 32
32void __init xen_build_dynamic_phys_to_machine(void); 33void __init xen_build_dynamic_phys_to_machine(void);
33 34
35void xen_init_irq_ops(void);
34void xen_setup_timer(int cpu); 36void xen_setup_timer(int cpu);
37void xen_teardown_timer(int cpu);
38cycle_t xen_clocksource_read(void);
35void xen_setup_cpu_clockevents(void); 39void xen_setup_cpu_clockevents(void);
36unsigned long xen_tsc_khz(void); 40unsigned long xen_tsc_khz(void);
37void __init xen_time_init(void); 41void __init xen_time_init(void);
@@ -50,6 +54,10 @@ void __init xen_setup_vcpu_info_placement(void);
50#ifdef CONFIG_SMP 54#ifdef CONFIG_SMP
51void xen_smp_init(void); 55void xen_smp_init(void);
52 56
57void __init xen_init_spinlocks(void);
58__cpuinit void xen_init_lock_cpu(int cpu);
59void xen_uninit_lock_cpu(int cpu);
60
53extern cpumask_t xen_cpu_initialized_map; 61extern cpumask_t xen_cpu_initialized_map;
54#else 62#else
55static inline void xen_smp_init(void) {} 63static inline void xen_smp_init(void) {}