aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-28 11:26:12 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-28 11:26:12 -0400
commit7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch)
treee730a4565e0318140d2fbd2f0415d18a339d7336 /arch/x86
parent41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff)
parent0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff)
Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig295
-rw-r--r--arch/x86/Kconfig.cpu166
-rw-r--r--arch/x86/Kconfig.debug24
-rw-r--r--arch/x86/Makefile11
-rw-r--r--arch/x86/Makefile_32.cpu6
-rw-r--r--arch/x86/boot/Makefile5
-rw-r--r--arch/x86/boot/boot.h10
-rw-r--r--arch/x86/boot/compressed/Makefile8
-rw-r--r--arch/x86/boot/compressed/head_32.S5
-rw-r--r--arch/x86/boot/compressed/misc.c51
-rw-r--r--arch/x86/boot/compressed/relocs.c2
-rw-r--r--arch/x86/boot/cpu.c20
-rw-r--r--arch/x86/boot/cpucheck.c18
-rw-r--r--arch/x86/boot/edd.c12
-rw-r--r--arch/x86/boot/header.S1
-rw-r--r--arch/x86/boot/main.c5
-rw-r--r--arch/x86/boot/memory.c1
-rw-r--r--arch/x86/boot/mkcpustr.c40
-rw-r--r--arch/x86/boot/pm.c6
-rw-r--r--arch/x86/boot/video-bios.c4
-rw-r--r--arch/x86/boot/video-vesa.c15
-rw-r--r--arch/x86/configs/i386_defconfig321
-rw-r--r--arch/x86/configs/x86_64_defconfig283
-rw-r--r--arch/x86/crypto/Makefile2
-rw-r--r--arch/x86/crypto/crc32c-intel.c197
-rw-r--r--arch/x86/ia32/ia32_aout.c17
-rw-r--r--arch/x86/ia32/ia32_signal.c149
-rw-r--r--arch/x86/ia32/ia32entry.S145
-rw-r--r--arch/x86/ia32/sys_ia32.c110
-rw-r--r--arch/x86/include/asm/Kbuild24
-rw-r--r--arch/x86/include/asm/a.out-core.h73
-rw-r--r--arch/x86/include/asm/a.out.h20
-rw-r--r--arch/x86/include/asm/acpi.h178
-rw-r--r--arch/x86/include/asm/agp.h35
-rw-r--r--arch/x86/include/asm/alternative-asm.h22
-rw-r--r--arch/x86/include/asm/alternative.h183
-rw-r--r--arch/x86/include/asm/amd_iommu.h35
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h404
-rw-r--r--arch/x86/include/asm/apic.h199
-rw-r--r--arch/x86/include/asm/apicdef.h417
-rw-r--r--arch/x86/include/asm/arch_hooks.h26
-rw-r--r--arch/x86/include/asm/asm.h47
-rw-r--r--arch/x86/include/asm/atomic.h5
-rw-r--r--arch/x86/include/asm/atomic_32.h259
-rw-r--r--arch/x86/include/asm/atomic_64.h473
-rw-r--r--arch/x86/include/asm/auxvec.h12
-rw-r--r--arch/x86/include/asm/bigsmp/apic.h139
-rw-r--r--arch/x86/include/asm/bigsmp/apicdef.h13
-rw-r--r--arch/x86/include/asm/bigsmp/ipi.h25
-rw-r--r--arch/x86/include/asm/bios_ebda.h36
-rw-r--r--arch/x86/include/asm/bitops.h451
-rw-r--r--arch/x86/include/asm/boot.h26
-rw-r--r--arch/x86/include/asm/bootparam.h111
-rw-r--r--arch/x86/include/asm/bug.h39
-rw-r--r--arch/x86/include/asm/bugs.h12
-rw-r--r--arch/x86/include/asm/byteorder.h81
-rw-r--r--arch/x86/include/asm/cache.h20
-rw-r--r--arch/x86/include/asm/cacheflush.h118
-rw-r--r--arch/x86/include/asm/calgary.h72
-rw-r--r--arch/x86/include/asm/calling.h170
-rw-r--r--arch/x86/include/asm/checksum.h5
-rw-r--r--arch/x86/include/asm/checksum_32.h189
-rw-r--r--arch/x86/include/asm/checksum_64.h191
-rw-r--r--arch/x86/include/asm/cmpxchg.h5
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h344
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h185
-rw-r--r--arch/x86/include/asm/compat.h218
-rw-r--r--arch/x86/include/asm/cpu.h20
-rw-r--r--arch/x86/include/asm/cpufeature.h271
-rw-r--r--arch/x86/include/asm/cputime.h1
-rw-r--r--arch/x86/include/asm/current.h39
-rw-r--r--arch/x86/include/asm/debugreg.h70
-rw-r--r--arch/x86/include/asm/delay.h31
-rw-r--r--arch/x86/include/asm/desc.h409
-rw-r--r--arch/x86/include/asm/desc_defs.h95
-rw-r--r--arch/x86/include/asm/device.h16
-rw-r--r--arch/x86/include/asm/div64.h60
-rw-r--r--arch/x86/include/asm/dma-mapping.h308
-rw-r--r--arch/x86/include/asm/dma.h318
-rw-r--r--arch/x86/include/asm/dmi.h26
-rw-r--r--arch/x86/include/asm/ds.h238
-rw-r--r--arch/x86/include/asm/dwarf2.h61
-rw-r--r--arch/x86/include/asm/e820.h146
-rw-r--r--arch/x86/include/asm/edac.h18
-rw-r--r--arch/x86/include/asm/efi.h110
-rw-r--r--arch/x86/include/asm/elf.h336
-rw-r--r--arch/x86/include/asm/emergency-restart.h18
-rw-r--r--arch/x86/include/asm/errno.h1
-rw-r--r--arch/x86/include/asm/es7000/apic.h193
-rw-r--r--arch/x86/include/asm/es7000/apicdef.h13
-rw-r--r--arch/x86/include/asm/es7000/ipi.h24
-rw-r--r--arch/x86/include/asm/es7000/mpparse.h30
-rw-r--r--arch/x86/include/asm/es7000/wakecpu.h59
-rw-r--r--arch/x86/include/asm/fb.h21
-rw-r--r--arch/x86/include/asm/fcntl.h1
-rw-r--r--arch/x86/include/asm/fixmap.h68
-rw-r--r--arch/x86/include/asm/fixmap_32.h123
-rw-r--r--arch/x86/include/asm/fixmap_64.h83
-rw-r--r--arch/x86/include/asm/floppy.h281
-rw-r--r--arch/x86/include/asm/frame.h27
-rw-r--r--arch/x86/include/asm/ftrace.h24
-rw-r--r--arch/x86/include/asm/futex.h140
-rw-r--r--arch/x86/include/asm/gart.h73
-rw-r--r--arch/x86/include/asm/genapic.h5
-rw-r--r--arch/x86/include/asm/genapic_32.h126
-rw-r--r--arch/x86/include/asm/genapic_64.h58
-rw-r--r--arch/x86/include/asm/geode.h253
-rw-r--r--arch/x86/include/asm/gpio.h56
-rw-r--r--arch/x86/include/asm/hardirq.h11
-rw-r--r--arch/x86/include/asm/hardirq_32.h28
-rw-r--r--arch/x86/include/asm/hardirq_64.h23
-rw-r--r--arch/x86/include/asm/highmem.h82
-rw-r--r--arch/x86/include/asm/hpet.h114
-rw-r--r--arch/x86/include/asm/hugetlb.h93
-rw-r--r--arch/x86/include/asm/hw_irq.h131
-rw-r--r--arch/x86/include/asm/hypertransport.h45
-rw-r--r--arch/x86/include/asm/i387.h400
-rw-r--r--arch/x86/include/asm/i8253.h18
-rw-r--r--arch/x86/include/asm/i8259.h63
-rw-r--r--arch/x86/include/asm/ia32.h170
-rw-r--r--arch/x86/include/asm/ia32_unistd.h18
-rw-r--r--arch/x86/include/asm/idle.h16
-rw-r--r--arch/x86/include/asm/intel_arch_perfmon.h31
-rw-r--r--arch/x86/include/asm/io.h91
-rw-r--r--arch/x86/include/asm/io_32.h284
-rw-r--r--arch/x86/include/asm/io_64.h244
-rw-r--r--arch/x86/include/asm/io_apic.h214
-rw-r--r--arch/x86/include/asm/ioctl.h1
-rw-r--r--arch/x86/include/asm/ioctls.h94
-rw-r--r--arch/x86/include/asm/iommu.h50
-rw-r--r--arch/x86/include/asm/ipcbuf.h28
-rw-r--r--arch/x86/include/asm/ipi.h138
-rw-r--r--arch/x86/include/asm/irq.h50
-rw-r--r--arch/x86/include/asm/irq_regs.h5
-rw-r--r--arch/x86/include/asm/irq_regs_32.h29
-rw-r--r--arch/x86/include/asm/irq_regs_64.h1
-rw-r--r--arch/x86/include/asm/irq_remapping.h8
-rw-r--r--arch/x86/include/asm/irq_vectors.h164
-rw-r--r--arch/x86/include/asm/irqflags.h211
-rw-r--r--arch/x86/include/asm/ist.h34
-rw-r--r--arch/x86/include/asm/k8.h15
-rw-r--r--arch/x86/include/asm/kdebug.h37
-rw-r--r--arch/x86/include/asm/kexec.h175
-rw-r--r--arch/x86/include/asm/kgdb.h79
-rw-r--r--arch/x86/include/asm/kmap_types.h29
-rw-r--r--arch/x86/include/asm/kprobes.h88
-rw-r--r--arch/x86/include/asm/kvm.h211
-rw-r--r--arch/x86/include/asm/kvm_host.h752
-rw-r--r--arch/x86/include/asm/kvm_para.h147
-rw-r--r--arch/x86/include/asm/kvm_x86_emulate.h184
-rw-r--r--arch/x86/include/asm/ldt.h40
-rw-r--r--arch/x86/include/asm/lguest.h94
-rw-r--r--arch/x86/include/asm/lguest_hcall.h71
-rw-r--r--arch/x86/include/asm/linkage.h61
-rw-r--r--arch/x86/include/asm/local.h235
-rw-r--r--arch/x86/include/asm/mach-default/apm.h73
-rw-r--r--arch/x86/include/asm/mach-default/do_timer.h16
-rw-r--r--arch/x86/include/asm/mach-default/entry_arch.h36
-rw-r--r--arch/x86/include/asm/mach-default/mach_apic.h156
-rw-r--r--arch/x86/include/asm/mach-default/mach_apicdef.h24
-rw-r--r--arch/x86/include/asm/mach-default/mach_ipi.h64
-rw-r--r--arch/x86/include/asm/mach-default/mach_mpparse.h17
-rw-r--r--arch/x86/include/asm/mach-default/mach_mpspec.h12
-rw-r--r--arch/x86/include/asm/mach-default/mach_timer.h48
-rw-r--r--arch/x86/include/asm/mach-default/mach_traps.h33
-rw-r--r--arch/x86/include/asm/mach-default/mach_wakecpu.h42
-rw-r--r--arch/x86/include/asm/mach-default/pci-functions.h19
-rw-r--r--arch/x86/include/asm/mach-default/setup_arch.h3
-rw-r--r--arch/x86/include/asm/mach-default/smpboot_hooks.h59
-rw-r--r--arch/x86/include/asm/mach-generic/gpio.h15
-rw-r--r--arch/x86/include/asm/mach-generic/mach_apic.h33
-rw-r--r--arch/x86/include/asm/mach-generic/mach_apicdef.h11
-rw-r--r--arch/x86/include/asm/mach-generic/mach_ipi.h10
-rw-r--r--arch/x86/include/asm/mach-generic/mach_mpparse.h10
-rw-r--r--arch/x86/include/asm/mach-generic/mach_mpspec.h12
-rw-r--r--arch/x86/include/asm/mach-rdc321x/gpio.h60
-rw-r--r--arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h12
-rw-r--r--arch/x86/include/asm/mach-voyager/do_timer.h17
-rw-r--r--arch/x86/include/asm/mach-voyager/entry_arch.h26
-rw-r--r--arch/x86/include/asm/mach-voyager/setup_arch.h12
-rw-r--r--arch/x86/include/asm/math_emu.h31
-rw-r--r--arch/x86/include/asm/mc146818rtc.h104
-rw-r--r--arch/x86/include/asm/mca.h43
-rw-r--r--arch/x86/include/asm/mca_dma.h201
-rw-r--r--arch/x86/include/asm/mce.h130
-rw-r--r--arch/x86/include/asm/microcode.h47
-rw-r--r--arch/x86/include/asm/mman.h20
-rw-r--r--arch/x86/include/asm/mmconfig.h12
-rw-r--r--arch/x86/include/asm/mmu.h26
-rw-r--r--arch/x86/include/asm/mmu_context.h37
-rw-r--r--arch/x86/include/asm/mmu_context_32.h56
-rw-r--r--arch/x86/include/asm/mmu_context_64.h54
-rw-r--r--arch/x86/include/asm/mmx.h14
-rw-r--r--arch/x86/include/asm/mmzone.h5
-rw-r--r--arch/x86/include/asm/mmzone_32.h134
-rw-r--r--arch/x86/include/asm/mmzone_64.h51
-rw-r--r--arch/x86/include/asm/module.h80
-rw-r--r--arch/x86/include/asm/mpspec.h145
-rw-r--r--arch/x86/include/asm/mpspec_def.h180
-rw-r--r--arch/x86/include/asm/msgbuf.h39
-rw-r--r--arch/x86/include/asm/msidef.h55
-rw-r--r--arch/x86/include/asm/msr-index.h332
-rw-r--r--arch/x86/include/asm/msr.h247
-rw-r--r--arch/x86/include/asm/mtrr.h173
-rw-r--r--arch/x86/include/asm/mutex.h5
-rw-r--r--arch/x86/include/asm/mutex_32.h125
-rw-r--r--arch/x86/include/asm/mutex_64.h100
-rw-r--r--arch/x86/include/asm/nmi.h81
-rw-r--r--arch/x86/include/asm/nops.h118
-rw-r--r--arch/x86/include/asm/numa.h5
-rw-r--r--arch/x86/include/asm/numa_32.h11
-rw-r--r--arch/x86/include/asm/numa_64.h43
-rw-r--r--arch/x86/include/asm/numaq.h169
-rw-r--r--arch/x86/include/asm/numaq/apic.h136
-rw-r--r--arch/x86/include/asm/numaq/apicdef.h14
-rw-r--r--arch/x86/include/asm/numaq/ipi.h25
-rw-r--r--arch/x86/include/asm/numaq/mpparse.h7
-rw-r--r--arch/x86/include/asm/numaq/wakecpu.h43
-rw-r--r--arch/x86/include/asm/olpc.h132
-rw-r--r--arch/x86/include/asm/page.h209
-rw-r--r--arch/x86/include/asm/page_32.h136
-rw-r--r--arch/x86/include/asm/page_64.h105
-rw-r--r--arch/x86/include/asm/param.h22
-rw-r--r--arch/x86/include/asm/paravirt.h1650
-rw-r--r--arch/x86/include/asm/parport.h10
-rw-r--r--arch/x86/include/asm/pat.h22
-rw-r--r--arch/x86/include/asm/pci-direct.h21
-rw-r--r--arch/x86/include/asm/pci.h116
-rw-r--r--arch/x86/include/asm/pci_32.h34
-rw-r--r--arch/x86/include/asm/pci_64.h66
-rw-r--r--arch/x86/include/asm/pda.h137
-rw-r--r--arch/x86/include/asm/percpu.h218
-rw-r--r--arch/x86/include/asm/pgalloc.h114
-rw-r--r--arch/x86/include/asm/pgtable-2level-defs.h20
-rw-r--r--arch/x86/include/asm/pgtable-2level.h79
-rw-r--r--arch/x86/include/asm/pgtable-3level-defs.h28
-rw-r--r--arch/x86/include/asm/pgtable-3level.h175
-rw-r--r--arch/x86/include/asm/pgtable.h562
-rw-r--r--arch/x86/include/asm/pgtable_32.h191
-rw-r--r--arch/x86/include/asm/pgtable_64.h285
-rw-r--r--arch/x86/include/asm/poll.h1
-rw-r--r--arch/x86/include/asm/posix_types.h13
-rw-r--r--arch/x86/include/asm/posix_types_32.h85
-rw-r--r--arch/x86/include/asm/posix_types_64.h119
-rw-r--r--arch/x86/include/asm/prctl.h10
-rw-r--r--arch/x86/include/asm/processor-cyrix.h38
-rw-r--r--arch/x86/include/asm/processor-flags.h100
-rw-r--r--arch/x86/include/asm/processor.h936
-rw-r--r--arch/x86/include/asm/proto.h32
-rw-r--r--arch/x86/include/asm/ptrace-abi.h145
-rw-r--r--arch/x86/include/asm/ptrace.h280
-rw-r--r--arch/x86/include/asm/pvclock-abi.h42
-rw-r--r--arch/x86/include/asm/pvclock.h14
-rw-r--r--arch/x86/include/asm/reboot.h21
-rw-r--r--arch/x86/include/asm/reboot_fixups.h6
-rw-r--r--arch/x86/include/asm/required-features.h82
-rw-r--r--arch/x86/include/asm/resource.h1
-rw-r--r--arch/x86/include/asm/resume-trace.h21
-rw-r--r--arch/x86/include/asm/rio.h63
-rw-r--r--arch/x86/include/asm/rtc.h1
-rw-r--r--arch/x86/include/asm/rwlock.h8
-rw-r--r--arch/x86/include/asm/rwsem.h265
-rw-r--r--arch/x86/include/asm/scatterlist.h33
-rw-r--r--arch/x86/include/asm/seccomp.h5
-rw-r--r--arch/x86/include/asm/seccomp_32.h17
-rw-r--r--arch/x86/include/asm/seccomp_64.h25
-rw-r--r--arch/x86/include/asm/sections.h1
-rw-r--r--arch/x86/include/asm/segment.h209
-rw-r--r--arch/x86/include/asm/sembuf.h24
-rw-r--r--arch/x86/include/asm/serial.h29
-rw-r--r--arch/x86/include/asm/setup.h105
-rw-r--r--arch/x86/include/asm/shmbuf.h51
-rw-r--r--arch/x86/include/asm/shmparam.h6
-rw-r--r--arch/x86/include/asm/sigcontext.h284
-rw-r--r--arch/x86/include/asm/sigcontext32.h75
-rw-r--r--arch/x86/include/asm/siginfo.h10
-rw-r--r--arch/x86/include/asm/signal.h262
-rw-r--r--arch/x86/include/asm/smp.h229
-rw-r--r--arch/x86/include/asm/socket.h57
-rw-r--r--arch/x86/include/asm/sockios.h13
-rw-r--r--arch/x86/include/asm/sparsemem.h34
-rw-r--r--arch/x86/include/asm/spinlock.h364
-rw-r--r--arch/x86/include/asm/spinlock_types.h20
-rw-r--r--arch/x86/include/asm/srat.h39
-rw-r--r--arch/x86/include/asm/stacktrace.h21
-rw-r--r--arch/x86/include/asm/stat.h114
-rw-r--r--arch/x86/include/asm/statfs.h12
-rw-r--r--arch/x86/include/asm/string.h5
-rw-r--r--arch/x86/include/asm/string_32.h326
-rw-r--r--arch/x86/include/asm/string_64.h60
-rw-r--r--arch/x86/include/asm/summit/apic.h184
-rw-r--r--arch/x86/include/asm/summit/apicdef.h13
-rw-r--r--arch/x86/include/asm/summit/ipi.h25
-rw-r--r--arch/x86/include/asm/summit/mpparse.h109
-rw-r--r--arch/x86/include/asm/suspend.h5
-rw-r--r--arch/x86/include/asm/suspend_32.h51
-rw-r--r--arch/x86/include/asm/suspend_64.h52
-rw-r--r--arch/x86/include/asm/swiotlb.h58
-rw-r--r--arch/x86/include/asm/sync_bitops.h130
-rw-r--r--arch/x86/include/asm/syscall.h213
-rw-r--r--arch/x86/include/asm/syscalls.h93
-rw-r--r--arch/x86/include/asm/system.h425
-rw-r--r--arch/x86/include/asm/system_64.h22
-rw-r--r--arch/x86/include/asm/tce.h48
-rw-r--r--arch/x86/include/asm/termbits.h198
-rw-r--r--arch/x86/include/asm/termios.h113
-rw-r--r--arch/x86/include/asm/therm_throt.h9
-rw-r--r--arch/x86/include/asm/thread_info.h264
-rw-r--r--arch/x86/include/asm/time.h63
-rw-r--r--arch/x86/include/asm/timer.h66
-rw-r--r--arch/x86/include/asm/timex.h19
-rw-r--r--arch/x86/include/asm/tlb.h11
-rw-r--r--arch/x86/include/asm/tlbflush.h178
-rw-r--r--arch/x86/include/asm/topology.h258
-rw-r--r--arch/x86/include/asm/trampoline.h21
-rw-r--r--arch/x86/include/asm/traps.h81
-rw-r--r--arch/x86/include/asm/tsc.h62
-rw-r--r--arch/x86/include/asm/types.h36
-rw-r--r--arch/x86/include/asm/uaccess.h454
-rw-r--r--arch/x86/include/asm/uaccess_32.h218
-rw-r--r--arch/x86/include/asm/uaccess_64.h202
-rw-r--r--arch/x86/include/asm/ucontext.h18
-rw-r--r--arch/x86/include/asm/unaligned.h14
-rw-r--r--arch/x86/include/asm/unistd.h13
-rw-r--r--arch/x86/include/asm/unistd_32.h379
-rw-r--r--arch/x86/include/asm/unistd_64.h693
-rw-r--r--arch/x86/include/asm/unwind.h13
-rw-r--r--arch/x86/include/asm/user.h5
-rw-r--r--arch/x86/include/asm/user32.h70
-rw-r--r--arch/x86/include/asm/user_32.h131
-rw-r--r--arch/x86/include/asm/user_64.h137
-rw-r--r--arch/x86/include/asm/uv/bios.h94
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h332
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h354
-rw-r--r--arch/x86/include/asm/uv/uv_irq.h36
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h1295
-rw-r--r--arch/x86/include/asm/vdso.h47
-rw-r--r--arch/x86/include/asm/vga.h20
-rw-r--r--arch/x86/include/asm/vgtod.h29
-rw-r--r--arch/x86/include/asm/vic.h61
-rw-r--r--arch/x86/include/asm/visws/cobalt.h125
-rw-r--r--arch/x86/include/asm/visws/lithium.h53
-rw-r--r--arch/x86/include/asm/visws/piix4.h107
-rw-r--r--arch/x86/include/asm/visws/sgivw.h5
-rw-r--r--arch/x86/include/asm/vm86.h208
-rw-r--r--arch/x86/include/asm/vmi.h263
-rw-r--r--arch/x86/include/asm/vmi_time.h98
-rw-r--r--arch/x86/include/asm/voyager.h528
-rw-r--r--arch/x86/include/asm/vsyscall.h44
-rw-r--r--arch/x86/include/asm/xcr.h49
-rw-r--r--arch/x86/include/asm/xen/events.h24
-rw-r--r--arch/x86/include/asm/xen/grant_table.h7
-rw-r--r--arch/x86/include/asm/xen/hypercall.h527
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h82
-rw-r--r--arch/x86/include/asm/xen/interface.h175
-rw-r--r--arch/x86/include/asm/xen/interface_32.h97
-rw-r--r--arch/x86/include/asm/xen/interface_64.h159
-rw-r--r--arch/x86/include/asm/xen/page.h165
-rw-r--r--arch/x86/include/asm/xor.h5
-rw-r--r--arch/x86/include/asm/xor_32.h888
-rw-r--r--arch/x86/include/asm/xor_64.h361
-rw-r--r--arch/x86/include/asm/xsave.h118
-rw-r--r--arch/x86/kernel/Makefile23
-rw-r--r--arch/x86/kernel/acpi/boot.c72
-rw-r--r--arch/x86/kernel/acpi/sleep.c25
-rw-r--r--arch/x86/kernel/alternative.c46
-rw-r--r--arch/x86/kernel/amd_iommu.c606
-rw-r--r--arch/x86/kernel/amd_iommu_init.c579
-rw-r--r--arch/x86/kernel/aperture_64.c7
-rw-r--r--arch/x86/kernel/apic.c (renamed from arch/x86/kernel/apic_32.c)1197
-rw-r--r--arch/x86/kernel/apic_64.c1393
-rw-r--r--arch/x86/kernel/apm_32.c5
-rw-r--r--arch/x86/kernel/asm-offsets_64.c13
-rw-r--r--arch/x86/kernel/bios_uv.c141
-rw-r--r--arch/x86/kernel/cpu/.gitignore1
-rw-r--r--arch/x86/kernel/cpu/Makefile34
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c105
-rw-r--r--arch/x86/kernel/cpu/amd.c549
-rw-r--r--arch/x86/kernel/cpu/amd_64.c222
-rw-r--r--arch/x86/kernel/cpu/bugs.c29
-rw-r--r--arch/x86/kernel/cpu/centaur.c15
-rw-r--r--arch/x86/kernel/cpu/centaur_64.c6
-rw-r--r--arch/x86/kernel/cpu/cmpxchg.c72
-rw-r--r--arch/x86/kernel/cpu/common.c1009
-rw-r--r--arch/x86/kernel/cpu/common_64.c681
-rw-r--r--arch/x86/kernel/cpu/cpu.h19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c22
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c41
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c55
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c151
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c6
-rw-r--r--arch/x86/kernel/cpu/cyrix.c73
-rw-r--r--arch/x86/kernel/cpu/feature_names.c83
-rw-r--r--arch/x86/kernel/cpu/intel.c357
-rw-r--r--arch/x86/kernel/cpu/intel_64.c95
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c178
-rw-r--r--arch/x86/kernel/cpu/mcheck/k7.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_32.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_64.c27
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd_64.c22
-rw-r--r--arch/x86/kernel/cpu/mcheck/non-fatal.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/p4.c4
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c1
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.pl32
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c15
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c4
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c281
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c109
-rw-r--r--arch/x86/kernel/cpu/powerflags.c20
-rw-r--r--arch/x86/kernel/cpu/proc.c6
-rw-r--r--arch/x86/kernel/cpu/transmeta.c32
-rw-r--r--arch/x86/kernel/cpu/umc.c3
-rw-r--r--arch/x86/kernel/cpuid.c18
-rw-r--r--arch/x86/kernel/crash_dump_32.c3
-rw-r--r--arch/x86/kernel/crash_dump_64.c14
-rw-r--r--arch/x86/kernel/doublefault_32.c2
-rw-r--r--arch/x86/kernel/ds.c954
-rw-r--r--arch/x86/kernel/dumpstack_32.c449
-rw-r--r--arch/x86/kernel/dumpstack_64.c575
-rw-r--r--arch/x86/kernel/e820.c65
-rw-r--r--arch/x86/kernel/early-quirks.c120
-rw-r--r--arch/x86/kernel/early_printk.c748
-rw-r--r--arch/x86/kernel/efi.c10
-rw-r--r--arch/x86/kernel/efi_32.c4
-rw-r--r--arch/x86/kernel/entry_32.S115
-rw-r--r--arch/x86/kernel/entry_64.S220
-rw-r--r--arch/x86/kernel/es7000_32.c (renamed from arch/x86/mach-es7000/es7000plat.c)123
-rw-r--r--arch/x86/kernel/ftrace.c124
-rw-r--r--arch/x86/kernel/genapic_64.c87
-rw-r--r--arch/x86/kernel/genapic_flat_64.c66
-rw-r--r--arch/x86/kernel/genx2apic_cluster.c159
-rw-r--r--arch/x86/kernel/genx2apic_phys.c154
-rw-r--r--arch/x86/kernel/genx2apic_uv_x.c157
-rw-r--r--arch/x86/kernel/head.c1
-rw-r--r--arch/x86/kernel/head64.c17
-rw-r--r--arch/x86/kernel/head_32.S42
-rw-r--r--arch/x86/kernel/head_64.S5
-rw-r--r--arch/x86/kernel/hpet.c510
-rw-r--r--arch/x86/kernel/i387.c168
-rw-r--r--arch/x86/kernel/i8259.c24
-rw-r--r--arch/x86/kernel/io_apic.c (renamed from arch/x86/kernel/io_apic_64.c)2147
-rw-r--r--arch/x86/kernel/io_apic_32.c2900
-rw-r--r--arch/x86/kernel/io_delay.c11
-rw-r--r--arch/x86/kernel/ioport.c1
-rw-r--r--arch/x86/kernel/ipi.c9
-rw-r--r--arch/x86/kernel/irq.c189
-rw-r--r--arch/x86/kernel/irq_32.c201
-rw-r--r--arch/x86/kernel/irq_64.c169
-rw-r--r--arch/x86/kernel/irqinit_32.c84
-rw-r--r--arch/x86/kernel/irqinit_64.c74
-rw-r--r--arch/x86/kernel/k8.c5
-rw-r--r--arch/x86/kernel/kdebugfs.c9
-rw-r--r--arch/x86/kernel/kgdb.c50
-rw-r--r--arch/x86/kernel/kprobes.c7
-rw-r--r--arch/x86/kernel/kvm.c2
-rw-r--r--arch/x86/kernel/kvmclock.c32
-rw-r--r--arch/x86/kernel/ldt.c16
-rw-r--r--arch/x86/kernel/machine_kexec_32.c57
-rw-r--r--arch/x86/kernel/machine_kexec_64.c2
-rw-r--r--arch/x86/kernel/mfgpt_32.c52
-rw-r--r--arch/x86/kernel/microcode.c851
-rw-r--r--arch/x86/kernel/microcode_amd.c435
-rw-r--r--arch/x86/kernel/microcode_core.c508
-rw-r--r--arch/x86/kernel/microcode_intel.c480
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c2
-rw-r--r--arch/x86/kernel/module_64.c11
-rw-r--r--arch/x86/kernel/mpparse.c225
-rw-r--r--arch/x86/kernel/msr.c42
-rw-r--r--arch/x86/kernel/nmi.c50
-rw-r--r--arch/x86/kernel/numaq_32.c204
-rw-r--r--arch/x86/kernel/olpc.c6
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c37
-rw-r--r--arch/x86/kernel/paravirt.c15
-rw-r--r--arch/x86/kernel/paravirt_patch_32.c2
-rw-r--r--arch/x86/kernel/pci-calgary_64.c210
-rw-r--r--arch/x86/kernel/pci-dma.c347
-rw-r--r--arch/x86/kernel/pci-gart_64.c181
-rw-r--r--arch/x86/kernel/pci-nommu.c20
-rw-r--r--arch/x86/kernel/pci-swiotlb_64.c4
-rw-r--r--arch/x86/kernel/pcspeaker.c13
-rw-r--r--arch/x86/kernel/process.c24
-rw-r--r--arch/x86/kernel/process_32.c108
-rw-r--r--arch/x86/kernel/process_64.c261
-rw-r--r--arch/x86/kernel/ptrace.c653
-rw-r--r--arch/x86/kernel/pvclock.c12
-rw-r--r--arch/x86/kernel/quirks.c44
-rw-r--r--arch/x86/kernel/reboot.c17
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S178
-rw-r--r--arch/x86/kernel/rtc.c42
-rw-r--r--arch/x86/kernel/setup.c305
-rw-r--r--arch/x86/kernel/setup_percpu.c53
-rw-r--r--arch/x86/kernel/sigframe.h19
-rw-r--r--arch/x86/kernel/signal_32.c280
-rw-r--r--arch/x86/kernel/signal_64.c313
-rw-r--r--arch/x86/kernel/smp.c6
-rw-r--r--arch/x86/kernel/smpboot.c382
-rw-r--r--arch/x86/kernel/smpcommon.c17
-rw-r--r--arch/x86/kernel/smpcommon_32.c1
-rw-r--r--arch/x86/kernel/step.c35
-rw-r--r--arch/x86/kernel/summit_32.c2
-rw-r--r--arch/x86/kernel/sys_i386_32.c2
-rw-r--r--arch/x86/kernel/sys_x86_64.c44
-rw-r--r--arch/x86/kernel/syscall_64.c4
-rw-r--r--arch/x86/kernel/syscall_table_32.S6
-rw-r--r--arch/x86/kernel/time_32.c9
-rw-r--r--arch/x86/kernel/time_64.c23
-rw-r--r--arch/x86/kernel/tlb_32.c8
-rw-r--r--arch/x86/kernel/tlb_uv.c5
-rw-r--r--arch/x86/kernel/tls.c1
-rw-r--r--arch/x86/kernel/traps.c (renamed from arch/x86/kernel/traps_32.c)902
-rw-r--r--arch/x86/kernel/traps_64.c1217
-rw-r--r--arch/x86/kernel/tsc.c424
-rw-r--r--arch/x86/kernel/tsc_sync.c6
-rw-r--r--arch/x86/kernel/uv_irq.c79
-rw-r--r--arch/x86/kernel/uv_sysfs.c72
-rw-r--r--arch/x86/kernel/visws_quirks.c96
-rw-r--r--arch/x86/kernel/vm86_32.c1
-rw-r--r--arch/x86/kernel/vmi_32.c20
-rw-r--r--arch/x86/kernel/vmiclock_32.c3
-rw-r--r--arch/x86/kernel/vmlinux_32.lds.S17
-rw-r--r--arch/x86/kernel/vmlinux_64.lds.S9
-rw-r--r--arch/x86/kernel/vsmp_64.c2
-rw-r--r--arch/x86/kernel/xsave.c345
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/Makefile6
-rw-r--r--arch/x86/kvm/i8254.c107
-rw-r--r--arch/x86/kvm/i8254.h7
-rw-r--r--arch/x86/kvm/i8259.c62
-rw-r--r--arch/x86/kvm/irq.c3
-rw-r--r--arch/x86/kvm/irq.h8
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h32
-rw-r--r--arch/x86/kvm/lapic.c63
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c827
-rw-r--r--arch/x86/kvm/mmu.h3
-rw-r--r--arch/x86/kvm/paging_tmpl.h281
-rw-r--r--arch/x86/kvm/svm.c301
-rw-r--r--arch/x86/kvm/vmx.c893
-rw-r--r--arch/x86/kvm/vmx.h32
-rw-r--r--arch/x86/kvm/x86.c984
-rw-r--r--arch/x86/kvm/x86.h22
-rw-r--r--arch/x86/kvm/x86_emulate.c421
-rw-r--r--arch/x86/lguest/boot.c44
-rw-r--r--arch/x86/lib/Makefile3
-rw-r--r--arch/x86/lib/copy_user_64.S2
-rw-r--r--arch/x86/lib/copy_user_nocache_64.S3
-rw-r--r--arch/x86/lib/msr-on-cpu.c76
-rw-r--r--arch/x86/lib/string_32.c42
-rw-r--r--arch/x86/lib/strstr_32.c6
-rw-r--r--arch/x86/lib/usercopy_32.c7
-rw-r--r--arch/x86/mach-default/setup.c51
-rw-r--r--arch/x86/mach-es7000/Makefile5
-rw-r--r--arch/x86/mach-es7000/es7000.h114
-rw-r--r--arch/x86/mach-generic/Makefile1
-rw-r--r--arch/x86/mach-generic/bigsmp.c13
-rw-r--r--arch/x86/mach-generic/es7000.c47
-rw-r--r--arch/x86/mach-generic/numaq.c26
-rw-r--r--arch/x86/mach-generic/summit.c25
-rw-r--r--arch/x86/mach-rdc321x/platform.c1
-rw-r--r--arch/x86/mach-voyager/voyager_smp.c6
-rw-r--r--arch/x86/mm/Makefile9
-rw-r--r--arch/x86/mm/dump_pagetables.c10
-rw-r--r--arch/x86/mm/fault.c67
-rw-r--r--arch/x86/mm/gup.c298
-rw-r--r--arch/x86/mm/highmem_32.c1
-rw-r--r--arch/x86/mm/hugetlbpage.c78
-rw-r--r--arch/x86/mm/init_32.c99
-rw-r--r--arch/x86/mm/init_64.c361
-rw-r--r--arch/x86/mm/ioremap.c213
-rw-r--r--arch/x86/mm/memtest.c122
-rw-r--r--arch/x86/mm/mmio-mod.c91
-rw-r--r--arch/x86/mm/numa_32.c (renamed from arch/x86/mm/discontig_32.c)5
-rw-r--r--arch/x86/mm/numa_64.c18
-rw-r--r--arch/x86/mm/pageattr-test.c12
-rw-r--r--arch/x86/mm/pageattr.c495
-rw-r--r--arch/x86/mm/pat.c268
-rw-r--r--arch/x86/mm/pf_in.c121
-rw-r--r--arch/x86/mm/pgtable.c9
-rw-r--r--arch/x86/mm/pgtable_32.c50
-rw-r--r--arch/x86/mm/srat_32.c12
-rw-r--r--arch/x86/mm/srat_64.c2
-rw-r--r--arch/x86/mm/testmmiotrace.c4
-rw-r--r--arch/x86/oprofile/Makefile2
-rw-r--r--arch/x86/oprofile/backtrace.c3
-rw-r--r--arch/x86/oprofile/nmi_int.c219
-rw-r--r--arch/x86/oprofile/op_counter.h18
-rw-r--r--arch/x86/oprofile/op_model_amd.c546
-rw-r--r--arch/x86/oprofile/op_model_athlon.c190
-rw-r--r--arch/x86/oprofile/op_model_p4.c207
-rw-r--r--arch/x86/oprofile/op_model_ppro.c120
-rw-r--r--arch/x86/oprofile/op_x86_model.h17
-rw-r--r--arch/x86/pci/Makefile12
-rw-r--r--arch/x86/pci/acpi.c5
-rw-r--r--arch/x86/pci/amd_bus.c52
-rw-r--r--arch/x86/pci/early.c16
-rw-r--r--arch/x86/pci/fixup.c31
-rw-r--r--arch/x86/pci/i386.c27
-rw-r--r--arch/x86/pci/irq.c176
-rw-r--r--arch/x86/pci/legacy.c11
-rw-r--r--arch/x86/pci/mmconfig-shared.c79
-rw-r--r--arch/x86/pci/numaq_32.c (renamed from arch/x86/pci/numa.c)9
-rw-r--r--arch/x86/pci/pci.h3
-rw-r--r--arch/x86/pci/visws.c23
-rw-r--r--arch/x86/power/cpu_32.c13
-rw-r--r--arch/x86/power/cpu_64.c7
-rw-r--r--arch/x86/power/hibernate_asm_32.S40
-rw-r--r--arch/x86/vdso/Makefile2
-rw-r--r--arch/x86/vdso/vdso32-setup.c19
-rw-r--r--arch/x86/vdso/vdso32.S13
-rw-r--r--arch/x86/vdso/vma.c11
-rw-r--r--arch/x86/xen/Kconfig22
-rw-r--r--arch/x86/xen/Makefile14
-rw-r--r--arch/x86/xen/debugfs.c123
-rw-r--r--arch/x86/xen/debugfs.h10
-rw-r--r--arch/x86/xen/enlighten.c873
-rw-r--r--arch/x86/xen/irq.c141
-rw-r--r--arch/x86/xen/mmu.c613
-rw-r--r--arch/x86/xen/mmu.h32
-rw-r--r--arch/x86/xen/multicalls.c116
-rw-r--r--arch/x86/xen/setup.c81
-rw-r--r--arch/x86/xen/smp.c215
-rw-r--r--arch/x86/xen/spinlock.c428
-rw-r--r--arch/x86/xen/suspend.c5
-rw-r--r--arch/x86/xen/time.c23
-rw-r--r--arch/x86/xen/xen-asm_32.S (renamed from arch/x86/xen/xen-asm.S)2
-rw-r--r--arch/x86/xen/xen-asm_64.S285
-rw-r--r--arch/x86/xen/xen-head.S28
-rw-r--r--arch/x86/xen/xen-ops.h29
634 files changed, 64242 insertions, 17341 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 09521332636b..f4ed47db79ee 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -18,25 +18,28 @@ config X86_64
18### Arch settings 18### Arch settings
19config X86 19config X86
20 def_bool y 20 def_bool y
21 select HAVE_AOUT if X86_32
21 select HAVE_UNSTABLE_SCHED_CLOCK 22 select HAVE_UNSTABLE_SCHED_CLOCK
22 select HAVE_IDE 23 select HAVE_IDE
23 select HAVE_OPROFILE 24 select HAVE_OPROFILE
25 select HAVE_IOREMAP_PROT
24 select HAVE_KPROBES 26 select HAVE_KPROBES
27 select ARCH_WANT_OPTIONAL_GPIOLIB
25 select HAVE_KRETPROBES 28 select HAVE_KRETPROBES
29 select HAVE_FTRACE_MCOUNT_RECORD
26 select HAVE_DYNAMIC_FTRACE 30 select HAVE_DYNAMIC_FTRACE
27 select HAVE_FTRACE 31 select HAVE_FTRACE
28 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) 32 select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
29 select HAVE_ARCH_KGDB if !X86_VOYAGER 33 select HAVE_ARCH_KGDB if !X86_VOYAGER
34 select HAVE_ARCH_TRACEHOOK
35 select HAVE_GENERIC_DMA_COHERENT if X86_32
36 select HAVE_EFFICIENT_UNALIGNED_ACCESS
30 37
31config ARCH_DEFCONFIG 38config ARCH_DEFCONFIG
32 string 39 string
33 default "arch/x86/configs/i386_defconfig" if X86_32 40 default "arch/x86/configs/i386_defconfig" if X86_32
34 default "arch/x86/configs/x86_64_defconfig" if X86_64 41 default "arch/x86/configs/x86_64_defconfig" if X86_64
35 42
36
37config GENERIC_LOCKBREAK
38 def_bool n
39
40config GENERIC_TIME 43config GENERIC_TIME
41 def_bool y 44 def_bool y
42 45
@@ -89,7 +92,7 @@ config GENERIC_HWEIGHT
89 def_bool y 92 def_bool y
90 93
91config GENERIC_GPIO 94config GENERIC_GPIO
92 def_bool n 95 bool
93 96
94config ARCH_MAY_HAVE_PC_FDC 97config ARCH_MAY_HAVE_PC_FDC
95 def_bool y 98 def_bool y
@@ -100,12 +103,6 @@ config RWSEM_GENERIC_SPINLOCK
100config RWSEM_XCHGADD_ALGORITHM 103config RWSEM_XCHGADD_ALGORITHM
101 def_bool X86_XADD 104 def_bool X86_XADD
102 105
103config ARCH_HAS_ILOG2_U32
104 def_bool n
105
106config ARCH_HAS_ILOG2_U64
107 def_bool n
108
109config ARCH_HAS_CPU_IDLE_WAIT 106config ARCH_HAS_CPU_IDLE_WAIT
110 def_bool y 107 def_bool y
111 108
@@ -119,6 +116,9 @@ config GENERIC_TIME_VSYSCALL
119config ARCH_HAS_CPU_RELAX 116config ARCH_HAS_CPU_RELAX
120 def_bool y 117 def_bool y
121 118
119config ARCH_HAS_DEFAULT_IDLE
120 def_bool y
121
122config ARCH_HAS_CACHE_LINE_SIZE 122config ARCH_HAS_CACHE_LINE_SIZE
123 def_bool y 123 def_bool y
124 124
@@ -147,9 +147,6 @@ config AUDIT_ARCH
147 bool 147 bool
148 default X86_64 148 default X86_64
149 149
150config ARCH_SUPPORTS_AOUT
151 def_bool y
152
153config ARCH_SUPPORTS_OPTIMIZED_INLINING 150config ARCH_SUPPORTS_OPTIMIZED_INLINING
154 def_bool y 151 def_bool y
155 152
@@ -200,6 +197,7 @@ config X86_TRAMPOLINE
200config KTIME_SCALAR 197config KTIME_SCALAR
201 def_bool X86_32 198 def_bool X86_32
202source "init/Kconfig" 199source "init/Kconfig"
200source "kernel/Kconfig.freezer"
203 201
204menu "Processor type and features" 202menu "Processor type and features"
205 203
@@ -329,20 +327,6 @@ config X86_BIGSMP
329 327
330endif 328endif
331 329
332config X86_RDC321X
333 bool "RDC R-321x SoC"
334 depends on X86_32
335 select M486
336 select X86_REBOOTFIXUPS
337 select GENERIC_GPIO
338 select LEDS_CLASS
339 select LEDS_GPIO
340 select NEW_LEDS
341 help
342 This option is needed for RDC R-321x system-on-chip, also known
343 as R-8610-(G).
344 If you don't have one of these chips, you should say N here.
345
346config X86_VSMP 330config X86_VSMP
347 bool "Support for ScaleMP vSMP" 331 bool "Support for ScaleMP vSMP"
348 select PARAVIRT 332 select PARAVIRT
@@ -366,6 +350,16 @@ config X86_VISWS
366 A kernel compiled for the Visual Workstation will run on general 350 A kernel compiled for the Visual Workstation will run on general
367 PCs as well. See <file:Documentation/sgi-visws.txt> for details. 351 PCs as well. See <file:Documentation/sgi-visws.txt> for details.
368 352
353config X86_RDC321X
354 bool "RDC R-321x SoC"
355 depends on X86_32
356 select M486
357 select X86_REBOOTFIXUPS
358 help
359 This option is needed for RDC R-321x system-on-chip, also known
360 as R-8610-(G).
361 If you don't have one of these chips, you should say N here.
362
369config SCHED_NO_NO_OMIT_FRAME_POINTER 363config SCHED_NO_NO_OMIT_FRAME_POINTER
370 def_bool y 364 def_bool y
371 prompt "Single-depth WCHAN output" 365 prompt "Single-depth WCHAN output"
@@ -447,7 +441,6 @@ config PARAVIRT_DEBUG
447 441
448config MEMTEST 442config MEMTEST
449 bool "Memtest" 443 bool "Memtest"
450 depends on X86_64
451 help 444 help
452 This option adds a kernel parameter 'memtest', which allows memtest 445 This option adds a kernel parameter 'memtest', which allows memtest
453 to be set. 446 to be set.
@@ -554,6 +547,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT
554config AMD_IOMMU 547config AMD_IOMMU
555 bool "AMD IOMMU support" 548 bool "AMD IOMMU support"
556 select SWIOTLB 549 select SWIOTLB
550 select PCI_MSI
557 depends on X86_64 && PCI && ACPI 551 depends on X86_64 && PCI && ACPI
558 help 552 help
559 With this option you can enable support for AMD IOMMU hardware in 553 With this option you can enable support for AMD IOMMU hardware in
@@ -578,35 +572,29 @@ config SWIOTLB
578 572
579config IOMMU_HELPER 573config IOMMU_HELPER
580 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) 574 def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU)
575
581config MAXSMP 576config MAXSMP
582 bool "Configure Maximum number of SMP Processors and NUMA Nodes" 577 bool "Configure Maximum number of SMP Processors and NUMA Nodes"
583 depends on X86_64 && SMP 578 depends on X86_64 && SMP && BROKEN
584 default n 579 default n
585 help 580 help
586 Configure maximum number of CPUS and NUMA Nodes for this architecture. 581 Configure maximum number of CPUS and NUMA Nodes for this architecture.
587 If unsure, say N. 582 If unsure, say N.
588 583
589if MAXSMP
590config NR_CPUS
591 int
592 default "4096"
593endif
594
595if !MAXSMP
596config NR_CPUS 584config NR_CPUS
597 int "Maximum number of CPUs (2-4096)" 585 int "Maximum number of CPUs (2-512)" if !MAXSMP
598 range 2 4096 586 range 2 512
599 depends on SMP 587 depends on SMP
588 default "4096" if MAXSMP
600 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 589 default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000
601 default "8" 590 default "8"
602 help 591 help
603 This allows you to specify the maximum number of CPUs which this 592 This allows you to specify the maximum number of CPUs which this
604 kernel will support. The maximum supported value is 4096 and the 593 kernel will support. The maximum supported value is 512 and the
605 minimum value which makes sense is 2. 594 minimum value which makes sense is 2.
606 595
607 This is purely to save memory - each supported CPU adds 596 This is purely to save memory - each supported CPU adds
608 approximately eight kilobytes to the kernel image. 597 approximately eight kilobytes to the kernel image.
609endif
610 598
611config SCHED_SMT 599config SCHED_SMT
612 bool "SMT (Hyperthreading) scheduler support" 600 bool "SMT (Hyperthreading) scheduler support"
@@ -789,9 +777,8 @@ config I8K
789 Say N otherwise. 777 Say N otherwise.
790 778
791config X86_REBOOTFIXUPS 779config X86_REBOOTFIXUPS
792 def_bool n 780 bool "Enable X86 board specific fixups for reboot"
793 prompt "Enable X86 board specific fixups for reboot" 781 depends on X86_32
794 depends on X86_32 && X86
795 ---help--- 782 ---help---
796 This enables chipset and/or board specific fixups to be done 783 This enables chipset and/or board specific fixups to be done
797 in order to get reboot to work correctly. This is only needed on 784 in order to get reboot to work correctly. This is only needed on
@@ -807,23 +794,45 @@ config X86_REBOOTFIXUPS
807 Say N otherwise. 794 Say N otherwise.
808 795
809config MICROCODE 796config MICROCODE
810 tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" 797 tristate "/dev/cpu/microcode - microcode support"
811 select FW_LOADER 798 select FW_LOADER
812 ---help--- 799 ---help---
813 If you say Y here, you will be able to update the microcode on 800 If you say Y here, you will be able to update the microcode on
814 Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II, 801 certain Intel and AMD processors. The Intel support is for the
815 Pentium III, Pentium 4, Xeon etc. You will obviously need the 802 IA32 family, e.g. Pentium Pro, Pentium II, Pentium III,
816 actual microcode binary data itself which is not shipped with the 803 Pentium 4, Xeon etc. The AMD support is for family 0x10 and
817 Linux kernel. 804 0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra.
805 You will obviously need the actual microcode binary data itself
806 which is not shipped with the Linux kernel.
818 807
819 For latest news and information on obtaining all the required 808 This option selects the general module only, you need to select
820 ingredients for this driver, check: 809 at least one vendor specific module as well.
821 <http://www.urbanmyth.org/microcode/>.
822 810
823 To compile this driver as a module, choose M here: the 811 To compile this driver as a module, choose M here: the
824 module will be called microcode. 812 module will be called microcode.
825 813
826config MICROCODE_OLD_INTERFACE 814config MICROCODE_INTEL
815 bool "Intel microcode patch loading support"
816 depends on MICROCODE
817 default MICROCODE
818 select FW_LOADER
819 --help---
820 This options enables microcode patch loading support for Intel
821 processors.
822
823 For latest news and information on obtaining all the required
824 Intel ingredients for this driver, check:
825 <http://www.urbanmyth.org/microcode/>.
826
827config MICROCODE_AMD
828 bool "AMD microcode patch loading support"
829 depends on MICROCODE
830 select FW_LOADER
831 --help---
832 If you select this option, microcode patch loading support for AMD
833 processors will be enabled.
834
835 config MICROCODE_OLD_INTERFACE
827 def_bool y 836 def_bool y
828 depends on MICROCODE 837 depends on MICROCODE
829 838
@@ -953,16 +962,17 @@ config HIGHMEM
953 depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) 962 depends on X86_32 && (HIGHMEM64G || HIGHMEM4G)
954 963
955config X86_PAE 964config X86_PAE
956 def_bool n 965 bool "PAE (Physical Address Extension) Support"
957 prompt "PAE (Physical Address Extension) Support"
958 depends on X86_32 && !HIGHMEM4G 966 depends on X86_32 && !HIGHMEM4G
959 select RESOURCES_64BIT
960 help 967 help
961 PAE is required for NX support, and furthermore enables 968 PAE is required for NX support, and furthermore enables
962 larger swapspace support for non-overcommit purposes. It 969 larger swapspace support for non-overcommit purposes. It
963 has the cost of more pagetable lookup overhead, and also 970 has the cost of more pagetable lookup overhead, and also
964 consumes more pagetable space per process. 971 consumes more pagetable space per process.
965 972
973config ARCH_PHYS_ADDR_T_64BIT
974 def_bool X86_64 || X86_PAE
975
966# Common NUMA Features 976# Common NUMA Features
967config NUMA 977config NUMA
968 bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" 978 bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)"
@@ -976,9 +986,9 @@ config NUMA
976 local memory controller of the CPU and add some more 986 local memory controller of the CPU and add some more
977 NUMA awareness to the kernel. 987 NUMA awareness to the kernel.
978 988
979 For i386 this is currently highly experimental and should be only 989 For 32-bit this is currently highly experimental and should be only
980 used for kernel development. It might also cause boot failures. 990 used for kernel development. It might also cause boot failures.
981 For x86_64 this is recommended on all multiprocessor Opteron systems. 991 For 64-bit this is recommended on all multiprocessor Opteron systems.
982 If the system is EM64T, you should say N unless your system is 992 If the system is EM64T, you should say N unless your system is
983 EM64T NUMA. 993 EM64T NUMA.
984 994
@@ -1021,17 +1031,10 @@ config NUMA_EMU
1021 into virtual nodes when booted with "numa=fake=N", where N is the 1031 into virtual nodes when booted with "numa=fake=N", where N is the
1022 number of nodes. This is only useful for debugging. 1032 number of nodes. This is only useful for debugging.
1023 1033
1024if MAXSMP
1025
1026config NODES_SHIFT 1034config NODES_SHIFT
1027 int 1035 int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
1028 default "9"
1029endif
1030
1031if !MAXSMP
1032config NODES_SHIFT
1033 int "Maximum NUMA Nodes (as a power of 2)"
1034 range 1 9 if X86_64 1036 range 1 9 if X86_64
1037 default "9" if MAXSMP
1035 default "6" if X86_64 1038 default "6" if X86_64
1036 default "4" if X86_NUMAQ 1039 default "4" if X86_NUMAQ
1037 default "3" 1040 default "3"
@@ -1039,7 +1042,6 @@ config NODES_SHIFT
1039 help 1042 help
1040 Specify the maximum number of NUMA Nodes available on the target 1043 Specify the maximum number of NUMA Nodes available on the target
1041 system. Increases memory reserved to accomodate various tables. 1044 system. Increases memory reserved to accomodate various tables.
1042endif
1043 1045
1044config HAVE_ARCH_BOOTMEM_NODE 1046config HAVE_ARCH_BOOTMEM_NODE
1045 def_bool y 1047 def_bool y
@@ -1059,7 +1061,7 @@ config HAVE_ARCH_ALLOC_REMAP
1059 1061
1060config ARCH_FLATMEM_ENABLE 1062config ARCH_FLATMEM_ENABLE
1061 def_bool y 1063 def_bool y
1062 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA 1064 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA
1063 1065
1064config ARCH_DISCONTIGMEM_ENABLE 1066config ARCH_DISCONTIGMEM_ENABLE
1065 def_bool y 1067 def_bool y
@@ -1075,7 +1077,7 @@ config ARCH_SPARSEMEM_DEFAULT
1075 1077
1076config ARCH_SPARSEMEM_ENABLE 1078config ARCH_SPARSEMEM_ENABLE
1077 def_bool y 1079 def_bool y
1078 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) 1080 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH
1079 select SPARSEMEM_STATIC if X86_32 1081 select SPARSEMEM_STATIC if X86_32
1080 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 1082 select SPARSEMEM_VMEMMAP_ENABLE if X86_64
1081 1083
@@ -1098,6 +1100,56 @@ config HIGHPTE
1098 low memory. Setting this option will put user-space page table 1100 low memory. Setting this option will put user-space page table
1099 entries in high memory. 1101 entries in high memory.
1100 1102
1103config X86_CHECK_BIOS_CORRUPTION
1104 bool "Check for low memory corruption"
1105 help
1106 Periodically check for memory corruption in low memory, which
1107 is suspected to be caused by BIOS. Even when enabled in the
1108 configuration, it is disabled at runtime. Enable it by
1109 setting "memory_corruption_check=1" on the kernel command
1110 line. By default it scans the low 64k of memory every 60
1111 seconds; see the memory_corruption_check_size and
1112 memory_corruption_check_period parameters in
1113 Documentation/kernel-parameters.txt to adjust this.
1114
1115 When enabled with the default parameters, this option has
1116 almost no overhead, as it reserves a relatively small amount
1117 of memory and scans it infrequently. It both detects corruption
1118 and prevents it from affecting the running system.
1119
1120 It is, however, intended as a diagnostic tool; if repeatable
1121 BIOS-originated corruption always affects the same memory,
1122 you can use memmap= to prevent the kernel from using that
1123 memory.
1124
1125config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
1126 bool "Set the default setting of memory_corruption_check"
1127 depends on X86_CHECK_BIOS_CORRUPTION
1128 default y
1129 help
1130 Set whether the default state of memory_corruption_check is
1131 on or off.
1132
1133config X86_RESERVE_LOW_64K
1134 bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen"
1135 default y
1136 help
1137 Reserve the first 64K of physical RAM on BIOSes that are known
1138 to potentially corrupt that memory range. A numbers of BIOSes are
1139 known to utilize this area during suspend/resume, so it must not
1140 be used by the kernel.
1141
1142 Set this to N if you are absolutely sure that you trust the BIOS
1143 to get all its memory reservations and usages right.
1144
1145 If you have doubts about the BIOS (e.g. suspend/resume does not
1146 work or there's kernel crashes after certain hardware hotplug
1147 events) and it's not AMI or Phoenix, then you might want to enable
1148 X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical
1149 corruption patterns.
1150
1151 Say Y if unsure.
1152
1101config MATH_EMULATION 1153config MATH_EMULATION
1102 bool 1154 bool
1103 prompt "Math emulation" if X86_32 1155 prompt "Math emulation" if X86_32
@@ -1156,10 +1208,10 @@ config MTRR
1156 You can safely say Y even if your machine doesn't have MTRRs, you'll 1208 You can safely say Y even if your machine doesn't have MTRRs, you'll
1157 just add about 9 KB to your kernel. 1209 just add about 9 KB to your kernel.
1158 1210
1159 See <file:Documentation/mtrr.txt> for more information. 1211 See <file:Documentation/x86/mtrr.txt> for more information.
1160 1212
1161config MTRR_SANITIZER 1213config MTRR_SANITIZER
1162 bool 1214 def_bool y
1163 prompt "MTRR cleanup support" 1215 prompt "MTRR cleanup support"
1164 depends on MTRR 1216 depends on MTRR
1165 help 1217 help
@@ -1170,7 +1222,7 @@ config MTRR_SANITIZER
1170 The largest mtrr entry size for a continous block can be set with 1222 The largest mtrr entry size for a continous block can be set with
1171 mtrr_chunk_size. 1223 mtrr_chunk_size.
1172 1224
1173 If unsure, say N. 1225 If unsure, say Y.
1174 1226
1175config MTRR_SANITIZER_ENABLE_DEFAULT 1227config MTRR_SANITIZER_ENABLE_DEFAULT
1176 int "MTRR cleanup enable value (0-1)" 1228 int "MTRR cleanup enable value (0-1)"
@@ -1205,8 +1257,7 @@ config X86_PAT
1205 If unsure, say Y. 1257 If unsure, say Y.
1206 1258
1207config EFI 1259config EFI
1208 def_bool n 1260 bool "EFI runtime service support"
1209 prompt "EFI runtime service support"
1210 depends on ACPI 1261 depends on ACPI
1211 ---help--- 1262 ---help---
1212 This enables the kernel to use EFI runtime services that are 1263 This enables the kernel to use EFI runtime services that are
@@ -1219,18 +1270,9 @@ config EFI
1219 resultant kernel should continue to boot on existing non-EFI 1270 resultant kernel should continue to boot on existing non-EFI
1220 platforms. 1271 platforms.
1221 1272
1222config IRQBALANCE
1223 def_bool y
1224 prompt "Enable kernel irq balancing"
1225 depends on X86_32 && SMP && X86_IO_APIC
1226 help
1227 The default yes will allow the kernel to do irq load balancing.
1228 Saying no will keep the kernel from doing irq load balancing.
1229
1230config SECCOMP 1273config SECCOMP
1231 def_bool y 1274 def_bool y
1232 prompt "Enable seccomp to safely compute untrusted bytecode" 1275 prompt "Enable seccomp to safely compute untrusted bytecode"
1233 depends on PROC_FS
1234 help 1276 help
1235 This kernel feature is useful for number crunching applications 1277 This kernel feature is useful for number crunching applications
1236 that may need to compute untrusted bytecode during their 1278 that may need to compute untrusted bytecode during their
@@ -1238,7 +1280,7 @@ config SECCOMP
1238 the process as file descriptors supporting the read/write 1280 the process as file descriptors supporting the read/write
1239 syscalls, it's possible to isolate those applications in 1281 syscalls, it's possible to isolate those applications in
1240 their own address space using seccomp. Once seccomp is 1282 their own address space using seccomp. Once seccomp is
1241 enabled via /proc/<pid>/seccomp, it cannot be disabled 1283 enabled via prctl(PR_SET_SECCOMP), it cannot be disabled
1242 and the task is only allowed to execute a few safe syscalls 1284 and the task is only allowed to execute a few safe syscalls
1243 defined by each seccomp mode. 1285 defined by each seccomp mode.
1244 1286
@@ -1288,7 +1330,7 @@ config KEXEC
1288 strongly in flux, so no good recommendation can be made. 1330 strongly in flux, so no good recommendation can be made.
1289 1331
1290config CRASH_DUMP 1332config CRASH_DUMP
1291 bool "kernel crash dumps (EXPERIMENTAL)" 1333 bool "kernel crash dumps"
1292 depends on X86_64 || (X86_32 && HIGHMEM) 1334 depends on X86_64 || (X86_32 && HIGHMEM)
1293 help 1335 help
1294 Generate crash dump after being started by kexec. 1336 Generate crash dump after being started by kexec.
@@ -1301,6 +1343,14 @@ config CRASH_DUMP
1301 (CONFIG_RELOCATABLE=y). 1343 (CONFIG_RELOCATABLE=y).
1302 For more details see Documentation/kdump/kdump.txt 1344 For more details see Documentation/kdump/kdump.txt
1303 1345
1346config KEXEC_JUMP
1347 bool "kexec jump (EXPERIMENTAL)"
1348 depends on EXPERIMENTAL
1349 depends on KEXEC && HIBERNATION && X86_32
1350 help
1351 Jump between original kernel and kexeced kernel and invoke
1352 code in physical address mode via KEXEC
1353
1304config PHYSICAL_START 1354config PHYSICAL_START
1305 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1355 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
1306 default "0x1000000" if X86_NUMAQ 1356 default "0x1000000" if X86_NUMAQ
@@ -1387,14 +1437,14 @@ config PHYSICAL_ALIGN
1387 Don't change this unless you know what you are doing. 1437 Don't change this unless you know what you are doing.
1388 1438
1389config HOTPLUG_CPU 1439config HOTPLUG_CPU
1390 bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)" 1440 bool "Support for hot-pluggable CPUs"
1391 depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER 1441 depends on SMP && HOTPLUG && !X86_VOYAGER
1392 ---help--- 1442 ---help---
1393 Say Y here to experiment with turning CPUs off and on, and to 1443 Say Y here to allow turning CPUs off and on. CPUs can be
1394 enable suspend on SMP systems. CPUs can be controlled through 1444 controlled through /sys/devices/system/cpu.
1395 /sys/devices/system/cpu. 1445 ( Note: power management support will enable this option
1396 Say N if you want to disable CPU hotplug and don't need to 1446 automatically on SMP systems. )
1397 suspend. 1447 Say N if you want to disable CPU hotplug.
1398 1448
1399config COMPAT_VDSO 1449config COMPAT_VDSO
1400 def_bool y 1450 def_bool y
@@ -1409,6 +1459,51 @@ config COMPAT_VDSO
1409 1459
1410 If unsure, say Y. 1460 If unsure, say Y.
1411 1461
1462config CMDLINE_BOOL
1463 bool "Built-in kernel command line"
1464 default n
1465 help
1466 Allow for specifying boot arguments to the kernel at
1467 build time. On some systems (e.g. embedded ones), it is
1468 necessary or convenient to provide some or all of the
1469 kernel boot arguments with the kernel itself (that is,
1470 to not rely on the boot loader to provide them.)
1471
1472 To compile command line arguments into the kernel,
1473 set this option to 'Y', then fill in the
1474 the boot arguments in CONFIG_CMDLINE.
1475
1476 Systems with fully functional boot loaders (i.e. non-embedded)
1477 should leave this option set to 'N'.
1478
1479config CMDLINE
1480 string "Built-in kernel command string"
1481 depends on CMDLINE_BOOL
1482 default ""
1483 help
1484 Enter arguments here that should be compiled into the kernel
1485 image and used at boot time. If the boot loader provides a
1486 command line at boot time, it is appended to this string to
1487 form the full kernel command line, when the system boots.
1488
1489 However, you can use the CONFIG_CMDLINE_OVERRIDE option to
1490 change this behavior.
1491
1492 In most cases, the command line (whether built-in or provided
1493 by the boot loader) should specify the device for the root
1494 file system.
1495
1496config CMDLINE_OVERRIDE
1497 bool "Built-in command line overrides boot loader arguments"
1498 default n
1499 depends on CMDLINE_BOOL
1500 help
1501 Set this option to 'Y' to have the kernel ignore the boot loader
1502 command line, and use ONLY the built-in command line.
1503
1504 This is used to work around broken boot loaders. This should
1505 be set to 'N' under normal conditions.
1506
1412endmenu 1507endmenu
1413 1508
1414config ARCH_ENABLE_MEMORY_HOTPLUG 1509config ARCH_ENABLE_MEMORY_HOTPLUG
@@ -1567,6 +1662,8 @@ source "arch/x86/kernel/cpu/cpufreq/Kconfig"
1567 1662
1568source "drivers/cpuidle/Kconfig" 1663source "drivers/cpuidle/Kconfig"
1569 1664
1665source "drivers/idle/Kconfig"
1666
1570endmenu 1667endmenu
1571 1668
1572 1669
@@ -1674,6 +1771,14 @@ config DMAR_FLOPPY_WA
1674 workaround will setup a 1:1 mapping for the first 1771 workaround will setup a 1:1 mapping for the first
1675 16M to make floppy (an ISA device) work. 1772 16M to make floppy (an ISA device) work.
1676 1773
1774config INTR_REMAP
1775 bool "Support for Interrupt Remapping (EXPERIMENTAL)"
1776 depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
1777 help
1778 Supports Interrupt remapping for IO-APIC and MSI devices.
1779 To use x2apic mode in the CPU's which support x2APIC enhancements or
1780 to support platforms with CPU's having > 8 bit APIC ID, say Y.
1781
1677source "drivers/pci/pcie/Kconfig" 1782source "drivers/pci/pcie/Kconfig"
1678 1783
1679source "drivers/pci/Kconfig" 1784source "drivers/pci/Kconfig"
@@ -1790,7 +1895,7 @@ config IA32_EMULATION
1790 1895
1791config IA32_AOUT 1896config IA32_AOUT
1792 tristate "IA32 a.out support" 1897 tristate "IA32 a.out support"
1793 depends on IA32_EMULATION && ARCH_SUPPORTS_AOUT 1898 depends on IA32_EMULATION
1794 help 1899 help
1795 Support old a.out binaries in the 32bit emulation. 1900 Support old a.out binaries in the 32bit emulation.
1796 1901
@@ -1804,7 +1909,7 @@ config COMPAT_FOR_U64_ALIGNMENT
1804 1909
1805config SYSVIPC_COMPAT 1910config SYSVIPC_COMPAT
1806 def_bool y 1911 def_bool y
1807 depends on X86_64 && COMPAT && SYSVIPC 1912 depends on COMPAT && SYSVIPC
1808 1913
1809endmenu 1914endmenu
1810 1915
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index abff1b84ed5b..0b7c4a3f0651 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -38,8 +38,7 @@ config M386
38 - "Crusoe" for the Transmeta Crusoe series. 38 - "Crusoe" for the Transmeta Crusoe series.
39 - "Efficeon" for the Transmeta Efficeon series. 39 - "Efficeon" for the Transmeta Efficeon series.
40 - "Winchip-C6" for original IDT Winchip. 40 - "Winchip-C6" for original IDT Winchip.
41 - "Winchip-2" for IDT Winchip 2. 41 - "Winchip-2" for IDT Winchips with 3dNow! capabilities.
42 - "Winchip-2A" for IDT Winchips with 3dNow! capabilities.
43 - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). 42 - "GeodeGX1" for Geode GX1 (Cyrix MediaGX).
44 - "Geode GX/LX" For AMD Geode GX and LX processors. 43 - "Geode GX/LX" For AMD Geode GX and LX processors.
45 - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. 44 - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3.
@@ -194,19 +193,11 @@ config MWINCHIPC6
194 treat this chip as a 586TSC with some extended instructions 193 treat this chip as a 586TSC with some extended instructions
195 and alignment requirements. 194 and alignment requirements.
196 195
197config MWINCHIP2
198 bool "Winchip-2"
199 depends on X86_32
200 help
201 Select this for an IDT Winchip-2. Linux and GCC
202 treat this chip as a 586TSC with some extended instructions
203 and alignment requirements.
204
205config MWINCHIP3D 196config MWINCHIP3D
206 bool "Winchip-2A/Winchip-3" 197 bool "Winchip-2/Winchip-2A/Winchip-3"
207 depends on X86_32 198 depends on X86_32
208 help 199 help
209 Select this for an IDT Winchip-2A or 3. Linux and GCC 200 Select this for an IDT Winchip-2, 2A or 3. Linux and GCC
210 treat this chip as a 586TSC with some extended instructions 201 treat this chip as a 586TSC with some extended instructions
211 and alignment requirements. Also enable out of order memory 202 and alignment requirements. Also enable out of order memory
212 stores for this CPU, which can increase performance of some 203 stores for this CPU, which can increase performance of some
@@ -318,7 +309,7 @@ config X86_L1_CACHE_SHIFT
318 int 309 int
319 default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC 310 default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC
320 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 311 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
321 default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 312 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
322 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 313 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7
323 314
324config X86_XADD 315config X86_XADD
@@ -360,11 +351,7 @@ config X86_POPAD_OK
360 351
361config X86_ALIGNMENT_16 352config X86_ALIGNMENT_16
362 def_bool y 353 def_bool y
363 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 354 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
364
365config X86_GOOD_APIC
366 def_bool y
367 depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64
368 355
369config X86_INTEL_USERCOPY 356config X86_INTEL_USERCOPY
370 def_bool y 357 def_bool y
@@ -372,7 +359,7 @@ config X86_INTEL_USERCOPY
372 359
373config X86_USE_PPRO_CHECKSUM 360config X86_USE_PPRO_CHECKSUM
374 def_bool y 361 def_bool y
375 depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 362 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2
376 363
377config X86_USE_3DNOW 364config X86_USE_3DNOW
378 def_bool y 365 def_bool y
@@ -380,24 +367,27 @@ config X86_USE_3DNOW
380 367
381config X86_OOSTORE 368config X86_OOSTORE
382 def_bool y 369 def_bool y
383 depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR 370 depends on (MWINCHIP3D || MWINCHIPC6) && MTRR
384 371
385# 372#
386# P6_NOPs are a relatively minor optimization that require a family >= 373# P6_NOPs are a relatively minor optimization that require a family >=
387# 6 processor, except that it is broken on certain VIA chips. 374# 6 processor, except that it is broken on certain VIA chips.
388# Furthermore, AMD chips prefer a totally different sequence of NOPs 375# Furthermore, AMD chips prefer a totally different sequence of NOPs
389# (which work on all CPUs). As a result, disallow these if we're 376# (which work on all CPUs). In addition, it looks like Virtual PC
390# compiling X86_GENERIC but not X86_64 (these NOPs do work on all 377# does not understand them.
391# x86-64 capable chips); the list of processors in the right-hand clause 378#
392# are the cores that benefit from this optimization. 379# As a result, disallow these if we're not compiling for X86_64 (these
380# NOPs do work on all x86-64 capable chips); the list of processors in
381# the right-hand clause are the cores that benefit from this optimization.
393# 382#
394config X86_P6_NOP 383config X86_P6_NOP
395 def_bool y 384 def_bool y
396 depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4 || MPSC) 385 depends on X86_64
386 depends on (MCORE2 || MPENTIUM4 || MPSC)
397 387
398config X86_TSC 388config X86_TSC
399 def_bool y 389 def_bool y
400 depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 390 depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64
401 391
402config X86_CMPXCHG64 392config X86_CMPXCHG64
403 def_bool y 393 def_bool y
@@ -407,7 +397,7 @@ config X86_CMPXCHG64
407# generates cmov. 397# generates cmov.
408config X86_CMOV 398config X86_CMOV
409 def_bool y 399 def_bool y
410 depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64) 400 depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64)
411 401
412config X86_MINIMUM_CPU_FAMILY 402config X86_MINIMUM_CPU_FAMILY
413 int 403 int
@@ -418,4 +408,124 @@ config X86_MINIMUM_CPU_FAMILY
418 408
419config X86_DEBUGCTLMSR 409config X86_DEBUGCTLMSR
420 def_bool y 410 def_bool y
421 depends on !(M586MMX || M586TSC || M586 || M486 || M386) 411 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386)
412
413menuconfig PROCESSOR_SELECT
414 bool "Supported processor vendors" if EMBEDDED
415 help
416 This lets you choose what x86 vendor support code your kernel
417 will include.
418
419config CPU_SUP_INTEL
420 default y
421 bool "Support Intel processors" if PROCESSOR_SELECT
422 help
423 This enables detection, tunings and quirks for Intel processors
424
425 You need this enabled if you want your kernel to run on an
426 Intel CPU. Disabling this option on other types of CPUs
427 makes the kernel a tiny bit smaller. Disabling it on an Intel
428 CPU might render the kernel unbootable.
429
430 If unsure, say N.
431
432config CPU_SUP_CYRIX_32
433 default y
434 bool "Support Cyrix processors" if PROCESSOR_SELECT
435 depends on !64BIT
436 help
437 This enables detection, tunings and quirks for Cyrix processors
438
439 You need this enabled if you want your kernel to run on a
440 Cyrix CPU. Disabling this option on other types of CPUs
441 makes the kernel a tiny bit smaller. Disabling it on a Cyrix
442 CPU might render the kernel unbootable.
443
444 If unsure, say N.
445
446config CPU_SUP_AMD
447 default y
448 bool "Support AMD processors" if PROCESSOR_SELECT
449 help
450 This enables detection, tunings and quirks for AMD processors
451
452 You need this enabled if you want your kernel to run on an
453 AMD CPU. Disabling this option on other types of CPUs
454 makes the kernel a tiny bit smaller. Disabling it on an AMD
455 CPU might render the kernel unbootable.
456
457 If unsure, say N.
458
459config CPU_SUP_CENTAUR_32
460 default y
461 bool "Support Centaur processors" if PROCESSOR_SELECT
462 depends on !64BIT
463 help
464 This enables detection, tunings and quirks for Centaur processors
465
466 You need this enabled if you want your kernel to run on a
467 Centaur CPU. Disabling this option on other types of CPUs
468 makes the kernel a tiny bit smaller. Disabling it on a Centaur
469 CPU might render the kernel unbootable.
470
471 If unsure, say N.
472
473config CPU_SUP_CENTAUR_64
474 default y
475 bool "Support Centaur processors" if PROCESSOR_SELECT
476 depends on 64BIT
477 help
478 This enables detection, tunings and quirks for Centaur processors
479
480 You need this enabled if you want your kernel to run on a
481 Centaur CPU. Disabling this option on other types of CPUs
482 makes the kernel a tiny bit smaller. Disabling it on a Centaur
483 CPU might render the kernel unbootable.
484
485 If unsure, say N.
486
487config CPU_SUP_TRANSMETA_32
488 default y
489 bool "Support Transmeta processors" if PROCESSOR_SELECT
490 depends on !64BIT
491 help
492 This enables detection, tunings and quirks for Transmeta processors
493
494 You need this enabled if you want your kernel to run on a
495 Transmeta CPU. Disabling this option on other types of CPUs
496 makes the kernel a tiny bit smaller. Disabling it on a Transmeta
497 CPU might render the kernel unbootable.
498
499 If unsure, say N.
500
501config CPU_SUP_UMC_32
502 default y
503 bool "Support UMC processors" if PROCESSOR_SELECT
504 depends on !64BIT
505 help
506 This enables detection, tunings and quirks for UMC processors
507
508 You need this enabled if you want your kernel to run on a
509 UMC CPU. Disabling this option on other types of CPUs
510 makes the kernel a tiny bit smaller. Disabling it on a UMC
511 CPU might render the kernel unbootable.
512
513 If unsure, say N.
514
515config X86_DS
516 bool "Debug Store support"
517 default y
518 help
519 Add support for Debug Store.
520 This allows the kernel to provide a memory buffer to the hardware
521 to store various profiling and tracing events.
522
523config X86_PTRACE_BTS
524 bool "ptrace interface to Branch Trace Store"
525 default y
526 depends on (X86_DS && X86_DEBUGCTLMSR)
527 help
528 Add a ptrace interface to allow collecting an execution trace
529 of the traced task.
530 This collects control flow changes in a (cyclic) buffer and allows
531 debuggers to fill in the gaps and show an execution trace of the debuggee.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index ae36bfa814e5..2a3dfbd5e677 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -5,13 +5,15 @@ config TRACE_IRQFLAGS_SUPPORT
5 5
6source "lib/Kconfig.debug" 6source "lib/Kconfig.debug"
7 7
8config NONPROMISC_DEVMEM 8config STRICT_DEVMEM
9 bool "Filter access to /dev/mem" 9 bool "Filter access to /dev/mem"
10 help 10 help
11 If this option is left off, you allow userspace access to all 11 If this option is disabled, you allow userspace (root) access to all
12 of memory, including kernel and userspace memory. Accidental 12 of memory, including kernel and userspace memory. Accidental
13 access to this is obviously disastrous, but specific access can 13 access to this is obviously disastrous, but specific access can
14 be used by people debugging the kernel. 14 be used by people debugging the kernel. Note that with PAT support
15 enabled, even in this case there are restrictions on /dev/mem
16 use due to the cache aliasing requirements.
15 17
16 If this option is switched on, the /dev/mem file only allows 18 If this option is switched on, the /dev/mem file only allows
17 userspace access to PCI space and the BIOS code and data regions. 19 userspace access to PCI space and the BIOS code and data regions.
@@ -41,6 +43,19 @@ config EARLY_PRINTK
41 with klogd/syslogd or the X server. You should normally N here, 43 with klogd/syslogd or the X server. You should normally N here,
42 unless you want to debug such a crash. 44 unless you want to debug such a crash.
43 45
46config EARLY_PRINTK_DBGP
47 bool "Early printk via EHCI debug port"
48 default n
49 depends on EARLY_PRINTK && PCI
50 help
51 Write kernel log output directly into the EHCI debug port.
52
53 This is useful for kernel debugging when your machine crashes very
54 early before the console code is initialized. For normal operation
55 it is not recommended because it looks ugly and doesn't cooperate
56 with klogd/syslogd or the X server. You should normally N here,
57 unless you want to debug such a crash. You need usb debug device.
58
44config DEBUG_STACKOVERFLOW 59config DEBUG_STACKOVERFLOW
45 bool "Check for stack overflows" 60 bool "Check for stack overflows"
46 depends on DEBUG_KERNEL 61 depends on DEBUG_KERNEL
@@ -287,7 +302,6 @@ config CPA_DEBUG
287 302
288config OPTIMIZE_INLINING 303config OPTIMIZE_INLINING
289 bool "Allow gcc to uninline functions marked 'inline'" 304 bool "Allow gcc to uninline functions marked 'inline'"
290 depends on BROKEN
291 help 305 help
292 This option determines if the kernel forces gcc to inline the functions 306 This option determines if the kernel forces gcc to inline the functions
293 developers have marked 'inline'. Doing so takes away freedom from gcc to 307 developers have marked 'inline'. Doing so takes away freedom from gcc to
@@ -298,5 +312,7 @@ config OPTIMIZE_INLINING
298 become the default in the future, until then this option is there to 312 become the default in the future, until then this option is there to
299 test gcc for this. 313 test gcc for this.
300 314
315 If unsure, say N.
316
301endmenu 317endmenu
302 318
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 919ce21ea654..d1a47adb5aec 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -110,21 +110,16 @@ KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,)
110mcore-y := arch/x86/mach-default/ 110mcore-y := arch/x86/mach-default/
111 111
112# Voyager subarch support 112# Voyager subarch support
113mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager 113mflags-$(CONFIG_X86_VOYAGER) := -Iarch/x86/include/asm/mach-voyager
114mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/ 114mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/
115 115
116# generic subarchitecture 116# generic subarchitecture
117mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic 117mflags-$(CONFIG_X86_GENERICARCH):= -Iarch/x86/include/asm/mach-generic
118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ 118fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/
119mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ 119mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/
120 120
121# RDC R-321x subarch support
122mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x
123mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default/
124core-$(CONFIG_X86_RDC321X) += arch/x86/mach-rdc321x/
125
126# default subarch .h files 121# default subarch .h files
127mflags-y += -Iinclude/asm-x86/mach-default 122mflags-y += -Iarch/x86/include/asm/mach-default
128 123
129# 64 bit does not support subarch support - clear sub arch variables 124# 64 bit does not support subarch support - clear sub arch variables
130fcore-$(CONFIG_X86_64) := 125fcore-$(CONFIG_X86_64) :=
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index e372b584e919..80177ec052f0 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -28,7 +28,6 @@ cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon)
28cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 28cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
29cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 29cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
30cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) 30cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586)
31cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586)
32cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) 31cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586)
33cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 32cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0
34cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) 33cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686)
@@ -45,3 +44,8 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
45# cpu entries 44# cpu entries
46cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) 45cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
47 46
47# Bug fix for binutils: this option is required in order to keep
48# binutils from generating NOPL instructions against our will.
49ifneq ($(CONFIG_X86_P6_NOP),y)
50cflags-y += $(call cc-option,-Wa$(comma)-mtune=generic32,)
51endif
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index 7ee102f9c4f8..cd48c7210016 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -72,9 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \
72KBUILD_CFLAGS += $(call cc-option,-m32) 72KBUILD_CFLAGS += $(call cc-option,-m32)
73KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ 73KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__
74 74
75$(obj)/zImage: IMAGE_OFFSET := 0x1000
76$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) 75$(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK)
77$(obj)/bzImage: IMAGE_OFFSET := 0x100000
78$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__ 76$(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__
79$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ 77$(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__
80$(obj)/bzImage: BUILDFLAGS := -b 78$(obj)/bzImage: BUILDFLAGS := -b
@@ -117,7 +115,7 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE
117 $(call if_changed,objcopy) 115 $(call if_changed,objcopy)
118 116
119$(obj)/compressed/vmlinux: FORCE 117$(obj)/compressed/vmlinux: FORCE
120 $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ 118 $(Q)$(MAKE) $(build)=$(obj)/compressed $@
121 119
122# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel 120# Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel
123FDARGS = 121FDARGS =
@@ -181,6 +179,7 @@ isoimage: $(BOOTIMAGE)
181 mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ 179 mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \
182 -no-emul-boot -boot-load-size 4 -boot-info-table \ 180 -no-emul-boot -boot-load-size 4 -boot-info-table \
183 $(obj)/isoimage 181 $(obj)/isoimage
182 isohybrid $(obj)/image.iso 2>/dev/null || true
184 rm -rf $(obj)/isoimage 183 rm -rf $(obj)/isoimage
185 184
186zlilo: $(BOOTIMAGE) 185zlilo: $(BOOTIMAGE)
diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h
index a34b9982c7cb..cc0ef13fba7a 100644
--- a/arch/x86/boot/boot.h
+++ b/arch/x86/boot/boot.h
@@ -24,10 +24,14 @@
24#include <linux/edd.h> 24#include <linux/edd.h>
25#include <asm/boot.h> 25#include <asm/boot.h>
26#include <asm/setup.h> 26#include <asm/setup.h>
27#include "bitops.h"
28#include <asm/cpufeature.h>
27 29
28/* Useful macros */ 30/* Useful macros */
29#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) 31#define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
30 32
33#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x)))
34
31extern struct setup_header hdr; 35extern struct setup_header hdr;
32extern struct boot_params boot_params; 36extern struct boot_params boot_params;
33 37
@@ -242,6 +246,12 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize);
242int cmdline_find_option_bool(const char *option); 246int cmdline_find_option_bool(const char *option);
243 247
244/* cpu.c, cpucheck.c */ 248/* cpu.c, cpucheck.c */
249struct cpu_features {
250 int level; /* Family, or 64 for x86-64 */
251 int model;
252 u32 flags[NCAPINTS];
253};
254extern struct cpu_features cpu;
245int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); 255int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr);
246int validate_cpu(void); 256int validate_cpu(void);
247 257
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 92fdd35bd93e..1771c804e02f 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -27,9 +27,8 @@ $(obj)/vmlinux.bin: vmlinux FORCE
27 $(call if_changed,objcopy) 27 $(call if_changed,objcopy)
28 28
29 29
30ifeq ($(CONFIG_X86_32),y) 30targets += vmlinux.bin.all vmlinux.relocs relocs
31targets += vmlinux.bin.all vmlinux.relocs 31hostprogs-$(CONFIG_X86_32) += relocs
32hostprogs-y := relocs
33 32
34quiet_cmd_relocs = RELOCS $@ 33quiet_cmd_relocs = RELOCS $@
35 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< 34 cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -43,6 +42,8 @@ quiet_cmd_relocbin = BUILD $@
43$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE 42$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
44 $(call if_changed,relocbin) 43 $(call if_changed,relocbin)
45 44
45ifeq ($(CONFIG_X86_32),y)
46
46ifdef CONFIG_RELOCATABLE 47ifdef CONFIG_RELOCATABLE
47$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE 48$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
48 $(call if_changed,gzip) 49 $(call if_changed,gzip)
@@ -59,6 +60,5 @@ $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
59LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T 60LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
60endif 61endif
61 62
62
63$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE 63$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE
64 $(call if_changed,ld) 64 $(call if_changed,ld)
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index ba7736cf2ec7..29c5fbf08392 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -137,14 +137,15 @@ relocated:
137 */ 137 */
138 movl output_len(%ebx), %eax 138 movl output_len(%ebx), %eax
139 pushl %eax 139 pushl %eax
140 # push arguments for decompress_kernel:
140 pushl %ebp # output address 141 pushl %ebp # output address
141 movl input_len(%ebx), %eax 142 movl input_len(%ebx), %eax
142 pushl %eax # input_len 143 pushl %eax # input_len
143 leal input_data(%ebx), %eax 144 leal input_data(%ebx), %eax
144 pushl %eax # input_data 145 pushl %eax # input_data
145 leal boot_heap(%ebx), %eax 146 leal boot_heap(%ebx), %eax
146 pushl %eax # heap area as third argument 147 pushl %eax # heap area
147 pushl %esi # real mode pointer as second arg 148 pushl %esi # real mode pointer
148 call decompress_kernel 149 call decompress_kernel
149 addl $20, %esp 150 addl $20, %esp
150 popl %ecx 151 popl %ecx
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index bc5553b496f7..da062216948a 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -16,7 +16,7 @@
16 */ 16 */
17#undef CONFIG_PARAVIRT 17#undef CONFIG_PARAVIRT
18#ifdef CONFIG_X86_32 18#ifdef CONFIG_X86_32
19#define _ASM_DESC_H_ 1 19#define _ASM_X86_DESC_H 1
20#endif 20#endif
21 21
22#ifdef CONFIG_X86_64 22#ifdef CONFIG_X86_64
@@ -27,7 +27,7 @@
27#include <linux/linkage.h> 27#include <linux/linkage.h>
28#include <linux/screen_info.h> 28#include <linux/screen_info.h>
29#include <linux/elf.h> 29#include <linux/elf.h>
30#include <asm/io.h> 30#include <linux/io.h>
31#include <asm/page.h> 31#include <asm/page.h>
32#include <asm/boot.h> 32#include <asm/boot.h>
33#include <asm/bootparam.h> 33#include <asm/bootparam.h>
@@ -182,8 +182,6 @@ static unsigned outcnt;
182static int fill_inbuf(void); 182static int fill_inbuf(void);
183static void flush_window(void); 183static void flush_window(void);
184static void error(char *m); 184static void error(char *m);
185static void gzip_mark(void **);
186static void gzip_release(void **);
187 185
188/* 186/*
189 * This is set up by the setup-routine at boot-time 187 * This is set up by the setup-routine at boot-time
@@ -196,9 +194,6 @@ extern int input_len;
196 194
197static long bytes_out; 195static long bytes_out;
198 196
199static void *malloc(int size);
200static void free(void *where);
201
202static void *memset(void *s, int c, unsigned n); 197static void *memset(void *s, int c, unsigned n);
203static void *memcpy(void *dest, const void *src, unsigned n); 198static void *memcpy(void *dest, const void *src, unsigned n);
204 199
@@ -220,40 +215,6 @@ static int lines, cols;
220 215
221#include "../../../../lib/inflate.c" 216#include "../../../../lib/inflate.c"
222 217
223static void *malloc(int size)
224{
225 void *p;
226
227 if (size < 0)
228 error("Malloc error");
229 if (free_mem_ptr <= 0)
230 error("Memory error");
231
232 free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */
233
234 p = (void *)free_mem_ptr;
235 free_mem_ptr += size;
236
237 if (free_mem_ptr >= free_mem_end_ptr)
238 error("Out of memory");
239
240 return p;
241}
242
243static void free(void *where)
244{ /* Don't care */
245}
246
247static void gzip_mark(void **ptr)
248{
249 *ptr = (void *) free_mem_ptr;
250}
251
252static void gzip_release(void **ptr)
253{
254 free_mem_ptr = (memptr) *ptr;
255}
256
257static void scroll(void) 218static void scroll(void)
258{ 219{
259 int i; 220 int i;
@@ -290,7 +251,7 @@ static void __putstr(int error, const char *s)
290 y--; 251 y--;
291 } 252 }
292 } else { 253 } else {
293 vidmem [(x + cols * y) * 2] = c; 254 vidmem[(x + cols * y) * 2] = c;
294 if (++x >= cols) { 255 if (++x >= cols) {
295 x = 0; 256 x = 0;
296 if (++y >= lines) { 257 if (++y >= lines) {
@@ -316,7 +277,8 @@ static void *memset(void *s, int c, unsigned n)
316 int i; 277 int i;
317 char *ss = s; 278 char *ss = s;
318 279
319 for (i = 0; i < n; i++) ss[i] = c; 280 for (i = 0; i < n; i++)
281 ss[i] = c;
320 return s; 282 return s;
321} 283}
322 284
@@ -326,7 +288,8 @@ static void *memcpy(void *dest, const void *src, unsigned n)
326 const char *s = src; 288 const char *s = src;
327 char *d = dest; 289 char *d = dest;
328 290
329 for (i = 0; i < n; i++) d[i] = s[i]; 291 for (i = 0; i < n; i++)
292 d[i] = s[i];
330 return dest; 293 return dest;
331} 294}
332 295
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index a1310c52fc0c..857e492c571e 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -492,7 +492,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
492 continue; 492 continue;
493 } 493 }
494 sh_symtab = sec_symtab->symtab; 494 sh_symtab = sec_symtab->symtab;
495 sym_strtab = sec->link->strtab; 495 sym_strtab = sec_symtab->link->strtab;
496 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { 496 for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) {
497 Elf32_Rel *rel; 497 Elf32_Rel *rel;
498 Elf32_Sym *sym; 498 Elf32_Sym *sym;
diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c
index 92d6fd73dc7d..6ec6bb6e9957 100644
--- a/arch/x86/boot/cpu.c
+++ b/arch/x86/boot/cpu.c
@@ -16,9 +16,6 @@
16 */ 16 */
17 17
18#include "boot.h" 18#include "boot.h"
19#include "bitops.h"
20#include <asm/cpufeature.h>
21
22#include "cpustr.h" 19#include "cpustr.h"
23 20
24static char *cpu_name(int level) 21static char *cpu_name(int level)
@@ -62,17 +59,18 @@ int validate_cpu(void)
62 u32 e = err_flags[i]; 59 u32 e = err_flags[i];
63 60
64 for (j = 0; j < 32; j++) { 61 for (j = 0; j < 32; j++) {
65 int n = (i << 5)+j; 62 if (msg_strs[0] < i ||
66 if (*msg_strs < n) { 63 (msg_strs[0] == i && msg_strs[1] < j)) {
67 /* Skip to the next string */ 64 /* Skip to the next string */
68 do { 65 msg_strs += 2;
69 msg_strs++; 66 while (*msg_strs++)
70 } while (*msg_strs); 67 ;
71 msg_strs++;
72 } 68 }
73 if (e & 1) { 69 if (e & 1) {
74 if (*msg_strs == n && msg_strs[1]) 70 if (msg_strs[0] == i &&
75 printf("%s ", msg_strs+1); 71 msg_strs[1] == j &&
72 msg_strs[2])
73 printf("%s ", msg_strs+2);
76 else 74 else
77 printf("%d:%d ", i, j); 75 printf("%d:%d ", i, j);
78 } 76 }
diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c
index 7804389ee005..4d3ff037201f 100644
--- a/arch/x86/boot/cpucheck.c
+++ b/arch/x86/boot/cpucheck.c
@@ -22,21 +22,13 @@
22 22
23#ifdef _SETUP 23#ifdef _SETUP
24# include "boot.h" 24# include "boot.h"
25# include "bitops.h"
26#endif 25#endif
27#include <linux/types.h> 26#include <linux/types.h>
28#include <asm/cpufeature.h>
29#include <asm/processor-flags.h> 27#include <asm/processor-flags.h>
30#include <asm/required-features.h> 28#include <asm/required-features.h>
31#include <asm/msr-index.h> 29#include <asm/msr-index.h>
32 30
33struct cpu_features { 31struct cpu_features cpu;
34 int level; /* Family, or 64 for x86-64 */
35 int model;
36 u32 flags[NCAPINTS];
37};
38
39static struct cpu_features cpu;
40static u32 cpu_vendor[3]; 32static u32 cpu_vendor[3];
41static u32 err_flags[NCAPINTS]; 33static u32 err_flags[NCAPINTS];
42 34
@@ -46,12 +38,12 @@ static const u32 req_flags[NCAPINTS] =
46{ 38{
47 REQUIRED_MASK0, 39 REQUIRED_MASK0,
48 REQUIRED_MASK1, 40 REQUIRED_MASK1,
49 REQUIRED_MASK2, 41 0, /* REQUIRED_MASK2 not implemented in this file */
50 REQUIRED_MASK3, 42 0, /* REQUIRED_MASK3 not implemented in this file */
51 REQUIRED_MASK4, 43 REQUIRED_MASK4,
52 REQUIRED_MASK5, 44 0, /* REQUIRED_MASK5 not implemented in this file */
53 REQUIRED_MASK6, 45 REQUIRED_MASK6,
54 REQUIRED_MASK7, 46 0, /* REQUIRED_MASK7 not implemented in this file */
55}; 47};
56 48
57#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) 49#define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a))
diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c
index 03399d64013b..1aae8f3e5ca1 100644
--- a/arch/x86/boot/edd.c
+++ b/arch/x86/boot/edd.c
@@ -41,6 +41,7 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
41 char *mbrbuf_ptr, *mbrbuf_end; 41 char *mbrbuf_ptr, *mbrbuf_end;
42 u32 buf_base, mbr_base; 42 u32 buf_base, mbr_base;
43 extern char _end[]; 43 extern char _end[];
44 u16 mbr_magic;
44 45
45 sector_size = ei->params.bytes_per_sector; 46 sector_size = ei->params.bytes_per_sector;
46 if (!sector_size) 47 if (!sector_size)
@@ -58,11 +59,15 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
58 if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr) 59 if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr)
59 return -1; 60 return -1;
60 61
62 memset(mbrbuf_ptr, 0, sector_size);
61 if (read_mbr(devno, mbrbuf_ptr)) 63 if (read_mbr(devno, mbrbuf_ptr))
62 return -1; 64 return -1;
63 65
64 *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET]; 66 *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET];
65 return 0; 67 mbr_magic = *(u16 *)&mbrbuf_ptr[510];
68
69 /* check for valid MBR magic */
70 return mbr_magic == 0xAA55 ? 0 : -1;
66} 71}
67 72
68static int get_edd_info(u8 devno, struct edd_info *ei) 73static int get_edd_info(u8 devno, struct edd_info *ei)
@@ -167,9 +172,8 @@ void query_edd(void)
167 * Scan the BIOS-supported hard disks and query EDD 172 * Scan the BIOS-supported hard disks and query EDD
168 * information... 173 * information...
169 */ 174 */
170 get_edd_info(devno, &ei); 175 if (!get_edd_info(devno, &ei)
171 176 && boot_params.eddbuf_entries < EDDMAXNR) {
172 if (boot_params.eddbuf_entries < EDDMAXNR) {
173 memcpy(edp, &ei, sizeof ei); 177 memcpy(edp, &ei, sizeof ei);
174 edp++; 178 edp++;
175 boot_params.eddbuf_entries++; 179 boot_params.eddbuf_entries++;
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index af86e431acfa..b993062e9a5f 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -30,7 +30,6 @@ SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */
30SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ 30SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */
31 /* to be loaded */ 31 /* to be loaded */
32ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ 32ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */
33SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */
34 33
35#ifndef SVGA_MODE 34#ifndef SVGA_MODE
36#define SVGA_MODE ASK_VGA 35#define SVGA_MODE ASK_VGA
diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c
index 2296164b54d2..197421db1af1 100644
--- a/arch/x86/boot/main.c
+++ b/arch/x86/boot/main.c
@@ -73,6 +73,11 @@ static void keyboard_set_repeat(void)
73 */ 73 */
74static void query_ist(void) 74static void query_ist(void)
75{ 75{
76 /* Some older BIOSes apparently crash on this call, so filter
77 it from machines too old to have SpeedStep at all. */
78 if (cpu.level < 6)
79 return;
80
76 asm("int $0x15" 81 asm("int $0x15"
77 : "=a" (boot_params.ist_info.signature), 82 : "=a" (boot_params.ist_info.signature),
78 "=b" (boot_params.ist_info.command), 83 "=b" (boot_params.ist_info.command),
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 53165c97336b..8c3c25f35578 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -13,7 +13,6 @@
13 */ 13 */
14 14
15#include "boot.h" 15#include "boot.h"
16#include <linux/kernel.h>
17 16
18#define SMAP 0x534d4150 /* ASCII "SMAP" */ 17#define SMAP 0x534d4150 /* ASCII "SMAP" */
19 18
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index bbe76953bae9..8ef60f20b371 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -15,33 +15,33 @@
15 15
16#include <stdio.h> 16#include <stdio.h>
17 17
18#include "../kernel/cpu/feature_names.c" 18#include "../kernel/cpu/capflags.c"
19
20#if NCAPFLAGS > 8
21# error "Need to adjust the boot code handling of CPUID strings"
22#endif
23 19
24int main(void) 20int main(void)
25{ 21{
26 int i; 22 int i, j;
27 const char *str; 23 const char *str;
28 24
29 printf("static const char x86_cap_strs[] = \n"); 25 printf("static const char x86_cap_strs[] = \n");
30 26
31 for (i = 0; i < NCAPINTS*32; i++) { 27 for (i = 0; i < NCAPINTS; i++) {
32 str = x86_cap_flags[i]; 28 for (j = 0; j < 32; j++) {
33 29 str = x86_cap_flags[i*32+j];
34 if (i == NCAPINTS*32-1) { 30
35 /* The last entry must be unconditional; this 31 if (i == NCAPINTS-1 && j == 31) {
36 also consumes the compiler-added null character */ 32 /* The last entry must be unconditional; this
37 if (!str) 33 also consumes the compiler-added null
38 str = ""; 34 character */
39 printf("\t\"\\x%02x\"\"%s\"\n", i, str); 35 if (!str)
40 } else if (str) { 36 str = "";
41 printf("#if REQUIRED_MASK%d & (1 << %d)\n" 37 printf("\t\"\\x%02x\\x%02x\"\"%s\"\n",
42 "\t\"\\x%02x\"\"%s\\0\"\n" 38 i, j, str);
43 "#endif\n", 39 } else if (str) {
44 i >> 5, i & 31, i, str); 40 printf("#if REQUIRED_MASK%d & (1 << %d)\n"
41 "\t\"\\x%02x\\x%02x\"\"%s\\0\"\n"
42 "#endif\n",
43 i, j, i, j, str);
44 }
45 } 45 }
46 } 46 }
47 printf("\t;\n"); 47 printf("\t;\n");
diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c
index 328956fdb59e..85a1cd8a8ff8 100644
--- a/arch/x86/boot/pm.c
+++ b/arch/x86/boot/pm.c
@@ -98,12 +98,6 @@ static void reset_coprocessor(void)
98/* 98/*
99 * Set up the GDT 99 * Set up the GDT
100 */ 100 */
101#define GDT_ENTRY(flags, base, limit) \
102 (((u64)(base & 0xff000000) << 32) | \
103 ((u64)flags << 40) | \
104 ((u64)(limit & 0x00ff0000) << 32) | \
105 ((u64)(base & 0x00ffffff) << 16) | \
106 ((u64)(limit & 0x0000ffff)))
107 101
108struct gdt_ptr { 102struct gdt_ptr {
109 u16 len; 103 u16 len;
diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c
index 49f26aaaebc8..3fa979c9c363 100644
--- a/arch/x86/boot/video-bios.c
+++ b/arch/x86/boot/video-bios.c
@@ -17,7 +17,7 @@
17#include "boot.h" 17#include "boot.h"
18#include "video.h" 18#include "video.h"
19 19
20__videocard video_bios; 20static __videocard video_bios;
21 21
22/* Set a conventional BIOS mode */ 22/* Set a conventional BIOS mode */
23static int set_bios_mode(u8 mode); 23static int set_bios_mode(u8 mode);
@@ -119,7 +119,7 @@ static int bios_probe(void)
119 return nmodes; 119 return nmodes;
120} 120}
121 121
122__videocard video_bios = 122static __videocard video_bios =
123{ 123{
124 .card_name = "BIOS", 124 .card_name = "BIOS",
125 .probe = bios_probe, 125 .probe = bios_probe,
diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c
index 401ad998ad08..75115849af33 100644
--- a/arch/x86/boot/video-vesa.c
+++ b/arch/x86/boot/video-vesa.c
@@ -20,7 +20,7 @@
20static struct vesa_general_info vginfo; 20static struct vesa_general_info vginfo;
21static struct vesa_mode_info vminfo; 21static struct vesa_mode_info vminfo;
22 22
23__videocard video_vesa; 23static __videocard video_vesa;
24 24
25#ifndef _WAKEUP 25#ifndef _WAKEUP
26static void vesa_store_mode_params_graphics(void); 26static void vesa_store_mode_params_graphics(void);
@@ -88,14 +88,11 @@ static int vesa_probe(void)
88 (vminfo.memory_layout == 4 || 88 (vminfo.memory_layout == 4 ||
89 vminfo.memory_layout == 6) && 89 vminfo.memory_layout == 6) &&
90 vminfo.memory_planes == 1) { 90 vminfo.memory_planes == 1) {
91#ifdef CONFIG_FB 91#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
92 /* Graphics mode, color, linear frame buffer 92 /* Graphics mode, color, linear frame buffer
93 supported. Only register the mode if 93 supported. Only register the mode if
94 if framebuffer is configured, however, 94 if framebuffer is configured, however,
95 otherwise the user will be left without a screen. 95 otherwise the user will be left without a screen. */
96 We don't require CONFIG_FB_VESA, however, since
97 some of the other framebuffer drivers can use
98 this mode-setting, too. */
99 mi = GET_HEAP(struct mode_info, 1); 96 mi = GET_HEAP(struct mode_info, 1);
100 mi->mode = mode + VIDEO_FIRST_VESA; 97 mi->mode = mode + VIDEO_FIRST_VESA;
101 mi->depth = vminfo.bpp; 98 mi->depth = vminfo.bpp;
@@ -133,10 +130,12 @@ static int vesa_set_mode(struct mode_info *mode)
133 if ((vminfo.mode_attr & 0x15) == 0x05) { 130 if ((vminfo.mode_attr & 0x15) == 0x05) {
134 /* It's a supported text mode */ 131 /* It's a supported text mode */
135 is_graphic = 0; 132 is_graphic = 0;
133#ifdef CONFIG_FB_BOOT_VESA_SUPPORT
136 } else if ((vminfo.mode_attr & 0x99) == 0x99) { 134 } else if ((vminfo.mode_attr & 0x99) == 0x99) {
137 /* It's a graphics mode with linear frame buffer */ 135 /* It's a graphics mode with linear frame buffer */
138 is_graphic = 1; 136 is_graphic = 1;
139 vesa_mode |= 0x4000; /* Request linear frame buffer */ 137 vesa_mode |= 0x4000; /* Request linear frame buffer */
138#endif
140 } else { 139 } else {
141 return -1; /* Invalid mode */ 140 return -1; /* Invalid mode */
142 } 141 }
@@ -224,7 +223,7 @@ static void vesa_store_pm_info(void)
224static void vesa_store_mode_params_graphics(void) 223static void vesa_store_mode_params_graphics(void)
225{ 224{
226 /* Tell the kernel we're in VESA graphics mode */ 225 /* Tell the kernel we're in VESA graphics mode */
227 boot_params.screen_info.orig_video_isVGA = 0x23; 226 boot_params.screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB;
228 227
229 /* Mode parameters */ 228 /* Mode parameters */
230 boot_params.screen_info.vesa_attributes = vminfo.mode_attr; 229 boot_params.screen_info.vesa_attributes = vminfo.mode_attr;
@@ -294,7 +293,7 @@ void vesa_store_edid(void)
294 293
295#endif /* not _WAKEUP */ 294#endif /* not _WAKEUP */
296 295
297__videocard video_vesa = 296static __videocard video_vesa =
298{ 297{
299 .card_name = "VESA", 298 .card_name = "VESA",
300 .probe = vesa_probe, 299 .probe = vesa_probe,
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 9bc34e2033ec..13b8c86ae985 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -1,13 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.26-rc1 3# Linux kernel version: 2.6.27-rc5
4# Sun May 4 19:59:02 2008 4# Wed Sep 3 17:23:09 2008
5# 5#
6# CONFIG_64BIT is not set 6# CONFIG_64BIT is not set
7CONFIG_X86_32=y 7CONFIG_X86_32=y
8# CONFIG_X86_64 is not set 8# CONFIG_X86_64 is not set
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_DEFCONFIG_LIST="arch/x86/configs/i386_defconfig" 10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
11# CONFIG_GENERIC_LOCKBREAK is not set 11# CONFIG_GENERIC_LOCKBREAK is not set
12CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
13CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
@@ -53,6 +53,7 @@ CONFIG_X86_HT=y
53CONFIG_X86_BIOS_REBOOT=y 53CONFIG_X86_BIOS_REBOOT=y
54CONFIG_X86_TRAMPOLINE=y 54CONFIG_X86_TRAMPOLINE=y
55CONFIG_KTIME_SCALAR=y 55CONFIG_KTIME_SCALAR=y
56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
56 57
57# 58#
58# General setup 59# General setup
@@ -82,6 +83,7 @@ CONFIG_CGROUPS=y
82CONFIG_CGROUP_NS=y 83CONFIG_CGROUP_NS=y
83# CONFIG_CGROUP_DEVICE is not set 84# CONFIG_CGROUP_DEVICE is not set
84CONFIG_CPUSETS=y 85CONFIG_CPUSETS=y
86CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
85CONFIG_GROUP_SCHED=y 87CONFIG_GROUP_SCHED=y
86CONFIG_FAIR_GROUP_SCHED=y 88CONFIG_FAIR_GROUP_SCHED=y
87# CONFIG_RT_GROUP_SCHED is not set 89# CONFIG_RT_GROUP_SCHED is not set
@@ -105,7 +107,6 @@ CONFIG_SYSCTL=y
105# CONFIG_EMBEDDED is not set 107# CONFIG_EMBEDDED is not set
106CONFIG_UID16=y 108CONFIG_UID16=y
107CONFIG_SYSCTL_SYSCALL=y 109CONFIG_SYSCTL_SYSCALL=y
108CONFIG_SYSCTL_SYSCALL_CHECK=y
109CONFIG_KALLSYMS=y 110CONFIG_KALLSYMS=y
110CONFIG_KALLSYMS_ALL=y 111CONFIG_KALLSYMS_ALL=y
111CONFIG_KALLSYMS_EXTRA_PASS=y 112CONFIG_KALLSYMS_EXTRA_PASS=y
@@ -113,6 +114,7 @@ CONFIG_HOTPLUG=y
113CONFIG_PRINTK=y 114CONFIG_PRINTK=y
114CONFIG_BUG=y 115CONFIG_BUG=y
115CONFIG_ELF_CORE=y 116CONFIG_ELF_CORE=y
117CONFIG_PCSPKR_PLATFORM=y
116# CONFIG_COMPAT_BRK is not set 118# CONFIG_COMPAT_BRK is not set
117CONFIG_BASE_FULL=y 119CONFIG_BASE_FULL=y
118CONFIG_FUTEX=y 120CONFIG_FUTEX=y
@@ -132,27 +134,35 @@ CONFIG_MARKERS=y
132# CONFIG_OPROFILE is not set 134# CONFIG_OPROFILE is not set
133CONFIG_HAVE_OPROFILE=y 135CONFIG_HAVE_OPROFILE=y
134CONFIG_KPROBES=y 136CONFIG_KPROBES=y
137CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
135CONFIG_KRETPROBES=y 138CONFIG_KRETPROBES=y
139CONFIG_HAVE_IOREMAP_PROT=y
136CONFIG_HAVE_KPROBES=y 140CONFIG_HAVE_KPROBES=y
137CONFIG_HAVE_KRETPROBES=y 141CONFIG_HAVE_KRETPROBES=y
142# CONFIG_HAVE_ARCH_TRACEHOOK is not set
138# CONFIG_HAVE_DMA_ATTRS is not set 143# CONFIG_HAVE_DMA_ATTRS is not set
144CONFIG_USE_GENERIC_SMP_HELPERS=y
145# CONFIG_HAVE_CLK is not set
139CONFIG_PROC_PAGE_MONITOR=y 146CONFIG_PROC_PAGE_MONITOR=y
147CONFIG_HAVE_GENERIC_DMA_COHERENT=y
140CONFIG_SLABINFO=y 148CONFIG_SLABINFO=y
141CONFIG_RT_MUTEXES=y 149CONFIG_RT_MUTEXES=y
142# CONFIG_TINY_SHMEM is not set 150# CONFIG_TINY_SHMEM is not set
143CONFIG_BASE_SMALL=0 151CONFIG_BASE_SMALL=0
144CONFIG_MODULES=y 152CONFIG_MODULES=y
153# CONFIG_MODULE_FORCE_LOAD is not set
145CONFIG_MODULE_UNLOAD=y 154CONFIG_MODULE_UNLOAD=y
146CONFIG_MODULE_FORCE_UNLOAD=y 155CONFIG_MODULE_FORCE_UNLOAD=y
147# CONFIG_MODVERSIONS is not set 156# CONFIG_MODVERSIONS is not set
148# CONFIG_MODULE_SRCVERSION_ALL is not set 157# CONFIG_MODULE_SRCVERSION_ALL is not set
149# CONFIG_KMOD is not set 158CONFIG_KMOD=y
150CONFIG_STOP_MACHINE=y 159CONFIG_STOP_MACHINE=y
151CONFIG_BLOCK=y 160CONFIG_BLOCK=y
152# CONFIG_LBD is not set 161# CONFIG_LBD is not set
153CONFIG_BLK_DEV_IO_TRACE=y 162CONFIG_BLK_DEV_IO_TRACE=y
154# CONFIG_LSF is not set 163# CONFIG_LSF is not set
155CONFIG_BLK_DEV_BSG=y 164CONFIG_BLK_DEV_BSG=y
165# CONFIG_BLK_DEV_INTEGRITY is not set
156 166
157# 167#
158# IO Schedulers 168# IO Schedulers
@@ -176,25 +186,23 @@ CONFIG_NO_HZ=y
176CONFIG_HIGH_RES_TIMERS=y 186CONFIG_HIGH_RES_TIMERS=y
177CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 187CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
178CONFIG_SMP=y 188CONFIG_SMP=y
189CONFIG_X86_FIND_SMP_CONFIG=y
190CONFIG_X86_MPPARSE=y
179CONFIG_X86_PC=y 191CONFIG_X86_PC=y
180# CONFIG_X86_ELAN is not set 192# CONFIG_X86_ELAN is not set
181# CONFIG_X86_VOYAGER is not set 193# CONFIG_X86_VOYAGER is not set
182# CONFIG_X86_NUMAQ is not set
183# CONFIG_X86_SUMMIT is not set
184# CONFIG_X86_BIGSMP is not set
185# CONFIG_X86_VISWS is not set
186# CONFIG_X86_GENERICARCH is not set 194# CONFIG_X86_GENERICARCH is not set
187# CONFIG_X86_ES7000 is not set
188# CONFIG_X86_RDC321X is not set
189# CONFIG_X86_VSMP is not set 195# CONFIG_X86_VSMP is not set
196# CONFIG_X86_RDC321X is not set
190CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y 197CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
191# CONFIG_PARAVIRT_GUEST is not set 198# CONFIG_PARAVIRT_GUEST is not set
199# CONFIG_MEMTEST is not set
192# CONFIG_M386 is not set 200# CONFIG_M386 is not set
193# CONFIG_M486 is not set 201# CONFIG_M486 is not set
194# CONFIG_M586 is not set 202# CONFIG_M586 is not set
195# CONFIG_M586TSC is not set 203# CONFIG_M586TSC is not set
196# CONFIG_M586MMX is not set 204# CONFIG_M586MMX is not set
197# CONFIG_M686 is not set 205CONFIG_M686=y
198# CONFIG_MPENTIUMII is not set 206# CONFIG_MPENTIUMII is not set
199# CONFIG_MPENTIUMIII is not set 207# CONFIG_MPENTIUMIII is not set
200# CONFIG_MPENTIUMM is not set 208# CONFIG_MPENTIUMM is not set
@@ -205,7 +213,6 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
205# CONFIG_MCRUSOE is not set 213# CONFIG_MCRUSOE is not set
206# CONFIG_MEFFICEON is not set 214# CONFIG_MEFFICEON is not set
207# CONFIG_MWINCHIPC6 is not set 215# CONFIG_MWINCHIPC6 is not set
208# CONFIG_MWINCHIP2 is not set
209# CONFIG_MWINCHIP3D is not set 216# CONFIG_MWINCHIP3D is not set
210# CONFIG_MGEODEGX1 is not set 217# CONFIG_MGEODEGX1 is not set
211# CONFIG_MGEODE_LX is not set 218# CONFIG_MGEODE_LX is not set
@@ -213,30 +220,30 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y
213# CONFIG_MVIAC3_2 is not set 220# CONFIG_MVIAC3_2 is not set
214# CONFIG_MVIAC7 is not set 221# CONFIG_MVIAC7 is not set
215# CONFIG_MPSC is not set 222# CONFIG_MPSC is not set
216CONFIG_MCORE2=y 223# CONFIG_MCORE2 is not set
217# CONFIG_GENERIC_CPU is not set 224# CONFIG_GENERIC_CPU is not set
218# CONFIG_X86_GENERIC is not set 225CONFIG_X86_GENERIC=y
219CONFIG_X86_CPU=y 226CONFIG_X86_CPU=y
220CONFIG_X86_CMPXCHG=y 227CONFIG_X86_CMPXCHG=y
221CONFIG_X86_L1_CACHE_SHIFT=6 228CONFIG_X86_L1_CACHE_SHIFT=7
222CONFIG_X86_XADD=y 229CONFIG_X86_XADD=y
230# CONFIG_X86_PPRO_FENCE is not set
223CONFIG_X86_WP_WORKS_OK=y 231CONFIG_X86_WP_WORKS_OK=y
224CONFIG_X86_INVLPG=y 232CONFIG_X86_INVLPG=y
225CONFIG_X86_BSWAP=y 233CONFIG_X86_BSWAP=y
226CONFIG_X86_POPAD_OK=y 234CONFIG_X86_POPAD_OK=y
227CONFIG_X86_GOOD_APIC=y
228CONFIG_X86_INTEL_USERCOPY=y 235CONFIG_X86_INTEL_USERCOPY=y
229CONFIG_X86_USE_PPRO_CHECKSUM=y 236CONFIG_X86_USE_PPRO_CHECKSUM=y
230CONFIG_X86_P6_NOP=y
231CONFIG_X86_TSC=y 237CONFIG_X86_TSC=y
232CONFIG_X86_MINIMUM_CPU_FAMILY=6 238CONFIG_X86_CMOV=y
239CONFIG_X86_MINIMUM_CPU_FAMILY=4
233CONFIG_X86_DEBUGCTLMSR=y 240CONFIG_X86_DEBUGCTLMSR=y
234CONFIG_HPET_TIMER=y 241CONFIG_HPET_TIMER=y
235CONFIG_HPET_EMULATE_RTC=y 242CONFIG_HPET_EMULATE_RTC=y
236CONFIG_DMI=y 243CONFIG_DMI=y
237# CONFIG_IOMMU_HELPER is not set 244# CONFIG_IOMMU_HELPER is not set
238CONFIG_NR_CPUS=4 245CONFIG_NR_CPUS=64
239# CONFIG_SCHED_SMT is not set 246CONFIG_SCHED_SMT=y
240CONFIG_SCHED_MC=y 247CONFIG_SCHED_MC=y
241# CONFIG_PREEMPT_NONE is not set 248# CONFIG_PREEMPT_NONE is not set
242CONFIG_PREEMPT_VOLUNTARY=y 249CONFIG_PREEMPT_VOLUNTARY=y
@@ -247,8 +254,9 @@ CONFIG_X86_IO_APIC=y
247CONFIG_VM86=y 254CONFIG_VM86=y
248# CONFIG_TOSHIBA is not set 255# CONFIG_TOSHIBA is not set
249# CONFIG_I8K is not set 256# CONFIG_I8K is not set
250# CONFIG_X86_REBOOTFIXUPS is not set 257CONFIG_X86_REBOOTFIXUPS=y
251# CONFIG_MICROCODE is not set 258CONFIG_MICROCODE=y
259CONFIG_MICROCODE_OLD_INTERFACE=y
252CONFIG_X86_MSR=y 260CONFIG_X86_MSR=y
253CONFIG_X86_CPUID=y 261CONFIG_X86_CPUID=y
254# CONFIG_NOHIGHMEM is not set 262# CONFIG_NOHIGHMEM is not set
@@ -256,34 +264,29 @@ CONFIG_HIGHMEM4G=y
256# CONFIG_HIGHMEM64G is not set 264# CONFIG_HIGHMEM64G is not set
257CONFIG_PAGE_OFFSET=0xC0000000 265CONFIG_PAGE_OFFSET=0xC0000000
258CONFIG_HIGHMEM=y 266CONFIG_HIGHMEM=y
259CONFIG_NEED_NODE_MEMMAP_SIZE=y
260CONFIG_ARCH_FLATMEM_ENABLE=y 267CONFIG_ARCH_FLATMEM_ENABLE=y
261CONFIG_ARCH_SPARSEMEM_ENABLE=y 268CONFIG_ARCH_SPARSEMEM_ENABLE=y
262CONFIG_ARCH_SELECT_MEMORY_MODEL=y 269CONFIG_ARCH_SELECT_MEMORY_MODEL=y
263CONFIG_SELECT_MEMORY_MODEL=y 270CONFIG_SELECT_MEMORY_MODEL=y
264# CONFIG_FLATMEM_MANUAL is not set 271CONFIG_FLATMEM_MANUAL=y
265# CONFIG_DISCONTIGMEM_MANUAL is not set 272# CONFIG_DISCONTIGMEM_MANUAL is not set
266CONFIG_SPARSEMEM_MANUAL=y 273# CONFIG_SPARSEMEM_MANUAL is not set
267CONFIG_SPARSEMEM=y 274CONFIG_FLATMEM=y
268CONFIG_HAVE_MEMORY_PRESENT=y 275CONFIG_FLAT_NODE_MEM_MAP=y
269CONFIG_SPARSEMEM_STATIC=y 276CONFIG_SPARSEMEM_STATIC=y
270# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set 277# CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set
271
272#
273# Memory hotplug is currently incompatible with Software Suspend
274#
275CONFIG_PAGEFLAGS_EXTENDED=y 278CONFIG_PAGEFLAGS_EXTENDED=y
276CONFIG_SPLIT_PTLOCK_CPUS=4 279CONFIG_SPLIT_PTLOCK_CPUS=4
277CONFIG_RESOURCES_64BIT=y 280CONFIG_RESOURCES_64BIT=y
278CONFIG_ZONE_DMA_FLAG=1 281CONFIG_ZONE_DMA_FLAG=1
279CONFIG_BOUNCE=y 282CONFIG_BOUNCE=y
280CONFIG_VIRT_TO_BUS=y 283CONFIG_VIRT_TO_BUS=y
281# CONFIG_HIGHPTE is not set 284CONFIG_HIGHPTE=y
282# CONFIG_MATH_EMULATION is not set 285# CONFIG_MATH_EMULATION is not set
283CONFIG_MTRR=y 286CONFIG_MTRR=y
284# CONFIG_X86_PAT is not set 287# CONFIG_MTRR_SANITIZER is not set
288CONFIG_X86_PAT=y
285CONFIG_EFI=y 289CONFIG_EFI=y
286# CONFIG_IRQBALANCE is not set
287CONFIG_SECCOMP=y 290CONFIG_SECCOMP=y
288# CONFIG_HZ_100 is not set 291# CONFIG_HZ_100 is not set
289# CONFIG_HZ_250 is not set 292# CONFIG_HZ_250 is not set
@@ -293,6 +296,7 @@ CONFIG_HZ=1000
293CONFIG_SCHED_HRTICK=y 296CONFIG_SCHED_HRTICK=y
294CONFIG_KEXEC=y 297CONFIG_KEXEC=y
295CONFIG_CRASH_DUMP=y 298CONFIG_CRASH_DUMP=y
299# CONFIG_KEXEC_JUMP is not set
296CONFIG_PHYSICAL_START=0x1000000 300CONFIG_PHYSICAL_START=0x1000000
297CONFIG_RELOCATABLE=y 301CONFIG_RELOCATABLE=y
298CONFIG_PHYSICAL_ALIGN=0x200000 302CONFIG_PHYSICAL_ALIGN=0x200000
@@ -312,6 +316,7 @@ CONFIG_PM_TRACE_RTC=y
312CONFIG_PM_SLEEP_SMP=y 316CONFIG_PM_SLEEP_SMP=y
313CONFIG_PM_SLEEP=y 317CONFIG_PM_SLEEP=y
314CONFIG_SUSPEND=y 318CONFIG_SUSPEND=y
319# CONFIG_PM_TEST_SUSPEND is not set
315CONFIG_SUSPEND_FREEZER=y 320CONFIG_SUSPEND_FREEZER=y
316CONFIG_HIBERNATION=y 321CONFIG_HIBERNATION=y
317CONFIG_PM_STD_PARTITION="" 322CONFIG_PM_STD_PARTITION=""
@@ -337,6 +342,7 @@ CONFIG_ACPI_THERMAL=y
337CONFIG_ACPI_BLACKLIST_YEAR=0 342CONFIG_ACPI_BLACKLIST_YEAR=0
338# CONFIG_ACPI_DEBUG is not set 343# CONFIG_ACPI_DEBUG is not set
339CONFIG_ACPI_EC=y 344CONFIG_ACPI_EC=y
345# CONFIG_ACPI_PCI_SLOT is not set
340CONFIG_ACPI_POWER=y 346CONFIG_ACPI_POWER=y
341CONFIG_ACPI_SYSTEM=y 347CONFIG_ACPI_SYSTEM=y
342CONFIG_X86_PM_TIMER=y 348CONFIG_X86_PM_TIMER=y
@@ -395,8 +401,8 @@ CONFIG_PCI=y
395# CONFIG_PCI_GOBIOS is not set 401# CONFIG_PCI_GOBIOS is not set
396# CONFIG_PCI_GOMMCONFIG is not set 402# CONFIG_PCI_GOMMCONFIG is not set
397# CONFIG_PCI_GODIRECT is not set 403# CONFIG_PCI_GODIRECT is not set
398CONFIG_PCI_GOANY=y
399# CONFIG_PCI_GOOLPC is not set 404# CONFIG_PCI_GOOLPC is not set
405CONFIG_PCI_GOANY=y
400CONFIG_PCI_BIOS=y 406CONFIG_PCI_BIOS=y
401CONFIG_PCI_DIRECT=y 407CONFIG_PCI_DIRECT=y
402CONFIG_PCI_MMCONFIG=y 408CONFIG_PCI_MMCONFIG=y
@@ -448,10 +454,6 @@ CONFIG_HOTPLUG_PCI=y
448CONFIG_BINFMT_ELF=y 454CONFIG_BINFMT_ELF=y
449# CONFIG_BINFMT_AOUT is not set 455# CONFIG_BINFMT_AOUT is not set
450CONFIG_BINFMT_MISC=y 456CONFIG_BINFMT_MISC=y
451
452#
453# Networking
454#
455CONFIG_NET=y 457CONFIG_NET=y
456 458
457# 459#
@@ -475,7 +477,10 @@ CONFIG_IP_FIB_HASH=y
475CONFIG_IP_MULTIPLE_TABLES=y 477CONFIG_IP_MULTIPLE_TABLES=y
476CONFIG_IP_ROUTE_MULTIPATH=y 478CONFIG_IP_ROUTE_MULTIPATH=y
477CONFIG_IP_ROUTE_VERBOSE=y 479CONFIG_IP_ROUTE_VERBOSE=y
478# CONFIG_IP_PNP is not set 480CONFIG_IP_PNP=y
481CONFIG_IP_PNP_DHCP=y
482CONFIG_IP_PNP_BOOTP=y
483CONFIG_IP_PNP_RARP=y
479# CONFIG_NET_IPIP is not set 484# CONFIG_NET_IPIP is not set
480# CONFIG_NET_IPGRE is not set 485# CONFIG_NET_IPGRE is not set
481CONFIG_IP_MROUTE=y 486CONFIG_IP_MROUTE=y
@@ -618,7 +623,6 @@ CONFIG_NET_SCHED=y
618# CONFIG_NET_SCH_HTB is not set 623# CONFIG_NET_SCH_HTB is not set
619# CONFIG_NET_SCH_HFSC is not set 624# CONFIG_NET_SCH_HFSC is not set
620# CONFIG_NET_SCH_PRIO is not set 625# CONFIG_NET_SCH_PRIO is not set
621# CONFIG_NET_SCH_RR is not set
622# CONFIG_NET_SCH_RED is not set 626# CONFIG_NET_SCH_RED is not set
623# CONFIG_NET_SCH_SFQ is not set 627# CONFIG_NET_SCH_SFQ is not set
624# CONFIG_NET_SCH_TEQL is not set 628# CONFIG_NET_SCH_TEQL is not set
@@ -680,28 +684,19 @@ CONFIG_FIB_RULES=y
680CONFIG_CFG80211=y 684CONFIG_CFG80211=y
681CONFIG_NL80211=y 685CONFIG_NL80211=y
682CONFIG_WIRELESS_EXT=y 686CONFIG_WIRELESS_EXT=y
687CONFIG_WIRELESS_EXT_SYSFS=y
683CONFIG_MAC80211=y 688CONFIG_MAC80211=y
684 689
685# 690#
686# Rate control algorithm selection 691# Rate control algorithm selection
687# 692#
693CONFIG_MAC80211_RC_PID=y
688CONFIG_MAC80211_RC_DEFAULT_PID=y 694CONFIG_MAC80211_RC_DEFAULT_PID=y
689# CONFIG_MAC80211_RC_DEFAULT_NONE is not set
690
691#
692# Selecting 'y' for an algorithm will
693#
694
695#
696# build the algorithm into mac80211.
697#
698CONFIG_MAC80211_RC_DEFAULT="pid" 695CONFIG_MAC80211_RC_DEFAULT="pid"
699CONFIG_MAC80211_RC_PID=y
700# CONFIG_MAC80211_MESH is not set 696# CONFIG_MAC80211_MESH is not set
701CONFIG_MAC80211_LEDS=y 697CONFIG_MAC80211_LEDS=y
702# CONFIG_MAC80211_DEBUGFS is not set 698# CONFIG_MAC80211_DEBUGFS is not set
703# CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT is not set 699# CONFIG_MAC80211_DEBUG_MENU is not set
704# CONFIG_MAC80211_DEBUG is not set
705# CONFIG_IEEE80211 is not set 700# CONFIG_IEEE80211 is not set
706# CONFIG_RFKILL is not set 701# CONFIG_RFKILL is not set
707# CONFIG_NET_9P is not set 702# CONFIG_NET_9P is not set
@@ -717,6 +712,8 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
717CONFIG_STANDALONE=y 712CONFIG_STANDALONE=y
718CONFIG_PREVENT_FIRMWARE_BUILD=y 713CONFIG_PREVENT_FIRMWARE_BUILD=y
719CONFIG_FW_LOADER=y 714CONFIG_FW_LOADER=y
715CONFIG_FIRMWARE_IN_KERNEL=y
716CONFIG_EXTRA_FIRMWARE=""
720# CONFIG_DEBUG_DRIVER is not set 717# CONFIG_DEBUG_DRIVER is not set
721CONFIG_DEBUG_DEVRES=y 718CONFIG_DEBUG_DEVRES=y
722# CONFIG_SYS_HYPERVISOR is not set 719# CONFIG_SYS_HYPERVISOR is not set
@@ -749,6 +746,7 @@ CONFIG_BLK_DEV_RAM_SIZE=16384
749# CONFIG_BLK_DEV_XIP is not set 746# CONFIG_BLK_DEV_XIP is not set
750# CONFIG_CDROM_PKTCDVD is not set 747# CONFIG_CDROM_PKTCDVD is not set
751# CONFIG_ATA_OVER_ETH is not set 748# CONFIG_ATA_OVER_ETH is not set
749# CONFIG_BLK_DEV_HD is not set
752CONFIG_MISC_DEVICES=y 750CONFIG_MISC_DEVICES=y
753# CONFIG_IBM_ASM is not set 751# CONFIG_IBM_ASM is not set
754# CONFIG_PHANTOM is not set 752# CONFIG_PHANTOM is not set
@@ -760,10 +758,12 @@ CONFIG_MISC_DEVICES=y
760# CONFIG_FUJITSU_LAPTOP is not set 758# CONFIG_FUJITSU_LAPTOP is not set
761# CONFIG_TC1100_WMI is not set 759# CONFIG_TC1100_WMI is not set
762# CONFIG_MSI_LAPTOP is not set 760# CONFIG_MSI_LAPTOP is not set
761# CONFIG_COMPAL_LAPTOP is not set
763# CONFIG_SONY_LAPTOP is not set 762# CONFIG_SONY_LAPTOP is not set
764# CONFIG_THINKPAD_ACPI is not set 763# CONFIG_THINKPAD_ACPI is not set
765# CONFIG_INTEL_MENLOW is not set 764# CONFIG_INTEL_MENLOW is not set
766# CONFIG_ENCLOSURE_SERVICES is not set 765# CONFIG_ENCLOSURE_SERVICES is not set
766# CONFIG_HP_ILO is not set
767CONFIG_HAVE_IDE=y 767CONFIG_HAVE_IDE=y
768# CONFIG_IDE is not set 768# CONFIG_IDE is not set
769 769
@@ -802,12 +802,13 @@ CONFIG_SCSI_WAIT_SCAN=m
802# 802#
803CONFIG_SCSI_SPI_ATTRS=y 803CONFIG_SCSI_SPI_ATTRS=y
804# CONFIG_SCSI_FC_ATTRS is not set 804# CONFIG_SCSI_FC_ATTRS is not set
805# CONFIG_SCSI_ISCSI_ATTRS is not set 805CONFIG_SCSI_ISCSI_ATTRS=y
806# CONFIG_SCSI_SAS_ATTRS is not set 806# CONFIG_SCSI_SAS_ATTRS is not set
807# CONFIG_SCSI_SAS_LIBSAS is not set 807# CONFIG_SCSI_SAS_LIBSAS is not set
808# CONFIG_SCSI_SRP_ATTRS is not set 808# CONFIG_SCSI_SRP_ATTRS is not set
809# CONFIG_SCSI_LOWLEVEL is not set 809# CONFIG_SCSI_LOWLEVEL is not set
810# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set 810# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
811# CONFIG_SCSI_DH is not set
811CONFIG_ATA=y 812CONFIG_ATA=y
812# CONFIG_ATA_NONSTANDARD is not set 813# CONFIG_ATA_NONSTANDARD is not set
813CONFIG_ATA_ACPI=y 814CONFIG_ATA_ACPI=y
@@ -842,7 +843,7 @@ CONFIG_PATA_AMD=y
842# CONFIG_PATA_CS5536 is not set 843# CONFIG_PATA_CS5536 is not set
843# CONFIG_PATA_CYPRESS is not set 844# CONFIG_PATA_CYPRESS is not set
844# CONFIG_PATA_EFAR is not set 845# CONFIG_PATA_EFAR is not set
845# CONFIG_ATA_GENERIC is not set 846CONFIG_ATA_GENERIC=y
846# CONFIG_PATA_HPT366 is not set 847# CONFIG_PATA_HPT366 is not set
847# CONFIG_PATA_HPT37X is not set 848# CONFIG_PATA_HPT37X is not set
848# CONFIG_PATA_HPT3X2N is not set 849# CONFIG_PATA_HPT3X2N is not set
@@ -852,7 +853,7 @@ CONFIG_PATA_AMD=y
852# CONFIG_PATA_JMICRON is not set 853# CONFIG_PATA_JMICRON is not set
853# CONFIG_PATA_TRIFLEX is not set 854# CONFIG_PATA_TRIFLEX is not set
854# CONFIG_PATA_MARVELL is not set 855# CONFIG_PATA_MARVELL is not set
855# CONFIG_PATA_MPIIX is not set 856CONFIG_PATA_MPIIX=y
856CONFIG_PATA_OLDPIIX=y 857CONFIG_PATA_OLDPIIX=y
857# CONFIG_PATA_NETCELL is not set 858# CONFIG_PATA_NETCELL is not set
858# CONFIG_PATA_NINJA32 is not set 859# CONFIG_PATA_NINJA32 is not set
@@ -871,6 +872,7 @@ CONFIG_PATA_OLDPIIX=y
871# CONFIG_PATA_SIS is not set 872# CONFIG_PATA_SIS is not set
872# CONFIG_PATA_VIA is not set 873# CONFIG_PATA_VIA is not set
873# CONFIG_PATA_WINBOND is not set 874# CONFIG_PATA_WINBOND is not set
875CONFIG_PATA_SCH=y
874CONFIG_MD=y 876CONFIG_MD=y
875CONFIG_BLK_DEV_MD=y 877CONFIG_BLK_DEV_MD=y
876# CONFIG_MD_LINEAR is not set 878# CONFIG_MD_LINEAR is not set
@@ -894,13 +896,16 @@ CONFIG_DM_ZERO=y
894# 896#
895# IEEE 1394 (FireWire) support 897# IEEE 1394 (FireWire) support
896# 898#
899
900#
901# Enable only one of the two stacks, unless you know what you are doing
902#
897# CONFIG_FIREWIRE is not set 903# CONFIG_FIREWIRE is not set
898# CONFIG_IEEE1394 is not set 904# CONFIG_IEEE1394 is not set
899# CONFIG_I2O is not set 905# CONFIG_I2O is not set
900CONFIG_MACINTOSH_DRIVERS=y 906CONFIG_MACINTOSH_DRIVERS=y
901CONFIG_MAC_EMUMOUSEBTN=y 907CONFIG_MAC_EMUMOUSEBTN=y
902CONFIG_NETDEVICES=y 908CONFIG_NETDEVICES=y
903# CONFIG_NETDEVICES_MULTIQUEUE is not set
904# CONFIG_IFB is not set 909# CONFIG_IFB is not set
905# CONFIG_DUMMY is not set 910# CONFIG_DUMMY is not set
906# CONFIG_BONDING is not set 911# CONFIG_BONDING is not set
@@ -910,7 +915,23 @@ CONFIG_NETDEVICES=y
910# CONFIG_VETH is not set 915# CONFIG_VETH is not set
911# CONFIG_NET_SB1000 is not set 916# CONFIG_NET_SB1000 is not set
912# CONFIG_ARCNET is not set 917# CONFIG_ARCNET is not set
913# CONFIG_PHYLIB is not set 918CONFIG_PHYLIB=y
919
920#
921# MII PHY device drivers
922#
923# CONFIG_MARVELL_PHY is not set
924# CONFIG_DAVICOM_PHY is not set
925# CONFIG_QSEMI_PHY is not set
926# CONFIG_LXT_PHY is not set
927# CONFIG_CICADA_PHY is not set
928# CONFIG_VITESSE_PHY is not set
929# CONFIG_SMSC_PHY is not set
930# CONFIG_BROADCOM_PHY is not set
931# CONFIG_ICPLUS_PHY is not set
932# CONFIG_REALTEK_PHY is not set
933# CONFIG_FIXED_PHY is not set
934# CONFIG_MDIO_BITBANG is not set
914CONFIG_NET_ETHERNET=y 935CONFIG_NET_ETHERNET=y
915CONFIG_MII=y 936CONFIG_MII=y
916# CONFIG_HAPPYMEAL is not set 937# CONFIG_HAPPYMEAL is not set
@@ -943,10 +964,10 @@ CONFIG_FORCEDETH=y
943CONFIG_E100=y 964CONFIG_E100=y
944# CONFIG_FEALNX is not set 965# CONFIG_FEALNX is not set
945# CONFIG_NATSEMI is not set 966# CONFIG_NATSEMI is not set
946# CONFIG_NE2K_PCI is not set 967CONFIG_NE2K_PCI=y
947# CONFIG_8139CP is not set 968# CONFIG_8139CP is not set
948CONFIG_8139TOO=y 969CONFIG_8139TOO=y
949CONFIG_8139TOO_PIO=y 970# CONFIG_8139TOO_PIO is not set
950# CONFIG_8139TOO_TUNE_TWISTER is not set 971# CONFIG_8139TOO_TUNE_TWISTER is not set
951# CONFIG_8139TOO_8129 is not set 972# CONFIG_8139TOO_8129 is not set
952# CONFIG_8139_OLD_RX_RESET is not set 973# CONFIG_8139_OLD_RX_RESET is not set
@@ -961,25 +982,24 @@ CONFIG_NETDEV_1000=y
961# CONFIG_ACENIC is not set 982# CONFIG_ACENIC is not set
962# CONFIG_DL2K is not set 983# CONFIG_DL2K is not set
963CONFIG_E1000=y 984CONFIG_E1000=y
964# CONFIG_E1000_NAPI is not set
965# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set 985# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
966# CONFIG_E1000E is not set 986CONFIG_E1000E=y
967# CONFIG_E1000E_ENABLED is not set
968# CONFIG_IP1000 is not set 987# CONFIG_IP1000 is not set
969# CONFIG_IGB is not set 988# CONFIG_IGB is not set
970# CONFIG_NS83820 is not set 989# CONFIG_NS83820 is not set
971# CONFIG_HAMACHI is not set 990# CONFIG_HAMACHI is not set
972# CONFIG_YELLOWFIN is not set 991# CONFIG_YELLOWFIN is not set
973# CONFIG_R8169 is not set 992CONFIG_R8169=y
974# CONFIG_SIS190 is not set 993# CONFIG_SIS190 is not set
975# CONFIG_SKGE is not set 994# CONFIG_SKGE is not set
976CONFIG_SKY2=y 995CONFIG_SKY2=y
977# CONFIG_SKY2_DEBUG is not set 996# CONFIG_SKY2_DEBUG is not set
978# CONFIG_VIA_VELOCITY is not set 997# CONFIG_VIA_VELOCITY is not set
979CONFIG_TIGON3=y 998CONFIG_TIGON3=y
980# CONFIG_BNX2 is not set 999CONFIG_BNX2=y
981# CONFIG_QLA3XXX is not set 1000# CONFIG_QLA3XXX is not set
982# CONFIG_ATL1 is not set 1001# CONFIG_ATL1 is not set
1002# CONFIG_ATL1E is not set
983CONFIG_NETDEV_10000=y 1003CONFIG_NETDEV_10000=y
984# CONFIG_CHELSIO_T1 is not set 1004# CONFIG_CHELSIO_T1 is not set
985# CONFIG_CHELSIO_T3 is not set 1005# CONFIG_CHELSIO_T3 is not set
@@ -1019,13 +1039,14 @@ CONFIG_WLAN_80211=y
1019# CONFIG_RTL8180 is not set 1039# CONFIG_RTL8180 is not set
1020# CONFIG_RTL8187 is not set 1040# CONFIG_RTL8187 is not set
1021# CONFIG_ADM8211 is not set 1041# CONFIG_ADM8211 is not set
1042# CONFIG_MAC80211_HWSIM is not set
1022# CONFIG_P54_COMMON is not set 1043# CONFIG_P54_COMMON is not set
1023CONFIG_ATH5K=y 1044CONFIG_ATH5K=y
1024# CONFIG_ATH5K_DEBUG is not set 1045# CONFIG_ATH5K_DEBUG is not set
1025# CONFIG_IWLWIFI is not set 1046# CONFIG_ATH9K is not set
1026# CONFIG_IWLCORE is not set 1047# CONFIG_IWLCORE is not set
1027# CONFIG_IWLWIFI_LEDS is not set 1048# CONFIG_IWLWIFI_LEDS is not set
1028# CONFIG_IWL4965 is not set 1049# CONFIG_IWLAGN is not set
1029# CONFIG_IWL3945 is not set 1050# CONFIG_IWL3945 is not set
1030# CONFIG_HOSTAP is not set 1051# CONFIG_HOSTAP is not set
1031# CONFIG_B43 is not set 1052# CONFIG_B43 is not set
@@ -1105,6 +1126,7 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y
1105# CONFIG_MOUSE_PS2_TOUCHKIT is not set 1126# CONFIG_MOUSE_PS2_TOUCHKIT is not set
1106# CONFIG_MOUSE_SERIAL is not set 1127# CONFIG_MOUSE_SERIAL is not set
1107# CONFIG_MOUSE_APPLETOUCH is not set 1128# CONFIG_MOUSE_APPLETOUCH is not set
1129# CONFIG_MOUSE_BCM5974 is not set
1108# CONFIG_MOUSE_VSXXXAA is not set 1130# CONFIG_MOUSE_VSXXXAA is not set
1109CONFIG_INPUT_JOYSTICK=y 1131CONFIG_INPUT_JOYSTICK=y
1110# CONFIG_JOYSTICK_ANALOG is not set 1132# CONFIG_JOYSTICK_ANALOG is not set
@@ -1139,12 +1161,14 @@ CONFIG_INPUT_TOUCHSCREEN=y
1139# CONFIG_TOUCHSCREEN_GUNZE is not set 1161# CONFIG_TOUCHSCREEN_GUNZE is not set
1140# CONFIG_TOUCHSCREEN_ELO is not set 1162# CONFIG_TOUCHSCREEN_ELO is not set
1141# CONFIG_TOUCHSCREEN_MTOUCH is not set 1163# CONFIG_TOUCHSCREEN_MTOUCH is not set
1164# CONFIG_TOUCHSCREEN_INEXIO is not set
1142# CONFIG_TOUCHSCREEN_MK712 is not set 1165# CONFIG_TOUCHSCREEN_MK712 is not set
1143# CONFIG_TOUCHSCREEN_PENMOUNT is not set 1166# CONFIG_TOUCHSCREEN_PENMOUNT is not set
1144# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set 1167# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
1145# CONFIG_TOUCHSCREEN_TOUCHWIN is not set 1168# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
1146# CONFIG_TOUCHSCREEN_UCB1400 is not set 1169# CONFIG_TOUCHSCREEN_UCB1400 is not set
1147# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set 1170# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
1171# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
1148CONFIG_INPUT_MISC=y 1172CONFIG_INPUT_MISC=y
1149# CONFIG_INPUT_PCSPKR is not set 1173# CONFIG_INPUT_PCSPKR is not set
1150# CONFIG_INPUT_APANEL is not set 1174# CONFIG_INPUT_APANEL is not set
@@ -1173,6 +1197,7 @@ CONFIG_SERIO_LIBPS2=y
1173# Character devices 1197# Character devices
1174# 1198#
1175CONFIG_VT=y 1199CONFIG_VT=y
1200CONFIG_CONSOLE_TRANSLATIONS=y
1176CONFIG_VT_CONSOLE=y 1201CONFIG_VT_CONSOLE=y
1177CONFIG_HW_CONSOLE=y 1202CONFIG_HW_CONSOLE=y
1178CONFIG_VT_HW_CONSOLE_BINDING=y 1203CONFIG_VT_HW_CONSOLE_BINDING=y
@@ -1223,8 +1248,8 @@ CONFIG_UNIX98_PTYS=y
1223# CONFIG_LEGACY_PTYS is not set 1248# CONFIG_LEGACY_PTYS is not set
1224# CONFIG_IPMI_HANDLER is not set 1249# CONFIG_IPMI_HANDLER is not set
1225CONFIG_HW_RANDOM=y 1250CONFIG_HW_RANDOM=y
1226# CONFIG_HW_RANDOM_INTEL is not set 1251CONFIG_HW_RANDOM_INTEL=y
1227# CONFIG_HW_RANDOM_AMD is not set 1252CONFIG_HW_RANDOM_AMD=y
1228CONFIG_HW_RANDOM_GEODE=y 1253CONFIG_HW_RANDOM_GEODE=y
1229CONFIG_HW_RANDOM_VIA=y 1254CONFIG_HW_RANDOM_VIA=y
1230CONFIG_NVRAM=y 1255CONFIG_NVRAM=y
@@ -1245,7 +1270,6 @@ CONFIG_NVRAM=y
1245# CONFIG_CS5535_GPIO is not set 1270# CONFIG_CS5535_GPIO is not set
1246# CONFIG_RAW_DRIVER is not set 1271# CONFIG_RAW_DRIVER is not set
1247CONFIG_HPET=y 1272CONFIG_HPET=y
1248# CONFIG_HPET_RTC_IRQ is not set
1249# CONFIG_HPET_MMAP is not set 1273# CONFIG_HPET_MMAP is not set
1250# CONFIG_HANGCHECK_TIMER is not set 1274# CONFIG_HANGCHECK_TIMER is not set
1251# CONFIG_TCG_TPM is not set 1275# CONFIG_TCG_TPM is not set
@@ -1254,43 +1278,64 @@ CONFIG_DEVPORT=y
1254CONFIG_I2C=y 1278CONFIG_I2C=y
1255CONFIG_I2C_BOARDINFO=y 1279CONFIG_I2C_BOARDINFO=y
1256# CONFIG_I2C_CHARDEV is not set 1280# CONFIG_I2C_CHARDEV is not set
1281CONFIG_I2C_HELPER_AUTO=y
1257 1282
1258# 1283#
1259# I2C Hardware Bus support 1284# I2C Hardware Bus support
1260# 1285#
1286
1287#
1288# PC SMBus host controller drivers
1289#
1261# CONFIG_I2C_ALI1535 is not set 1290# CONFIG_I2C_ALI1535 is not set
1262# CONFIG_I2C_ALI1563 is not set 1291# CONFIG_I2C_ALI1563 is not set
1263# CONFIG_I2C_ALI15X3 is not set 1292# CONFIG_I2C_ALI15X3 is not set
1264# CONFIG_I2C_AMD756 is not set 1293# CONFIG_I2C_AMD756 is not set
1265# CONFIG_I2C_AMD8111 is not set 1294# CONFIG_I2C_AMD8111 is not set
1266CONFIG_I2C_I801=y 1295CONFIG_I2C_I801=y
1267# CONFIG_I2C_I810 is not set 1296# CONFIG_I2C_ISCH is not set
1268# CONFIG_I2C_PIIX4 is not set 1297# CONFIG_I2C_PIIX4 is not set
1269# CONFIG_I2C_NFORCE2 is not set 1298# CONFIG_I2C_NFORCE2 is not set
1270# CONFIG_I2C_OCORES is not set
1271# CONFIG_I2C_PARPORT_LIGHT is not set
1272# CONFIG_I2C_PROSAVAGE is not set
1273# CONFIG_I2C_SAVAGE4 is not set
1274# CONFIG_I2C_SIMTEC is not set
1275# CONFIG_SCx200_ACB is not set
1276# CONFIG_I2C_SIS5595 is not set 1299# CONFIG_I2C_SIS5595 is not set
1277# CONFIG_I2C_SIS630 is not set 1300# CONFIG_I2C_SIS630 is not set
1278# CONFIG_I2C_SIS96X is not set 1301# CONFIG_I2C_SIS96X is not set
1279# CONFIG_I2C_TAOS_EVM is not set
1280# CONFIG_I2C_STUB is not set
1281# CONFIG_I2C_TINY_USB is not set
1282# CONFIG_I2C_VIA is not set 1302# CONFIG_I2C_VIA is not set
1283# CONFIG_I2C_VIAPRO is not set 1303# CONFIG_I2C_VIAPRO is not set
1304
1305#
1306# I2C system bus drivers (mostly embedded / system-on-chip)
1307#
1308# CONFIG_I2C_OCORES is not set
1309# CONFIG_I2C_SIMTEC is not set
1310
1311#
1312# External I2C/SMBus adapter drivers
1313#
1314# CONFIG_I2C_PARPORT_LIGHT is not set
1315# CONFIG_I2C_TAOS_EVM is not set
1316# CONFIG_I2C_TINY_USB is not set
1317
1318#
1319# Graphics adapter I2C/DDC channel drivers
1320#
1284# CONFIG_I2C_VOODOO3 is not set 1321# CONFIG_I2C_VOODOO3 is not set
1322
1323#
1324# Other I2C/SMBus bus drivers
1325#
1285# CONFIG_I2C_PCA_PLATFORM is not set 1326# CONFIG_I2C_PCA_PLATFORM is not set
1327# CONFIG_I2C_STUB is not set
1328# CONFIG_SCx200_ACB is not set
1286 1329
1287# 1330#
1288# Miscellaneous I2C Chip support 1331# Miscellaneous I2C Chip support
1289# 1332#
1290# CONFIG_DS1682 is not set 1333# CONFIG_DS1682 is not set
1334# CONFIG_AT24 is not set
1291# CONFIG_SENSORS_EEPROM is not set 1335# CONFIG_SENSORS_EEPROM is not set
1292# CONFIG_SENSORS_PCF8574 is not set 1336# CONFIG_SENSORS_PCF8574 is not set
1293# CONFIG_PCF8575 is not set 1337# CONFIG_PCF8575 is not set
1338# CONFIG_SENSORS_PCA9539 is not set
1294# CONFIG_SENSORS_PCF8591 is not set 1339# CONFIG_SENSORS_PCF8591 is not set
1295# CONFIG_SENSORS_MAX6875 is not set 1340# CONFIG_SENSORS_MAX6875 is not set
1296# CONFIG_SENSORS_TSL2550 is not set 1341# CONFIG_SENSORS_TSL2550 is not set
@@ -1299,6 +1344,8 @@ CONFIG_I2C_I801=y
1299# CONFIG_I2C_DEBUG_BUS is not set 1344# CONFIG_I2C_DEBUG_BUS is not set
1300# CONFIG_I2C_DEBUG_CHIP is not set 1345# CONFIG_I2C_DEBUG_CHIP is not set
1301# CONFIG_SPI is not set 1346# CONFIG_SPI is not set
1347CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
1348# CONFIG_GPIOLIB is not set
1302# CONFIG_W1 is not set 1349# CONFIG_W1 is not set
1303CONFIG_POWER_SUPPLY=y 1350CONFIG_POWER_SUPPLY=y
1304# CONFIG_POWER_SUPPLY_DEBUG is not set 1351# CONFIG_POWER_SUPPLY_DEBUG is not set
@@ -1360,8 +1407,10 @@ CONFIG_SSB_POSSIBLE=y
1360# 1407#
1361# Multifunction device drivers 1408# Multifunction device drivers
1362# 1409#
1410# CONFIG_MFD_CORE is not set
1363# CONFIG_MFD_SM501 is not set 1411# CONFIG_MFD_SM501 is not set
1364# CONFIG_HTC_PASIC3 is not set 1412# CONFIG_HTC_PASIC3 is not set
1413# CONFIG_MFD_TMIO is not set
1365 1414
1366# 1415#
1367# Multimedia devices 1416# Multimedia devices
@@ -1372,6 +1421,7 @@ CONFIG_SSB_POSSIBLE=y
1372# 1421#
1373# CONFIG_VIDEO_DEV is not set 1422# CONFIG_VIDEO_DEV is not set
1374# CONFIG_DVB_CORE is not set 1423# CONFIG_DVB_CORE is not set
1424# CONFIG_VIDEO_MEDIA is not set
1375 1425
1376# 1426#
1377# Multimedia drivers 1427# Multimedia drivers
@@ -1418,7 +1468,6 @@ CONFIG_FB_CFB_IMAGEBLIT=y
1418# CONFIG_FB_SYS_IMAGEBLIT is not set 1468# CONFIG_FB_SYS_IMAGEBLIT is not set
1419# CONFIG_FB_FOREIGN_ENDIAN is not set 1469# CONFIG_FB_FOREIGN_ENDIAN is not set
1420# CONFIG_FB_SYS_FOPS is not set 1470# CONFIG_FB_SYS_FOPS is not set
1421CONFIG_FB_DEFERRED_IO=y
1422# CONFIG_FB_SVGALIB is not set 1471# CONFIG_FB_SVGALIB is not set
1423# CONFIG_FB_MACMODES is not set 1472# CONFIG_FB_MACMODES is not set
1424# CONFIG_FB_BACKLIGHT is not set 1473# CONFIG_FB_BACKLIGHT is not set
@@ -1463,6 +1512,7 @@ CONFIG_FB_EFI=y
1463# CONFIG_FB_TRIDENT is not set 1512# CONFIG_FB_TRIDENT is not set
1464# CONFIG_FB_ARK is not set 1513# CONFIG_FB_ARK is not set
1465# CONFIG_FB_PM3 is not set 1514# CONFIG_FB_PM3 is not set
1515# CONFIG_FB_CARMINE is not set
1466# CONFIG_FB_GEODE is not set 1516# CONFIG_FB_GEODE is not set
1467# CONFIG_FB_VIRTUAL is not set 1517# CONFIG_FB_VIRTUAL is not set
1468CONFIG_BACKLIGHT_LCD_SUPPORT=y 1518CONFIG_BACKLIGHT_LCD_SUPPORT=y
@@ -1470,6 +1520,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y
1470CONFIG_BACKLIGHT_CLASS_DEVICE=y 1520CONFIG_BACKLIGHT_CLASS_DEVICE=y
1471# CONFIG_BACKLIGHT_CORGI is not set 1521# CONFIG_BACKLIGHT_CORGI is not set
1472# CONFIG_BACKLIGHT_PROGEAR is not set 1522# CONFIG_BACKLIGHT_PROGEAR is not set
1523# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
1473 1524
1474# 1525#
1475# Display device support 1526# Display device support
@@ -1482,22 +1533,13 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y
1482CONFIG_VGA_CONSOLE=y 1533CONFIG_VGA_CONSOLE=y
1483CONFIG_VGACON_SOFT_SCROLLBACK=y 1534CONFIG_VGACON_SOFT_SCROLLBACK=y
1484CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 1535CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
1485CONFIG_VIDEO_SELECT=y
1486CONFIG_DUMMY_CONSOLE=y 1536CONFIG_DUMMY_CONSOLE=y
1487# CONFIG_FRAMEBUFFER_CONSOLE is not set 1537# CONFIG_FRAMEBUFFER_CONSOLE is not set
1488CONFIG_LOGO=y 1538CONFIG_LOGO=y
1489# CONFIG_LOGO_LINUX_MONO is not set 1539# CONFIG_LOGO_LINUX_MONO is not set
1490# CONFIG_LOGO_LINUX_VGA16 is not set 1540# CONFIG_LOGO_LINUX_VGA16 is not set
1491CONFIG_LOGO_LINUX_CLUT224=y 1541CONFIG_LOGO_LINUX_CLUT224=y
1492
1493#
1494# Sound
1495#
1496CONFIG_SOUND=y 1542CONFIG_SOUND=y
1497
1498#
1499# Advanced Linux Sound Architecture
1500#
1501CONFIG_SND=y 1543CONFIG_SND=y
1502CONFIG_SND_TIMER=y 1544CONFIG_SND_TIMER=y
1503CONFIG_SND_PCM=y 1545CONFIG_SND_PCM=y
@@ -1515,20 +1557,14 @@ CONFIG_SND_VERBOSE_PROCFS=y
1515# CONFIG_SND_VERBOSE_PRINTK is not set 1557# CONFIG_SND_VERBOSE_PRINTK is not set
1516# CONFIG_SND_DEBUG is not set 1558# CONFIG_SND_DEBUG is not set
1517CONFIG_SND_VMASTER=y 1559CONFIG_SND_VMASTER=y
1518 1560CONFIG_SND_DRIVERS=y
1519#
1520# Generic devices
1521#
1522# CONFIG_SND_PCSP is not set 1561# CONFIG_SND_PCSP is not set
1523# CONFIG_SND_DUMMY is not set 1562# CONFIG_SND_DUMMY is not set
1524# CONFIG_SND_VIRMIDI is not set 1563# CONFIG_SND_VIRMIDI is not set
1525# CONFIG_SND_MTPAV is not set 1564# CONFIG_SND_MTPAV is not set
1526# CONFIG_SND_SERIAL_U16550 is not set 1565# CONFIG_SND_SERIAL_U16550 is not set
1527# CONFIG_SND_MPU401 is not set 1566# CONFIG_SND_MPU401 is not set
1528 1567CONFIG_SND_PCI=y
1529#
1530# PCI devices
1531#
1532# CONFIG_SND_AD1889 is not set 1568# CONFIG_SND_AD1889 is not set
1533# CONFIG_SND_ALS300 is not set 1569# CONFIG_SND_ALS300 is not set
1534# CONFIG_SND_ALS4000 is not set 1570# CONFIG_SND_ALS4000 is not set
@@ -1603,36 +1639,14 @@ CONFIG_SND_HDA_GENERIC=y
1603# CONFIG_SND_VIRTUOSO is not set 1639# CONFIG_SND_VIRTUOSO is not set
1604# CONFIG_SND_VX222 is not set 1640# CONFIG_SND_VX222 is not set
1605# CONFIG_SND_YMFPCI is not set 1641# CONFIG_SND_YMFPCI is not set
1606 1642CONFIG_SND_USB=y
1607#
1608# USB devices
1609#
1610# CONFIG_SND_USB_AUDIO is not set 1643# CONFIG_SND_USB_AUDIO is not set
1611# CONFIG_SND_USB_USX2Y is not set 1644# CONFIG_SND_USB_USX2Y is not set
1612# CONFIG_SND_USB_CAIAQ is not set 1645# CONFIG_SND_USB_CAIAQ is not set
1613 1646CONFIG_SND_PCMCIA=y
1614#
1615# PCMCIA devices
1616#
1617# CONFIG_SND_VXPOCKET is not set 1647# CONFIG_SND_VXPOCKET is not set
1618# CONFIG_SND_PDAUDIOCF is not set 1648# CONFIG_SND_PDAUDIOCF is not set
1619
1620#
1621# System on Chip audio support
1622#
1623# CONFIG_SND_SOC is not set 1649# CONFIG_SND_SOC is not set
1624
1625#
1626# ALSA SoC audio for Freescale SOCs
1627#
1628
1629#
1630# SoC Audio for the Texas Instruments OMAP
1631#
1632
1633#
1634# Open Sound System
1635#
1636# CONFIG_SOUND_PRIME is not set 1650# CONFIG_SOUND_PRIME is not set
1637CONFIG_HID_SUPPORT=y 1651CONFIG_HID_SUPPORT=y
1638CONFIG_HID=y 1652CONFIG_HID=y
@@ -1668,6 +1682,7 @@ CONFIG_USB_DEVICEFS=y
1668# CONFIG_USB_DYNAMIC_MINORS is not set 1682# CONFIG_USB_DYNAMIC_MINORS is not set
1669CONFIG_USB_SUSPEND=y 1683CONFIG_USB_SUSPEND=y
1670# CONFIG_USB_OTG is not set 1684# CONFIG_USB_OTG is not set
1685CONFIG_USB_MON=y
1671 1686
1672# 1687#
1673# USB Host Controller Drivers 1688# USB Host Controller Drivers
@@ -1691,6 +1706,7 @@ CONFIG_USB_UHCI_HCD=y
1691# 1706#
1692# CONFIG_USB_ACM is not set 1707# CONFIG_USB_ACM is not set
1693CONFIG_USB_PRINTER=y 1708CONFIG_USB_PRINTER=y
1709# CONFIG_USB_WDM is not set
1694 1710
1695# 1711#
1696# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' 1712# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
@@ -1712,6 +1728,7 @@ CONFIG_USB_STORAGE=y
1712# CONFIG_USB_STORAGE_ALAUDA is not set 1728# CONFIG_USB_STORAGE_ALAUDA is not set
1713# CONFIG_USB_STORAGE_ONETOUCH is not set 1729# CONFIG_USB_STORAGE_ONETOUCH is not set
1714# CONFIG_USB_STORAGE_KARMA is not set 1730# CONFIG_USB_STORAGE_KARMA is not set
1731# CONFIG_USB_STORAGE_SIERRA is not set
1715# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set 1732# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
1716CONFIG_USB_LIBUSUAL=y 1733CONFIG_USB_LIBUSUAL=y
1717 1734
@@ -1720,7 +1737,6 @@ CONFIG_USB_LIBUSUAL=y
1720# 1737#
1721# CONFIG_USB_MDC800 is not set 1738# CONFIG_USB_MDC800 is not set
1722# CONFIG_USB_MICROTEK is not set 1739# CONFIG_USB_MICROTEK is not set
1723CONFIG_USB_MON=y
1724 1740
1725# 1741#
1726# USB port drivers 1742# USB port drivers
@@ -1733,7 +1749,6 @@ CONFIG_USB_MON=y
1733# CONFIG_USB_EMI62 is not set 1749# CONFIG_USB_EMI62 is not set
1734# CONFIG_USB_EMI26 is not set 1750# CONFIG_USB_EMI26 is not set
1735# CONFIG_USB_ADUTUX is not set 1751# CONFIG_USB_ADUTUX is not set
1736# CONFIG_USB_AUERSWALD is not set
1737# CONFIG_USB_RIO500 is not set 1752# CONFIG_USB_RIO500 is not set
1738# CONFIG_USB_LEGOTOWER is not set 1753# CONFIG_USB_LEGOTOWER is not set
1739# CONFIG_USB_LCD is not set 1754# CONFIG_USB_LCD is not set
@@ -1750,6 +1765,7 @@ CONFIG_USB_MON=y
1750# CONFIG_USB_TRANCEVIBRATOR is not set 1765# CONFIG_USB_TRANCEVIBRATOR is not set
1751# CONFIG_USB_IOWARRIOR is not set 1766# CONFIG_USB_IOWARRIOR is not set
1752# CONFIG_USB_TEST is not set 1767# CONFIG_USB_TEST is not set
1768# CONFIG_USB_ISIGHTFW is not set
1753# CONFIG_USB_GADGET is not set 1769# CONFIG_USB_GADGET is not set
1754# CONFIG_MMC is not set 1770# CONFIG_MMC is not set
1755# CONFIG_MEMSTICK is not set 1771# CONFIG_MEMSTICK is not set
@@ -1759,7 +1775,9 @@ CONFIG_LEDS_CLASS=y
1759# 1775#
1760# LED drivers 1776# LED drivers
1761# 1777#
1778# CONFIG_LEDS_PCA9532 is not set
1762# CONFIG_LEDS_CLEVO_MAIL is not set 1779# CONFIG_LEDS_CLEVO_MAIL is not set
1780# CONFIG_LEDS_PCA955X is not set
1763 1781
1764# 1782#
1765# LED Triggers 1783# LED Triggers
@@ -1805,6 +1823,7 @@ CONFIG_RTC_INTF_DEV=y
1805# CONFIG_RTC_DRV_PCF8583 is not set 1823# CONFIG_RTC_DRV_PCF8583 is not set
1806# CONFIG_RTC_DRV_M41T80 is not set 1824# CONFIG_RTC_DRV_M41T80 is not set
1807# CONFIG_RTC_DRV_S35390A is not set 1825# CONFIG_RTC_DRV_S35390A is not set
1826# CONFIG_RTC_DRV_FM3130 is not set
1808 1827
1809# 1828#
1810# SPI RTC drivers 1829# SPI RTC drivers
@@ -1837,11 +1856,13 @@ CONFIG_DMADEVICES=y
1837# Firmware Drivers 1856# Firmware Drivers
1838# 1857#
1839# CONFIG_EDD is not set 1858# CONFIG_EDD is not set
1859CONFIG_FIRMWARE_MEMMAP=y
1840CONFIG_EFI_VARS=y 1860CONFIG_EFI_VARS=y
1841# CONFIG_DELL_RBU is not set 1861# CONFIG_DELL_RBU is not set
1842# CONFIG_DCDBAS is not set 1862# CONFIG_DCDBAS is not set
1843CONFIG_DMIID=y 1863CONFIG_DMIID=y
1844# CONFIG_ISCSI_IBFT_FIND is not set 1864CONFIG_ISCSI_IBFT_FIND=y
1865CONFIG_ISCSI_IBFT=y
1845 1866
1846# 1867#
1847# File systems 1868# File systems
@@ -1920,14 +1941,27 @@ CONFIG_HUGETLB_PAGE=y
1920# CONFIG_CRAMFS is not set 1941# CONFIG_CRAMFS is not set
1921# CONFIG_VXFS_FS is not set 1942# CONFIG_VXFS_FS is not set
1922# CONFIG_MINIX_FS is not set 1943# CONFIG_MINIX_FS is not set
1944# CONFIG_OMFS_FS is not set
1923# CONFIG_HPFS_FS is not set 1945# CONFIG_HPFS_FS is not set
1924# CONFIG_QNX4FS_FS is not set 1946# CONFIG_QNX4FS_FS is not set
1925# CONFIG_ROMFS_FS is not set 1947# CONFIG_ROMFS_FS is not set
1926# CONFIG_SYSV_FS is not set 1948# CONFIG_SYSV_FS is not set
1927# CONFIG_UFS_FS is not set 1949# CONFIG_UFS_FS is not set
1928CONFIG_NETWORK_FILESYSTEMS=y 1950CONFIG_NETWORK_FILESYSTEMS=y
1929# CONFIG_NFS_FS is not set 1951CONFIG_NFS_FS=y
1952CONFIG_NFS_V3=y
1953CONFIG_NFS_V3_ACL=y
1954CONFIG_NFS_V4=y
1955CONFIG_ROOT_NFS=y
1930# CONFIG_NFSD is not set 1956# CONFIG_NFSD is not set
1957CONFIG_LOCKD=y
1958CONFIG_LOCKD_V4=y
1959CONFIG_NFS_ACL_SUPPORT=y
1960CONFIG_NFS_COMMON=y
1961CONFIG_SUNRPC=y
1962CONFIG_SUNRPC_GSS=y
1963CONFIG_RPCSEC_GSS_KRB5=y
1964# CONFIG_RPCSEC_GSS_SPKM3 is not set
1931# CONFIG_SMB_FS is not set 1965# CONFIG_SMB_FS is not set
1932# CONFIG_CIFS is not set 1966# CONFIG_CIFS is not set
1933# CONFIG_NCP_FS is not set 1967# CONFIG_NCP_FS is not set
@@ -2001,9 +2035,9 @@ CONFIG_NLS_UTF8=y
2001# Kernel hacking 2035# Kernel hacking
2002# 2036#
2003CONFIG_TRACE_IRQFLAGS_SUPPORT=y 2037CONFIG_TRACE_IRQFLAGS_SUPPORT=y
2004# CONFIG_PRINTK_TIME is not set 2038CONFIG_PRINTK_TIME=y
2005# CONFIG_ENABLE_WARN_DEPRECATED is not set 2039CONFIG_ENABLE_WARN_DEPRECATED=y
2006# CONFIG_ENABLE_MUST_CHECK is not set 2040CONFIG_ENABLE_MUST_CHECK=y
2007CONFIG_FRAME_WARN=2048 2041CONFIG_FRAME_WARN=2048
2008CONFIG_MAGIC_SYSRQ=y 2042CONFIG_MAGIC_SYSRQ=y
2009# CONFIG_UNUSED_SYMBOLS is not set 2043# CONFIG_UNUSED_SYMBOLS is not set
@@ -2033,6 +2067,7 @@ CONFIG_DEBUG_BUGVERBOSE=y
2033# CONFIG_DEBUG_INFO is not set 2067# CONFIG_DEBUG_INFO is not set
2034# CONFIG_DEBUG_VM is not set 2068# CONFIG_DEBUG_VM is not set
2035# CONFIG_DEBUG_WRITECOUNT is not set 2069# CONFIG_DEBUG_WRITECOUNT is not set
2070CONFIG_DEBUG_MEMORY_INIT=y
2036# CONFIG_DEBUG_LIST is not set 2071# CONFIG_DEBUG_LIST is not set
2037# CONFIG_DEBUG_SG is not set 2072# CONFIG_DEBUG_SG is not set
2038CONFIG_FRAME_POINTER=y 2073CONFIG_FRAME_POINTER=y
@@ -2043,23 +2078,32 @@ CONFIG_FRAME_POINTER=y
2043# CONFIG_LKDTM is not set 2078# CONFIG_LKDTM is not set
2044# CONFIG_FAULT_INJECTION is not set 2079# CONFIG_FAULT_INJECTION is not set
2045# CONFIG_LATENCYTOP is not set 2080# CONFIG_LATENCYTOP is not set
2081CONFIG_SYSCTL_SYSCALL_CHECK=y
2082CONFIG_HAVE_FTRACE=y
2083CONFIG_HAVE_DYNAMIC_FTRACE=y
2084# CONFIG_FTRACE is not set
2085# CONFIG_IRQSOFF_TRACER is not set
2086# CONFIG_SYSPROF_TRACER is not set
2087# CONFIG_SCHED_TRACER is not set
2088# CONFIG_CONTEXT_SWITCH_TRACER is not set
2046CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2089CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2047# CONFIG_SAMPLES is not set 2090# CONFIG_SAMPLES is not set
2048# CONFIG_KGDB is not set
2049CONFIG_HAVE_ARCH_KGDB=y 2091CONFIG_HAVE_ARCH_KGDB=y
2050# CONFIG_NONPROMISC_DEVMEM is not set 2092# CONFIG_KGDB is not set
2093# CONFIG_STRICT_DEVMEM is not set
2094CONFIG_X86_VERBOSE_BOOTUP=y
2051CONFIG_EARLY_PRINTK=y 2095CONFIG_EARLY_PRINTK=y
2052CONFIG_DEBUG_STACKOVERFLOW=y 2096CONFIG_DEBUG_STACKOVERFLOW=y
2053CONFIG_DEBUG_STACK_USAGE=y 2097CONFIG_DEBUG_STACK_USAGE=y
2054# CONFIG_DEBUG_PAGEALLOC is not set 2098# CONFIG_DEBUG_PAGEALLOC is not set
2099# CONFIG_DEBUG_PER_CPU_MAPS is not set
2055# CONFIG_X86_PTDUMP is not set 2100# CONFIG_X86_PTDUMP is not set
2056CONFIG_DEBUG_RODATA=y 2101CONFIG_DEBUG_RODATA=y
2057# CONFIG_DEBUG_RODATA_TEST is not set 2102# CONFIG_DEBUG_RODATA_TEST is not set
2058CONFIG_DEBUG_NX_TEST=m 2103CONFIG_DEBUG_NX_TEST=m
2059# CONFIG_4KSTACKS is not set 2104# CONFIG_4KSTACKS is not set
2060CONFIG_X86_FIND_SMP_CONFIG=y
2061CONFIG_X86_MPPARSE=y
2062CONFIG_DOUBLEFAULT=y 2105CONFIG_DOUBLEFAULT=y
2106# CONFIG_MMIOTRACE is not set
2063CONFIG_IO_DELAY_TYPE_0X80=0 2107CONFIG_IO_DELAY_TYPE_0X80=0
2064CONFIG_IO_DELAY_TYPE_0XED=1 2108CONFIG_IO_DELAY_TYPE_0XED=1
2065CONFIG_IO_DELAY_TYPE_UDELAY=2 2109CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2071,6 +2115,7 @@ CONFIG_IO_DELAY_0X80=y
2071CONFIG_DEFAULT_IO_DELAY_TYPE=0 2115CONFIG_DEFAULT_IO_DELAY_TYPE=0
2072CONFIG_DEBUG_BOOT_PARAMS=y 2116CONFIG_DEBUG_BOOT_PARAMS=y
2073# CONFIG_CPA_DEBUG is not set 2117# CONFIG_CPA_DEBUG is not set
2118CONFIG_OPTIMIZE_INLINING=y
2074 2119
2075# 2120#
2076# Security options 2121# Security options
@@ -2080,7 +2125,6 @@ CONFIG_KEYS_DEBUG_PROC_KEYS=y
2080CONFIG_SECURITY=y 2125CONFIG_SECURITY=y
2081CONFIG_SECURITY_NETWORK=y 2126CONFIG_SECURITY_NETWORK=y
2082# CONFIG_SECURITY_NETWORK_XFRM is not set 2127# CONFIG_SECURITY_NETWORK_XFRM is not set
2083CONFIG_SECURITY_CAPABILITIES=y
2084CONFIG_SECURITY_FILE_CAPABILITIES=y 2128CONFIG_SECURITY_FILE_CAPABILITIES=y
2085# CONFIG_SECURITY_ROOTPLUG is not set 2129# CONFIG_SECURITY_ROOTPLUG is not set
2086CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 2130CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
@@ -2141,6 +2185,10 @@ CONFIG_CRYPTO_HMAC=y
2141# CONFIG_CRYPTO_MD4 is not set 2185# CONFIG_CRYPTO_MD4 is not set
2142CONFIG_CRYPTO_MD5=y 2186CONFIG_CRYPTO_MD5=y
2143# CONFIG_CRYPTO_MICHAEL_MIC is not set 2187# CONFIG_CRYPTO_MICHAEL_MIC is not set
2188# CONFIG_CRYPTO_RMD128 is not set
2189# CONFIG_CRYPTO_RMD160 is not set
2190# CONFIG_CRYPTO_RMD256 is not set
2191# CONFIG_CRYPTO_RMD320 is not set
2144CONFIG_CRYPTO_SHA1=y 2192CONFIG_CRYPTO_SHA1=y
2145# CONFIG_CRYPTO_SHA256 is not set 2193# CONFIG_CRYPTO_SHA256 is not set
2146# CONFIG_CRYPTO_SHA512 is not set 2194# CONFIG_CRYPTO_SHA512 is not set
@@ -2151,7 +2199,7 @@ CONFIG_CRYPTO_SHA1=y
2151# Ciphers 2199# Ciphers
2152# 2200#
2153CONFIG_CRYPTO_AES=y 2201CONFIG_CRYPTO_AES=y
2154# CONFIG_CRYPTO_AES_586 is not set 2202CONFIG_CRYPTO_AES_586=y
2155# CONFIG_CRYPTO_ANUBIS is not set 2203# CONFIG_CRYPTO_ANUBIS is not set
2156CONFIG_CRYPTO_ARC4=y 2204CONFIG_CRYPTO_ARC4=y
2157# CONFIG_CRYPTO_BLOWFISH is not set 2205# CONFIG_CRYPTO_BLOWFISH is not set
@@ -2193,6 +2241,7 @@ CONFIG_GENERIC_FIND_FIRST_BIT=y
2193CONFIG_GENERIC_FIND_NEXT_BIT=y 2241CONFIG_GENERIC_FIND_NEXT_BIT=y
2194# CONFIG_CRC_CCITT is not set 2242# CONFIG_CRC_CCITT is not set
2195# CONFIG_CRC16 is not set 2243# CONFIG_CRC16 is not set
2244CONFIG_CRC_T10DIF=y
2196# CONFIG_CRC_ITU_T is not set 2245# CONFIG_CRC_ITU_T is not set
2197CONFIG_CRC32=y 2246CONFIG_CRC32=y
2198# CONFIG_CRC7 is not set 2247# CONFIG_CRC7 is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index ae5124e064d4..f0a03d7a7d63 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -1,13 +1,13 @@
1# 1#
2# Automatically generated make config: don't edit 2# Automatically generated make config: don't edit
3# Linux kernel version: 2.6.26-rc1 3# Linux kernel version: 2.6.27-rc5
4# Sun May 4 19:59:57 2008 4# Wed Sep 3 17:13:39 2008
5# 5#
6CONFIG_64BIT=y 6CONFIG_64BIT=y
7# CONFIG_X86_32 is not set 7# CONFIG_X86_32 is not set
8CONFIG_X86_64=y 8CONFIG_X86_64=y
9CONFIG_X86=y 9CONFIG_X86=y
10CONFIG_DEFCONFIG_LIST="arch/x86/configs/x86_64_defconfig" 10CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
11# CONFIG_GENERIC_LOCKBREAK is not set 11# CONFIG_GENERIC_LOCKBREAK is not set
12CONFIG_GENERIC_TIME=y 12CONFIG_GENERIC_TIME=y
13CONFIG_GENERIC_CMOS_UPDATE=y 13CONFIG_GENERIC_CMOS_UPDATE=y
@@ -53,6 +53,7 @@ CONFIG_X86_HT=y
53CONFIG_X86_BIOS_REBOOT=y 53CONFIG_X86_BIOS_REBOOT=y
54CONFIG_X86_TRAMPOLINE=y 54CONFIG_X86_TRAMPOLINE=y
55# CONFIG_KTIME_SCALAR is not set 55# CONFIG_KTIME_SCALAR is not set
56CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
56 57
57# 58#
58# General setup 59# General setup
@@ -82,6 +83,7 @@ CONFIG_CGROUPS=y
82CONFIG_CGROUP_NS=y 83CONFIG_CGROUP_NS=y
83# CONFIG_CGROUP_DEVICE is not set 84# CONFIG_CGROUP_DEVICE is not set
84CONFIG_CPUSETS=y 85CONFIG_CPUSETS=y
86CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y
85CONFIG_GROUP_SCHED=y 87CONFIG_GROUP_SCHED=y
86CONFIG_FAIR_GROUP_SCHED=y 88CONFIG_FAIR_GROUP_SCHED=y
87# CONFIG_RT_GROUP_SCHED is not set 89# CONFIG_RT_GROUP_SCHED is not set
@@ -105,7 +107,6 @@ CONFIG_SYSCTL=y
105# CONFIG_EMBEDDED is not set 107# CONFIG_EMBEDDED is not set
106CONFIG_UID16=y 108CONFIG_UID16=y
107CONFIG_SYSCTL_SYSCALL=y 109CONFIG_SYSCTL_SYSCALL=y
108CONFIG_SYSCTL_SYSCALL_CHECK=y
109CONFIG_KALLSYMS=y 110CONFIG_KALLSYMS=y
110CONFIG_KALLSYMS_ALL=y 111CONFIG_KALLSYMS_ALL=y
111CONFIG_KALLSYMS_EXTRA_PASS=y 112CONFIG_KALLSYMS_EXTRA_PASS=y
@@ -113,6 +114,7 @@ CONFIG_HOTPLUG=y
113CONFIG_PRINTK=y 114CONFIG_PRINTK=y
114CONFIG_BUG=y 115CONFIG_BUG=y
115CONFIG_ELF_CORE=y 116CONFIG_ELF_CORE=y
117CONFIG_PCSPKR_PLATFORM=y
116# CONFIG_COMPAT_BRK is not set 118# CONFIG_COMPAT_BRK is not set
117CONFIG_BASE_FULL=y 119CONFIG_BASE_FULL=y
118CONFIG_FUTEX=y 120CONFIG_FUTEX=y
@@ -132,25 +134,33 @@ CONFIG_MARKERS=y
132# CONFIG_OPROFILE is not set 134# CONFIG_OPROFILE is not set
133CONFIG_HAVE_OPROFILE=y 135CONFIG_HAVE_OPROFILE=y
134CONFIG_KPROBES=y 136CONFIG_KPROBES=y
137CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y
135CONFIG_KRETPROBES=y 138CONFIG_KRETPROBES=y
139CONFIG_HAVE_IOREMAP_PROT=y
136CONFIG_HAVE_KPROBES=y 140CONFIG_HAVE_KPROBES=y
137CONFIG_HAVE_KRETPROBES=y 141CONFIG_HAVE_KRETPROBES=y
142# CONFIG_HAVE_ARCH_TRACEHOOK is not set
138# CONFIG_HAVE_DMA_ATTRS is not set 143# CONFIG_HAVE_DMA_ATTRS is not set
144CONFIG_USE_GENERIC_SMP_HELPERS=y
145# CONFIG_HAVE_CLK is not set
139CONFIG_PROC_PAGE_MONITOR=y 146CONFIG_PROC_PAGE_MONITOR=y
147# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
140CONFIG_SLABINFO=y 148CONFIG_SLABINFO=y
141CONFIG_RT_MUTEXES=y 149CONFIG_RT_MUTEXES=y
142# CONFIG_TINY_SHMEM is not set 150# CONFIG_TINY_SHMEM is not set
143CONFIG_BASE_SMALL=0 151CONFIG_BASE_SMALL=0
144CONFIG_MODULES=y 152CONFIG_MODULES=y
153# CONFIG_MODULE_FORCE_LOAD is not set
145CONFIG_MODULE_UNLOAD=y 154CONFIG_MODULE_UNLOAD=y
146CONFIG_MODULE_FORCE_UNLOAD=y 155CONFIG_MODULE_FORCE_UNLOAD=y
147# CONFIG_MODVERSIONS is not set 156# CONFIG_MODVERSIONS is not set
148# CONFIG_MODULE_SRCVERSION_ALL is not set 157# CONFIG_MODULE_SRCVERSION_ALL is not set
149# CONFIG_KMOD is not set 158CONFIG_KMOD=y
150CONFIG_STOP_MACHINE=y 159CONFIG_STOP_MACHINE=y
151CONFIG_BLOCK=y 160CONFIG_BLOCK=y
152CONFIG_BLK_DEV_IO_TRACE=y 161CONFIG_BLK_DEV_IO_TRACE=y
153CONFIG_BLK_DEV_BSG=y 162CONFIG_BLK_DEV_BSG=y
163# CONFIG_BLK_DEV_INTEGRITY is not set
154CONFIG_BLOCK_COMPAT=y 164CONFIG_BLOCK_COMPAT=y
155 165
156# 166#
@@ -175,20 +185,15 @@ CONFIG_NO_HZ=y
175CONFIG_HIGH_RES_TIMERS=y 185CONFIG_HIGH_RES_TIMERS=y
176CONFIG_GENERIC_CLOCKEVENTS_BUILD=y 186CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
177CONFIG_SMP=y 187CONFIG_SMP=y
188CONFIG_X86_FIND_SMP_CONFIG=y
189CONFIG_X86_MPPARSE=y
178CONFIG_X86_PC=y 190CONFIG_X86_PC=y
179# CONFIG_X86_ELAN is not set 191# CONFIG_X86_ELAN is not set
180# CONFIG_X86_VOYAGER is not set 192# CONFIG_X86_VOYAGER is not set
181# CONFIG_X86_NUMAQ is not set
182# CONFIG_X86_SUMMIT is not set
183# CONFIG_X86_BIGSMP is not set
184# CONFIG_X86_VISWS is not set
185# CONFIG_X86_GENERICARCH is not set 193# CONFIG_X86_GENERICARCH is not set
186# CONFIG_X86_ES7000 is not set
187# CONFIG_X86_RDC321X is not set
188# CONFIG_X86_VSMP is not set 194# CONFIG_X86_VSMP is not set
189# CONFIG_PARAVIRT_GUEST is not set 195# CONFIG_PARAVIRT_GUEST is not set
190CONFIG_MEMTEST_BOOTPARAM=y 196# CONFIG_MEMTEST is not set
191CONFIG_MEMTEST_BOOTPARAM_VALUE=0
192# CONFIG_M386 is not set 197# CONFIG_M386 is not set
193# CONFIG_M486 is not set 198# CONFIG_M486 is not set
194# CONFIG_M586 is not set 199# CONFIG_M586 is not set
@@ -205,7 +210,6 @@ CONFIG_MEMTEST_BOOTPARAM_VALUE=0
205# CONFIG_MCRUSOE is not set 210# CONFIG_MCRUSOE is not set
206# CONFIG_MEFFICEON is not set 211# CONFIG_MEFFICEON is not set
207# CONFIG_MWINCHIPC6 is not set 212# CONFIG_MWINCHIPC6 is not set
208# CONFIG_MWINCHIP2 is not set
209# CONFIG_MWINCHIP3D is not set 213# CONFIG_MWINCHIP3D is not set
210# CONFIG_MGEODEGX1 is not set 214# CONFIG_MGEODEGX1 is not set
211# CONFIG_MGEODE_LX is not set 215# CONFIG_MGEODE_LX is not set
@@ -213,18 +217,16 @@ CONFIG_MEMTEST_BOOTPARAM_VALUE=0
213# CONFIG_MVIAC3_2 is not set 217# CONFIG_MVIAC3_2 is not set
214# CONFIG_MVIAC7 is not set 218# CONFIG_MVIAC7 is not set
215# CONFIG_MPSC is not set 219# CONFIG_MPSC is not set
216CONFIG_MCORE2=y 220# CONFIG_MCORE2 is not set
217# CONFIG_GENERIC_CPU is not set 221CONFIG_GENERIC_CPU=y
218CONFIG_X86_CPU=y 222CONFIG_X86_CPU=y
219CONFIG_X86_L1_CACHE_BYTES=64 223CONFIG_X86_L1_CACHE_BYTES=128
220CONFIG_X86_INTERNODE_CACHE_BYTES=64 224CONFIG_X86_INTERNODE_CACHE_BYTES=128
221CONFIG_X86_CMPXCHG=y 225CONFIG_X86_CMPXCHG=y
222CONFIG_X86_L1_CACHE_SHIFT=6 226CONFIG_X86_L1_CACHE_SHIFT=7
223CONFIG_X86_GOOD_APIC=y 227CONFIG_X86_WP_WORKS_OK=y
224CONFIG_X86_INTEL_USERCOPY=y
225CONFIG_X86_USE_PPRO_CHECKSUM=y
226CONFIG_X86_P6_NOP=y
227CONFIG_X86_TSC=y 228CONFIG_X86_TSC=y
229CONFIG_X86_CMPXCHG64=y
228CONFIG_X86_CMOV=y 230CONFIG_X86_CMOV=y
229CONFIG_X86_MINIMUM_CPU_FAMILY=64 231CONFIG_X86_MINIMUM_CPU_FAMILY=64
230CONFIG_X86_DEBUGCTLMSR=y 232CONFIG_X86_DEBUGCTLMSR=y
@@ -234,10 +236,11 @@ CONFIG_DMI=y
234CONFIG_GART_IOMMU=y 236CONFIG_GART_IOMMU=y
235CONFIG_CALGARY_IOMMU=y 237CONFIG_CALGARY_IOMMU=y
236CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y 238CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y
239CONFIG_AMD_IOMMU=y
237CONFIG_SWIOTLB=y 240CONFIG_SWIOTLB=y
238CONFIG_IOMMU_HELPER=y 241CONFIG_IOMMU_HELPER=y
239CONFIG_NR_CPUS=4 242CONFIG_NR_CPUS=64
240# CONFIG_SCHED_SMT is not set 243CONFIG_SCHED_SMT=y
241CONFIG_SCHED_MC=y 244CONFIG_SCHED_MC=y
242# CONFIG_PREEMPT_NONE is not set 245# CONFIG_PREEMPT_NONE is not set
243CONFIG_PREEMPT_VOLUNTARY=y 246CONFIG_PREEMPT_VOLUNTARY=y
@@ -246,7 +249,8 @@ CONFIG_X86_LOCAL_APIC=y
246CONFIG_X86_IO_APIC=y 249CONFIG_X86_IO_APIC=y
247# CONFIG_X86_MCE is not set 250# CONFIG_X86_MCE is not set
248# CONFIG_I8K is not set 251# CONFIG_I8K is not set
249# CONFIG_MICROCODE is not set 252CONFIG_MICROCODE=y
253CONFIG_MICROCODE_OLD_INTERFACE=y
250CONFIG_X86_MSR=y 254CONFIG_X86_MSR=y
251CONFIG_X86_CPUID=y 255CONFIG_X86_CPUID=y
252CONFIG_NUMA=y 256CONFIG_NUMA=y
@@ -281,7 +285,8 @@ CONFIG_ZONE_DMA_FLAG=1
281CONFIG_BOUNCE=y 285CONFIG_BOUNCE=y
282CONFIG_VIRT_TO_BUS=y 286CONFIG_VIRT_TO_BUS=y
283CONFIG_MTRR=y 287CONFIG_MTRR=y
284# CONFIG_X86_PAT is not set 288# CONFIG_MTRR_SANITIZER is not set
289CONFIG_X86_PAT=y
285CONFIG_EFI=y 290CONFIG_EFI=y
286CONFIG_SECCOMP=y 291CONFIG_SECCOMP=y
287# CONFIG_HZ_100 is not set 292# CONFIG_HZ_100 is not set
@@ -313,6 +318,7 @@ CONFIG_PM_TRACE_RTC=y
313CONFIG_PM_SLEEP_SMP=y 318CONFIG_PM_SLEEP_SMP=y
314CONFIG_PM_SLEEP=y 319CONFIG_PM_SLEEP=y
315CONFIG_SUSPEND=y 320CONFIG_SUSPEND=y
321# CONFIG_PM_TEST_SUSPEND is not set
316CONFIG_SUSPEND_FREEZER=y 322CONFIG_SUSPEND_FREEZER=y
317CONFIG_HIBERNATION=y 323CONFIG_HIBERNATION=y
318CONFIG_PM_STD_PARTITION="" 324CONFIG_PM_STD_PARTITION=""
@@ -339,6 +345,7 @@ CONFIG_ACPI_NUMA=y
339CONFIG_ACPI_BLACKLIST_YEAR=0 345CONFIG_ACPI_BLACKLIST_YEAR=0
340# CONFIG_ACPI_DEBUG is not set 346# CONFIG_ACPI_DEBUG is not set
341CONFIG_ACPI_EC=y 347CONFIG_ACPI_EC=y
348# CONFIG_ACPI_PCI_SLOT is not set
342CONFIG_ACPI_POWER=y 349CONFIG_ACPI_POWER=y
343CONFIG_ACPI_SYSTEM=y 350CONFIG_ACPI_SYSTEM=y
344CONFIG_X86_PM_TIMER=y 351CONFIG_X86_PM_TIMER=y
@@ -437,10 +444,6 @@ CONFIG_IA32_EMULATION=y
437CONFIG_COMPAT=y 444CONFIG_COMPAT=y
438CONFIG_COMPAT_FOR_U64_ALIGNMENT=y 445CONFIG_COMPAT_FOR_U64_ALIGNMENT=y
439CONFIG_SYSVIPC_COMPAT=y 446CONFIG_SYSVIPC_COMPAT=y
440
441#
442# Networking
443#
444CONFIG_NET=y 447CONFIG_NET=y
445 448
446# 449#
@@ -464,7 +467,10 @@ CONFIG_IP_FIB_HASH=y
464CONFIG_IP_MULTIPLE_TABLES=y 467CONFIG_IP_MULTIPLE_TABLES=y
465CONFIG_IP_ROUTE_MULTIPATH=y 468CONFIG_IP_ROUTE_MULTIPATH=y
466CONFIG_IP_ROUTE_VERBOSE=y 469CONFIG_IP_ROUTE_VERBOSE=y
467# CONFIG_IP_PNP is not set 470CONFIG_IP_PNP=y
471CONFIG_IP_PNP_DHCP=y
472CONFIG_IP_PNP_BOOTP=y
473CONFIG_IP_PNP_RARP=y
468# CONFIG_NET_IPIP is not set 474# CONFIG_NET_IPIP is not set
469# CONFIG_NET_IPGRE is not set 475# CONFIG_NET_IPGRE is not set
470CONFIG_IP_MROUTE=y 476CONFIG_IP_MROUTE=y
@@ -607,7 +613,6 @@ CONFIG_NET_SCHED=y
607# CONFIG_NET_SCH_HTB is not set 613# CONFIG_NET_SCH_HTB is not set
608# CONFIG_NET_SCH_HFSC is not set 614# CONFIG_NET_SCH_HFSC is not set
609# CONFIG_NET_SCH_PRIO is not set 615# CONFIG_NET_SCH_PRIO is not set
610# CONFIG_NET_SCH_RR is not set
611# CONFIG_NET_SCH_RED is not set 616# CONFIG_NET_SCH_RED is not set
612# CONFIG_NET_SCH_SFQ is not set 617# CONFIG_NET_SCH_SFQ is not set
613# CONFIG_NET_SCH_TEQL is not set 618# CONFIG_NET_SCH_TEQL is not set
@@ -669,28 +674,19 @@ CONFIG_FIB_RULES=y
669CONFIG_CFG80211=y 674CONFIG_CFG80211=y
670CONFIG_NL80211=y 675CONFIG_NL80211=y
671CONFIG_WIRELESS_EXT=y 676CONFIG_WIRELESS_EXT=y
677CONFIG_WIRELESS_EXT_SYSFS=y
672CONFIG_MAC80211=y 678CONFIG_MAC80211=y
673 679
674# 680#
675# Rate control algorithm selection 681# Rate control algorithm selection
676# 682#
683CONFIG_MAC80211_RC_PID=y
677CONFIG_MAC80211_RC_DEFAULT_PID=y 684CONFIG_MAC80211_RC_DEFAULT_PID=y
678# CONFIG_MAC80211_RC_DEFAULT_NONE is not set
679
680#
681# Selecting 'y' for an algorithm will
682#
683
684#
685# build the algorithm into mac80211.
686#
687CONFIG_MAC80211_RC_DEFAULT="pid" 685CONFIG_MAC80211_RC_DEFAULT="pid"
688CONFIG_MAC80211_RC_PID=y
689# CONFIG_MAC80211_MESH is not set 686# CONFIG_MAC80211_MESH is not set
690CONFIG_MAC80211_LEDS=y 687CONFIG_MAC80211_LEDS=y
691# CONFIG_MAC80211_DEBUGFS is not set 688# CONFIG_MAC80211_DEBUGFS is not set
692# CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT is not set 689# CONFIG_MAC80211_DEBUG_MENU is not set
693# CONFIG_MAC80211_DEBUG is not set
694# CONFIG_IEEE80211 is not set 690# CONFIG_IEEE80211 is not set
695# CONFIG_RFKILL is not set 691# CONFIG_RFKILL is not set
696# CONFIG_NET_9P is not set 692# CONFIG_NET_9P is not set
@@ -706,6 +702,8 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
706CONFIG_STANDALONE=y 702CONFIG_STANDALONE=y
707CONFIG_PREVENT_FIRMWARE_BUILD=y 703CONFIG_PREVENT_FIRMWARE_BUILD=y
708CONFIG_FW_LOADER=y 704CONFIG_FW_LOADER=y
705CONFIG_FIRMWARE_IN_KERNEL=y
706CONFIG_EXTRA_FIRMWARE=""
709# CONFIG_DEBUG_DRIVER is not set 707# CONFIG_DEBUG_DRIVER is not set
710CONFIG_DEBUG_DEVRES=y 708CONFIG_DEBUG_DEVRES=y
711# CONFIG_SYS_HYPERVISOR is not set 709# CONFIG_SYS_HYPERVISOR is not set
@@ -738,6 +736,7 @@ CONFIG_BLK_DEV_RAM_SIZE=16384
738# CONFIG_BLK_DEV_XIP is not set 736# CONFIG_BLK_DEV_XIP is not set
739# CONFIG_CDROM_PKTCDVD is not set 737# CONFIG_CDROM_PKTCDVD is not set
740# CONFIG_ATA_OVER_ETH is not set 738# CONFIG_ATA_OVER_ETH is not set
739# CONFIG_BLK_DEV_HD is not set
741CONFIG_MISC_DEVICES=y 740CONFIG_MISC_DEVICES=y
742# CONFIG_IBM_ASM is not set 741# CONFIG_IBM_ASM is not set
743# CONFIG_PHANTOM is not set 742# CONFIG_PHANTOM is not set
@@ -748,10 +747,14 @@ CONFIG_MISC_DEVICES=y
748# CONFIG_ASUS_LAPTOP is not set 747# CONFIG_ASUS_LAPTOP is not set
749# CONFIG_FUJITSU_LAPTOP is not set 748# CONFIG_FUJITSU_LAPTOP is not set
750# CONFIG_MSI_LAPTOP is not set 749# CONFIG_MSI_LAPTOP is not set
750# CONFIG_COMPAL_LAPTOP is not set
751# CONFIG_SONY_LAPTOP is not set 751# CONFIG_SONY_LAPTOP is not set
752# CONFIG_THINKPAD_ACPI is not set 752# CONFIG_THINKPAD_ACPI is not set
753# CONFIG_INTEL_MENLOW is not set 753# CONFIG_INTEL_MENLOW is not set
754# CONFIG_ENCLOSURE_SERVICES is not set 754# CONFIG_ENCLOSURE_SERVICES is not set
755# CONFIG_SGI_XP is not set
756# CONFIG_HP_ILO is not set
757# CONFIG_SGI_GRU is not set
755CONFIG_HAVE_IDE=y 758CONFIG_HAVE_IDE=y
756# CONFIG_IDE is not set 759# CONFIG_IDE is not set
757 760
@@ -790,12 +793,13 @@ CONFIG_SCSI_WAIT_SCAN=m
790# 793#
791CONFIG_SCSI_SPI_ATTRS=y 794CONFIG_SCSI_SPI_ATTRS=y
792# CONFIG_SCSI_FC_ATTRS is not set 795# CONFIG_SCSI_FC_ATTRS is not set
793# CONFIG_SCSI_ISCSI_ATTRS is not set 796CONFIG_SCSI_ISCSI_ATTRS=y
794# CONFIG_SCSI_SAS_ATTRS is not set 797# CONFIG_SCSI_SAS_ATTRS is not set
795# CONFIG_SCSI_SAS_LIBSAS is not set 798# CONFIG_SCSI_SAS_LIBSAS is not set
796# CONFIG_SCSI_SRP_ATTRS is not set 799# CONFIG_SCSI_SRP_ATTRS is not set
797# CONFIG_SCSI_LOWLEVEL is not set 800# CONFIG_SCSI_LOWLEVEL is not set
798# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set 801# CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
802# CONFIG_SCSI_DH is not set
799CONFIG_ATA=y 803CONFIG_ATA=y
800# CONFIG_ATA_NONSTANDARD is not set 804# CONFIG_ATA_NONSTANDARD is not set
801CONFIG_ATA_ACPI=y 805CONFIG_ATA_ACPI=y
@@ -857,6 +861,7 @@ CONFIG_PATA_OLDPIIX=y
857# CONFIG_PATA_SIS is not set 861# CONFIG_PATA_SIS is not set
858# CONFIG_PATA_VIA is not set 862# CONFIG_PATA_VIA is not set
859# CONFIG_PATA_WINBOND is not set 863# CONFIG_PATA_WINBOND is not set
864CONFIG_PATA_SCH=y
860CONFIG_MD=y 865CONFIG_MD=y
861CONFIG_BLK_DEV_MD=y 866CONFIG_BLK_DEV_MD=y
862# CONFIG_MD_LINEAR is not set 867# CONFIG_MD_LINEAR is not set
@@ -880,13 +885,16 @@ CONFIG_DM_ZERO=y
880# 885#
881# IEEE 1394 (FireWire) support 886# IEEE 1394 (FireWire) support
882# 887#
888
889#
890# Enable only one of the two stacks, unless you know what you are doing
891#
883# CONFIG_FIREWIRE is not set 892# CONFIG_FIREWIRE is not set
884# CONFIG_IEEE1394 is not set 893# CONFIG_IEEE1394 is not set
885# CONFIG_I2O is not set 894# CONFIG_I2O is not set
886CONFIG_MACINTOSH_DRIVERS=y 895CONFIG_MACINTOSH_DRIVERS=y
887CONFIG_MAC_EMUMOUSEBTN=y 896CONFIG_MAC_EMUMOUSEBTN=y
888CONFIG_NETDEVICES=y 897CONFIG_NETDEVICES=y
889# CONFIG_NETDEVICES_MULTIQUEUE is not set
890# CONFIG_IFB is not set 898# CONFIG_IFB is not set
891# CONFIG_DUMMY is not set 899# CONFIG_DUMMY is not set
892# CONFIG_BONDING is not set 900# CONFIG_BONDING is not set
@@ -896,7 +904,23 @@ CONFIG_NETDEVICES=y
896# CONFIG_VETH is not set 904# CONFIG_VETH is not set
897# CONFIG_NET_SB1000 is not set 905# CONFIG_NET_SB1000 is not set
898# CONFIG_ARCNET is not set 906# CONFIG_ARCNET is not set
899# CONFIG_PHYLIB is not set 907CONFIG_PHYLIB=y
908
909#
910# MII PHY device drivers
911#
912# CONFIG_MARVELL_PHY is not set
913# CONFIG_DAVICOM_PHY is not set
914# CONFIG_QSEMI_PHY is not set
915# CONFIG_LXT_PHY is not set
916# CONFIG_CICADA_PHY is not set
917# CONFIG_VITESSE_PHY is not set
918# CONFIG_SMSC_PHY is not set
919# CONFIG_BROADCOM_PHY is not set
920# CONFIG_ICPLUS_PHY is not set
921# CONFIG_REALTEK_PHY is not set
922# CONFIG_FIXED_PHY is not set
923# CONFIG_MDIO_BITBANG is not set
900CONFIG_NET_ETHERNET=y 924CONFIG_NET_ETHERNET=y
901CONFIG_MII=y 925CONFIG_MII=y
902# CONFIG_HAPPYMEAL is not set 926# CONFIG_HAPPYMEAL is not set
@@ -940,16 +964,15 @@ CONFIG_8139TOO_PIO=y
940# CONFIG_SIS900 is not set 964# CONFIG_SIS900 is not set
941# CONFIG_EPIC100 is not set 965# CONFIG_EPIC100 is not set
942# CONFIG_SUNDANCE is not set 966# CONFIG_SUNDANCE is not set
967# CONFIG_TLAN is not set
943# CONFIG_VIA_RHINE is not set 968# CONFIG_VIA_RHINE is not set
944# CONFIG_SC92031 is not set 969# CONFIG_SC92031 is not set
945CONFIG_NETDEV_1000=y 970CONFIG_NETDEV_1000=y
946# CONFIG_ACENIC is not set 971# CONFIG_ACENIC is not set
947# CONFIG_DL2K is not set 972# CONFIG_DL2K is not set
948CONFIG_E1000=y 973CONFIG_E1000=y
949# CONFIG_E1000_NAPI is not set
950# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set 974# CONFIG_E1000_DISABLE_PACKET_SPLIT is not set
951# CONFIG_E1000E is not set 975# CONFIG_E1000E is not set
952# CONFIG_E1000E_ENABLED is not set
953# CONFIG_IP1000 is not set 976# CONFIG_IP1000 is not set
954# CONFIG_IGB is not set 977# CONFIG_IGB is not set
955# CONFIG_NS83820 is not set 978# CONFIG_NS83820 is not set
@@ -965,6 +988,7 @@ CONFIG_TIGON3=y
965# CONFIG_BNX2 is not set 988# CONFIG_BNX2 is not set
966# CONFIG_QLA3XXX is not set 989# CONFIG_QLA3XXX is not set
967# CONFIG_ATL1 is not set 990# CONFIG_ATL1 is not set
991# CONFIG_ATL1E is not set
968CONFIG_NETDEV_10000=y 992CONFIG_NETDEV_10000=y
969# CONFIG_CHELSIO_T1 is not set 993# CONFIG_CHELSIO_T1 is not set
970# CONFIG_CHELSIO_T3 is not set 994# CONFIG_CHELSIO_T3 is not set
@@ -1003,13 +1027,14 @@ CONFIG_WLAN_80211=y
1003# CONFIG_RTL8180 is not set 1027# CONFIG_RTL8180 is not set
1004# CONFIG_RTL8187 is not set 1028# CONFIG_RTL8187 is not set
1005# CONFIG_ADM8211 is not set 1029# CONFIG_ADM8211 is not set
1030# CONFIG_MAC80211_HWSIM is not set
1006# CONFIG_P54_COMMON is not set 1031# CONFIG_P54_COMMON is not set
1007CONFIG_ATH5K=y 1032CONFIG_ATH5K=y
1008# CONFIG_ATH5K_DEBUG is not set 1033# CONFIG_ATH5K_DEBUG is not set
1009# CONFIG_IWLWIFI is not set 1034# CONFIG_ATH9K is not set
1010# CONFIG_IWLCORE is not set 1035# CONFIG_IWLCORE is not set
1011# CONFIG_IWLWIFI_LEDS is not set 1036# CONFIG_IWLWIFI_LEDS is not set
1012# CONFIG_IWL4965 is not set 1037# CONFIG_IWLAGN is not set
1013# CONFIG_IWL3945 is not set 1038# CONFIG_IWL3945 is not set
1014# CONFIG_HOSTAP is not set 1039# CONFIG_HOSTAP is not set
1015# CONFIG_B43 is not set 1040# CONFIG_B43 is not set
@@ -1088,6 +1113,7 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y
1088# CONFIG_MOUSE_PS2_TOUCHKIT is not set 1113# CONFIG_MOUSE_PS2_TOUCHKIT is not set
1089# CONFIG_MOUSE_SERIAL is not set 1114# CONFIG_MOUSE_SERIAL is not set
1090# CONFIG_MOUSE_APPLETOUCH is not set 1115# CONFIG_MOUSE_APPLETOUCH is not set
1116# CONFIG_MOUSE_BCM5974 is not set
1091# CONFIG_MOUSE_VSXXXAA is not set 1117# CONFIG_MOUSE_VSXXXAA is not set
1092CONFIG_INPUT_JOYSTICK=y 1118CONFIG_INPUT_JOYSTICK=y
1093# CONFIG_JOYSTICK_ANALOG is not set 1119# CONFIG_JOYSTICK_ANALOG is not set
@@ -1122,12 +1148,14 @@ CONFIG_INPUT_TOUCHSCREEN=y
1122# CONFIG_TOUCHSCREEN_GUNZE is not set 1148# CONFIG_TOUCHSCREEN_GUNZE is not set
1123# CONFIG_TOUCHSCREEN_ELO is not set 1149# CONFIG_TOUCHSCREEN_ELO is not set
1124# CONFIG_TOUCHSCREEN_MTOUCH is not set 1150# CONFIG_TOUCHSCREEN_MTOUCH is not set
1151# CONFIG_TOUCHSCREEN_INEXIO is not set
1125# CONFIG_TOUCHSCREEN_MK712 is not set 1152# CONFIG_TOUCHSCREEN_MK712 is not set
1126# CONFIG_TOUCHSCREEN_PENMOUNT is not set 1153# CONFIG_TOUCHSCREEN_PENMOUNT is not set
1127# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set 1154# CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set
1128# CONFIG_TOUCHSCREEN_TOUCHWIN is not set 1155# CONFIG_TOUCHSCREEN_TOUCHWIN is not set
1129# CONFIG_TOUCHSCREEN_UCB1400 is not set 1156# CONFIG_TOUCHSCREEN_UCB1400 is not set
1130# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set 1157# CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set
1158# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set
1131CONFIG_INPUT_MISC=y 1159CONFIG_INPUT_MISC=y
1132# CONFIG_INPUT_PCSPKR is not set 1160# CONFIG_INPUT_PCSPKR is not set
1133# CONFIG_INPUT_APANEL is not set 1161# CONFIG_INPUT_APANEL is not set
@@ -1155,6 +1183,7 @@ CONFIG_SERIO_LIBPS2=y
1155# Character devices 1183# Character devices
1156# 1184#
1157CONFIG_VT=y 1185CONFIG_VT=y
1186CONFIG_CONSOLE_TRANSLATIONS=y
1158CONFIG_VT_CONSOLE=y 1187CONFIG_VT_CONSOLE=y
1159CONFIG_HW_CONSOLE=y 1188CONFIG_HW_CONSOLE=y
1160CONFIG_VT_HW_CONSOLE_BINDING=y 1189CONFIG_VT_HW_CONSOLE_BINDING=y
@@ -1222,7 +1251,6 @@ CONFIG_NVRAM=y
1222# CONFIG_PC8736x_GPIO is not set 1251# CONFIG_PC8736x_GPIO is not set
1223# CONFIG_RAW_DRIVER is not set 1252# CONFIG_RAW_DRIVER is not set
1224CONFIG_HPET=y 1253CONFIG_HPET=y
1225# CONFIG_HPET_RTC_IRQ is not set
1226# CONFIG_HPET_MMAP is not set 1254# CONFIG_HPET_MMAP is not set
1227# CONFIG_HANGCHECK_TIMER is not set 1255# CONFIG_HANGCHECK_TIMER is not set
1228# CONFIG_TCG_TPM is not set 1256# CONFIG_TCG_TPM is not set
@@ -1231,42 +1259,63 @@ CONFIG_DEVPORT=y
1231CONFIG_I2C=y 1259CONFIG_I2C=y
1232CONFIG_I2C_BOARDINFO=y 1260CONFIG_I2C_BOARDINFO=y
1233# CONFIG_I2C_CHARDEV is not set 1261# CONFIG_I2C_CHARDEV is not set
1262CONFIG_I2C_HELPER_AUTO=y
1234 1263
1235# 1264#
1236# I2C Hardware Bus support 1265# I2C Hardware Bus support
1237# 1266#
1267
1268#
1269# PC SMBus host controller drivers
1270#
1238# CONFIG_I2C_ALI1535 is not set 1271# CONFIG_I2C_ALI1535 is not set
1239# CONFIG_I2C_ALI1563 is not set 1272# CONFIG_I2C_ALI1563 is not set
1240# CONFIG_I2C_ALI15X3 is not set 1273# CONFIG_I2C_ALI15X3 is not set
1241# CONFIG_I2C_AMD756 is not set 1274# CONFIG_I2C_AMD756 is not set
1242# CONFIG_I2C_AMD8111 is not set 1275# CONFIG_I2C_AMD8111 is not set
1243CONFIG_I2C_I801=y 1276CONFIG_I2C_I801=y
1244# CONFIG_I2C_I810 is not set 1277# CONFIG_I2C_ISCH is not set
1245# CONFIG_I2C_PIIX4 is not set 1278# CONFIG_I2C_PIIX4 is not set
1246# CONFIG_I2C_NFORCE2 is not set 1279# CONFIG_I2C_NFORCE2 is not set
1247# CONFIG_I2C_OCORES is not set
1248# CONFIG_I2C_PARPORT_LIGHT is not set
1249# CONFIG_I2C_PROSAVAGE is not set
1250# CONFIG_I2C_SAVAGE4 is not set
1251# CONFIG_I2C_SIMTEC is not set
1252# CONFIG_I2C_SIS5595 is not set 1280# CONFIG_I2C_SIS5595 is not set
1253# CONFIG_I2C_SIS630 is not set 1281# CONFIG_I2C_SIS630 is not set
1254# CONFIG_I2C_SIS96X is not set 1282# CONFIG_I2C_SIS96X is not set
1255# CONFIG_I2C_TAOS_EVM is not set
1256# CONFIG_I2C_STUB is not set
1257# CONFIG_I2C_TINY_USB is not set
1258# CONFIG_I2C_VIA is not set 1283# CONFIG_I2C_VIA is not set
1259# CONFIG_I2C_VIAPRO is not set 1284# CONFIG_I2C_VIAPRO is not set
1285
1286#
1287# I2C system bus drivers (mostly embedded / system-on-chip)
1288#
1289# CONFIG_I2C_OCORES is not set
1290# CONFIG_I2C_SIMTEC is not set
1291
1292#
1293# External I2C/SMBus adapter drivers
1294#
1295# CONFIG_I2C_PARPORT_LIGHT is not set
1296# CONFIG_I2C_TAOS_EVM is not set
1297# CONFIG_I2C_TINY_USB is not set
1298
1299#
1300# Graphics adapter I2C/DDC channel drivers
1301#
1260# CONFIG_I2C_VOODOO3 is not set 1302# CONFIG_I2C_VOODOO3 is not set
1303
1304#
1305# Other I2C/SMBus bus drivers
1306#
1261# CONFIG_I2C_PCA_PLATFORM is not set 1307# CONFIG_I2C_PCA_PLATFORM is not set
1308# CONFIG_I2C_STUB is not set
1262 1309
1263# 1310#
1264# Miscellaneous I2C Chip support 1311# Miscellaneous I2C Chip support
1265# 1312#
1266# CONFIG_DS1682 is not set 1313# CONFIG_DS1682 is not set
1314# CONFIG_AT24 is not set
1267# CONFIG_SENSORS_EEPROM is not set 1315# CONFIG_SENSORS_EEPROM is not set
1268# CONFIG_SENSORS_PCF8574 is not set 1316# CONFIG_SENSORS_PCF8574 is not set
1269# CONFIG_PCF8575 is not set 1317# CONFIG_PCF8575 is not set
1318# CONFIG_SENSORS_PCA9539 is not set
1270# CONFIG_SENSORS_PCF8591 is not set 1319# CONFIG_SENSORS_PCF8591 is not set
1271# CONFIG_SENSORS_MAX6875 is not set 1320# CONFIG_SENSORS_MAX6875 is not set
1272# CONFIG_SENSORS_TSL2550 is not set 1321# CONFIG_SENSORS_TSL2550 is not set
@@ -1275,6 +1324,8 @@ CONFIG_I2C_I801=y
1275# CONFIG_I2C_DEBUG_BUS is not set 1324# CONFIG_I2C_DEBUG_BUS is not set
1276# CONFIG_I2C_DEBUG_CHIP is not set 1325# CONFIG_I2C_DEBUG_CHIP is not set
1277# CONFIG_SPI is not set 1326# CONFIG_SPI is not set
1327CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y
1328# CONFIG_GPIOLIB is not set
1278# CONFIG_W1 is not set 1329# CONFIG_W1 is not set
1279CONFIG_POWER_SUPPLY=y 1330CONFIG_POWER_SUPPLY=y
1280# CONFIG_POWER_SUPPLY_DEBUG is not set 1331# CONFIG_POWER_SUPPLY_DEBUG is not set
@@ -1335,8 +1386,10 @@ CONFIG_SSB_POSSIBLE=y
1335# 1386#
1336# Multifunction device drivers 1387# Multifunction device drivers
1337# 1388#
1389# CONFIG_MFD_CORE is not set
1338# CONFIG_MFD_SM501 is not set 1390# CONFIG_MFD_SM501 is not set
1339# CONFIG_HTC_PASIC3 is not set 1391# CONFIG_HTC_PASIC3 is not set
1392# CONFIG_MFD_TMIO is not set
1340 1393
1341# 1394#
1342# Multimedia devices 1395# Multimedia devices
@@ -1347,6 +1400,7 @@ CONFIG_SSB_POSSIBLE=y
1347# 1400#
1348# CONFIG_VIDEO_DEV is not set 1401# CONFIG_VIDEO_DEV is not set
1349# CONFIG_DVB_CORE is not set 1402# CONFIG_DVB_CORE is not set
1403# CONFIG_VIDEO_MEDIA is not set
1350 1404
1351# 1405#
1352# Multimedia drivers 1406# Multimedia drivers
@@ -1387,7 +1441,6 @@ CONFIG_FB_CFB_IMAGEBLIT=y
1387# CONFIG_FB_SYS_IMAGEBLIT is not set 1441# CONFIG_FB_SYS_IMAGEBLIT is not set
1388# CONFIG_FB_FOREIGN_ENDIAN is not set 1442# CONFIG_FB_FOREIGN_ENDIAN is not set
1389# CONFIG_FB_SYS_FOPS is not set 1443# CONFIG_FB_SYS_FOPS is not set
1390CONFIG_FB_DEFERRED_IO=y
1391# CONFIG_FB_SVGALIB is not set 1444# CONFIG_FB_SVGALIB is not set
1392# CONFIG_FB_MACMODES is not set 1445# CONFIG_FB_MACMODES is not set
1393# CONFIG_FB_BACKLIGHT is not set 1446# CONFIG_FB_BACKLIGHT is not set
@@ -1430,6 +1483,7 @@ CONFIG_FB_EFI=y
1430# CONFIG_FB_TRIDENT is not set 1483# CONFIG_FB_TRIDENT is not set
1431# CONFIG_FB_ARK is not set 1484# CONFIG_FB_ARK is not set
1432# CONFIG_FB_PM3 is not set 1485# CONFIG_FB_PM3 is not set
1486# CONFIG_FB_CARMINE is not set
1433# CONFIG_FB_GEODE is not set 1487# CONFIG_FB_GEODE is not set
1434# CONFIG_FB_VIRTUAL is not set 1488# CONFIG_FB_VIRTUAL is not set
1435CONFIG_BACKLIGHT_LCD_SUPPORT=y 1489CONFIG_BACKLIGHT_LCD_SUPPORT=y
@@ -1437,6 +1491,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y
1437CONFIG_BACKLIGHT_CLASS_DEVICE=y 1491CONFIG_BACKLIGHT_CLASS_DEVICE=y
1438# CONFIG_BACKLIGHT_CORGI is not set 1492# CONFIG_BACKLIGHT_CORGI is not set
1439# CONFIG_BACKLIGHT_PROGEAR is not set 1493# CONFIG_BACKLIGHT_PROGEAR is not set
1494# CONFIG_BACKLIGHT_MBP_NVIDIA is not set
1440 1495
1441# 1496#
1442# Display device support 1497# Display device support
@@ -1449,22 +1504,13 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y
1449CONFIG_VGA_CONSOLE=y 1504CONFIG_VGA_CONSOLE=y
1450CONFIG_VGACON_SOFT_SCROLLBACK=y 1505CONFIG_VGACON_SOFT_SCROLLBACK=y
1451CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 1506CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64
1452CONFIG_VIDEO_SELECT=y
1453CONFIG_DUMMY_CONSOLE=y 1507CONFIG_DUMMY_CONSOLE=y
1454# CONFIG_FRAMEBUFFER_CONSOLE is not set 1508# CONFIG_FRAMEBUFFER_CONSOLE is not set
1455CONFIG_LOGO=y 1509CONFIG_LOGO=y
1456# CONFIG_LOGO_LINUX_MONO is not set 1510# CONFIG_LOGO_LINUX_MONO is not set
1457# CONFIG_LOGO_LINUX_VGA16 is not set 1511# CONFIG_LOGO_LINUX_VGA16 is not set
1458CONFIG_LOGO_LINUX_CLUT224=y 1512CONFIG_LOGO_LINUX_CLUT224=y
1459
1460#
1461# Sound
1462#
1463CONFIG_SOUND=y 1513CONFIG_SOUND=y
1464
1465#
1466# Advanced Linux Sound Architecture
1467#
1468CONFIG_SND=y 1514CONFIG_SND=y
1469CONFIG_SND_TIMER=y 1515CONFIG_SND_TIMER=y
1470CONFIG_SND_PCM=y 1516CONFIG_SND_PCM=y
@@ -1482,20 +1528,14 @@ CONFIG_SND_VERBOSE_PROCFS=y
1482# CONFIG_SND_VERBOSE_PRINTK is not set 1528# CONFIG_SND_VERBOSE_PRINTK is not set
1483# CONFIG_SND_DEBUG is not set 1529# CONFIG_SND_DEBUG is not set
1484CONFIG_SND_VMASTER=y 1530CONFIG_SND_VMASTER=y
1485 1531CONFIG_SND_DRIVERS=y
1486#
1487# Generic devices
1488#
1489# CONFIG_SND_PCSP is not set 1532# CONFIG_SND_PCSP is not set
1490# CONFIG_SND_DUMMY is not set 1533# CONFIG_SND_DUMMY is not set
1491# CONFIG_SND_VIRMIDI is not set 1534# CONFIG_SND_VIRMIDI is not set
1492# CONFIG_SND_MTPAV is not set 1535# CONFIG_SND_MTPAV is not set
1493# CONFIG_SND_SERIAL_U16550 is not set 1536# CONFIG_SND_SERIAL_U16550 is not set
1494# CONFIG_SND_MPU401 is not set 1537# CONFIG_SND_MPU401 is not set
1495 1538CONFIG_SND_PCI=y
1496#
1497# PCI devices
1498#
1499# CONFIG_SND_AD1889 is not set 1539# CONFIG_SND_AD1889 is not set
1500# CONFIG_SND_ALS300 is not set 1540# CONFIG_SND_ALS300 is not set
1501# CONFIG_SND_ALS4000 is not set 1541# CONFIG_SND_ALS4000 is not set
@@ -1568,36 +1608,14 @@ CONFIG_SND_HDA_GENERIC=y
1568# CONFIG_SND_VIRTUOSO is not set 1608# CONFIG_SND_VIRTUOSO is not set
1569# CONFIG_SND_VX222 is not set 1609# CONFIG_SND_VX222 is not set
1570# CONFIG_SND_YMFPCI is not set 1610# CONFIG_SND_YMFPCI is not set
1571 1611CONFIG_SND_USB=y
1572#
1573# USB devices
1574#
1575# CONFIG_SND_USB_AUDIO is not set 1612# CONFIG_SND_USB_AUDIO is not set
1576# CONFIG_SND_USB_USX2Y is not set 1613# CONFIG_SND_USB_USX2Y is not set
1577# CONFIG_SND_USB_CAIAQ is not set 1614# CONFIG_SND_USB_CAIAQ is not set
1578 1615CONFIG_SND_PCMCIA=y
1579#
1580# PCMCIA devices
1581#
1582# CONFIG_SND_VXPOCKET is not set 1616# CONFIG_SND_VXPOCKET is not set
1583# CONFIG_SND_PDAUDIOCF is not set 1617# CONFIG_SND_PDAUDIOCF is not set
1584
1585#
1586# System on Chip audio support
1587#
1588# CONFIG_SND_SOC is not set 1618# CONFIG_SND_SOC is not set
1589
1590#
1591# ALSA SoC audio for Freescale SOCs
1592#
1593
1594#
1595# SoC Audio for the Texas Instruments OMAP
1596#
1597
1598#
1599# Open Sound System
1600#
1601# CONFIG_SOUND_PRIME is not set 1619# CONFIG_SOUND_PRIME is not set
1602CONFIG_HID_SUPPORT=y 1620CONFIG_HID_SUPPORT=y
1603CONFIG_HID=y 1621CONFIG_HID=y
@@ -1633,6 +1651,7 @@ CONFIG_USB_DEVICEFS=y
1633# CONFIG_USB_DYNAMIC_MINORS is not set 1651# CONFIG_USB_DYNAMIC_MINORS is not set
1634CONFIG_USB_SUSPEND=y 1652CONFIG_USB_SUSPEND=y
1635# CONFIG_USB_OTG is not set 1653# CONFIG_USB_OTG is not set
1654CONFIG_USB_MON=y
1636 1655
1637# 1656#
1638# USB Host Controller Drivers 1657# USB Host Controller Drivers
@@ -1656,6 +1675,7 @@ CONFIG_USB_UHCI_HCD=y
1656# 1675#
1657# CONFIG_USB_ACM is not set 1676# CONFIG_USB_ACM is not set
1658CONFIG_USB_PRINTER=y 1677CONFIG_USB_PRINTER=y
1678# CONFIG_USB_WDM is not set
1659 1679
1660# 1680#
1661# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' 1681# NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support'
@@ -1677,6 +1697,7 @@ CONFIG_USB_STORAGE=y
1677# CONFIG_USB_STORAGE_ALAUDA is not set 1697# CONFIG_USB_STORAGE_ALAUDA is not set
1678# CONFIG_USB_STORAGE_ONETOUCH is not set 1698# CONFIG_USB_STORAGE_ONETOUCH is not set
1679# CONFIG_USB_STORAGE_KARMA is not set 1699# CONFIG_USB_STORAGE_KARMA is not set
1700# CONFIG_USB_STORAGE_SIERRA is not set
1680# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set 1701# CONFIG_USB_STORAGE_CYPRESS_ATACB is not set
1681CONFIG_USB_LIBUSUAL=y 1702CONFIG_USB_LIBUSUAL=y
1682 1703
@@ -1685,7 +1706,6 @@ CONFIG_USB_LIBUSUAL=y
1685# 1706#
1686# CONFIG_USB_MDC800 is not set 1707# CONFIG_USB_MDC800 is not set
1687# CONFIG_USB_MICROTEK is not set 1708# CONFIG_USB_MICROTEK is not set
1688CONFIG_USB_MON=y
1689 1709
1690# 1710#
1691# USB port drivers 1711# USB port drivers
@@ -1698,7 +1718,6 @@ CONFIG_USB_MON=y
1698# CONFIG_USB_EMI62 is not set 1718# CONFIG_USB_EMI62 is not set
1699# CONFIG_USB_EMI26 is not set 1719# CONFIG_USB_EMI26 is not set
1700# CONFIG_USB_ADUTUX is not set 1720# CONFIG_USB_ADUTUX is not set
1701# CONFIG_USB_AUERSWALD is not set
1702# CONFIG_USB_RIO500 is not set 1721# CONFIG_USB_RIO500 is not set
1703# CONFIG_USB_LEGOTOWER is not set 1722# CONFIG_USB_LEGOTOWER is not set
1704# CONFIG_USB_LCD is not set 1723# CONFIG_USB_LCD is not set
@@ -1715,6 +1734,7 @@ CONFIG_USB_MON=y
1715# CONFIG_USB_TRANCEVIBRATOR is not set 1734# CONFIG_USB_TRANCEVIBRATOR is not set
1716# CONFIG_USB_IOWARRIOR is not set 1735# CONFIG_USB_IOWARRIOR is not set
1717# CONFIG_USB_TEST is not set 1736# CONFIG_USB_TEST is not set
1737# CONFIG_USB_ISIGHTFW is not set
1718# CONFIG_USB_GADGET is not set 1738# CONFIG_USB_GADGET is not set
1719# CONFIG_MMC is not set 1739# CONFIG_MMC is not set
1720# CONFIG_MEMSTICK is not set 1740# CONFIG_MEMSTICK is not set
@@ -1724,7 +1744,9 @@ CONFIG_LEDS_CLASS=y
1724# 1744#
1725# LED drivers 1745# LED drivers
1726# 1746#
1747# CONFIG_LEDS_PCA9532 is not set
1727# CONFIG_LEDS_CLEVO_MAIL is not set 1748# CONFIG_LEDS_CLEVO_MAIL is not set
1749# CONFIG_LEDS_PCA955X is not set
1728 1750
1729# 1751#
1730# LED Triggers 1752# LED Triggers
@@ -1770,6 +1792,7 @@ CONFIG_RTC_INTF_DEV=y
1770# CONFIG_RTC_DRV_PCF8583 is not set 1792# CONFIG_RTC_DRV_PCF8583 is not set
1771# CONFIG_RTC_DRV_M41T80 is not set 1793# CONFIG_RTC_DRV_M41T80 is not set
1772# CONFIG_RTC_DRV_S35390A is not set 1794# CONFIG_RTC_DRV_S35390A is not set
1795# CONFIG_RTC_DRV_FM3130 is not set
1773 1796
1774# 1797#
1775# SPI RTC drivers 1798# SPI RTC drivers
@@ -1802,11 +1825,13 @@ CONFIG_DMADEVICES=y
1802# Firmware Drivers 1825# Firmware Drivers
1803# 1826#
1804# CONFIG_EDD is not set 1827# CONFIG_EDD is not set
1828CONFIG_FIRMWARE_MEMMAP=y
1805CONFIG_EFI_VARS=y 1829CONFIG_EFI_VARS=y
1806# CONFIG_DELL_RBU is not set 1830# CONFIG_DELL_RBU is not set
1807# CONFIG_DCDBAS is not set 1831# CONFIG_DCDBAS is not set
1808CONFIG_DMIID=y 1832CONFIG_DMIID=y
1809# CONFIG_ISCSI_IBFT_FIND is not set 1833CONFIG_ISCSI_IBFT_FIND=y
1834CONFIG_ISCSI_IBFT=y
1810 1835
1811# 1836#
1812# File systems 1837# File systems
@@ -1886,14 +1911,27 @@ CONFIG_HUGETLB_PAGE=y
1886# CONFIG_CRAMFS is not set 1911# CONFIG_CRAMFS is not set
1887# CONFIG_VXFS_FS is not set 1912# CONFIG_VXFS_FS is not set
1888# CONFIG_MINIX_FS is not set 1913# CONFIG_MINIX_FS is not set
1914# CONFIG_OMFS_FS is not set
1889# CONFIG_HPFS_FS is not set 1915# CONFIG_HPFS_FS is not set
1890# CONFIG_QNX4FS_FS is not set 1916# CONFIG_QNX4FS_FS is not set
1891# CONFIG_ROMFS_FS is not set 1917# CONFIG_ROMFS_FS is not set
1892# CONFIG_SYSV_FS is not set 1918# CONFIG_SYSV_FS is not set
1893# CONFIG_UFS_FS is not set 1919# CONFIG_UFS_FS is not set
1894CONFIG_NETWORK_FILESYSTEMS=y 1920CONFIG_NETWORK_FILESYSTEMS=y
1895# CONFIG_NFS_FS is not set 1921CONFIG_NFS_FS=y
1922CONFIG_NFS_V3=y
1923CONFIG_NFS_V3_ACL=y
1924CONFIG_NFS_V4=y
1925CONFIG_ROOT_NFS=y
1896# CONFIG_NFSD is not set 1926# CONFIG_NFSD is not set
1927CONFIG_LOCKD=y
1928CONFIG_LOCKD_V4=y
1929CONFIG_NFS_ACL_SUPPORT=y
1930CONFIG_NFS_COMMON=y
1931CONFIG_SUNRPC=y
1932CONFIG_SUNRPC_GSS=y
1933CONFIG_RPCSEC_GSS_KRB5=y
1934# CONFIG_RPCSEC_GSS_SPKM3 is not set
1897# CONFIG_SMB_FS is not set 1935# CONFIG_SMB_FS is not set
1898# CONFIG_CIFS is not set 1936# CONFIG_CIFS is not set
1899# CONFIG_NCP_FS is not set 1937# CONFIG_NCP_FS is not set
@@ -1967,9 +2005,9 @@ CONFIG_NLS_UTF8=y
1967# Kernel hacking 2005# Kernel hacking
1968# 2006#
1969CONFIG_TRACE_IRQFLAGS_SUPPORT=y 2007CONFIG_TRACE_IRQFLAGS_SUPPORT=y
1970# CONFIG_PRINTK_TIME is not set 2008CONFIG_PRINTK_TIME=y
1971# CONFIG_ENABLE_WARN_DEPRECATED is not set 2009CONFIG_ENABLE_WARN_DEPRECATED=y
1972# CONFIG_ENABLE_MUST_CHECK is not set 2010CONFIG_ENABLE_MUST_CHECK=y
1973CONFIG_FRAME_WARN=2048 2011CONFIG_FRAME_WARN=2048
1974CONFIG_MAGIC_SYSRQ=y 2012CONFIG_MAGIC_SYSRQ=y
1975# CONFIG_UNUSED_SYMBOLS is not set 2013# CONFIG_UNUSED_SYMBOLS is not set
@@ -1998,6 +2036,7 @@ CONFIG_DEBUG_BUGVERBOSE=y
1998# CONFIG_DEBUG_INFO is not set 2036# CONFIG_DEBUG_INFO is not set
1999# CONFIG_DEBUG_VM is not set 2037# CONFIG_DEBUG_VM is not set
2000# CONFIG_DEBUG_WRITECOUNT is not set 2038# CONFIG_DEBUG_WRITECOUNT is not set
2039CONFIG_DEBUG_MEMORY_INIT=y
2001# CONFIG_DEBUG_LIST is not set 2040# CONFIG_DEBUG_LIST is not set
2002# CONFIG_DEBUG_SG is not set 2041# CONFIG_DEBUG_SG is not set
2003CONFIG_FRAME_POINTER=y 2042CONFIG_FRAME_POINTER=y
@@ -2008,11 +2047,20 @@ CONFIG_FRAME_POINTER=y
2008# CONFIG_LKDTM is not set 2047# CONFIG_LKDTM is not set
2009# CONFIG_FAULT_INJECTION is not set 2048# CONFIG_FAULT_INJECTION is not set
2010# CONFIG_LATENCYTOP is not set 2049# CONFIG_LATENCYTOP is not set
2050CONFIG_SYSCTL_SYSCALL_CHECK=y
2051CONFIG_HAVE_FTRACE=y
2052CONFIG_HAVE_DYNAMIC_FTRACE=y
2053# CONFIG_FTRACE is not set
2054# CONFIG_IRQSOFF_TRACER is not set
2055# CONFIG_SYSPROF_TRACER is not set
2056# CONFIG_SCHED_TRACER is not set
2057# CONFIG_CONTEXT_SWITCH_TRACER is not set
2011CONFIG_PROVIDE_OHCI1394_DMA_INIT=y 2058CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
2012# CONFIG_SAMPLES is not set 2059# CONFIG_SAMPLES is not set
2013# CONFIG_KGDB is not set
2014CONFIG_HAVE_ARCH_KGDB=y 2060CONFIG_HAVE_ARCH_KGDB=y
2015# CONFIG_NONPROMISC_DEVMEM is not set 2061# CONFIG_KGDB is not set
2062# CONFIG_STRICT_DEVMEM is not set
2063CONFIG_X86_VERBOSE_BOOTUP=y
2016CONFIG_EARLY_PRINTK=y 2064CONFIG_EARLY_PRINTK=y
2017CONFIG_DEBUG_STACKOVERFLOW=y 2065CONFIG_DEBUG_STACKOVERFLOW=y
2018CONFIG_DEBUG_STACK_USAGE=y 2066CONFIG_DEBUG_STACK_USAGE=y
@@ -2023,8 +2071,8 @@ CONFIG_DEBUG_RODATA=y
2023# CONFIG_DIRECT_GBPAGES is not set 2071# CONFIG_DIRECT_GBPAGES is not set
2024# CONFIG_DEBUG_RODATA_TEST is not set 2072# CONFIG_DEBUG_RODATA_TEST is not set
2025CONFIG_DEBUG_NX_TEST=m 2073CONFIG_DEBUG_NX_TEST=m
2026CONFIG_X86_MPPARSE=y
2027# CONFIG_IOMMU_DEBUG is not set 2074# CONFIG_IOMMU_DEBUG is not set
2075# CONFIG_MMIOTRACE is not set
2028CONFIG_IO_DELAY_TYPE_0X80=0 2076CONFIG_IO_DELAY_TYPE_0X80=0
2029CONFIG_IO_DELAY_TYPE_0XED=1 2077CONFIG_IO_DELAY_TYPE_0XED=1
2030CONFIG_IO_DELAY_TYPE_UDELAY=2 2078CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2036,6 +2084,7 @@ CONFIG_IO_DELAY_0X80=y
2036CONFIG_DEFAULT_IO_DELAY_TYPE=0 2084CONFIG_DEFAULT_IO_DELAY_TYPE=0
2037CONFIG_DEBUG_BOOT_PARAMS=y 2085CONFIG_DEBUG_BOOT_PARAMS=y
2038# CONFIG_CPA_DEBUG is not set 2086# CONFIG_CPA_DEBUG is not set
2087CONFIG_OPTIMIZE_INLINING=y
2039 2088
2040# 2089#
2041# Security options 2090# Security options
@@ -2045,7 +2094,6 @@ CONFIG_KEYS_DEBUG_PROC_KEYS=y
2045CONFIG_SECURITY=y 2094CONFIG_SECURITY=y
2046CONFIG_SECURITY_NETWORK=y 2095CONFIG_SECURITY_NETWORK=y
2047# CONFIG_SECURITY_NETWORK_XFRM is not set 2096# CONFIG_SECURITY_NETWORK_XFRM is not set
2048CONFIG_SECURITY_CAPABILITIES=y
2049CONFIG_SECURITY_FILE_CAPABILITIES=y 2097CONFIG_SECURITY_FILE_CAPABILITIES=y
2050# CONFIG_SECURITY_ROOTPLUG is not set 2098# CONFIG_SECURITY_ROOTPLUG is not set
2051CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 2099CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536
@@ -2106,6 +2154,10 @@ CONFIG_CRYPTO_HMAC=y
2106# CONFIG_CRYPTO_MD4 is not set 2154# CONFIG_CRYPTO_MD4 is not set
2107CONFIG_CRYPTO_MD5=y 2155CONFIG_CRYPTO_MD5=y
2108# CONFIG_CRYPTO_MICHAEL_MIC is not set 2156# CONFIG_CRYPTO_MICHAEL_MIC is not set
2157# CONFIG_CRYPTO_RMD128 is not set
2158# CONFIG_CRYPTO_RMD160 is not set
2159# CONFIG_CRYPTO_RMD256 is not set
2160# CONFIG_CRYPTO_RMD320 is not set
2109CONFIG_CRYPTO_SHA1=y 2161CONFIG_CRYPTO_SHA1=y
2110# CONFIG_CRYPTO_SHA256 is not set 2162# CONFIG_CRYPTO_SHA256 is not set
2111# CONFIG_CRYPTO_SHA512 is not set 2163# CONFIG_CRYPTO_SHA512 is not set
@@ -2155,6 +2207,7 @@ CONFIG_GENERIC_FIND_FIRST_BIT=y
2155CONFIG_GENERIC_FIND_NEXT_BIT=y 2207CONFIG_GENERIC_FIND_NEXT_BIT=y
2156# CONFIG_CRC_CCITT is not set 2208# CONFIG_CRC_CCITT is not set
2157# CONFIG_CRC16 is not set 2209# CONFIG_CRC16 is not set
2210CONFIG_CRC_T10DIF=y
2158# CONFIG_CRC_ITU_T is not set 2211# CONFIG_CRC_ITU_T is not set
2159CONFIG_CRC32=y 2212CONFIG_CRC32=y
2160# CONFIG_CRC7 is not set 2213# CONFIG_CRC7 is not set
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 3874c2de5403..903de4aa5094 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -10,6 +10,8 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 10obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
11obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 11obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
12 12
13obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
14
13aes-i586-y := aes-i586-asm_32.o aes_glue.o 15aes-i586-y := aes-i586-asm_32.o aes_glue.o
14twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o 16twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o
15salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o 17salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o
diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c
new file mode 100644
index 000000000000..070afc5b6c94
--- /dev/null
+++ b/arch/x86/crypto/crc32c-intel.c
@@ -0,0 +1,197 @@
1/*
2 * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal.
3 * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE)
4 * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at:
5 * http://www.intel.com/products/processor/manuals/
6 * Intel(R) 64 and IA-32 Architectures Software Developer's Manual
7 * Volume 2A: Instruction Set Reference, A-M
8 *
9 * Copyright (c) 2008 Austin Zhang <austin_zhang@linux.intel.com>
10 * Copyright (c) 2008 Kent Liu <kent.liu@intel.com>
11 *
12 * This program is free software; you can redistribute it and/or modify it
13 * under the terms of the GNU General Public License as published by the Free
14 * Software Foundation; either version 2 of the License, or (at your option)
15 * any later version.
16 *
17 */
18#include <linux/init.h>
19#include <linux/module.h>
20#include <linux/string.h>
21#include <linux/kernel.h>
22#include <crypto/internal/hash.h>
23
24#include <asm/cpufeature.h>
25
26#define CHKSUM_BLOCK_SIZE 1
27#define CHKSUM_DIGEST_SIZE 4
28
29#define SCALE_F sizeof(unsigned long)
30
31#ifdef CONFIG_X86_64
32#define REX_PRE "0x48, "
33#else
34#define REX_PRE
35#endif
36
37static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length)
38{
39 while (length--) {
40 __asm__ __volatile__(
41 ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1"
42 :"=S"(crc)
43 :"0"(crc), "c"(*data)
44 );
45 data++;
46 }
47
48 return crc;
49}
50
51static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len)
52{
53 unsigned int iquotient = len / SCALE_F;
54 unsigned int iremainder = len % SCALE_F;
55 unsigned long *ptmp = (unsigned long *)p;
56
57 while (iquotient--) {
58 __asm__ __volatile__(
59 ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;"
60 :"=S"(crc)
61 :"0"(crc), "c"(*ptmp)
62 );
63 ptmp++;
64 }
65
66 if (iremainder)
67 crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp,
68 iremainder);
69
70 return crc;
71}
72
73/*
74 * Setting the seed allows arbitrary accumulators and flexible XOR policy
75 * If your algorithm starts with ~0, then XOR with ~0 before you set
76 * the seed.
77 */
78static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key,
79 unsigned int keylen)
80{
81 u32 *mctx = crypto_ahash_ctx(hash);
82
83 if (keylen != sizeof(u32)) {
84 crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN);
85 return -EINVAL;
86 }
87 *mctx = le32_to_cpup((__le32 *)key);
88 return 0;
89}
90
91static int crc32c_intel_init(struct ahash_request *req)
92{
93 u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
94 u32 *crcp = ahash_request_ctx(req);
95
96 *crcp = *mctx;
97
98 return 0;
99}
100
101static int crc32c_intel_update(struct ahash_request *req)
102{
103 struct crypto_hash_walk walk;
104 u32 *crcp = ahash_request_ctx(req);
105 u32 crc = *crcp;
106 int nbytes;
107
108 for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
109 nbytes = crypto_hash_walk_done(&walk, 0))
110 crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
111
112 *crcp = crc;
113 return 0;
114}
115
116static int crc32c_intel_final(struct ahash_request *req)
117{
118 u32 *crcp = ahash_request_ctx(req);
119
120 *(__le32 *)req->result = ~cpu_to_le32p(crcp);
121 return 0;
122}
123
124static int crc32c_intel_digest(struct ahash_request *req)
125{
126 struct crypto_hash_walk walk;
127 u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req));
128 u32 crc = *mctx;
129 int nbytes;
130
131 for (nbytes = crypto_hash_walk_first(req, &walk); nbytes;
132 nbytes = crypto_hash_walk_done(&walk, 0))
133 crc = crc32c_intel_le_hw(crc, walk.data, nbytes);
134
135 *(__le32 *)req->result = ~cpu_to_le32(crc);
136 return 0;
137}
138
139static int crc32c_intel_cra_init(struct crypto_tfm *tfm)
140{
141 u32 *key = crypto_tfm_ctx(tfm);
142
143 *key = ~0;
144
145 tfm->crt_ahash.reqsize = sizeof(u32);
146
147 return 0;
148}
149
150static struct crypto_alg alg = {
151 .cra_name = "crc32c",
152 .cra_driver_name = "crc32c-intel",
153 .cra_priority = 200,
154 .cra_flags = CRYPTO_ALG_TYPE_AHASH,
155 .cra_blocksize = CHKSUM_BLOCK_SIZE,
156 .cra_alignmask = 3,
157 .cra_ctxsize = sizeof(u32),
158 .cra_module = THIS_MODULE,
159 .cra_list = LIST_HEAD_INIT(alg.cra_list),
160 .cra_init = crc32c_intel_cra_init,
161 .cra_type = &crypto_ahash_type,
162 .cra_u = {
163 .ahash = {
164 .digestsize = CHKSUM_DIGEST_SIZE,
165 .setkey = crc32c_intel_setkey,
166 .init = crc32c_intel_init,
167 .update = crc32c_intel_update,
168 .final = crc32c_intel_final,
169 .digest = crc32c_intel_digest,
170 }
171 }
172};
173
174
175static int __init crc32c_intel_mod_init(void)
176{
177 if (cpu_has_xmm4_2)
178 return crypto_register_alg(&alg);
179 else
180 return -ENODEV;
181}
182
183static void __exit crc32c_intel_mod_fini(void)
184{
185 crypto_unregister_alg(&alg);
186}
187
188module_init(crc32c_intel_mod_init);
189module_exit(crc32c_intel_mod_fini);
190
191MODULE_AUTHOR("Austin Zhang <austin.zhang@intel.com>, Kent Liu <kent.liu@intel.com>");
192MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware.");
193MODULE_LICENSE("GPL");
194
195MODULE_ALIAS("crc32c");
196MODULE_ALIAS("crc32c-intel");
197
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 58cccb6483b0..127ec3f07214 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -85,8 +85,10 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump)
85 dump->regs.ax = regs->ax; 85 dump->regs.ax = regs->ax;
86 dump->regs.ds = current->thread.ds; 86 dump->regs.ds = current->thread.ds;
87 dump->regs.es = current->thread.es; 87 dump->regs.es = current->thread.es;
88 asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; 88 savesegment(fs, fs);
89 asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; 89 dump->regs.fs = fs;
90 savesegment(gs, gs);
91 dump->regs.gs = gs;
90 dump->regs.orig_ax = regs->orig_ax; 92 dump->regs.orig_ax = regs->orig_ax;
91 dump->regs.ip = regs->ip; 93 dump->regs.ip = regs->ip;
92 dump->regs.cs = regs->cs; 94 dump->regs.cs = regs->cs;
@@ -430,8 +432,9 @@ beyond_if:
430 current->mm->start_stack = 432 current->mm->start_stack =
431 (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); 433 (unsigned long)create_aout_tables((char __user *)bprm->p, bprm);
432 /* start thread */ 434 /* start thread */
433 asm volatile("movl %0,%%fs" :: "r" (0)); \ 435 loadsegment(fs, 0);
434 asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); 436 loadsegment(ds, __USER32_DS);
437 loadsegment(es, __USER32_DS);
435 load_gs_index(0); 438 load_gs_index(0);
436 (regs)->ip = ex.a_entry; 439 (regs)->ip = ex.a_entry;
437 (regs)->sp = current->mm->start_stack; 440 (regs)->sp = current->mm->start_stack;
@@ -441,12 +444,6 @@ beyond_if:
441 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 444 regs->r8 = regs->r9 = regs->r10 = regs->r11 =
442 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; 445 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
443 set_fs(USER_DS); 446 set_fs(USER_DS);
444 if (unlikely(current->ptrace & PT_PTRACED)) {
445 if (current->ptrace & PT_TRACE_EXEC)
446 ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP);
447 else
448 send_sig(SIGTRAP, current, 0);
449 }
450 return 0; 447 return 0;
451} 448}
452 449
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index cb3856a18c85..4bc02b23674b 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -36,6 +36,11 @@
36 36
37#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 37#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
38 38
39#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \
40 X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \
41 X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \
42 X86_EFLAGS_CF)
43
39asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); 44asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset);
40void signal_fault(struct pt_regs *regs, void __user *frame, char *where); 45void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
41 46
@@ -174,9 +179,10 @@ struct sigframe
174 u32 pretcode; 179 u32 pretcode;
175 int sig; 180 int sig;
176 struct sigcontext_ia32 sc; 181 struct sigcontext_ia32 sc;
177 struct _fpstate_ia32 fpstate; 182 struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */
178 unsigned int extramask[_COMPAT_NSIG_WORDS-1]; 183 unsigned int extramask[_COMPAT_NSIG_WORDS-1];
179 char retcode[8]; 184 char retcode[8];
185 /* fp state follows here */
180}; 186};
181 187
182struct rt_sigframe 188struct rt_sigframe
@@ -187,8 +193,8 @@ struct rt_sigframe
187 u32 puc; 193 u32 puc;
188 compat_siginfo_t info; 194 compat_siginfo_t info;
189 struct ucontext_ia32 uc; 195 struct ucontext_ia32 uc;
190 struct _fpstate_ia32 fpstate;
191 char retcode[8]; 196 char retcode[8];
197 /* fp state follows here */
192}; 198};
193 199
194#define COPY(x) { \ 200#define COPY(x) { \
@@ -201,7 +207,7 @@ struct rt_sigframe
201 { unsigned int cur; \ 207 { unsigned int cur; \
202 unsigned short pre; \ 208 unsigned short pre; \
203 err |= __get_user(pre, &sc->seg); \ 209 err |= __get_user(pre, &sc->seg); \
204 asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ 210 savesegment(seg, cur); \
205 pre |= mask; \ 211 pre |= mask; \
206 if (pre != cur) loadsegment(seg, pre); } 212 if (pre != cur) loadsegment(seg, pre); }
207 213
@@ -210,7 +216,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
210 unsigned int *peax) 216 unsigned int *peax)
211{ 217{
212 unsigned int tmpflags, gs, oldgs, err = 0; 218 unsigned int tmpflags, gs, oldgs, err = 0;
213 struct _fpstate_ia32 __user *buf; 219 void __user *buf;
214 u32 tmp; 220 u32 tmp;
215 221
216 /* Always make any pending restarted system calls return -EINTR */ 222 /* Always make any pending restarted system calls return -EINTR */
@@ -230,7 +236,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
230 */ 236 */
231 err |= __get_user(gs, &sc->gs); 237 err |= __get_user(gs, &sc->gs);
232 gs |= 3; 238 gs |= 3;
233 asm("movl %%gs,%0" : "=r" (oldgs)); 239 savesegment(gs, oldgs);
234 if (gs != oldgs) 240 if (gs != oldgs)
235 load_gs_index(gs); 241 load_gs_index(gs);
236 242
@@ -248,32 +254,18 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
248 regs->ss |= 3; 254 regs->ss |= 3;
249 255
250 err |= __get_user(tmpflags, &sc->flags); 256 err |= __get_user(tmpflags, &sc->flags);
251 regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5); 257 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
252 /* disable syscall checks */ 258 /* disable syscall checks */
253 regs->orig_ax = -1; 259 regs->orig_ax = -1;
254 260
255 err |= __get_user(tmp, &sc->fpstate); 261 err |= __get_user(tmp, &sc->fpstate);
256 buf = compat_ptr(tmp); 262 buf = compat_ptr(tmp);
257 if (buf) { 263 err |= restore_i387_xstate_ia32(buf);
258 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
259 goto badframe;
260 err |= restore_i387_ia32(buf);
261 } else {
262 struct task_struct *me = current;
263
264 if (used_math()) {
265 clear_fpu(me);
266 clear_used_math();
267 }
268 }
269 264
270 err |= __get_user(tmp, &sc->ax); 265 err |= __get_user(tmp, &sc->ax);
271 *peax = tmp; 266 *peax = tmp;
272 267
273 return err; 268 return err;
274
275badframe:
276 return 1;
277} 269}
278 270
279asmlinkage long sys32_sigreturn(struct pt_regs *regs) 271asmlinkage long sys32_sigreturn(struct pt_regs *regs)
@@ -345,46 +337,42 @@ badframe:
345 */ 337 */
346 338
347static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, 339static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
348 struct _fpstate_ia32 __user *fpstate, 340 void __user *fpstate,
349 struct pt_regs *regs, unsigned int mask) 341 struct pt_regs *regs, unsigned int mask)
350{ 342{
351 int tmp, err = 0; 343 int tmp, err = 0;
352 344
353 tmp = 0; 345 savesegment(gs, tmp);
354 __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp));
355 err |= __put_user(tmp, (unsigned int __user *)&sc->gs); 346 err |= __put_user(tmp, (unsigned int __user *)&sc->gs);
356 __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); 347 savesegment(fs, tmp);
357 err |= __put_user(tmp, (unsigned int __user *)&sc->fs); 348 err |= __put_user(tmp, (unsigned int __user *)&sc->fs);
358 __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp)); 349 savesegment(ds, tmp);
359 err |= __put_user(tmp, (unsigned int __user *)&sc->ds); 350 err |= __put_user(tmp, (unsigned int __user *)&sc->ds);
360 __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); 351 savesegment(es, tmp);
361 err |= __put_user(tmp, (unsigned int __user *)&sc->es); 352 err |= __put_user(tmp, (unsigned int __user *)&sc->es);
362 353
363 err |= __put_user((u32)regs->di, &sc->di); 354 err |= __put_user(regs->di, &sc->di);
364 err |= __put_user((u32)regs->si, &sc->si); 355 err |= __put_user(regs->si, &sc->si);
365 err |= __put_user((u32)regs->bp, &sc->bp); 356 err |= __put_user(regs->bp, &sc->bp);
366 err |= __put_user((u32)regs->sp, &sc->sp); 357 err |= __put_user(regs->sp, &sc->sp);
367 err |= __put_user((u32)regs->bx, &sc->bx); 358 err |= __put_user(regs->bx, &sc->bx);
368 err |= __put_user((u32)regs->dx, &sc->dx); 359 err |= __put_user(regs->dx, &sc->dx);
369 err |= __put_user((u32)regs->cx, &sc->cx); 360 err |= __put_user(regs->cx, &sc->cx);
370 err |= __put_user((u32)regs->ax, &sc->ax); 361 err |= __put_user(regs->ax, &sc->ax);
371 err |= __put_user((u32)regs->cs, &sc->cs); 362 err |= __put_user(regs->cs, &sc->cs);
372 err |= __put_user((u32)regs->ss, &sc->ss); 363 err |= __put_user(regs->ss, &sc->ss);
373 err |= __put_user(current->thread.trap_no, &sc->trapno); 364 err |= __put_user(current->thread.trap_no, &sc->trapno);
374 err |= __put_user(current->thread.error_code, &sc->err); 365 err |= __put_user(current->thread.error_code, &sc->err);
375 err |= __put_user((u32)regs->ip, &sc->ip); 366 err |= __put_user(regs->ip, &sc->ip);
376 err |= __put_user((u32)regs->flags, &sc->flags); 367 err |= __put_user(regs->flags, &sc->flags);
377 err |= __put_user((u32)regs->sp, &sc->sp_at_signal); 368 err |= __put_user(regs->sp, &sc->sp_at_signal);
378 369
379 tmp = save_i387_ia32(fpstate); 370 tmp = save_i387_xstate_ia32(fpstate);
380 if (tmp < 0) 371 if (tmp < 0)
381 err = -EFAULT; 372 err = -EFAULT;
382 else { 373 else
383 clear_used_math();
384 stts();
385 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), 374 err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL),
386 &sc->fpstate); 375 &sc->fpstate);
387 }
388 376
389 /* non-iBCS2 extensions.. */ 377 /* non-iBCS2 extensions.. */
390 err |= __put_user(mask, &sc->oldmask); 378 err |= __put_user(mask, &sc->oldmask);
@@ -397,7 +385,8 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,
397 * Determine which stack to use.. 385 * Determine which stack to use..
398 */ 386 */
399static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, 387static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
400 size_t frame_size) 388 size_t frame_size,
389 void **fpstate)
401{ 390{
402 unsigned long sp; 391 unsigned long sp;
403 392
@@ -416,6 +405,11 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,
416 ka->sa.sa_restorer) 405 ka->sa.sa_restorer)
417 sp = (unsigned long) ka->sa.sa_restorer; 406 sp = (unsigned long) ka->sa.sa_restorer;
418 407
408 if (used_math()) {
409 sp = sp - sig_xstate_ia32_size;
410 *fpstate = (struct _fpstate_ia32 *) sp;
411 }
412
419 sp -= frame_size; 413 sp -= frame_size;
420 /* Align the stack pointer according to the i386 ABI, 414 /* Align the stack pointer according to the i386 ABI,
421 * i.e. so that on function entry ((sp + 4) & 15) == 0. */ 415 * i.e. so that on function entry ((sp + 4) & 15) == 0. */
@@ -429,6 +423,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
429 struct sigframe __user *frame; 423 struct sigframe __user *frame;
430 void __user *restorer; 424 void __user *restorer;
431 int err = 0; 425 int err = 0;
426 void __user *fpstate = NULL;
432 427
433 /* copy_to_user optimizes that into a single 8 byte store */ 428 /* copy_to_user optimizes that into a single 8 byte store */
434 static const struct { 429 static const struct {
@@ -443,25 +438,21 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
443 0, 438 0,
444 }; 439 };
445 440
446 frame = get_sigframe(ka, regs, sizeof(*frame)); 441 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
447 442
448 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 443 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
449 goto give_sigsegv; 444 return -EFAULT;
450 445
451 err |= __put_user(sig, &frame->sig); 446 if (__put_user(sig, &frame->sig))
452 if (err) 447 return -EFAULT;
453 goto give_sigsegv;
454 448
455 err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs, 449 if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
456 set->sig[0]); 450 return -EFAULT;
457 if (err)
458 goto give_sigsegv;
459 451
460 if (_COMPAT_NSIG_WORDS > 1) { 452 if (_COMPAT_NSIG_WORDS > 1) {
461 err |= __copy_to_user(frame->extramask, &set->sig[1], 453 if (__copy_to_user(frame->extramask, &set->sig[1],
462 sizeof(frame->extramask)); 454 sizeof(frame->extramask)))
463 if (err) 455 return -EFAULT;
464 goto give_sigsegv;
465 } 456 }
466 457
467 if (ka->sa.sa_flags & SA_RESTORER) { 458 if (ka->sa.sa_flags & SA_RESTORER) {
@@ -482,7 +473,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
482 */ 473 */
483 err |= __copy_to_user(frame->retcode, &code, 8); 474 err |= __copy_to_user(frame->retcode, &code, 8);
484 if (err) 475 if (err)
485 goto give_sigsegv; 476 return -EFAULT;
486 477
487 /* Set up registers for signal handler */ 478 /* Set up registers for signal handler */
488 regs->sp = (unsigned long) frame; 479 regs->sp = (unsigned long) frame;
@@ -493,8 +484,8 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
493 regs->dx = 0; 484 regs->dx = 0;
494 regs->cx = 0; 485 regs->cx = 0;
495 486
496 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 487 loadsegment(ds, __USER32_DS);
497 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 488 loadsegment(es, __USER32_DS);
498 489
499 regs->cs = __USER32_CS; 490 regs->cs = __USER32_CS;
500 regs->ss = __USER32_DS; 491 regs->ss = __USER32_DS;
@@ -505,19 +496,15 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,
505#endif 496#endif
506 497
507 return 0; 498 return 0;
508
509give_sigsegv:
510 force_sigsegv(sig, current);
511 return -EFAULT;
512} 499}
513 500
514int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 501int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
515 compat_sigset_t *set, struct pt_regs *regs) 502 compat_sigset_t *set, struct pt_regs *regs)
516{ 503{
517 struct rt_sigframe __user *frame; 504 struct rt_sigframe __user *frame;
518 struct exec_domain *ed = current_thread_info()->exec_domain;
519 void __user *restorer; 505 void __user *restorer;
520 int err = 0; 506 int err = 0;
507 void __user *fpstate = NULL;
521 508
522 /* __copy_to_user optimizes that into a single 8 byte store */ 509 /* __copy_to_user optimizes that into a single 8 byte store */
523 static const struct { 510 static const struct {
@@ -533,31 +520,33 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
533 0, 520 0,
534 }; 521 };
535 522
536 frame = get_sigframe(ka, regs, sizeof(*frame)); 523 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
537 524
538 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 525 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
539 goto give_sigsegv; 526 return -EFAULT;
540 527
541 err |= __put_user((ed && ed->signal_invmap && sig < 32 528 err |= __put_user(sig, &frame->sig);
542 ? ed->signal_invmap[sig] : sig), &frame->sig);
543 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); 529 err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo);
544 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); 530 err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc);
545 err |= copy_siginfo_to_user32(&frame->info, info); 531 err |= copy_siginfo_to_user32(&frame->info, info);
546 if (err) 532 if (err)
547 goto give_sigsegv; 533 return -EFAULT;
548 534
549 /* Create the ucontext. */ 535 /* Create the ucontext. */
550 err |= __put_user(0, &frame->uc.uc_flags); 536 if (cpu_has_xsave)
537 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
538 else
539 err |= __put_user(0, &frame->uc.uc_flags);
551 err |= __put_user(0, &frame->uc.uc_link); 540 err |= __put_user(0, &frame->uc.uc_link);
552 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 541 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
553 err |= __put_user(sas_ss_flags(regs->sp), 542 err |= __put_user(sas_ss_flags(regs->sp),
554 &frame->uc.uc_stack.ss_flags); 543 &frame->uc.uc_stack.ss_flags);
555 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 544 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
556 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 545 err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
557 regs, set->sig[0]); 546 regs, set->sig[0]);
558 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 547 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
559 if (err) 548 if (err)
560 goto give_sigsegv; 549 return -EFAULT;
561 550
562 if (ka->sa.sa_flags & SA_RESTORER) 551 if (ka->sa.sa_flags & SA_RESTORER)
563 restorer = ka->sa.sa_restorer; 552 restorer = ka->sa.sa_restorer;
@@ -572,7 +561,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
572 */ 561 */
573 err |= __copy_to_user(frame->retcode, &code, 8); 562 err |= __copy_to_user(frame->retcode, &code, 8);
574 if (err) 563 if (err)
575 goto give_sigsegv; 564 return -EFAULT;
576 565
577 /* Set up registers for signal handler */ 566 /* Set up registers for signal handler */
578 regs->sp = (unsigned long) frame; 567 regs->sp = (unsigned long) frame;
@@ -588,8 +577,8 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
588 regs->dx = (unsigned long) &frame->info; 577 regs->dx = (unsigned long) &frame->info;
589 regs->cx = (unsigned long) &frame->uc; 578 regs->cx = (unsigned long) &frame->uc;
590 579
591 asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); 580 loadsegment(ds, __USER32_DS);
592 asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); 581 loadsegment(es, __USER32_DS);
593 582
594 regs->cs = __USER32_CS; 583 regs->cs = __USER32_CS;
595 regs->ss = __USER32_DS; 584 regs->ss = __USER32_DS;
@@ -600,8 +589,4 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
600#endif 589#endif
601 590
602 return 0; 591 return 0;
603
604give_sigsegv:
605 force_sigsegv(sig, current);
606 return -EFAULT;
607} 592}
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 20371d0635e4..256b00b61892 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -15,6 +15,16 @@
15#include <asm/irqflags.h> 15#include <asm/irqflags.h>
16#include <linux/linkage.h> 16#include <linux/linkage.h>
17 17
18/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
19#include <linux/elf-em.h>
20#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
21#define __AUDIT_ARCH_LE 0x40000000
22
23#ifndef CONFIG_AUDITSYSCALL
24#define sysexit_audit int_ret_from_sys_call
25#define sysretl_audit int_ret_from_sys_call
26#endif
27
18#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) 28#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
19 29
20 .macro IA32_ARG_FIXUP noebp=0 30 .macro IA32_ARG_FIXUP noebp=0
@@ -29,24 +39,27 @@
29 .endm 39 .endm
30 40
31 /* clobbers %eax */ 41 /* clobbers %eax */
32 .macro CLEAR_RREGS 42 .macro CLEAR_RREGS _r9=rax
33 xorl %eax,%eax 43 xorl %eax,%eax
34 movq %rax,R11(%rsp) 44 movq %rax,R11(%rsp)
35 movq %rax,R10(%rsp) 45 movq %rax,R10(%rsp)
36 movq %rax,R9(%rsp) 46 movq %\_r9,R9(%rsp)
37 movq %rax,R8(%rsp) 47 movq %rax,R8(%rsp)
38 .endm 48 .endm
39 49
40 .macro LOAD_ARGS32 offset 50 /*
41 movl \offset(%rsp),%r11d 51 * Reload arg registers from stack in case ptrace changed them.
42 movl \offset+8(%rsp),%r10d 52 * We don't reload %eax because syscall_trace_enter() returned
53 * the value it wants us to use in the table lookup.
54 */
55 .macro LOAD_ARGS32 offset, _r9=0
56 .if \_r9
43 movl \offset+16(%rsp),%r9d 57 movl \offset+16(%rsp),%r9d
44 movl \offset+24(%rsp),%r8d 58 .endif
45 movl \offset+40(%rsp),%ecx 59 movl \offset+40(%rsp),%ecx
46 movl \offset+48(%rsp),%edx 60 movl \offset+48(%rsp),%edx
47 movl \offset+56(%rsp),%esi 61 movl \offset+56(%rsp),%esi
48 movl \offset+64(%rsp),%edi 62 movl \offset+64(%rsp),%edi
49 movl \offset+72(%rsp),%eax
50 .endm 63 .endm
51 64
52 .macro CFI_STARTPROC32 simple 65 .macro CFI_STARTPROC32 simple
@@ -131,27 +144,28 @@ ENTRY(ia32_sysenter_target)
131 SAVE_ARGS 0,0,1 144 SAVE_ARGS 0,0,1
132 /* no need to do an access_ok check here because rbp has been 145 /* no need to do an access_ok check here because rbp has been
133 32bit zero extended */ 146 32bit zero extended */
1341: movl (%rbp),%r9d 1471: movl (%rbp),%ebp
135 .section __ex_table,"a" 148 .section __ex_table,"a"
136 .quad 1b,ia32_badarg 149 .quad 1b,ia32_badarg
137 .previous 150 .previous
138 GET_THREAD_INFO(%r10) 151 GET_THREAD_INFO(%r10)
139 orl $TS_COMPAT,TI_status(%r10) 152 orl $TS_COMPAT,TI_status(%r10)
140 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 153 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
141 TI_flags(%r10)
142 CFI_REMEMBER_STATE 154 CFI_REMEMBER_STATE
143 jnz sysenter_tracesys 155 jnz sysenter_tracesys
144sysenter_do_call:
145 cmpl $(IA32_NR_syscalls-1),%eax 156 cmpl $(IA32_NR_syscalls-1),%eax
146 ja ia32_badsys 157 ja ia32_badsys
147 IA32_ARG_FIXUP 1 158sysenter_do_call:
159 IA32_ARG_FIXUP
160sysenter_dispatch:
148 call *ia32_sys_call_table(,%rax,8) 161 call *ia32_sys_call_table(,%rax,8)
149 movq %rax,RAX-ARGOFFSET(%rsp) 162 movq %rax,RAX-ARGOFFSET(%rsp)
150 GET_THREAD_INFO(%r10) 163 GET_THREAD_INFO(%r10)
151 DISABLE_INTERRUPTS(CLBR_NONE) 164 DISABLE_INTERRUPTS(CLBR_NONE)
152 TRACE_IRQS_OFF 165 TRACE_IRQS_OFF
153 testl $_TIF_ALLWORK_MASK,TI_flags(%r10) 166 testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
154 jnz int_ret_from_sys_call 167 jnz sysexit_audit
168sysexit_from_sys_call:
155 andl $~TS_COMPAT,TI_status(%r10) 169 andl $~TS_COMPAT,TI_status(%r10)
156 /* clear IF, that popfq doesn't enable interrupts early */ 170 /* clear IF, that popfq doesn't enable interrupts early */
157 andl $~0x200,EFLAGS-R11(%rsp) 171 andl $~0x200,EFLAGS-R11(%rsp)
@@ -167,18 +181,69 @@ sysenter_do_call:
167 TRACE_IRQS_ON 181 TRACE_IRQS_ON
168 ENABLE_INTERRUPTS_SYSEXIT32 182 ENABLE_INTERRUPTS_SYSEXIT32
169 183
170sysenter_tracesys: 184#ifdef CONFIG_AUDITSYSCALL
185 .macro auditsys_entry_common
186 movl %esi,%r9d /* 6th arg: 4th syscall arg */
187 movl %edx,%r8d /* 5th arg: 3rd syscall arg */
188 /* (already in %ecx) 4th arg: 2nd syscall arg */
189 movl %ebx,%edx /* 3rd arg: 1st syscall arg */
190 movl %eax,%esi /* 2nd arg: syscall number */
191 movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */
192 call audit_syscall_entry
193 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
194 cmpl $(IA32_NR_syscalls-1),%eax
195 ja ia32_badsys
196 movl %ebx,%edi /* reload 1st syscall arg */
197 movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
198 movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
199 movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
200 movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
201 .endm
202
203 .macro auditsys_exit exit,ebpsave=RBP
204 testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
205 jnz int_ret_from_sys_call
206 TRACE_IRQS_ON
207 sti
208 movl %eax,%esi /* second arg, syscall return value */
209 cmpl $0,%eax /* is it < 0? */
210 setl %al /* 1 if so, 0 if not */
211 movzbl %al,%edi /* zero-extend that into %edi */
212 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
213 call audit_syscall_exit
214 GET_THREAD_INFO(%r10)
215 movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */
216 movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */
217 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
218 cli
219 TRACE_IRQS_OFF
220 testl %edi,TI_flags(%r10)
221 jnz int_with_check
222 jmp \exit
223 .endm
224
225sysenter_auditsys:
171 CFI_RESTORE_STATE 226 CFI_RESTORE_STATE
172 xchgl %r9d,%ebp 227 auditsys_entry_common
228 movl %ebp,%r9d /* reload 6th syscall arg */
229 jmp sysenter_dispatch
230
231sysexit_audit:
232 auditsys_exit sysexit_from_sys_call
233#endif
234
235sysenter_tracesys:
236#ifdef CONFIG_AUDITSYSCALL
237 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
238 jz sysenter_auditsys
239#endif
173 SAVE_REST 240 SAVE_REST
174 CLEAR_RREGS 241 CLEAR_RREGS
175 movq %r9,R9(%rsp)
176 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ 242 movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
177 movq %rsp,%rdi /* &pt_regs -> arg1 */ 243 movq %rsp,%rdi /* &pt_regs -> arg1 */
178 call syscall_trace_enter 244 call syscall_trace_enter
179 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 245 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
180 RESTORE_REST 246 RESTORE_REST
181 xchgl %ebp,%r9d
182 cmpl $(IA32_NR_syscalls-1),%eax 247 cmpl $(IA32_NR_syscalls-1),%eax
183 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ 248 ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
184 jmp sysenter_do_call 249 jmp sysenter_do_call
@@ -242,21 +307,22 @@ ENTRY(ia32_cstar_target)
242 .previous 307 .previous
243 GET_THREAD_INFO(%r10) 308 GET_THREAD_INFO(%r10)
244 orl $TS_COMPAT,TI_status(%r10) 309 orl $TS_COMPAT,TI_status(%r10)
245 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 310 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
246 TI_flags(%r10)
247 CFI_REMEMBER_STATE 311 CFI_REMEMBER_STATE
248 jnz cstar_tracesys 312 jnz cstar_tracesys
249cstar_do_call:
250 cmpl $IA32_NR_syscalls-1,%eax 313 cmpl $IA32_NR_syscalls-1,%eax
251 ja ia32_badsys 314 ja ia32_badsys
315cstar_do_call:
252 IA32_ARG_FIXUP 1 316 IA32_ARG_FIXUP 1
317cstar_dispatch:
253 call *ia32_sys_call_table(,%rax,8) 318 call *ia32_sys_call_table(,%rax,8)
254 movq %rax,RAX-ARGOFFSET(%rsp) 319 movq %rax,RAX-ARGOFFSET(%rsp)
255 GET_THREAD_INFO(%r10) 320 GET_THREAD_INFO(%r10)
256 DISABLE_INTERRUPTS(CLBR_NONE) 321 DISABLE_INTERRUPTS(CLBR_NONE)
257 TRACE_IRQS_OFF 322 TRACE_IRQS_OFF
258 testl $_TIF_ALLWORK_MASK,TI_flags(%r10) 323 testl $_TIF_ALLWORK_MASK,TI_flags(%r10)
259 jnz int_ret_from_sys_call 324 jnz sysretl_audit
325sysretl_from_sys_call:
260 andl $~TS_COMPAT,TI_status(%r10) 326 andl $~TS_COMPAT,TI_status(%r10)
261 RESTORE_ARGS 1,-ARG_SKIP,1,1,1 327 RESTORE_ARGS 1,-ARG_SKIP,1,1,1
262 movl RIP-ARGOFFSET(%rsp),%ecx 328 movl RIP-ARGOFFSET(%rsp),%ecx
@@ -268,19 +334,32 @@ cstar_do_call:
268 CFI_RESTORE rsp 334 CFI_RESTORE rsp
269 USERGS_SYSRET32 335 USERGS_SYSRET32
270 336
271cstar_tracesys: 337#ifdef CONFIG_AUDITSYSCALL
338cstar_auditsys:
272 CFI_RESTORE_STATE 339 CFI_RESTORE_STATE
340 movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
341 auditsys_entry_common
342 movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
343 jmp cstar_dispatch
344
345sysretl_audit:
346 auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */
347#endif
348
349cstar_tracesys:
350#ifdef CONFIG_AUDITSYSCALL
351 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10)
352 jz cstar_auditsys
353#endif
273 xchgl %r9d,%ebp 354 xchgl %r9d,%ebp
274 SAVE_REST 355 SAVE_REST
275 CLEAR_RREGS 356 CLEAR_RREGS r9
276 movq %r9,R9(%rsp)
277 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 357 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
278 movq %rsp,%rdi /* &pt_regs -> arg1 */ 358 movq %rsp,%rdi /* &pt_regs -> arg1 */
279 call syscall_trace_enter 359 call syscall_trace_enter
280 LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ 360 LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
281 RESTORE_REST 361 RESTORE_REST
282 xchgl %ebp,%r9d 362 xchgl %ebp,%r9d
283 movl RSP-ARGOFFSET(%rsp), %r8d
284 cmpl $(IA32_NR_syscalls-1),%eax 363 cmpl $(IA32_NR_syscalls-1),%eax
285 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ 364 ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
286 jmp cstar_do_call 365 jmp cstar_do_call
@@ -321,6 +400,7 @@ ENTRY(ia32_syscall)
321 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ 400 /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/
322 /*CFI_REL_OFFSET cs,CS-RIP*/ 401 /*CFI_REL_OFFSET cs,CS-RIP*/
323 CFI_REL_OFFSET rip,RIP-RIP 402 CFI_REL_OFFSET rip,RIP-RIP
403 PARAVIRT_ADJUST_EXCEPTION_FRAME
324 SWAPGS 404 SWAPGS
325 /* 405 /*
326 * No need to follow this irqs on/off section: the syscall 406 * No need to follow this irqs on/off section: the syscall
@@ -336,8 +416,7 @@ ENTRY(ia32_syscall)
336 SAVE_ARGS 0,0,1 416 SAVE_ARGS 0,0,1
337 GET_THREAD_INFO(%r10) 417 GET_THREAD_INFO(%r10)
338 orl $TS_COMPAT,TI_status(%r10) 418 orl $TS_COMPAT,TI_status(%r10)
339 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 419 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10)
340 TI_flags(%r10)
341 jnz ia32_tracesys 420 jnz ia32_tracesys
342ia32_do_syscall: 421ia32_do_syscall:
343 cmpl $(IA32_NR_syscalls-1),%eax 422 cmpl $(IA32_NR_syscalls-1),%eax
@@ -492,8 +571,8 @@ ia32_sys_call_table:
492 .quad compat_sys_setrlimit /* 75 */ 571 .quad compat_sys_setrlimit /* 75 */
493 .quad compat_sys_old_getrlimit /* old_getrlimit */ 572 .quad compat_sys_old_getrlimit /* old_getrlimit */
494 .quad compat_sys_getrusage 573 .quad compat_sys_getrusage
495 .quad sys32_gettimeofday 574 .quad compat_sys_gettimeofday
496 .quad sys32_settimeofday 575 .quad compat_sys_settimeofday
497 .quad sys_getgroups16 /* 80 */ 576 .quad sys_getgroups16 /* 80 */
498 .quad sys_setgroups16 577 .quad sys_setgroups16
499 .quad sys32_old_select 578 .quad sys32_old_select
@@ -741,4 +820,10 @@ ia32_sys_call_table:
741 .quad sys32_fallocate 820 .quad sys32_fallocate
742 .quad compat_sys_timerfd_settime /* 325 */ 821 .quad compat_sys_timerfd_settime /* 325 */
743 .quad compat_sys_timerfd_gettime 822 .quad compat_sys_timerfd_gettime
823 .quad compat_sys_signalfd4
824 .quad sys_eventfd2
825 .quad sys_epoll_create1
826 .quad sys_dup3 /* 330 */
827 .quad sys_pipe2
828 .quad sys_inotify_init1
744ia32_syscall_end: 829ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index f00afdf61e67..2e09dcd3c0a6 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -49,41 +49,6 @@
49 49
50#define AA(__x) ((unsigned long)(__x)) 50#define AA(__x) ((unsigned long)(__x))
51 51
52int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf)
53{
54 compat_ino_t ino;
55
56 typeof(ubuf->st_uid) uid = 0;
57 typeof(ubuf->st_gid) gid = 0;
58 SET_UID(uid, kbuf->uid);
59 SET_GID(gid, kbuf->gid);
60 if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev))
61 return -EOVERFLOW;
62 if (kbuf->size >= 0x7fffffff)
63 return -EOVERFLOW;
64 ino = kbuf->ino;
65 if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino)
66 return -EOVERFLOW;
67 if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) ||
68 __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) ||
69 __put_user(ino, &ubuf->st_ino) ||
70 __put_user(kbuf->mode, &ubuf->st_mode) ||
71 __put_user(kbuf->nlink, &ubuf->st_nlink) ||
72 __put_user(uid, &ubuf->st_uid) ||
73 __put_user(gid, &ubuf->st_gid) ||
74 __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) ||
75 __put_user(kbuf->size, &ubuf->st_size) ||
76 __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) ||
77 __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) ||
78 __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) ||
79 __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) ||
80 __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) ||
81 __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) ||
82 __put_user(kbuf->blksize, &ubuf->st_blksize) ||
83 __put_user(kbuf->blocks, &ubuf->st_blocks))
84 return -EFAULT;
85 return 0;
86}
87 52
88asmlinkage long sys32_truncate64(char __user *filename, 53asmlinkage long sys32_truncate64(char __user *filename,
89 unsigned long offset_low, 54 unsigned long offset_low,
@@ -238,7 +203,7 @@ asmlinkage long sys32_pipe(int __user *fd)
238 int retval; 203 int retval;
239 int fds[2]; 204 int fds[2];
240 205
241 retval = do_pipe(fds); 206 retval = do_pipe_flags(fds, 0);
242 if (retval) 207 if (retval)
243 goto out; 208 goto out;
244 if (copy_to_user(fd, fds, sizeof(fds))) 209 if (copy_to_user(fd, fds, sizeof(fds)))
@@ -402,75 +367,11 @@ asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set,
402 return 0; 367 return 0;
403} 368}
404 369
405static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i)
406{
407 int err = -EFAULT;
408
409 if (access_ok(VERIFY_READ, i, sizeof(*i))) {
410 err = __get_user(o->tv_sec, &i->tv_sec);
411 err |= __get_user(o->tv_usec, &i->tv_usec);
412 }
413 return err;
414}
415
416static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i)
417{
418 int err = -EFAULT;
419
420 if (access_ok(VERIFY_WRITE, o, sizeof(*o))) {
421 err = __put_user(i->tv_sec, &o->tv_sec);
422 err |= __put_user(i->tv_usec, &o->tv_usec);
423 }
424 return err;
425}
426
427asmlinkage long sys32_alarm(unsigned int seconds) 370asmlinkage long sys32_alarm(unsigned int seconds)
428{ 371{
429 return alarm_setitimer(seconds); 372 return alarm_setitimer(seconds);
430} 373}
431 374
432/*
433 * Translations due to time_t size differences. Which affects all
434 * sorts of things, like timeval and itimerval.
435 */
436asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv,
437 struct timezone __user *tz)
438{
439 if (tv) {
440 struct timeval ktv;
441
442 do_gettimeofday(&ktv);
443 if (put_tv32(tv, &ktv))
444 return -EFAULT;
445 }
446 if (tz) {
447 if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
448 return -EFAULT;
449 }
450 return 0;
451}
452
453asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv,
454 struct timezone __user *tz)
455{
456 struct timeval ktv;
457 struct timespec kts;
458 struct timezone ktz;
459
460 if (tv) {
461 if (get_tv32(&ktv, tv))
462 return -EFAULT;
463 kts.tv_sec = ktv.tv_sec;
464 kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC;
465 }
466 if (tz) {
467 if (copy_from_user(&ktz, tz, sizeof(ktz)))
468 return -EFAULT;
469 }
470
471 return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
472}
473
474struct sel_arg_struct { 375struct sel_arg_struct {
475 unsigned int n; 376 unsigned int n;
476 unsigned int inp; 377 unsigned int inp;
@@ -556,15 +457,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
556 return ret; 457 return ret;
557} 458}
558 459
559/* These are here just in case some old ia32 binary calls it. */
560asmlinkage long sys32_pause(void)
561{
562 current->state = TASK_INTERRUPTIBLE;
563 schedule();
564 return -ERESTARTNOHAND;
565}
566
567
568#ifdef CONFIG_SYSCTL_SYSCALL 460#ifdef CONFIG_SYSCTL_SYSCALL
569struct sysctl_ia32 { 461struct sysctl_ia32 {
570 unsigned int name; 462 unsigned int name;
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
new file mode 100644
index 000000000000..4a8e80cdcfa5
--- /dev/null
+++ b/arch/x86/include/asm/Kbuild
@@ -0,0 +1,24 @@
1include include/asm-generic/Kbuild.asm
2
3header-y += boot.h
4header-y += bootparam.h
5header-y += debugreg.h
6header-y += ldt.h
7header-y += msr-index.h
8header-y += prctl.h
9header-y += ptrace-abi.h
10header-y += sigcontext32.h
11header-y += ucontext.h
12header-y += processor-flags.h
13
14unifdef-y += e820.h
15unifdef-y += ist.h
16unifdef-y += mce.h
17unifdef-y += msr.h
18unifdef-y += mtrr.h
19unifdef-y += posix_types_32.h
20unifdef-y += posix_types_64.h
21unifdef-y += unistd_32.h
22unifdef-y += unistd_64.h
23unifdef-y += vm86.h
24unifdef-y += vsyscall.h
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
new file mode 100644
index 000000000000..37822206083e
--- /dev/null
+++ b/arch/x86/include/asm/a.out-core.h
@@ -0,0 +1,73 @@
1/* a.out coredump register dumper
2 *
3 * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public Licence
8 * as published by the Free Software Foundation; either version
9 * 2 of the Licence, or (at your option) any later version.
10 */
11
12#ifndef _ASM_X86_A_OUT_CORE_H
13#define _ASM_X86_A_OUT_CORE_H
14
15#ifdef __KERNEL__
16#ifdef CONFIG_X86_32
17
18#include <linux/user.h>
19#include <linux/elfcore.h>
20
21/*
22 * fill in the user structure for an a.out core dump
23 */
24static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
25{
26 u16 gs;
27
28/* changed the size calculations - should hopefully work better. lbt */
29 dump->magic = CMAGIC;
30 dump->start_code = 0;
31 dump->start_stack = regs->sp & ~(PAGE_SIZE - 1);
32 dump->u_tsize = ((unsigned long)current->mm->end_code) >> PAGE_SHIFT;
33 dump->u_dsize = ((unsigned long)(current->mm->brk + (PAGE_SIZE - 1)))
34 >> PAGE_SHIFT;
35 dump->u_dsize -= dump->u_tsize;
36 dump->u_ssize = 0;
37 dump->u_debugreg[0] = current->thread.debugreg0;
38 dump->u_debugreg[1] = current->thread.debugreg1;
39 dump->u_debugreg[2] = current->thread.debugreg2;
40 dump->u_debugreg[3] = current->thread.debugreg3;
41 dump->u_debugreg[4] = 0;
42 dump->u_debugreg[5] = 0;
43 dump->u_debugreg[6] = current->thread.debugreg6;
44 dump->u_debugreg[7] = current->thread.debugreg7;
45
46 if (dump->start_stack < TASK_SIZE)
47 dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
48 >> PAGE_SHIFT;
49
50 dump->regs.bx = regs->bx;
51 dump->regs.cx = regs->cx;
52 dump->regs.dx = regs->dx;
53 dump->regs.si = regs->si;
54 dump->regs.di = regs->di;
55 dump->regs.bp = regs->bp;
56 dump->regs.ax = regs->ax;
57 dump->regs.ds = (u16)regs->ds;
58 dump->regs.es = (u16)regs->es;
59 dump->regs.fs = (u16)regs->fs;
60 savesegment(gs, gs);
61 dump->regs.orig_ax = regs->orig_ax;
62 dump->regs.ip = regs->ip;
63 dump->regs.cs = (u16)regs->cs;
64 dump->regs.flags = regs->flags;
65 dump->regs.sp = regs->sp;
66 dump->regs.ss = (u16)regs->ss;
67
68 dump->u_fpvalid = dump_fpu(regs, &dump->i387);
69}
70
71#endif /* CONFIG_X86_32 */
72#endif /* __KERNEL__ */
73#endif /* _ASM_X86_A_OUT_CORE_H */
diff --git a/arch/x86/include/asm/a.out.h b/arch/x86/include/asm/a.out.h
new file mode 100644
index 000000000000..4684f97a5bbd
--- /dev/null
+++ b/arch/x86/include/asm/a.out.h
@@ -0,0 +1,20 @@
1#ifndef _ASM_X86_A_OUT_H
2#define _ASM_X86_A_OUT_H
3
4struct exec
5{
6 unsigned int a_info; /* Use macros N_MAGIC, etc for access */
7 unsigned a_text; /* length of text, in bytes */
8 unsigned a_data; /* length of data, in bytes */
9 unsigned a_bss; /* length of uninitialized data area for file, in bytes */
10 unsigned a_syms; /* length of symbol table data in file, in bytes */
11 unsigned a_entry; /* start address */
12 unsigned a_trsize; /* length of relocation info for text, in bytes */
13 unsigned a_drsize; /* length of relocation info for data, in bytes */
14};
15
16#define N_TRSIZE(a) ((a).a_trsize)
17#define N_DRSIZE(a) ((a).a_drsize)
18#define N_SYMSIZE(a) ((a).a_syms)
19
20#endif /* _ASM_X86_A_OUT_H */
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
new file mode 100644
index 000000000000..8d676d8ecde9
--- /dev/null
+++ b/arch/x86/include/asm/acpi.h
@@ -0,0 +1,178 @@
1#ifndef _ASM_X86_ACPI_H
2#define _ASM_X86_ACPI_H
3
4/*
5 * Copyright (C) 2001 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6 * Copyright (C) 2001 Patrick Mochel <mochel@osdl.org>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23 *
24 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25 */
26#include <acpi/pdc_intel.h>
27
28#include <asm/numa.h>
29#include <asm/processor.h>
30#include <asm/mmu.h>
31#include <asm/mpspec.h>
32
33#define COMPILER_DEPENDENT_INT64 long long
34#define COMPILER_DEPENDENT_UINT64 unsigned long long
35
36/*
37 * Calling conventions:
38 *
39 * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads)
40 * ACPI_EXTERNAL_XFACE - External ACPI interfaces
41 * ACPI_INTERNAL_XFACE - Internal ACPI interfaces
42 * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces
43 */
44#define ACPI_SYSTEM_XFACE
45#define ACPI_EXTERNAL_XFACE
46#define ACPI_INTERNAL_XFACE
47#define ACPI_INTERNAL_VAR_XFACE
48
49/* Asm macros */
50
51#define ACPI_ASM_MACROS
52#define BREAKPOINT3
53#define ACPI_DISABLE_IRQS() local_irq_disable()
54#define ACPI_ENABLE_IRQS() local_irq_enable()
55#define ACPI_FLUSH_CPU_CACHE() wbinvd()
56
57int __acpi_acquire_global_lock(unsigned int *lock);
58int __acpi_release_global_lock(unsigned int *lock);
59
60#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \
61 ((Acq) = __acpi_acquire_global_lock(&facs->global_lock))
62
63#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \
64 ((Acq) = __acpi_release_global_lock(&facs->global_lock))
65
66/*
67 * Math helper asm macros
68 */
69#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \
70 asm("divl %2;" \
71 : "=a"(q32), "=d"(r32) \
72 : "r"(d32), \
73 "0"(n_lo), "1"(n_hi))
74
75
76#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \
77 asm("shrl $1,%2 ;" \
78 "rcrl $1,%3;" \
79 : "=r"(n_hi), "=r"(n_lo) \
80 : "0"(n_hi), "1"(n_lo))
81
82#ifdef CONFIG_ACPI
83extern int acpi_lapic;
84extern int acpi_ioapic;
85extern int acpi_noirq;
86extern int acpi_strict;
87extern int acpi_disabled;
88extern int acpi_ht;
89extern int acpi_pci_disabled;
90extern int acpi_skip_timer_override;
91extern int acpi_use_timer_override;
92
93extern u8 acpi_sci_flags;
94extern int acpi_sci_override_gsi;
95void acpi_pic_sci_set_trigger(unsigned int, u16);
96
97static inline void disable_acpi(void)
98{
99 acpi_disabled = 1;
100 acpi_ht = 0;
101 acpi_pci_disabled = 1;
102 acpi_noirq = 1;
103}
104
105/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */
106#define FIX_ACPI_PAGES 4
107
108extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq);
109
110static inline void acpi_noirq_set(void) { acpi_noirq = 1; }
111static inline void acpi_disable_pci(void)
112{
113 acpi_pci_disabled = 1;
114 acpi_noirq_set();
115}
116extern int acpi_irq_balance_set(char *str);
117
118/* routines for saving/restoring kernel state */
119extern int acpi_save_state_mem(void);
120extern void acpi_restore_state_mem(void);
121
122extern unsigned long acpi_wakeup_address;
123
124/* early initialization routine */
125extern void acpi_reserve_bootmem(void);
126
127/*
128 * Check if the CPU can handle C2 and deeper
129 */
130static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
131{
132 /*
133 * Early models (<=5) of AMD Opterons are not supposed to go into
134 * C2 state.
135 *
136 * Steppings 0x0A and later are good
137 */
138 if (boot_cpu_data.x86 == 0x0F &&
139 boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
140 boot_cpu_data.x86_model <= 0x05 &&
141 boot_cpu_data.x86_mask < 0x0A)
142 return 1;
143 else if (boot_cpu_has(X86_FEATURE_AMDC1E))
144 return 1;
145 else
146 return max_cstate;
147}
148
149#else /* !CONFIG_ACPI */
150
151#define acpi_lapic 0
152#define acpi_ioapic 0
153static inline void acpi_noirq_set(void) { }
154static inline void acpi_disable_pci(void) { }
155static inline void disable_acpi(void) { }
156
157#endif /* !CONFIG_ACPI */
158
159#define ARCH_HAS_POWER_INIT 1
160
161struct bootnode;
162
163#ifdef CONFIG_ACPI_NUMA
164extern int acpi_numa;
165extern int acpi_scan_nodes(unsigned long start, unsigned long end);
166#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
167extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
168 int num_nodes);
169#else
170static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
171 int num_nodes)
172{
173}
174#endif
175
176#define acpi_unlazy_tlb(x) leave_mm(x)
177
178#endif /* _ASM_X86_ACPI_H */
diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h
new file mode 100644
index 000000000000..9825cd64c9b6
--- /dev/null
+++ b/arch/x86/include/asm/agp.h
@@ -0,0 +1,35 @@
1#ifndef _ASM_X86_AGP_H
2#define _ASM_X86_AGP_H
3
4#include <asm/pgtable.h>
5#include <asm/cacheflush.h>
6
7/*
8 * Functions to keep the agpgart mappings coherent with the MMU. The
9 * GART gives the CPU a physical alias of pages in memory. The alias
10 * region is mapped uncacheable. Make sure there are no conflicting
11 * mappings with different cachability attributes for the same
12 * page. This avoids data corruption on some CPUs.
13 */
14
15#define map_page_into_agp(page) set_pages_uc(page, 1)
16#define unmap_page_from_agp(page) set_pages_wb(page, 1)
17
18/*
19 * Could use CLFLUSH here if the cpu supports it. But then it would
20 * need to be called for each cacheline of the whole page so it may
21 * not be worth it. Would need a page for it.
22 */
23#define flush_agp_cache() wbinvd()
24
25/* Convert a physical address to an address suitable for the GART. */
26#define phys_to_gart(x) (x)
27#define gart_to_phys(x) (x)
28
29/* GATT allocation. Returns/accepts GATT kernel virtual address. */
30#define alloc_gatt_pages(order) \
31 ((char *)__get_free_pages(GFP_KERNEL, (order)))
32#define free_gatt_pages(table, order) \
33 free_pages((unsigned long)(table), (order))
34
35#endif /* _ASM_X86_AGP_H */
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
new file mode 100644
index 000000000000..e2077d343c33
--- /dev/null
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -0,0 +1,22 @@
1#ifdef __ASSEMBLY__
2
3#ifdef CONFIG_X86_32
4# define X86_ALIGN .long
5#else
6# define X86_ALIGN .quad
7#endif
8
9#ifdef CONFIG_SMP
10 .macro LOCK_PREFIX
111: lock
12 .section .smp_locks,"a"
13 .align 4
14 X86_ALIGN 1b
15 .previous
16 .endm
17#else
18 .macro LOCK_PREFIX
19 .endm
20#endif
21
22#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
new file mode 100644
index 000000000000..f6aa18eadf71
--- /dev/null
+++ b/arch/x86/include/asm/alternative.h
@@ -0,0 +1,183 @@
1#ifndef _ASM_X86_ALTERNATIVE_H
2#define _ASM_X86_ALTERNATIVE_H
3
4#include <linux/types.h>
5#include <linux/stddef.h>
6#include <asm/asm.h>
7
8/*
9 * Alternative inline assembly for SMP.
10 *
11 * The LOCK_PREFIX macro defined here replaces the LOCK and
12 * LOCK_PREFIX macros used everywhere in the source tree.
13 *
14 * SMP alternatives use the same data structures as the other
15 * alternatives and the X86_FEATURE_UP flag to indicate the case of a
16 * UP system running a SMP kernel. The existing apply_alternatives()
17 * works fine for patching a SMP kernel for UP.
18 *
19 * The SMP alternative tables can be kept after boot and contain both
20 * UP and SMP versions of the instructions to allow switching back to
21 * SMP at runtime, when hotplugging in a new CPU, which is especially
22 * useful in virtualized environments.
23 *
24 * The very common lock prefix is handled as special case in a
25 * separate table which is a pure address list without replacement ptr
26 * and size information. That keeps the table sizes small.
27 */
28
29#ifdef CONFIG_SMP
30#define LOCK_PREFIX \
31 ".section .smp_locks,\"a\"\n" \
32 _ASM_ALIGN "\n" \
33 _ASM_PTR "661f\n" /* address */ \
34 ".previous\n" \
35 "661:\n\tlock; "
36
37#else /* ! CONFIG_SMP */
38#define LOCK_PREFIX ""
39#endif
40
41/* This must be included *after* the definition of LOCK_PREFIX */
42#include <asm/cpufeature.h>
43
44struct alt_instr {
45 u8 *instr; /* original instruction */
46 u8 *replacement;
47 u8 cpuid; /* cpuid bit set for replacement */
48 u8 instrlen; /* length of original instruction */
49 u8 replacementlen; /* length of new instruction, <= instrlen */
50 u8 pad1;
51#ifdef CONFIG_X86_64
52 u32 pad2;
53#endif
54};
55
56extern void alternative_instructions(void);
57extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
58
59struct module;
60
61#ifdef CONFIG_SMP
62extern void alternatives_smp_module_add(struct module *mod, char *name,
63 void *locks, void *locks_end,
64 void *text, void *text_end);
65extern void alternatives_smp_module_del(struct module *mod);
66extern void alternatives_smp_switch(int smp);
67#else
68static inline void alternatives_smp_module_add(struct module *mod, char *name,
69 void *locks, void *locks_end,
70 void *text, void *text_end) {}
71static inline void alternatives_smp_module_del(struct module *mod) {}
72static inline void alternatives_smp_switch(int smp) {}
73#endif /* CONFIG_SMP */
74
75const unsigned char *const *find_nop_table(void);
76
77/*
78 * Alternative instructions for different CPU types or capabilities.
79 *
80 * This allows to use optimized instructions even on generic binary
81 * kernels.
82 *
83 * length of oldinstr must be longer or equal the length of newinstr
84 * It can be padded with nops as needed.
85 *
86 * For non barrier like inlines please define new variants
87 * without volatile and memory clobber.
88 */
89#define alternative(oldinstr, newinstr, feature) \
90 asm volatile ("661:\n\t" oldinstr "\n662:\n" \
91 ".section .altinstructions,\"a\"\n" \
92 _ASM_ALIGN "\n" \
93 _ASM_PTR "661b\n" /* label */ \
94 _ASM_PTR "663f\n" /* new instruction */ \
95 " .byte %c0\n" /* feature bit */ \
96 " .byte 662b-661b\n" /* sourcelen */ \
97 " .byte 664f-663f\n" /* replacementlen */ \
98 ".previous\n" \
99 ".section .altinstr_replacement,\"ax\"\n" \
100 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
101 ".previous" :: "i" (feature) : "memory")
102
103/*
104 * Alternative inline assembly with input.
105 *
106 * Pecularities:
107 * No memory clobber here.
108 * Argument numbers start with 1.
109 * Best is to use constraints that are fixed size (like (%1) ... "r")
110 * If you use variable sized constraints like "m" or "g" in the
111 * replacement make sure to pad to the worst case length.
112 */
113#define alternative_input(oldinstr, newinstr, feature, input...) \
114 asm volatile ("661:\n\t" oldinstr "\n662:\n" \
115 ".section .altinstructions,\"a\"\n" \
116 _ASM_ALIGN "\n" \
117 _ASM_PTR "661b\n" /* label */ \
118 _ASM_PTR "663f\n" /* new instruction */ \
119 " .byte %c0\n" /* feature bit */ \
120 " .byte 662b-661b\n" /* sourcelen */ \
121 " .byte 664f-663f\n" /* replacementlen */ \
122 ".previous\n" \
123 ".section .altinstr_replacement,\"ax\"\n" \
124 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
125 ".previous" :: "i" (feature), ##input)
126
127/* Like alternative_input, but with a single output argument */
128#define alternative_io(oldinstr, newinstr, feature, output, input...) \
129 asm volatile ("661:\n\t" oldinstr "\n662:\n" \
130 ".section .altinstructions,\"a\"\n" \
131 _ASM_ALIGN "\n" \
132 _ASM_PTR "661b\n" /* label */ \
133 _ASM_PTR "663f\n" /* new instruction */ \
134 " .byte %c[feat]\n" /* feature bit */ \
135 " .byte 662b-661b\n" /* sourcelen */ \
136 " .byte 664f-663f\n" /* replacementlen */ \
137 ".previous\n" \
138 ".section .altinstr_replacement,\"ax\"\n" \
139 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
140 ".previous" : output : [feat] "i" (feature), ##input)
141
142/*
143 * use this macro(s) if you need more than one output parameter
144 * in alternative_io
145 */
146#define ASM_OUTPUT2(a, b) a, b
147
148struct paravirt_patch_site;
149#ifdef CONFIG_PARAVIRT
150void apply_paravirt(struct paravirt_patch_site *start,
151 struct paravirt_patch_site *end);
152#else
153static inline void apply_paravirt(struct paravirt_patch_site *start,
154 struct paravirt_patch_site *end)
155{}
156#define __parainstructions NULL
157#define __parainstructions_end NULL
158#endif
159
160extern void add_nops(void *insns, unsigned int len);
161
162/*
163 * Clear and restore the kernel write-protection flag on the local CPU.
164 * Allows the kernel to edit read-only pages.
165 * Side-effect: any interrupt handler running between save and restore will have
166 * the ability to write to read-only pages.
167 *
168 * Warning:
169 * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and
170 * no thread can be preempted in the instructions being modified (no iret to an
171 * invalid instruction possible) or if the instructions are changed from a
172 * consistent state to another consistent state atomically.
173 * More care must be taken when modifying code in the SMP case because of
174 * Intel's errata.
175 * On the local CPU you need to be protected again NMI or MCE handlers seeing an
176 * inconsistent instruction while you patch.
177 * The _early version expects the memory to already be RW.
178 */
179
180extern void *text_poke(void *addr, const void *opcode, size_t len);
181extern void *text_poke_early(void *addr, const void *opcode, size_t len);
182
183#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
new file mode 100644
index 000000000000..f712344329bc
--- /dev/null
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -0,0 +1,35 @@
1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#ifndef _ASM_X86_AMD_IOMMU_H
21#define _ASM_X86_AMD_IOMMU_H
22
23#include <linux/irqreturn.h>
24
25#ifdef CONFIG_AMD_IOMMU
26extern int amd_iommu_init(void);
27extern int amd_iommu_init_dma_ops(void);
28extern void amd_iommu_detect(void);
29extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
30#else
31static inline int amd_iommu_init(void) { return -ENODEV; }
32static inline void amd_iommu_detect(void) { }
33#endif
34
35#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
new file mode 100644
index 000000000000..1a30c0440c6b
--- /dev/null
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -0,0 +1,404 @@
1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com>
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published
8 * by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18 */
19
20#ifndef _ASM_X86_AMD_IOMMU_TYPES_H
21#define _ASM_X86_AMD_IOMMU_TYPES_H
22
23#include <linux/types.h>
24#include <linux/list.h>
25#include <linux/spinlock.h>
26
27/*
28 * some size calculation constants
29 */
30#define DEV_TABLE_ENTRY_SIZE 32
31#define ALIAS_TABLE_ENTRY_SIZE 2
32#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *))
33
34/* Length of the MMIO region for the AMD IOMMU */
35#define MMIO_REGION_LENGTH 0x4000
36
37/* Capability offsets used by the driver */
38#define MMIO_CAP_HDR_OFFSET 0x00
39#define MMIO_RANGE_OFFSET 0x0c
40#define MMIO_MISC_OFFSET 0x10
41
42/* Masks, shifts and macros to parse the device range capability */
43#define MMIO_RANGE_LD_MASK 0xff000000
44#define MMIO_RANGE_FD_MASK 0x00ff0000
45#define MMIO_RANGE_BUS_MASK 0x0000ff00
46#define MMIO_RANGE_LD_SHIFT 24
47#define MMIO_RANGE_FD_SHIFT 16
48#define MMIO_RANGE_BUS_SHIFT 8
49#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT)
50#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT)
51#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT)
52#define MMIO_MSI_NUM(x) ((x) & 0x1f)
53
54/* Flag masks for the AMD IOMMU exclusion range */
55#define MMIO_EXCL_ENABLE_MASK 0x01ULL
56#define MMIO_EXCL_ALLOW_MASK 0x02ULL
57
58/* Used offsets into the MMIO space */
59#define MMIO_DEV_TABLE_OFFSET 0x0000
60#define MMIO_CMD_BUF_OFFSET 0x0008
61#define MMIO_EVT_BUF_OFFSET 0x0010
62#define MMIO_CONTROL_OFFSET 0x0018
63#define MMIO_EXCL_BASE_OFFSET 0x0020
64#define MMIO_EXCL_LIMIT_OFFSET 0x0028
65#define MMIO_CMD_HEAD_OFFSET 0x2000
66#define MMIO_CMD_TAIL_OFFSET 0x2008
67#define MMIO_EVT_HEAD_OFFSET 0x2010
68#define MMIO_EVT_TAIL_OFFSET 0x2018
69#define MMIO_STATUS_OFFSET 0x2020
70
71/* MMIO status bits */
72#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04
73
74/* event logging constants */
75#define EVENT_ENTRY_SIZE 0x10
76#define EVENT_TYPE_SHIFT 28
77#define EVENT_TYPE_MASK 0xf
78#define EVENT_TYPE_ILL_DEV 0x1
79#define EVENT_TYPE_IO_FAULT 0x2
80#define EVENT_TYPE_DEV_TAB_ERR 0x3
81#define EVENT_TYPE_PAGE_TAB_ERR 0x4
82#define EVENT_TYPE_ILL_CMD 0x5
83#define EVENT_TYPE_CMD_HARD_ERR 0x6
84#define EVENT_TYPE_IOTLB_INV_TO 0x7
85#define EVENT_TYPE_INV_DEV_REQ 0x8
86#define EVENT_DEVID_MASK 0xffff
87#define EVENT_DEVID_SHIFT 0
88#define EVENT_DOMID_MASK 0xffff
89#define EVENT_DOMID_SHIFT 0
90#define EVENT_FLAGS_MASK 0xfff
91#define EVENT_FLAGS_SHIFT 0x10
92
93/* feature control bits */
94#define CONTROL_IOMMU_EN 0x00ULL
95#define CONTROL_HT_TUN_EN 0x01ULL
96#define CONTROL_EVT_LOG_EN 0x02ULL
97#define CONTROL_EVT_INT_EN 0x03ULL
98#define CONTROL_COMWAIT_EN 0x04ULL
99#define CONTROL_PASSPW_EN 0x08ULL
100#define CONTROL_RESPASSPW_EN 0x09ULL
101#define CONTROL_COHERENT_EN 0x0aULL
102#define CONTROL_ISOC_EN 0x0bULL
103#define CONTROL_CMDBUF_EN 0x0cULL
104#define CONTROL_PPFLOG_EN 0x0dULL
105#define CONTROL_PPFINT_EN 0x0eULL
106
107/* command specific defines */
108#define CMD_COMPL_WAIT 0x01
109#define CMD_INV_DEV_ENTRY 0x02
110#define CMD_INV_IOMMU_PAGES 0x03
111
112#define CMD_COMPL_WAIT_STORE_MASK 0x01
113#define CMD_COMPL_WAIT_INT_MASK 0x02
114#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01
115#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02
116
117#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL
118
119/* macros and definitions for device table entries */
120#define DEV_ENTRY_VALID 0x00
121#define DEV_ENTRY_TRANSLATION 0x01
122#define DEV_ENTRY_IR 0x3d
123#define DEV_ENTRY_IW 0x3e
124#define DEV_ENTRY_NO_PAGE_FAULT 0x62
125#define DEV_ENTRY_EX 0x67
126#define DEV_ENTRY_SYSMGT1 0x68
127#define DEV_ENTRY_SYSMGT2 0x69
128#define DEV_ENTRY_INIT_PASS 0xb8
129#define DEV_ENTRY_EINT_PASS 0xb9
130#define DEV_ENTRY_NMI_PASS 0xba
131#define DEV_ENTRY_LINT0_PASS 0xbe
132#define DEV_ENTRY_LINT1_PASS 0xbf
133#define DEV_ENTRY_MODE_MASK 0x07
134#define DEV_ENTRY_MODE_SHIFT 0x09
135
136/* constants to configure the command buffer */
137#define CMD_BUFFER_SIZE 8192
138#define CMD_BUFFER_ENTRIES 512
139#define MMIO_CMD_SIZE_SHIFT 56
140#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
141
142/* constants for event buffer handling */
143#define EVT_BUFFER_SIZE 8192 /* 512 entries */
144#define EVT_LEN_MASK (0x9ULL << 56)
145
146#define PAGE_MODE_1_LEVEL 0x01
147#define PAGE_MODE_2_LEVEL 0x02
148#define PAGE_MODE_3_LEVEL 0x03
149
150#define IOMMU_PDE_NL_0 0x000ULL
151#define IOMMU_PDE_NL_1 0x200ULL
152#define IOMMU_PDE_NL_2 0x400ULL
153#define IOMMU_PDE_NL_3 0x600ULL
154
155#define IOMMU_PTE_L2_INDEX(address) (((address) >> 30) & 0x1ffULL)
156#define IOMMU_PTE_L1_INDEX(address) (((address) >> 21) & 0x1ffULL)
157#define IOMMU_PTE_L0_INDEX(address) (((address) >> 12) & 0x1ffULL)
158
159#define IOMMU_MAP_SIZE_L1 (1ULL << 21)
160#define IOMMU_MAP_SIZE_L2 (1ULL << 30)
161#define IOMMU_MAP_SIZE_L3 (1ULL << 39)
162
163#define IOMMU_PTE_P (1ULL << 0)
164#define IOMMU_PTE_TV (1ULL << 1)
165#define IOMMU_PTE_U (1ULL << 59)
166#define IOMMU_PTE_FC (1ULL << 60)
167#define IOMMU_PTE_IR (1ULL << 61)
168#define IOMMU_PTE_IW (1ULL << 62)
169
170#define IOMMU_L1_PDE(address) \
171 ((address) | IOMMU_PDE_NL_1 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
172#define IOMMU_L2_PDE(address) \
173 ((address) | IOMMU_PDE_NL_2 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW)
174
175#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
176#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
177#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
178#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07)
179
180#define IOMMU_PROT_MASK 0x03
181#define IOMMU_PROT_IR 0x01
182#define IOMMU_PROT_IW 0x02
183
184/* IOMMU capabilities */
185#define IOMMU_CAP_IOTLB 24
186#define IOMMU_CAP_NPCACHE 26
187
188#define MAX_DOMAIN_ID 65536
189
190/* FIXME: move this macro to <linux/pci.h> */
191#define PCI_BUS(x) (((x) >> 8) & 0xff)
192
193/*
194 * This structure contains generic data for IOMMU protection domains
195 * independent of their use.
196 */
197struct protection_domain {
198 spinlock_t lock; /* mostly used to lock the page table*/
199 u16 id; /* the domain id written to the device table */
200 int mode; /* paging mode (0-6 levels) */
201 u64 *pt_root; /* page table root pointer */
202 void *priv; /* private data */
203};
204
205/*
206 * Data container for a dma_ops specific protection domain
207 */
208struct dma_ops_domain {
209 struct list_head list;
210
211 /* generic protection domain information */
212 struct protection_domain domain;
213
214 /* size of the aperture for the mappings */
215 unsigned long aperture_size;
216
217 /* address we start to search for free addresses */
218 unsigned long next_bit;
219
220 /* address allocation bitmap */
221 unsigned long *bitmap;
222
223 /*
224 * Array of PTE pages for the aperture. In this array we save all the
225 * leaf pages of the domain page table used for the aperture. This way
226 * we don't need to walk the page table to find a specific PTE. We can
227 * just calculate its address in constant time.
228 */
229 u64 **pte_pages;
230
231 /* This will be set to true when TLB needs to be flushed */
232 bool need_flush;
233
234 /*
235 * if this is a preallocated domain, keep the device for which it was
236 * preallocated in this variable
237 */
238 u16 target_dev;
239};
240
241/*
242 * Structure where we save information about one hardware AMD IOMMU in the
243 * system.
244 */
245struct amd_iommu {
246 struct list_head list;
247
248 /* locks the accesses to the hardware */
249 spinlock_t lock;
250
251 /* Pointer to PCI device of this IOMMU */
252 struct pci_dev *dev;
253
254 /*
255 * Capability pointer. There could be more than one IOMMU per PCI
256 * device function if there are more than one AMD IOMMU capability
257 * pointers.
258 */
259 u16 cap_ptr;
260
261 /* physical address of MMIO space */
262 u64 mmio_phys;
263 /* virtual address of MMIO space */
264 u8 *mmio_base;
265
266 /* capabilities of that IOMMU read from ACPI */
267 u32 cap;
268
269 /* pci domain of this IOMMU */
270 u16 pci_seg;
271
272 /* first device this IOMMU handles. read from PCI */
273 u16 first_device;
274 /* last device this IOMMU handles. read from PCI */
275 u16 last_device;
276
277 /* start of exclusion range of that IOMMU */
278 u64 exclusion_start;
279 /* length of exclusion range of that IOMMU */
280 u64 exclusion_length;
281
282 /* command buffer virtual address */
283 u8 *cmd_buf;
284 /* size of command buffer */
285 u32 cmd_buf_size;
286
287 /* event buffer virtual address */
288 u8 *evt_buf;
289 /* size of event buffer */
290 u32 evt_buf_size;
291 /* MSI number for event interrupt */
292 u16 evt_msi_num;
293
294 /* if one, we need to send a completion wait command */
295 int need_sync;
296
297 /* true if interrupts for this IOMMU are already enabled */
298 bool int_enabled;
299
300 /* default dma_ops domain for that IOMMU */
301 struct dma_ops_domain *default_dom;
302};
303
304/*
305 * List with all IOMMUs in the system. This list is not locked because it is
306 * only written and read at driver initialization or suspend time
307 */
308extern struct list_head amd_iommu_list;
309
310/*
311 * Structure defining one entry in the device table
312 */
313struct dev_table_entry {
314 u32 data[8];
315};
316
317/*
318 * One entry for unity mappings parsed out of the ACPI table.
319 */
320struct unity_map_entry {
321 struct list_head list;
322
323 /* starting device id this entry is used for (including) */
324 u16 devid_start;
325 /* end device id this entry is used for (including) */
326 u16 devid_end;
327
328 /* start address to unity map (including) */
329 u64 address_start;
330 /* end address to unity map (including) */
331 u64 address_end;
332
333 /* required protection */
334 int prot;
335};
336
337/*
338 * List of all unity mappings. It is not locked because as runtime it is only
339 * read. It is created at ACPI table parsing time.
340 */
341extern struct list_head amd_iommu_unity_map;
342
343/*
344 * Data structures for device handling
345 */
346
347/*
348 * Device table used by hardware. Read and write accesses by software are
349 * locked with the amd_iommu_pd_table lock.
350 */
351extern struct dev_table_entry *amd_iommu_dev_table;
352
353/*
354 * Alias table to find requestor ids to device ids. Not locked because only
355 * read on runtime.
356 */
357extern u16 *amd_iommu_alias_table;
358
359/*
360 * Reverse lookup table to find the IOMMU which translates a specific device.
361 */
362extern struct amd_iommu **amd_iommu_rlookup_table;
363
364/* size of the dma_ops aperture as power of 2 */
365extern unsigned amd_iommu_aperture_order;
366
367/* largest PCI device id we expect translation requests for */
368extern u16 amd_iommu_last_bdf;
369
370/* data structures for protection domain handling */
371extern struct protection_domain **amd_iommu_pd_table;
372
373/* allocation bitmap for domain ids */
374extern unsigned long *amd_iommu_pd_alloc_bitmap;
375
376/* will be 1 if device isolation is enabled */
377extern int amd_iommu_isolate;
378
379/*
380 * If true, the addresses will be flushed on unmap time, not when
381 * they are reused
382 */
383extern bool amd_iommu_unmap_flush;
384
385/* takes a PCI device id and prints it out in a readable form */
386static inline void print_devid(u16 devid, int nl)
387{
388 int bus = devid >> 8;
389 int dev = devid >> 3 & 0x1f;
390 int fn = devid & 0x07;
391
392 printk("%02x:%02x.%x", bus, dev, fn);
393 if (nl)
394 printk("\n");
395}
396
397/* takes bus and device/function and returns the device id
398 * FIXME: should that be in generic PCI code? */
399static inline u16 calc_devid(u8 bus, u8 devfn)
400{
401 return (((u16)bus) << 8) | devfn;
402}
403
404#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
new file mode 100644
index 000000000000..3b1510b4fc57
--- /dev/null
+++ b/arch/x86/include/asm/apic.h
@@ -0,0 +1,199 @@
1#ifndef _ASM_X86_APIC_H
2#define _ASM_X86_APIC_H
3
4#include <linux/pm.h>
5#include <linux/delay.h>
6
7#include <asm/alternative.h>
8#include <asm/fixmap.h>
9#include <asm/apicdef.h>
10#include <asm/processor.h>
11#include <asm/system.h>
12#include <asm/cpufeature.h>
13#include <asm/msr.h>
14
15#define ARCH_APICTIMER_STOPS_ON_C3 1
16
17/*
18 * Debugging macros
19 */
20#define APIC_QUIET 0
21#define APIC_VERBOSE 1
22#define APIC_DEBUG 2
23
24/*
25 * Define the default level of output to be very little
26 * This can be turned up by using apic=verbose for more
27 * information and apic=debug for _lots_ of information.
28 * apic_verbosity is defined in apic.c
29 */
30#define apic_printk(v, s, a...) do { \
31 if ((v) <= apic_verbosity) \
32 printk(s, ##a); \
33 } while (0)
34
35
36extern void generic_apic_probe(void);
37
38#ifdef CONFIG_X86_LOCAL_APIC
39
40extern unsigned int apic_verbosity;
41extern int local_apic_timer_c2_ok;
42
43extern int disable_apic;
44/*
45 * Basic functions accessing APICs.
46 */
47#ifdef CONFIG_PARAVIRT
48#include <asm/paravirt.h>
49#else
50#define setup_boot_clock setup_boot_APIC_clock
51#define setup_secondary_clock setup_secondary_APIC_clock
52#endif
53
54extern int is_vsmp_box(void);
55extern void xapic_wait_icr_idle(void);
56extern u32 safe_xapic_wait_icr_idle(void);
57extern u64 xapic_icr_read(void);
58extern void xapic_icr_write(u32, u32);
59extern int setup_profiling_timer(unsigned int);
60
61static inline void native_apic_mem_write(u32 reg, u32 v)
62{
63 volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
64
65 alternative_io("movl %0, %1", "xchgl %0, %1", X86_FEATURE_11AP,
66 ASM_OUTPUT2("=r" (v), "=m" (*addr)),
67 ASM_OUTPUT2("0" (v), "m" (*addr)));
68}
69
70static inline u32 native_apic_mem_read(u32 reg)
71{
72 return *((volatile u32 *)(APIC_BASE + reg));
73}
74
75static inline void native_apic_msr_write(u32 reg, u32 v)
76{
77 if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR ||
78 reg == APIC_LVR)
79 return;
80
81 wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0);
82}
83
84static inline u32 native_apic_msr_read(u32 reg)
85{
86 u32 low, high;
87
88 if (reg == APIC_DFR)
89 return -1;
90
91 rdmsr(APIC_BASE_MSR + (reg >> 4), low, high);
92 return low;
93}
94
95#ifndef CONFIG_X86_32
96extern int x2apic, x2apic_preenabled;
97extern void check_x2apic(void);
98extern void enable_x2apic(void);
99extern void enable_IR_x2apic(void);
100extern void x2apic_icr_write(u32 low, u32 id);
101static inline int x2apic_enabled(void)
102{
103 int msr, msr2;
104
105 if (!cpu_has_x2apic)
106 return 0;
107
108 rdmsr(MSR_IA32_APICBASE, msr, msr2);
109 if (msr & X2APIC_ENABLE)
110 return 1;
111 return 0;
112}
113#else
114#define x2apic_enabled() 0
115#endif
116
117struct apic_ops {
118 u32 (*read)(u32 reg);
119 void (*write)(u32 reg, u32 v);
120 u64 (*icr_read)(void);
121 void (*icr_write)(u32 low, u32 high);
122 void (*wait_icr_idle)(void);
123 u32 (*safe_wait_icr_idle)(void);
124};
125
126extern struct apic_ops *apic_ops;
127
128#define apic_read (apic_ops->read)
129#define apic_write (apic_ops->write)
130#define apic_icr_read (apic_ops->icr_read)
131#define apic_icr_write (apic_ops->icr_write)
132#define apic_wait_icr_idle (apic_ops->wait_icr_idle)
133#define safe_apic_wait_icr_idle (apic_ops->safe_wait_icr_idle)
134
135extern int get_physical_broadcast(void);
136
137#ifdef CONFIG_X86_64
138static inline void ack_x2APIC_irq(void)
139{
140 /* Docs say use 0 for future compatibility */
141 native_apic_msr_write(APIC_EOI, 0);
142}
143#endif
144
145
146static inline void ack_APIC_irq(void)
147{
148 /*
149 * ack_APIC_irq() actually gets compiled as a single instruction
150 * ... yummie.
151 */
152
153 /* Docs say use 0 for future compatibility */
154 apic_write(APIC_EOI, 0);
155}
156
157extern int lapic_get_maxlvt(void);
158extern void clear_local_APIC(void);
159extern void connect_bsp_APIC(void);
160extern void disconnect_bsp_APIC(int virt_wire_setup);
161extern void disable_local_APIC(void);
162extern void lapic_shutdown(void);
163extern int verify_local_APIC(void);
164extern void cache_APIC_registers(void);
165extern void sync_Arb_IDs(void);
166extern void init_bsp_APIC(void);
167extern void setup_local_APIC(void);
168extern void end_local_APIC_setup(void);
169extern void init_apic_mappings(void);
170extern void setup_boot_APIC_clock(void);
171extern void setup_secondary_APIC_clock(void);
172extern int APIC_init_uniprocessor(void);
173extern void enable_NMI_through_LVT0(void);
174
175/*
176 * On 32bit this is mach-xxx local
177 */
178#ifdef CONFIG_X86_64
179extern void early_init_lapic_mapping(void);
180extern int apic_is_clustered_box(void);
181#else
182static inline int apic_is_clustered_box(void)
183{
184 return 0;
185}
186#endif
187
188extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask);
189extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
190
191
192#else /* !CONFIG_X86_LOCAL_APIC */
193static inline void lapic_shutdown(void) { }
194#define local_apic_timer_c2_ok 1
195static inline void init_apic_mappings(void) { }
196
197#endif /* !CONFIG_X86_LOCAL_APIC */
198
199#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
new file mode 100644
index 000000000000..63134e31e8b9
--- /dev/null
+++ b/arch/x86/include/asm/apicdef.h
@@ -0,0 +1,417 @@
1#ifndef _ASM_X86_APICDEF_H
2#define _ASM_X86_APICDEF_H
3
4/*
5 * Constants for various Intel APICs. (local APIC, IOAPIC, etc.)
6 *
7 * Alan Cox <Alan.Cox@linux.org>, 1995.
8 * Ingo Molnar <mingo@redhat.com>, 1999, 2000
9 */
10
11#define APIC_DEFAULT_PHYS_BASE 0xfee00000
12
13#define APIC_ID 0x20
14
15#define APIC_LVR 0x30
16#define APIC_LVR_MASK 0xFF00FF
17#define GET_APIC_VERSION(x) ((x) & 0xFFu)
18#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu)
19#ifdef CONFIG_X86_32
20# define APIC_INTEGRATED(x) ((x) & 0xF0u)
21#else
22# define APIC_INTEGRATED(x) (1)
23#endif
24#define APIC_XAPIC(x) ((x) >= 0x14)
25#define APIC_TASKPRI 0x80
26#define APIC_TPRI_MASK 0xFFu
27#define APIC_ARBPRI 0x90
28#define APIC_ARBPRI_MASK 0xFFu
29#define APIC_PROCPRI 0xA0
30#define APIC_EOI 0xB0
31#define APIC_EIO_ACK 0x0
32#define APIC_RRR 0xC0
33#define APIC_LDR 0xD0
34#define APIC_LDR_MASK (0xFFu << 24)
35#define GET_APIC_LOGICAL_ID(x) (((x) >> 24) & 0xFFu)
36#define SET_APIC_LOGICAL_ID(x) (((x) << 24))
37#define APIC_ALL_CPUS 0xFFu
38#define APIC_DFR 0xE0
39#define APIC_DFR_CLUSTER 0x0FFFFFFFul
40#define APIC_DFR_FLAT 0xFFFFFFFFul
41#define APIC_SPIV 0xF0
42#define APIC_SPIV_FOCUS_DISABLED (1 << 9)
43#define APIC_SPIV_APIC_ENABLED (1 << 8)
44#define APIC_ISR 0x100
45#define APIC_ISR_NR 0x8 /* Number of 32 bit ISR registers. */
46#define APIC_TMR 0x180
47#define APIC_IRR 0x200
48#define APIC_ESR 0x280
49#define APIC_ESR_SEND_CS 0x00001
50#define APIC_ESR_RECV_CS 0x00002
51#define APIC_ESR_SEND_ACC 0x00004
52#define APIC_ESR_RECV_ACC 0x00008
53#define APIC_ESR_SENDILL 0x00020
54#define APIC_ESR_RECVILL 0x00040
55#define APIC_ESR_ILLREGA 0x00080
56#define APIC_ICR 0x300
57#define APIC_DEST_SELF 0x40000
58#define APIC_DEST_ALLINC 0x80000
59#define APIC_DEST_ALLBUT 0xC0000
60#define APIC_ICR_RR_MASK 0x30000
61#define APIC_ICR_RR_INVALID 0x00000
62#define APIC_ICR_RR_INPROG 0x10000
63#define APIC_ICR_RR_VALID 0x20000
64#define APIC_INT_LEVELTRIG 0x08000
65#define APIC_INT_ASSERT 0x04000
66#define APIC_ICR_BUSY 0x01000
67#define APIC_DEST_LOGICAL 0x00800
68#define APIC_DEST_PHYSICAL 0x00000
69#define APIC_DM_FIXED 0x00000
70#define APIC_DM_LOWEST 0x00100
71#define APIC_DM_SMI 0x00200
72#define APIC_DM_REMRD 0x00300
73#define APIC_DM_NMI 0x00400
74#define APIC_DM_INIT 0x00500
75#define APIC_DM_STARTUP 0x00600
76#define APIC_DM_EXTINT 0x00700
77#define APIC_VECTOR_MASK 0x000FF
78#define APIC_ICR2 0x310
79#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF)
80#define SET_APIC_DEST_FIELD(x) ((x) << 24)
81#define APIC_LVTT 0x320
82#define APIC_LVTTHMR 0x330
83#define APIC_LVTPC 0x340
84#define APIC_LVT0 0x350
85#define APIC_LVT_TIMER_BASE_MASK (0x3 << 18)
86#define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3)
87#define SET_APIC_TIMER_BASE(x) (((x) << 18))
88#define APIC_TIMER_BASE_CLKIN 0x0
89#define APIC_TIMER_BASE_TMBASE 0x1
90#define APIC_TIMER_BASE_DIV 0x2
91#define APIC_LVT_TIMER_PERIODIC (1 << 17)
92#define APIC_LVT_MASKED (1 << 16)
93#define APIC_LVT_LEVEL_TRIGGER (1 << 15)
94#define APIC_LVT_REMOTE_IRR (1 << 14)
95#define APIC_INPUT_POLARITY (1 << 13)
96#define APIC_SEND_PENDING (1 << 12)
97#define APIC_MODE_MASK 0x700
98#define GET_APIC_DELIVERY_MODE(x) (((x) >> 8) & 0x7)
99#define SET_APIC_DELIVERY_MODE(x, y) (((x) & ~0x700) | ((y) << 8))
100#define APIC_MODE_FIXED 0x0
101#define APIC_MODE_NMI 0x4
102#define APIC_MODE_EXTINT 0x7
103#define APIC_LVT1 0x360
104#define APIC_LVTERR 0x370
105#define APIC_TMICT 0x380
106#define APIC_TMCCT 0x390
107#define APIC_TDCR 0x3E0
108#define APIC_SELF_IPI 0x3F0
109#define APIC_TDR_DIV_TMBASE (1 << 2)
110#define APIC_TDR_DIV_1 0xB
111#define APIC_TDR_DIV_2 0x0
112#define APIC_TDR_DIV_4 0x1
113#define APIC_TDR_DIV_8 0x2
114#define APIC_TDR_DIV_16 0x3
115#define APIC_TDR_DIV_32 0x8
116#define APIC_TDR_DIV_64 0x9
117#define APIC_TDR_DIV_128 0xA
118#define APIC_EILVT0 0x500
119#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
120#define APIC_EILVT_NR_AMD_10H 4
121#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
122#define APIC_EILVT_MSG_FIX 0x0
123#define APIC_EILVT_MSG_SMI 0x2
124#define APIC_EILVT_MSG_NMI 0x4
125#define APIC_EILVT_MSG_EXT 0x7
126#define APIC_EILVT_MASKED (1 << 16)
127#define APIC_EILVT1 0x510
128#define APIC_EILVT2 0x520
129#define APIC_EILVT3 0x530
130
131#define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
132#define APIC_BASE_MSR 0x800
133#define X2APIC_ENABLE (1UL << 10)
134
135#ifdef CONFIG_X86_32
136# define MAX_IO_APICS 64
137#else
138# define MAX_IO_APICS 128
139# define MAX_LOCAL_APIC 32768
140#endif
141
142/*
143 * All x86-64 systems are xAPIC compatible.
144 * In the following, "apicid" is a physical APIC ID.
145 */
146#define XAPIC_DEST_CPUS_SHIFT 4
147#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
148#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
149#define APIC_CLUSTER(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK)
150#define APIC_CLUSTERID(apicid) (APIC_CLUSTER(apicid) >> XAPIC_DEST_CPUS_SHIFT)
151#define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK)
152#define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT)
153
154/*
155 * the local APIC register structure, memory mapped. Not terribly well
156 * tested, but we might eventually use this one in the future - the
157 * problem why we cannot use it right now is the P5 APIC, it has an
158 * errata which cannot take 8-bit reads and writes, only 32-bit ones ...
159 */
160#define u32 unsigned int
161
162struct local_apic {
163
164/*000*/ struct { u32 __reserved[4]; } __reserved_01;
165
166/*010*/ struct { u32 __reserved[4]; } __reserved_02;
167
168/*020*/ struct { /* APIC ID Register */
169 u32 __reserved_1 : 24,
170 phys_apic_id : 4,
171 __reserved_2 : 4;
172 u32 __reserved[3];
173 } id;
174
175/*030*/ const
176 struct { /* APIC Version Register */
177 u32 version : 8,
178 __reserved_1 : 8,
179 max_lvt : 8,
180 __reserved_2 : 8;
181 u32 __reserved[3];
182 } version;
183
184/*040*/ struct { u32 __reserved[4]; } __reserved_03;
185
186/*050*/ struct { u32 __reserved[4]; } __reserved_04;
187
188/*060*/ struct { u32 __reserved[4]; } __reserved_05;
189
190/*070*/ struct { u32 __reserved[4]; } __reserved_06;
191
192/*080*/ struct { /* Task Priority Register */
193 u32 priority : 8,
194 __reserved_1 : 24;
195 u32 __reserved_2[3];
196 } tpr;
197
198/*090*/ const
199 struct { /* Arbitration Priority Register */
200 u32 priority : 8,
201 __reserved_1 : 24;
202 u32 __reserved_2[3];
203 } apr;
204
205/*0A0*/ const
206 struct { /* Processor Priority Register */
207 u32 priority : 8,
208 __reserved_1 : 24;
209 u32 __reserved_2[3];
210 } ppr;
211
212/*0B0*/ struct { /* End Of Interrupt Register */
213 u32 eoi;
214 u32 __reserved[3];
215 } eoi;
216
217/*0C0*/ struct { u32 __reserved[4]; } __reserved_07;
218
219/*0D0*/ struct { /* Logical Destination Register */
220 u32 __reserved_1 : 24,
221 logical_dest : 8;
222 u32 __reserved_2[3];
223 } ldr;
224
225/*0E0*/ struct { /* Destination Format Register */
226 u32 __reserved_1 : 28,
227 model : 4;
228 u32 __reserved_2[3];
229 } dfr;
230
231/*0F0*/ struct { /* Spurious Interrupt Vector Register */
232 u32 spurious_vector : 8,
233 apic_enabled : 1,
234 focus_cpu : 1,
235 __reserved_2 : 22;
236 u32 __reserved_3[3];
237 } svr;
238
239/*100*/ struct { /* In Service Register */
240/*170*/ u32 bitfield;
241 u32 __reserved[3];
242 } isr [8];
243
244/*180*/ struct { /* Trigger Mode Register */
245/*1F0*/ u32 bitfield;
246 u32 __reserved[3];
247 } tmr [8];
248
249/*200*/ struct { /* Interrupt Request Register */
250/*270*/ u32 bitfield;
251 u32 __reserved[3];
252 } irr [8];
253
254/*280*/ union { /* Error Status Register */
255 struct {
256 u32 send_cs_error : 1,
257 receive_cs_error : 1,
258 send_accept_error : 1,
259 receive_accept_error : 1,
260 __reserved_1 : 1,
261 send_illegal_vector : 1,
262 receive_illegal_vector : 1,
263 illegal_register_address : 1,
264 __reserved_2 : 24;
265 u32 __reserved_3[3];
266 } error_bits;
267 struct {
268 u32 errors;
269 u32 __reserved_3[3];
270 } all_errors;
271 } esr;
272
273/*290*/ struct { u32 __reserved[4]; } __reserved_08;
274
275/*2A0*/ struct { u32 __reserved[4]; } __reserved_09;
276
277/*2B0*/ struct { u32 __reserved[4]; } __reserved_10;
278
279/*2C0*/ struct { u32 __reserved[4]; } __reserved_11;
280
281/*2D0*/ struct { u32 __reserved[4]; } __reserved_12;
282
283/*2E0*/ struct { u32 __reserved[4]; } __reserved_13;
284
285/*2F0*/ struct { u32 __reserved[4]; } __reserved_14;
286
287/*300*/ struct { /* Interrupt Command Register 1 */
288 u32 vector : 8,
289 delivery_mode : 3,
290 destination_mode : 1,
291 delivery_status : 1,
292 __reserved_1 : 1,
293 level : 1,
294 trigger : 1,
295 __reserved_2 : 2,
296 shorthand : 2,
297 __reserved_3 : 12;
298 u32 __reserved_4[3];
299 } icr1;
300
301/*310*/ struct { /* Interrupt Command Register 2 */
302 union {
303 u32 __reserved_1 : 24,
304 phys_dest : 4,
305 __reserved_2 : 4;
306 u32 __reserved_3 : 24,
307 logical_dest : 8;
308 } dest;
309 u32 __reserved_4[3];
310 } icr2;
311
312/*320*/ struct { /* LVT - Timer */
313 u32 vector : 8,
314 __reserved_1 : 4,
315 delivery_status : 1,
316 __reserved_2 : 3,
317 mask : 1,
318 timer_mode : 1,
319 __reserved_3 : 14;
320 u32 __reserved_4[3];
321 } lvt_timer;
322
323/*330*/ struct { /* LVT - Thermal Sensor */
324 u32 vector : 8,
325 delivery_mode : 3,
326 __reserved_1 : 1,
327 delivery_status : 1,
328 __reserved_2 : 3,
329 mask : 1,
330 __reserved_3 : 15;
331 u32 __reserved_4[3];
332 } lvt_thermal;
333
334/*340*/ struct { /* LVT - Performance Counter */
335 u32 vector : 8,
336 delivery_mode : 3,
337 __reserved_1 : 1,
338 delivery_status : 1,
339 __reserved_2 : 3,
340 mask : 1,
341 __reserved_3 : 15;
342 u32 __reserved_4[3];
343 } lvt_pc;
344
345/*350*/ struct { /* LVT - LINT0 */
346 u32 vector : 8,
347 delivery_mode : 3,
348 __reserved_1 : 1,
349 delivery_status : 1,
350 polarity : 1,
351 remote_irr : 1,
352 trigger : 1,
353 mask : 1,
354 __reserved_2 : 15;
355 u32 __reserved_3[3];
356 } lvt_lint0;
357
358/*360*/ struct { /* LVT - LINT1 */
359 u32 vector : 8,
360 delivery_mode : 3,
361 __reserved_1 : 1,
362 delivery_status : 1,
363 polarity : 1,
364 remote_irr : 1,
365 trigger : 1,
366 mask : 1,
367 __reserved_2 : 15;
368 u32 __reserved_3[3];
369 } lvt_lint1;
370
371/*370*/ struct { /* LVT - Error */
372 u32 vector : 8,
373 __reserved_1 : 4,
374 delivery_status : 1,
375 __reserved_2 : 3,
376 mask : 1,
377 __reserved_3 : 15;
378 u32 __reserved_4[3];
379 } lvt_error;
380
381/*380*/ struct { /* Timer Initial Count Register */
382 u32 initial_count;
383 u32 __reserved_2[3];
384 } timer_icr;
385
386/*390*/ const
387 struct { /* Timer Current Count Register */
388 u32 curr_count;
389 u32 __reserved_2[3];
390 } timer_ccr;
391
392/*3A0*/ struct { u32 __reserved[4]; } __reserved_16;
393
394/*3B0*/ struct { u32 __reserved[4]; } __reserved_17;
395
396/*3C0*/ struct { u32 __reserved[4]; } __reserved_18;
397
398/*3D0*/ struct { u32 __reserved[4]; } __reserved_19;
399
400/*3E0*/ struct { /* Timer Divide Configuration Register */
401 u32 divisor : 4,
402 __reserved_1 : 28;
403 u32 __reserved_2[3];
404 } timer_dcr;
405
406/*3F0*/ struct { u32 __reserved[4]; } __reserved_20;
407
408} __attribute__ ((packed));
409
410#undef u32
411
412#ifdef CONFIG_X86_32
413 #define BAD_APICID 0xFFu
414#else
415 #define BAD_APICID 0xFFFFu
416#endif
417#endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/arch_hooks.h b/arch/x86/include/asm/arch_hooks.h
new file mode 100644
index 000000000000..cbd4957838a6
--- /dev/null
+++ b/arch/x86/include/asm/arch_hooks.h
@@ -0,0 +1,26 @@
1#ifndef _ASM_X86_ARCH_HOOKS_H
2#define _ASM_X86_ARCH_HOOKS_H
3
4#include <linux/interrupt.h>
5
6/*
7 * linux/include/asm/arch_hooks.h
8 *
9 * define the architecture specific hooks
10 */
11
12/* these aren't arch hooks, they are generic routines
13 * that can be used by the hooks */
14extern void init_ISA_irqs(void);
15extern irqreturn_t timer_interrupt(int irq, void *dev_id);
16
17/* these are the defined hooks */
18extern void intr_init_hook(void);
19extern void pre_intr_init_hook(void);
20extern void pre_setup_arch_hook(void);
21extern void trap_init_hook(void);
22extern void pre_time_init_hook(void);
23extern void time_init_hook(void);
24extern void mca_nmi_hook(void);
25
26#endif /* _ASM_X86_ARCH_HOOKS_H */
diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
new file mode 100644
index 000000000000..56be78f582f0
--- /dev/null
+++ b/arch/x86/include/asm/asm.h
@@ -0,0 +1,47 @@
1#ifndef _ASM_X86_ASM_H
2#define _ASM_X86_ASM_H
3
4#ifdef __ASSEMBLY__
5# define __ASM_FORM(x) x
6# define __ASM_EX_SEC .section __ex_table
7#else
8# define __ASM_FORM(x) " " #x " "
9# define __ASM_EX_SEC " .section __ex_table,\"a\"\n"
10#endif
11
12#ifdef CONFIG_X86_32
13# define __ASM_SEL(a,b) __ASM_FORM(a)
14#else
15# define __ASM_SEL(a,b) __ASM_FORM(b)
16#endif
17
18#define __ASM_SIZE(inst) __ASM_SEL(inst##l, inst##q)
19#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg)
20
21#define _ASM_PTR __ASM_SEL(.long, .quad)
22#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8)
23
24#define _ASM_MOV __ASM_SIZE(mov)
25#define _ASM_INC __ASM_SIZE(inc)
26#define _ASM_DEC __ASM_SIZE(dec)
27#define _ASM_ADD __ASM_SIZE(add)
28#define _ASM_SUB __ASM_SIZE(sub)
29#define _ASM_XADD __ASM_SIZE(xadd)
30
31#define _ASM_AX __ASM_REG(ax)
32#define _ASM_BX __ASM_REG(bx)
33#define _ASM_CX __ASM_REG(cx)
34#define _ASM_DX __ASM_REG(dx)
35#define _ASM_SP __ASM_REG(sp)
36#define _ASM_BP __ASM_REG(bp)
37#define _ASM_SI __ASM_REG(si)
38#define _ASM_DI __ASM_REG(di)
39
40/* Exception table entry */
41# define _ASM_EXTABLE(from,to) \
42 __ASM_EX_SEC \
43 _ASM_ALIGN "\n" \
44 _ASM_PTR #from "," #to "\n" \
45 " .previous\n"
46
47#endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
new file mode 100644
index 000000000000..4e1b8873c474
--- /dev/null
+++ b/arch/x86/include/asm/atomic.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "atomic_32.h"
3#else
4# include "atomic_64.h"
5#endif
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
new file mode 100644
index 000000000000..ad5b9f6ecddf
--- /dev/null
+++ b/arch/x86/include/asm/atomic_32.h
@@ -0,0 +1,259 @@
1#ifndef _ASM_X86_ATOMIC_32_H
2#define _ASM_X86_ATOMIC_32_H
3
4#include <linux/compiler.h>
5#include <asm/processor.h>
6#include <asm/cmpxchg.h>
7
8/*
9 * Atomic operations that C can't guarantee us. Useful for
10 * resource counting etc..
11 */
12
13/*
14 * Make sure gcc doesn't try to be clever and move things around
15 * on us. We need to use _exactly_ the address the user gave us,
16 * not some alias that contains the same information.
17 */
18typedef struct {
19 int counter;
20} atomic_t;
21
22#define ATOMIC_INIT(i) { (i) }
23
24/**
25 * atomic_read - read atomic variable
26 * @v: pointer of type atomic_t
27 *
28 * Atomically reads the value of @v.
29 */
30#define atomic_read(v) ((v)->counter)
31
32/**
33 * atomic_set - set atomic variable
34 * @v: pointer of type atomic_t
35 * @i: required value
36 *
37 * Atomically sets the value of @v to @i.
38 */
39#define atomic_set(v, i) (((v)->counter) = (i))
40
41/**
42 * atomic_add - add integer to atomic variable
43 * @i: integer value to add
44 * @v: pointer of type atomic_t
45 *
46 * Atomically adds @i to @v.
47 */
48static inline void atomic_add(int i, atomic_t *v)
49{
50 asm volatile(LOCK_PREFIX "addl %1,%0"
51 : "+m" (v->counter)
52 : "ir" (i));
53}
54
55/**
56 * atomic_sub - subtract integer from atomic variable
57 * @i: integer value to subtract
58 * @v: pointer of type atomic_t
59 *
60 * Atomically subtracts @i from @v.
61 */
62static inline void atomic_sub(int i, atomic_t *v)
63{
64 asm volatile(LOCK_PREFIX "subl %1,%0"
65 : "+m" (v->counter)
66 : "ir" (i));
67}
68
69/**
70 * atomic_sub_and_test - subtract value from variable and test result
71 * @i: integer value to subtract
72 * @v: pointer of type atomic_t
73 *
74 * Atomically subtracts @i from @v and returns
75 * true if the result is zero, or false for all
76 * other cases.
77 */
78static inline int atomic_sub_and_test(int i, atomic_t *v)
79{
80 unsigned char c;
81
82 asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
83 : "+m" (v->counter), "=qm" (c)
84 : "ir" (i) : "memory");
85 return c;
86}
87
88/**
89 * atomic_inc - increment atomic variable
90 * @v: pointer of type atomic_t
91 *
92 * Atomically increments @v by 1.
93 */
94static inline void atomic_inc(atomic_t *v)
95{
96 asm volatile(LOCK_PREFIX "incl %0"
97 : "+m" (v->counter));
98}
99
100/**
101 * atomic_dec - decrement atomic variable
102 * @v: pointer of type atomic_t
103 *
104 * Atomically decrements @v by 1.
105 */
106static inline void atomic_dec(atomic_t *v)
107{
108 asm volatile(LOCK_PREFIX "decl %0"
109 : "+m" (v->counter));
110}
111
112/**
113 * atomic_dec_and_test - decrement and test
114 * @v: pointer of type atomic_t
115 *
116 * Atomically decrements @v by 1 and
117 * returns true if the result is 0, or false for all other
118 * cases.
119 */
120static inline int atomic_dec_and_test(atomic_t *v)
121{
122 unsigned char c;
123
124 asm volatile(LOCK_PREFIX "decl %0; sete %1"
125 : "+m" (v->counter), "=qm" (c)
126 : : "memory");
127 return c != 0;
128}
129
130/**
131 * atomic_inc_and_test - increment and test
132 * @v: pointer of type atomic_t
133 *
134 * Atomically increments @v by 1
135 * and returns true if the result is zero, or false for all
136 * other cases.
137 */
138static inline int atomic_inc_and_test(atomic_t *v)
139{
140 unsigned char c;
141
142 asm volatile(LOCK_PREFIX "incl %0; sete %1"
143 : "+m" (v->counter), "=qm" (c)
144 : : "memory");
145 return c != 0;
146}
147
148/**
149 * atomic_add_negative - add and test if negative
150 * @v: pointer of type atomic_t
151 * @i: integer value to add
152 *
153 * Atomically adds @i to @v and returns true
154 * if the result is negative, or false when
155 * result is greater than or equal to zero.
156 */
157static inline int atomic_add_negative(int i, atomic_t *v)
158{
159 unsigned char c;
160
161 asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
162 : "+m" (v->counter), "=qm" (c)
163 : "ir" (i) : "memory");
164 return c;
165}
166
167/**
168 * atomic_add_return - add integer and return
169 * @v: pointer of type atomic_t
170 * @i: integer value to add
171 *
172 * Atomically adds @i to @v and returns @i + @v
173 */
174static inline int atomic_add_return(int i, atomic_t *v)
175{
176 int __i;
177#ifdef CONFIG_M386
178 unsigned long flags;
179 if (unlikely(boot_cpu_data.x86 <= 3))
180 goto no_xadd;
181#endif
182 /* Modern 486+ processor */
183 __i = i;
184 asm volatile(LOCK_PREFIX "xaddl %0, %1"
185 : "+r" (i), "+m" (v->counter)
186 : : "memory");
187 return i + __i;
188
189#ifdef CONFIG_M386
190no_xadd: /* Legacy 386 processor */
191 local_irq_save(flags);
192 __i = atomic_read(v);
193 atomic_set(v, i + __i);
194 local_irq_restore(flags);
195 return i + __i;
196#endif
197}
198
199/**
200 * atomic_sub_return - subtract integer and return
201 * @v: pointer of type atomic_t
202 * @i: integer value to subtract
203 *
204 * Atomically subtracts @i from @v and returns @v - @i
205 */
206static inline int atomic_sub_return(int i, atomic_t *v)
207{
208 return atomic_add_return(-i, v);
209}
210
211#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
212#define atomic_xchg(v, new) (xchg(&((v)->counter), (new)))
213
214/**
215 * atomic_add_unless - add unless the number is already a given value
216 * @v: pointer of type atomic_t
217 * @a: the amount to add to v...
218 * @u: ...unless v is equal to u.
219 *
220 * Atomically adds @a to @v, so long as @v was not already @u.
221 * Returns non-zero if @v was not @u, and zero otherwise.
222 */
223static inline int atomic_add_unless(atomic_t *v, int a, int u)
224{
225 int c, old;
226 c = atomic_read(v);
227 for (;;) {
228 if (unlikely(c == (u)))
229 break;
230 old = atomic_cmpxchg((v), c, c + (a));
231 if (likely(old == c))
232 break;
233 c = old;
234 }
235 return c != (u);
236}
237
238#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
239
240#define atomic_inc_return(v) (atomic_add_return(1, v))
241#define atomic_dec_return(v) (atomic_sub_return(1, v))
242
243/* These are x86-specific, used by some header files */
244#define atomic_clear_mask(mask, addr) \
245 asm volatile(LOCK_PREFIX "andl %0,%1" \
246 : : "r" (~(mask)), "m" (*(addr)) : "memory")
247
248#define atomic_set_mask(mask, addr) \
249 asm volatile(LOCK_PREFIX "orl %0,%1" \
250 : : "r" (mask), "m" (*(addr)) : "memory")
251
252/* Atomic operations are already serializing on x86 */
253#define smp_mb__before_atomic_dec() barrier()
254#define smp_mb__after_atomic_dec() barrier()
255#define smp_mb__before_atomic_inc() barrier()
256#define smp_mb__after_atomic_inc() barrier()
257
258#include <asm-generic/atomic.h>
259#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
new file mode 100644
index 000000000000..279d2a731f3f
--- /dev/null
+++ b/arch/x86/include/asm/atomic_64.h
@@ -0,0 +1,473 @@
1#ifndef _ASM_X86_ATOMIC_64_H
2#define _ASM_X86_ATOMIC_64_H
3
4#include <asm/alternative.h>
5#include <asm/cmpxchg.h>
6
7/* atomic_t should be 32 bit signed type */
8
9/*
10 * Atomic operations that C can't guarantee us. Useful for
11 * resource counting etc..
12 */
13
14/*
15 * Make sure gcc doesn't try to be clever and move things around
16 * on us. We need to use _exactly_ the address the user gave us,
17 * not some alias that contains the same information.
18 */
19typedef struct {
20 int counter;
21} atomic_t;
22
23#define ATOMIC_INIT(i) { (i) }
24
25/**
26 * atomic_read - read atomic variable
27 * @v: pointer of type atomic_t
28 *
29 * Atomically reads the value of @v.
30 */
31#define atomic_read(v) ((v)->counter)
32
33/**
34 * atomic_set - set atomic variable
35 * @v: pointer of type atomic_t
36 * @i: required value
37 *
38 * Atomically sets the value of @v to @i.
39 */
40#define atomic_set(v, i) (((v)->counter) = (i))
41
42/**
43 * atomic_add - add integer to atomic variable
44 * @i: integer value to add
45 * @v: pointer of type atomic_t
46 *
47 * Atomically adds @i to @v.
48 */
49static inline void atomic_add(int i, atomic_t *v)
50{
51 asm volatile(LOCK_PREFIX "addl %1,%0"
52 : "=m" (v->counter)
53 : "ir" (i), "m" (v->counter));
54}
55
56/**
57 * atomic_sub - subtract the atomic variable
58 * @i: integer value to subtract
59 * @v: pointer of type atomic_t
60 *
61 * Atomically subtracts @i from @v.
62 */
63static inline void atomic_sub(int i, atomic_t *v)
64{
65 asm volatile(LOCK_PREFIX "subl %1,%0"
66 : "=m" (v->counter)
67 : "ir" (i), "m" (v->counter));
68}
69
70/**
71 * atomic_sub_and_test - subtract value from variable and test result
72 * @i: integer value to subtract
73 * @v: pointer of type atomic_t
74 *
75 * Atomically subtracts @i from @v and returns
76 * true if the result is zero, or false for all
77 * other cases.
78 */
79static inline int atomic_sub_and_test(int i, atomic_t *v)
80{
81 unsigned char c;
82
83 asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
84 : "=m" (v->counter), "=qm" (c)
85 : "ir" (i), "m" (v->counter) : "memory");
86 return c;
87}
88
89/**
90 * atomic_inc - increment atomic variable
91 * @v: pointer of type atomic_t
92 *
93 * Atomically increments @v by 1.
94 */
95static inline void atomic_inc(atomic_t *v)
96{
97 asm volatile(LOCK_PREFIX "incl %0"
98 : "=m" (v->counter)
99 : "m" (v->counter));
100}
101
102/**
103 * atomic_dec - decrement atomic variable
104 * @v: pointer of type atomic_t
105 *
106 * Atomically decrements @v by 1.
107 */
108static inline void atomic_dec(atomic_t *v)
109{
110 asm volatile(LOCK_PREFIX "decl %0"
111 : "=m" (v->counter)
112 : "m" (v->counter));
113}
114
115/**
116 * atomic_dec_and_test - decrement and test
117 * @v: pointer of type atomic_t
118 *
119 * Atomically decrements @v by 1 and
120 * returns true if the result is 0, or false for all other
121 * cases.
122 */
123static inline int atomic_dec_and_test(atomic_t *v)
124{
125 unsigned char c;
126
127 asm volatile(LOCK_PREFIX "decl %0; sete %1"
128 : "=m" (v->counter), "=qm" (c)
129 : "m" (v->counter) : "memory");
130 return c != 0;
131}
132
133/**
134 * atomic_inc_and_test - increment and test
135 * @v: pointer of type atomic_t
136 *
137 * Atomically increments @v by 1
138 * and returns true if the result is zero, or false for all
139 * other cases.
140 */
141static inline int atomic_inc_and_test(atomic_t *v)
142{
143 unsigned char c;
144
145 asm volatile(LOCK_PREFIX "incl %0; sete %1"
146 : "=m" (v->counter), "=qm" (c)
147 : "m" (v->counter) : "memory");
148 return c != 0;
149}
150
151/**
152 * atomic_add_negative - add and test if negative
153 * @i: integer value to add
154 * @v: pointer of type atomic_t
155 *
156 * Atomically adds @i to @v and returns true
157 * if the result is negative, or false when
158 * result is greater than or equal to zero.
159 */
160static inline int atomic_add_negative(int i, atomic_t *v)
161{
162 unsigned char c;
163
164 asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
165 : "=m" (v->counter), "=qm" (c)
166 : "ir" (i), "m" (v->counter) : "memory");
167 return c;
168}
169
170/**
171 * atomic_add_return - add and return
172 * @i: integer value to add
173 * @v: pointer of type atomic_t
174 *
175 * Atomically adds @i to @v and returns @i + @v
176 */
177static inline int atomic_add_return(int i, atomic_t *v)
178{
179 int __i = i;
180 asm volatile(LOCK_PREFIX "xaddl %0, %1"
181 : "+r" (i), "+m" (v->counter)
182 : : "memory");
183 return i + __i;
184}
185
186static inline int atomic_sub_return(int i, atomic_t *v)
187{
188 return atomic_add_return(-i, v);
189}
190
191#define atomic_inc_return(v) (atomic_add_return(1, v))
192#define atomic_dec_return(v) (atomic_sub_return(1, v))
193
194/* An 64bit atomic type */
195
196typedef struct {
197 long counter;
198} atomic64_t;
199
200#define ATOMIC64_INIT(i) { (i) }
201
202/**
203 * atomic64_read - read atomic64 variable
204 * @v: pointer of type atomic64_t
205 *
206 * Atomically reads the value of @v.
207 * Doesn't imply a read memory barrier.
208 */
209#define atomic64_read(v) ((v)->counter)
210
211/**
212 * atomic64_set - set atomic64 variable
213 * @v: pointer to type atomic64_t
214 * @i: required value
215 *
216 * Atomically sets the value of @v to @i.
217 */
218#define atomic64_set(v, i) (((v)->counter) = (i))
219
220/**
221 * atomic64_add - add integer to atomic64 variable
222 * @i: integer value to add
223 * @v: pointer to type atomic64_t
224 *
225 * Atomically adds @i to @v.
226 */
227static inline void atomic64_add(long i, atomic64_t *v)
228{
229 asm volatile(LOCK_PREFIX "addq %1,%0"
230 : "=m" (v->counter)
231 : "er" (i), "m" (v->counter));
232}
233
234/**
235 * atomic64_sub - subtract the atomic64 variable
236 * @i: integer value to subtract
237 * @v: pointer to type atomic64_t
238 *
239 * Atomically subtracts @i from @v.
240 */
241static inline void atomic64_sub(long i, atomic64_t *v)
242{
243 asm volatile(LOCK_PREFIX "subq %1,%0"
244 : "=m" (v->counter)
245 : "er" (i), "m" (v->counter));
246}
247
248/**
249 * atomic64_sub_and_test - subtract value from variable and test result
250 * @i: integer value to subtract
251 * @v: pointer to type atomic64_t
252 *
253 * Atomically subtracts @i from @v and returns
254 * true if the result is zero, or false for all
255 * other cases.
256 */
257static inline int atomic64_sub_and_test(long i, atomic64_t *v)
258{
259 unsigned char c;
260
261 asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
262 : "=m" (v->counter), "=qm" (c)
263 : "er" (i), "m" (v->counter) : "memory");
264 return c;
265}
266
267/**
268 * atomic64_inc - increment atomic64 variable
269 * @v: pointer to type atomic64_t
270 *
271 * Atomically increments @v by 1.
272 */
273static inline void atomic64_inc(atomic64_t *v)
274{
275 asm volatile(LOCK_PREFIX "incq %0"
276 : "=m" (v->counter)
277 : "m" (v->counter));
278}
279
280/**
281 * atomic64_dec - decrement atomic64 variable
282 * @v: pointer to type atomic64_t
283 *
284 * Atomically decrements @v by 1.
285 */
286static inline void atomic64_dec(atomic64_t *v)
287{
288 asm volatile(LOCK_PREFIX "decq %0"
289 : "=m" (v->counter)
290 : "m" (v->counter));
291}
292
293/**
294 * atomic64_dec_and_test - decrement and test
295 * @v: pointer to type atomic64_t
296 *
297 * Atomically decrements @v by 1 and
298 * returns true if the result is 0, or false for all other
299 * cases.
300 */
301static inline int atomic64_dec_and_test(atomic64_t *v)
302{
303 unsigned char c;
304
305 asm volatile(LOCK_PREFIX "decq %0; sete %1"
306 : "=m" (v->counter), "=qm" (c)
307 : "m" (v->counter) : "memory");
308 return c != 0;
309}
310
311/**
312 * atomic64_inc_and_test - increment and test
313 * @v: pointer to type atomic64_t
314 *
315 * Atomically increments @v by 1
316 * and returns true if the result is zero, or false for all
317 * other cases.
318 */
319static inline int atomic64_inc_and_test(atomic64_t *v)
320{
321 unsigned char c;
322
323 asm volatile(LOCK_PREFIX "incq %0; sete %1"
324 : "=m" (v->counter), "=qm" (c)
325 : "m" (v->counter) : "memory");
326 return c != 0;
327}
328
329/**
330 * atomic64_add_negative - add and test if negative
331 * @i: integer value to add
332 * @v: pointer to type atomic64_t
333 *
334 * Atomically adds @i to @v and returns true
335 * if the result is negative, or false when
336 * result is greater than or equal to zero.
337 */
338static inline int atomic64_add_negative(long i, atomic64_t *v)
339{
340 unsigned char c;
341
342 asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
343 : "=m" (v->counter), "=qm" (c)
344 : "er" (i), "m" (v->counter) : "memory");
345 return c;
346}
347
348/**
349 * atomic64_add_return - add and return
350 * @i: integer value to add
351 * @v: pointer to type atomic64_t
352 *
353 * Atomically adds @i to @v and returns @i + @v
354 */
355static inline long atomic64_add_return(long i, atomic64_t *v)
356{
357 long __i = i;
358 asm volatile(LOCK_PREFIX "xaddq %0, %1;"
359 : "+r" (i), "+m" (v->counter)
360 : : "memory");
361 return i + __i;
362}
363
364static inline long atomic64_sub_return(long i, atomic64_t *v)
365{
366 return atomic64_add_return(-i, v);
367}
368
369#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
370#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
371
372#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
373#define atomic64_xchg(v, new) (xchg(&((v)->counter), new))
374
375#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new)))
376#define atomic_xchg(v, new) (xchg(&((v)->counter), (new)))
377
378/**
379 * atomic_add_unless - add unless the number is a given value
380 * @v: pointer of type atomic_t
381 * @a: the amount to add to v...
382 * @u: ...unless v is equal to u.
383 *
384 * Atomically adds @a to @v, so long as it was not @u.
385 * Returns non-zero if @v was not @u, and zero otherwise.
386 */
387static inline int atomic_add_unless(atomic_t *v, int a, int u)
388{
389 int c, old;
390 c = atomic_read(v);
391 for (;;) {
392 if (unlikely(c == (u)))
393 break;
394 old = atomic_cmpxchg((v), c, c + (a));
395 if (likely(old == c))
396 break;
397 c = old;
398 }
399 return c != (u);
400}
401
402#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
403
404/**
405 * atomic64_add_unless - add unless the number is a given value
406 * @v: pointer of type atomic64_t
407 * @a: the amount to add to v...
408 * @u: ...unless v is equal to u.
409 *
410 * Atomically adds @a to @v, so long as it was not @u.
411 * Returns non-zero if @v was not @u, and zero otherwise.
412 */
413static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
414{
415 long c, old;
416 c = atomic64_read(v);
417 for (;;) {
418 if (unlikely(c == (u)))
419 break;
420 old = atomic64_cmpxchg((v), c, c + (a));
421 if (likely(old == c))
422 break;
423 c = old;
424 }
425 return c != (u);
426}
427
428/**
429 * atomic_inc_short - increment of a short integer
430 * @v: pointer to type int
431 *
432 * Atomically adds 1 to @v
433 * Returns the new value of @u
434 */
435static inline short int atomic_inc_short(short int *v)
436{
437 asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
438 return *v;
439}
440
441/**
442 * atomic_or_long - OR of two long integers
443 * @v1: pointer to type unsigned long
444 * @v2: pointer to type unsigned long
445 *
446 * Atomically ORs @v1 and @v2
447 * Returns the result of the OR
448 */
449static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
450{
451 asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
452}
453
454#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
455
456/* These are x86-specific, used by some header files */
457#define atomic_clear_mask(mask, addr) \
458 asm volatile(LOCK_PREFIX "andl %0,%1" \
459 : : "r" (~(mask)), "m" (*(addr)) : "memory")
460
461#define atomic_set_mask(mask, addr) \
462 asm volatile(LOCK_PREFIX "orl %0,%1" \
463 : : "r" ((unsigned)(mask)), "m" (*(addr)) \
464 : "memory")
465
466/* Atomic operations are already serializing on x86 */
467#define smp_mb__before_atomic_dec() barrier()
468#define smp_mb__after_atomic_dec() barrier()
469#define smp_mb__before_atomic_inc() barrier()
470#define smp_mb__after_atomic_inc() barrier()
471
472#include <asm-generic/atomic.h>
473#endif /* _ASM_X86_ATOMIC_64_H */
diff --git a/arch/x86/include/asm/auxvec.h b/arch/x86/include/asm/auxvec.h
new file mode 100644
index 000000000000..1316b4c35425
--- /dev/null
+++ b/arch/x86/include/asm/auxvec.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_AUXVEC_H
2#define _ASM_X86_AUXVEC_H
3/*
4 * Architecture-neutral AT_ values in 0-17, leave some room
5 * for more of them, start the x86-specific ones at 32.
6 */
7#ifdef __i386__
8#define AT_SYSINFO 32
9#endif
10#define AT_SYSINFO_EHDR 33
11
12#endif /* _ASM_X86_AUXVEC_H */
diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h
new file mode 100644
index 000000000000..1d9543b9d358
--- /dev/null
+++ b/arch/x86/include/asm/bigsmp/apic.h
@@ -0,0 +1,139 @@
1#ifndef __ASM_MACH_APIC_H
2#define __ASM_MACH_APIC_H
3
4#define xapic_phys_to_log_apicid(cpu) (per_cpu(x86_bios_cpu_apicid, cpu))
5#define esr_disable (1)
6
7static inline int apic_id_registered(void)
8{
9 return (1);
10}
11
12static inline cpumask_t target_cpus(void)
13{
14#ifdef CONFIG_SMP
15 return cpu_online_map;
16#else
17 return cpumask_of_cpu(0);
18#endif
19}
20
21#undef APIC_DEST_LOGICAL
22#define APIC_DEST_LOGICAL 0
23#define APIC_DFR_VALUE (APIC_DFR_FLAT)
24#define INT_DELIVERY_MODE (dest_Fixed)
25#define INT_DEST_MODE (0) /* phys delivery to target proc */
26#define NO_BALANCE_IRQ (0)
27#define WAKE_SECONDARY_VIA_INIT
28
29
30static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
31{
32 return (0);
33}
34
35static inline unsigned long check_apicid_present(int bit)
36{
37 return (1);
38}
39
40static inline unsigned long calculate_ldr(int cpu)
41{
42 unsigned long val, id;
43 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
44 id = xapic_phys_to_log_apicid(cpu);
45 val |= SET_APIC_LOGICAL_ID(id);
46 return val;
47}
48
49/*
50 * Set up the logical destination ID.
51 *
52 * Intel recommends to set DFR, LDR and TPR before enabling
53 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
54 * document number 292116). So here it goes...
55 */
56static inline void init_apic_ldr(void)
57{
58 unsigned long val;
59 int cpu = smp_processor_id();
60
61 apic_write(APIC_DFR, APIC_DFR_VALUE);
62 val = calculate_ldr(cpu);
63 apic_write(APIC_LDR, val);
64}
65
66static inline void setup_apic_routing(void)
67{
68 printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
69 "Physflat", nr_ioapics);
70}
71
72static inline int multi_timer_check(int apic, int irq)
73{
74 return (0);
75}
76
77static inline int apicid_to_node(int logical_apicid)
78{
79 return apicid_2_node[hard_smp_processor_id()];
80}
81
82static inline int cpu_present_to_apicid(int mps_cpu)
83{
84 if (mps_cpu < NR_CPUS)
85 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
86
87 return BAD_APICID;
88}
89
90static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
91{
92 return physid_mask_of_physid(phys_apicid);
93}
94
95extern u8 cpu_2_logical_apicid[];
96/* Mapping from cpu number to logical apicid */
97static inline int cpu_to_logical_apicid(int cpu)
98{
99 if (cpu >= NR_CPUS)
100 return BAD_APICID;
101 return cpu_physical_id(cpu);
102}
103
104static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
105{
106 /* For clustered we don't have a good way to do this yet - hack */
107 return physids_promote(0xFFL);
108}
109
110static inline void setup_portio_remap(void)
111{
112}
113
114static inline void enable_apic_mode(void)
115{
116}
117
118static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
119{
120 return (1);
121}
122
123/* As we are using single CPU as destination, pick only one CPU here */
124static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
125{
126 int cpu;
127 int apicid;
128
129 cpu = first_cpu(cpumask);
130 apicid = cpu_to_logical_apicid(cpu);
131 return apicid;
132}
133
134static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
135{
136 return cpuid_apic >> index_msb;
137}
138
139#endif /* __ASM_MACH_APIC_H */
diff --git a/arch/x86/include/asm/bigsmp/apicdef.h b/arch/x86/include/asm/bigsmp/apicdef.h
new file mode 100644
index 000000000000..392c3f5ef2fe
--- /dev/null
+++ b/arch/x86/include/asm/bigsmp/apicdef.h
@@ -0,0 +1,13 @@
1#ifndef __ASM_MACH_APICDEF_H
2#define __ASM_MACH_APICDEF_H
3
4#define APIC_ID_MASK (0xFF<<24)
5
6static inline unsigned get_apic_id(unsigned long x)
7{
8 return (((x)>>24)&0xFF);
9}
10
11#define GET_APIC_ID(x) get_apic_id(x)
12
13#endif
diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h
new file mode 100644
index 000000000000..9404c535b7ec
--- /dev/null
+++ b/arch/x86/include/asm/bigsmp/ipi.h
@@ -0,0 +1,25 @@
1#ifndef __ASM_MACH_IPI_H
2#define __ASM_MACH_IPI_H
3
4void send_IPI_mask_sequence(cpumask_t mask, int vector);
5
6static inline void send_IPI_mask(cpumask_t mask, int vector)
7{
8 send_IPI_mask_sequence(mask, vector);
9}
10
11static inline void send_IPI_allbutself(int vector)
12{
13 cpumask_t mask = cpu_online_map;
14 cpu_clear(smp_processor_id(), mask);
15
16 if (!cpus_empty(mask))
17 send_IPI_mask(mask, vector);
18}
19
20static inline void send_IPI_all(int vector)
21{
22 send_IPI_mask(cpu_online_map, vector);
23}
24
25#endif /* __ASM_MACH_IPI_H */
diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h
new file mode 100644
index 000000000000..3c7521063d3f
--- /dev/null
+++ b/arch/x86/include/asm/bios_ebda.h
@@ -0,0 +1,36 @@
1#ifndef _ASM_X86_BIOS_EBDA_H
2#define _ASM_X86_BIOS_EBDA_H
3
4#include <asm/io.h>
5
6/*
7 * there is a real-mode segmented pointer pointing to the
8 * 4K EBDA area at 0x40E.
9 */
10static inline unsigned int get_bios_ebda(void)
11{
12 unsigned int address = *(unsigned short *)phys_to_virt(0x40E);
13 address <<= 4;
14 return address; /* 0 means none */
15}
16
17void reserve_ebda_region(void);
18
19#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
20/*
21 * This is obviously not a great place for this, but we want to be
22 * able to scatter it around anywhere in the kernel.
23 */
24void check_for_bios_corruption(void);
25void start_periodic_check_for_corruption(void);
26#else
27static inline void check_for_bios_corruption(void)
28{
29}
30
31static inline void start_periodic_check_for_corruption(void)
32{
33}
34#endif
35
36#endif /* _ASM_X86_BIOS_EBDA_H */
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
new file mode 100644
index 000000000000..360010322711
--- /dev/null
+++ b/arch/x86/include/asm/bitops.h
@@ -0,0 +1,451 @@
1#ifndef _ASM_X86_BITOPS_H
2#define _ASM_X86_BITOPS_H
3
4/*
5 * Copyright 1992, Linus Torvalds.
6 */
7
8#ifndef _LINUX_BITOPS_H
9#error only <linux/bitops.h> can be included directly
10#endif
11
12#include <linux/compiler.h>
13#include <asm/alternative.h>
14
15/*
16 * These have to be done with inline assembly: that way the bit-setting
17 * is guaranteed to be atomic. All bit operations return 0 if the bit
18 * was cleared before the operation and != 0 if it was not.
19 *
20 * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
21 */
22
23#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1)
24/* Technically wrong, but this avoids compilation errors on some gcc
25 versions. */
26#define BITOP_ADDR(x) "=m" (*(volatile long *) (x))
27#else
28#define BITOP_ADDR(x) "+m" (*(volatile long *) (x))
29#endif
30
31#define ADDR BITOP_ADDR(addr)
32
33/*
34 * We do the locked ops that don't return the old value as
35 * a mask operation on a byte.
36 */
37#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr))
38#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3))
39#define CONST_MASK(nr) (1 << ((nr) & 7))
40
41/**
42 * set_bit - Atomically set a bit in memory
43 * @nr: the bit to set
44 * @addr: the address to start counting from
45 *
46 * This function is atomic and may not be reordered. See __set_bit()
47 * if you do not require the atomic guarantees.
48 *
49 * Note: there are no guarantees that this function will not be reordered
50 * on non x86 architectures, so if you are writing portable code,
51 * make sure not to rely on its reordering guarantees.
52 *
53 * Note that @nr may be almost arbitrarily large; this function is not
54 * restricted to acting on a single-word quantity.
55 */
56static inline void set_bit(unsigned int nr, volatile unsigned long *addr)
57{
58 if (IS_IMMEDIATE(nr)) {
59 asm volatile(LOCK_PREFIX "orb %1,%0"
60 : CONST_MASK_ADDR(nr, addr)
61 : "iq" ((u8)CONST_MASK(nr))
62 : "memory");
63 } else {
64 asm volatile(LOCK_PREFIX "bts %1,%0"
65 : BITOP_ADDR(addr) : "Ir" (nr) : "memory");
66 }
67}
68
69/**
70 * __set_bit - Set a bit in memory
71 * @nr: the bit to set
72 * @addr: the address to start counting from
73 *
74 * Unlike set_bit(), this function is non-atomic and may be reordered.
75 * If it's called on the same region of memory simultaneously, the effect
76 * may be that only one operation succeeds.
77 */
78static inline void __set_bit(int nr, volatile unsigned long *addr)
79{
80 asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory");
81}
82
83/**
84 * clear_bit - Clears a bit in memory
85 * @nr: Bit to clear
86 * @addr: Address to start counting from
87 *
88 * clear_bit() is atomic and may not be reordered. However, it does
89 * not contain a memory barrier, so if it is used for locking purposes,
90 * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
91 * in order to ensure changes are visible on other processors.
92 */
93static inline void clear_bit(int nr, volatile unsigned long *addr)
94{
95 if (IS_IMMEDIATE(nr)) {
96 asm volatile(LOCK_PREFIX "andb %1,%0"
97 : CONST_MASK_ADDR(nr, addr)
98 : "iq" ((u8)~CONST_MASK(nr)));
99 } else {
100 asm volatile(LOCK_PREFIX "btr %1,%0"
101 : BITOP_ADDR(addr)
102 : "Ir" (nr));
103 }
104}
105
106/*
107 * clear_bit_unlock - Clears a bit in memory
108 * @nr: Bit to clear
109 * @addr: Address to start counting from
110 *
111 * clear_bit() is atomic and implies release semantics before the memory
112 * operation. It can be used for an unlock.
113 */
114static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
115{
116 barrier();
117 clear_bit(nr, addr);
118}
119
120static inline void __clear_bit(int nr, volatile unsigned long *addr)
121{
122 asm volatile("btr %1,%0" : ADDR : "Ir" (nr));
123}
124
125/*
126 * __clear_bit_unlock - Clears a bit in memory
127 * @nr: Bit to clear
128 * @addr: Address to start counting from
129 *
130 * __clear_bit() is non-atomic and implies release semantics before the memory
131 * operation. It can be used for an unlock if no other CPUs can concurrently
132 * modify other bits in the word.
133 *
134 * No memory barrier is required here, because x86 cannot reorder stores past
135 * older loads. Same principle as spin_unlock.
136 */
137static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr)
138{
139 barrier();
140 __clear_bit(nr, addr);
141}
142
143#define smp_mb__before_clear_bit() barrier()
144#define smp_mb__after_clear_bit() barrier()
145
146/**
147 * __change_bit - Toggle a bit in memory
148 * @nr: the bit to change
149 * @addr: the address to start counting from
150 *
151 * Unlike change_bit(), this function is non-atomic and may be reordered.
152 * If it's called on the same region of memory simultaneously, the effect
153 * may be that only one operation succeeds.
154 */
155static inline void __change_bit(int nr, volatile unsigned long *addr)
156{
157 asm volatile("btc %1,%0" : ADDR : "Ir" (nr));
158}
159
160/**
161 * change_bit - Toggle a bit in memory
162 * @nr: Bit to change
163 * @addr: Address to start counting from
164 *
165 * change_bit() is atomic and may not be reordered.
166 * Note that @nr may be almost arbitrarily large; this function is not
167 * restricted to acting on a single-word quantity.
168 */
169static inline void change_bit(int nr, volatile unsigned long *addr)
170{
171 asm volatile(LOCK_PREFIX "btc %1,%0" : ADDR : "Ir" (nr));
172}
173
174/**
175 * test_and_set_bit - Set a bit and return its old value
176 * @nr: Bit to set
177 * @addr: Address to count from
178 *
179 * This operation is atomic and cannot be reordered.
180 * It also implies a memory barrier.
181 */
182static inline int test_and_set_bit(int nr, volatile unsigned long *addr)
183{
184 int oldbit;
185
186 asm volatile(LOCK_PREFIX "bts %2,%1\n\t"
187 "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
188
189 return oldbit;
190}
191
192/**
193 * test_and_set_bit_lock - Set a bit and return its old value for lock
194 * @nr: Bit to set
195 * @addr: Address to count from
196 *
197 * This is the same as test_and_set_bit on x86.
198 */
199static inline int test_and_set_bit_lock(int nr, volatile unsigned long *addr)
200{
201 return test_and_set_bit(nr, addr);
202}
203
204/**
205 * __test_and_set_bit - Set a bit and return its old value
206 * @nr: Bit to set
207 * @addr: Address to count from
208 *
209 * This operation is non-atomic and can be reordered.
210 * If two examples of this operation race, one can appear to succeed
211 * but actually fail. You must protect multiple accesses with a lock.
212 */
213static inline int __test_and_set_bit(int nr, volatile unsigned long *addr)
214{
215 int oldbit;
216
217 asm("bts %2,%1\n\t"
218 "sbb %0,%0"
219 : "=r" (oldbit), ADDR
220 : "Ir" (nr));
221 return oldbit;
222}
223
224/**
225 * test_and_clear_bit - Clear a bit and return its old value
226 * @nr: Bit to clear
227 * @addr: Address to count from
228 *
229 * This operation is atomic and cannot be reordered.
230 * It also implies a memory barrier.
231 */
232static inline int test_and_clear_bit(int nr, volatile unsigned long *addr)
233{
234 int oldbit;
235
236 asm volatile(LOCK_PREFIX "btr %2,%1\n\t"
237 "sbb %0,%0"
238 : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
239
240 return oldbit;
241}
242
243/**
244 * __test_and_clear_bit - Clear a bit and return its old value
245 * @nr: Bit to clear
246 * @addr: Address to count from
247 *
248 * This operation is non-atomic and can be reordered.
249 * If two examples of this operation race, one can appear to succeed
250 * but actually fail. You must protect multiple accesses with a lock.
251 */
252static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr)
253{
254 int oldbit;
255
256 asm volatile("btr %2,%1\n\t"
257 "sbb %0,%0"
258 : "=r" (oldbit), ADDR
259 : "Ir" (nr));
260 return oldbit;
261}
262
263/* WARNING: non atomic and it can be reordered! */
264static inline int __test_and_change_bit(int nr, volatile unsigned long *addr)
265{
266 int oldbit;
267
268 asm volatile("btc %2,%1\n\t"
269 "sbb %0,%0"
270 : "=r" (oldbit), ADDR
271 : "Ir" (nr) : "memory");
272
273 return oldbit;
274}
275
276/**
277 * test_and_change_bit - Change a bit and return its old value
278 * @nr: Bit to change
279 * @addr: Address to count from
280 *
281 * This operation is atomic and cannot be reordered.
282 * It also implies a memory barrier.
283 */
284static inline int test_and_change_bit(int nr, volatile unsigned long *addr)
285{
286 int oldbit;
287
288 asm volatile(LOCK_PREFIX "btc %2,%1\n\t"
289 "sbb %0,%0"
290 : "=r" (oldbit), ADDR : "Ir" (nr) : "memory");
291
292 return oldbit;
293}
294
295static inline int constant_test_bit(int nr, const volatile unsigned long *addr)
296{
297 return ((1UL << (nr % BITS_PER_LONG)) &
298 (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0;
299}
300
301static inline int variable_test_bit(int nr, volatile const unsigned long *addr)
302{
303 int oldbit;
304
305 asm volatile("bt %2,%1\n\t"
306 "sbb %0,%0"
307 : "=r" (oldbit)
308 : "m" (*(unsigned long *)addr), "Ir" (nr));
309
310 return oldbit;
311}
312
313#if 0 /* Fool kernel-doc since it doesn't do macros yet */
314/**
315 * test_bit - Determine whether a bit is set
316 * @nr: bit number to test
317 * @addr: Address to start counting from
318 */
319static int test_bit(int nr, const volatile unsigned long *addr);
320#endif
321
322#define test_bit(nr, addr) \
323 (__builtin_constant_p((nr)) \
324 ? constant_test_bit((nr), (addr)) \
325 : variable_test_bit((nr), (addr)))
326
327/**
328 * __ffs - find first set bit in word
329 * @word: The word to search
330 *
331 * Undefined if no bit exists, so code should check against 0 first.
332 */
333static inline unsigned long __ffs(unsigned long word)
334{
335 asm("bsf %1,%0"
336 : "=r" (word)
337 : "rm" (word));
338 return word;
339}
340
341/**
342 * ffz - find first zero bit in word
343 * @word: The word to search
344 *
345 * Undefined if no zero exists, so code should check against ~0UL first.
346 */
347static inline unsigned long ffz(unsigned long word)
348{
349 asm("bsf %1,%0"
350 : "=r" (word)
351 : "r" (~word));
352 return word;
353}
354
355/*
356 * __fls: find last set bit in word
357 * @word: The word to search
358 *
359 * Undefined if no set bit exists, so code should check against 0 first.
360 */
361static inline unsigned long __fls(unsigned long word)
362{
363 asm("bsr %1,%0"
364 : "=r" (word)
365 : "rm" (word));
366 return word;
367}
368
369#ifdef __KERNEL__
370/**
371 * ffs - find first set bit in word
372 * @x: the word to search
373 *
374 * This is defined the same way as the libc and compiler builtin ffs
375 * routines, therefore differs in spirit from the other bitops.
376 *
377 * ffs(value) returns 0 if value is 0 or the position of the first
378 * set bit if value is nonzero. The first (least significant) bit
379 * is at position 1.
380 */
381static inline int ffs(int x)
382{
383 int r;
384#ifdef CONFIG_X86_CMOV
385 asm("bsfl %1,%0\n\t"
386 "cmovzl %2,%0"
387 : "=r" (r) : "rm" (x), "r" (-1));
388#else
389 asm("bsfl %1,%0\n\t"
390 "jnz 1f\n\t"
391 "movl $-1,%0\n"
392 "1:" : "=r" (r) : "rm" (x));
393#endif
394 return r + 1;
395}
396
397/**
398 * fls - find last set bit in word
399 * @x: the word to search
400 *
401 * This is defined in a similar way as the libc and compiler builtin
402 * ffs, but returns the position of the most significant set bit.
403 *
404 * fls(value) returns 0 if value is 0 or the position of the last
405 * set bit if value is nonzero. The last (most significant) bit is
406 * at position 32.
407 */
408static inline int fls(int x)
409{
410 int r;
411#ifdef CONFIG_X86_CMOV
412 asm("bsrl %1,%0\n\t"
413 "cmovzl %2,%0"
414 : "=&r" (r) : "rm" (x), "rm" (-1));
415#else
416 asm("bsrl %1,%0\n\t"
417 "jnz 1f\n\t"
418 "movl $-1,%0\n"
419 "1:" : "=r" (r) : "rm" (x));
420#endif
421 return r + 1;
422}
423#endif /* __KERNEL__ */
424
425#undef ADDR
426
427#ifdef __KERNEL__
428
429#include <asm-generic/bitops/sched.h>
430
431#define ARCH_HAS_FAST_MULTIPLIER 1
432
433#include <asm-generic/bitops/hweight.h>
434
435#endif /* __KERNEL__ */
436
437#include <asm-generic/bitops/fls64.h>
438
439#ifdef __KERNEL__
440
441#include <asm-generic/bitops/ext2-non-atomic.h>
442
443#define ext2_set_bit_atomic(lock, nr, addr) \
444 test_and_set_bit((nr), (unsigned long *)(addr))
445#define ext2_clear_bit_atomic(lock, nr, addr) \
446 test_and_clear_bit((nr), (unsigned long *)(addr))
447
448#include <asm-generic/bitops/minix.h>
449
450#endif /* __KERNEL__ */
451#endif /* _ASM_X86_BITOPS_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
new file mode 100644
index 000000000000..dd61616cb73d
--- /dev/null
+++ b/arch/x86/include/asm/boot.h
@@ -0,0 +1,26 @@
1#ifndef _ASM_X86_BOOT_H
2#define _ASM_X86_BOOT_H
3
4/* Don't touch these, unless you really know what you're doing. */
5#define DEF_SYSSEG 0x1000
6#define DEF_SYSSIZE 0x7F00
7
8/* Internal svga startup constants */
9#define NORMAL_VGA 0xffff /* 80x25 mode */
10#define EXTENDED_VGA 0xfffe /* 80x50 mode */
11#define ASK_VGA 0xfffd /* ask for it at bootup */
12
13/* Physical address where kernel should be loaded. */
14#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
15 + (CONFIG_PHYSICAL_ALIGN - 1)) \
16 & ~(CONFIG_PHYSICAL_ALIGN - 1))
17
18#ifdef CONFIG_X86_64
19#define BOOT_HEAP_SIZE 0x7000
20#define BOOT_STACK_SIZE 0x4000
21#else
22#define BOOT_HEAP_SIZE 0x4000
23#define BOOT_STACK_SIZE 0x1000
24#endif
25
26#endif /* _ASM_X86_BOOT_H */
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
new file mode 100644
index 000000000000..433adaebf9b6
--- /dev/null
+++ b/arch/x86/include/asm/bootparam.h
@@ -0,0 +1,111 @@
1#ifndef _ASM_X86_BOOTPARAM_H
2#define _ASM_X86_BOOTPARAM_H
3
4#include <linux/types.h>
5#include <linux/screen_info.h>
6#include <linux/apm_bios.h>
7#include <linux/edd.h>
8#include <asm/e820.h>
9#include <asm/ist.h>
10#include <video/edid.h>
11
12/* setup data types */
13#define SETUP_NONE 0
14#define SETUP_E820_EXT 1
15
16/* extensible setup data list node */
17struct setup_data {
18 __u64 next;
19 __u32 type;
20 __u32 len;
21 __u8 data[0];
22};
23
24struct setup_header {
25 __u8 setup_sects;
26 __u16 root_flags;
27 __u32 syssize;
28 __u16 ram_size;
29#define RAMDISK_IMAGE_START_MASK 0x07FF
30#define RAMDISK_PROMPT_FLAG 0x8000
31#define RAMDISK_LOAD_FLAG 0x4000
32 __u16 vid_mode;
33 __u16 root_dev;
34 __u16 boot_flag;
35 __u16 jump;
36 __u32 header;
37 __u16 version;
38 __u32 realmode_swtch;
39 __u16 start_sys;
40 __u16 kernel_version;
41 __u8 type_of_loader;
42 __u8 loadflags;
43#define LOADED_HIGH (1<<0)
44#define QUIET_FLAG (1<<5)
45#define KEEP_SEGMENTS (1<<6)
46#define CAN_USE_HEAP (1<<7)
47 __u16 setup_move_size;
48 __u32 code32_start;
49 __u32 ramdisk_image;
50 __u32 ramdisk_size;
51 __u32 bootsect_kludge;
52 __u16 heap_end_ptr;
53 __u16 _pad1;
54 __u32 cmd_line_ptr;
55 __u32 initrd_addr_max;
56 __u32 kernel_alignment;
57 __u8 relocatable_kernel;
58 __u8 _pad2[3];
59 __u32 cmdline_size;
60 __u32 hardware_subarch;
61 __u64 hardware_subarch_data;
62 __u32 payload_offset;
63 __u32 payload_length;
64 __u64 setup_data;
65} __attribute__((packed));
66
67struct sys_desc_table {
68 __u16 length;
69 __u8 table[14];
70};
71
72struct efi_info {
73 __u32 efi_loader_signature;
74 __u32 efi_systab;
75 __u32 efi_memdesc_size;
76 __u32 efi_memdesc_version;
77 __u32 efi_memmap;
78 __u32 efi_memmap_size;
79 __u32 efi_systab_hi;
80 __u32 efi_memmap_hi;
81};
82
83/* The so-called "zeropage" */
84struct boot_params {
85 struct screen_info screen_info; /* 0x000 */
86 struct apm_bios_info apm_bios_info; /* 0x040 */
87 __u8 _pad2[12]; /* 0x054 */
88 struct ist_info ist_info; /* 0x060 */
89 __u8 _pad3[16]; /* 0x070 */
90 __u8 hd0_info[16]; /* obsolete! */ /* 0x080 */
91 __u8 hd1_info[16]; /* obsolete! */ /* 0x090 */
92 struct sys_desc_table sys_desc_table; /* 0x0a0 */
93 __u8 _pad4[144]; /* 0x0b0 */
94 struct edid_info edid_info; /* 0x140 */
95 struct efi_info efi_info; /* 0x1c0 */
96 __u32 alt_mem_k; /* 0x1e0 */
97 __u32 scratch; /* Scratch field! */ /* 0x1e4 */
98 __u8 e820_entries; /* 0x1e8 */
99 __u8 eddbuf_entries; /* 0x1e9 */
100 __u8 edd_mbr_sig_buf_entries; /* 0x1ea */
101 __u8 _pad6[6]; /* 0x1eb */
102 struct setup_header hdr; /* setup header */ /* 0x1f1 */
103 __u8 _pad7[0x290-0x1f1-sizeof(struct setup_header)];
104 __u32 edd_mbr_sig_buffer[EDD_MBR_SIG_MAX]; /* 0x290 */
105 struct e820entry e820_map[E820MAX]; /* 0x2d0 */
106 __u8 _pad8[48]; /* 0xcd0 */
107 struct edd_info eddbuf[EDDMAXNR]; /* 0xd00 */
108 __u8 _pad9[276]; /* 0xeec */
109} __attribute__((packed));
110
111#endif /* _ASM_X86_BOOTPARAM_H */
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
new file mode 100644
index 000000000000..3def2065fcea
--- /dev/null
+++ b/arch/x86/include/asm/bug.h
@@ -0,0 +1,39 @@
1#ifndef _ASM_X86_BUG_H
2#define _ASM_X86_BUG_H
3
4#ifdef CONFIG_BUG
5#define HAVE_ARCH_BUG
6
7#ifdef CONFIG_DEBUG_BUGVERBOSE
8
9#ifdef CONFIG_X86_32
10# define __BUG_C0 "2:\t.long 1b, %c0\n"
11#else
12# define __BUG_C0 "2:\t.quad 1b, %c0\n"
13#endif
14
15#define BUG() \
16do { \
17 asm volatile("1:\tud2\n" \
18 ".pushsection __bug_table,\"a\"\n" \
19 __BUG_C0 \
20 "\t.word %c1, 0\n" \
21 "\t.org 2b+%c2\n" \
22 ".popsection" \
23 : : "i" (__FILE__), "i" (__LINE__), \
24 "i" (sizeof(struct bug_entry))); \
25 for (;;) ; \
26} while (0)
27
28#else
29#define BUG() \
30do { \
31 asm volatile("ud2"); \
32 for (;;) ; \
33} while (0)
34#endif
35
36#endif /* !CONFIG_BUG */
37
38#include <asm-generic/bug.h>
39#endif /* _ASM_X86_BUG_H */
diff --git a/arch/x86/include/asm/bugs.h b/arch/x86/include/asm/bugs.h
new file mode 100644
index 000000000000..08abf639075f
--- /dev/null
+++ b/arch/x86/include/asm/bugs.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_BUGS_H
2#define _ASM_X86_BUGS_H
3
4extern void check_bugs(void);
5
6#if defined(CONFIG_CPU_SUP_INTEL) && defined(CONFIG_X86_32)
7int ppro_with_ram_bug(void);
8#else
9static inline int ppro_with_ram_bug(void) { return 0; }
10#endif
11
12#endif /* _ASM_X86_BUGS_H */
diff --git a/arch/x86/include/asm/byteorder.h b/arch/x86/include/asm/byteorder.h
new file mode 100644
index 000000000000..e02ae2d89acf
--- /dev/null
+++ b/arch/x86/include/asm/byteorder.h
@@ -0,0 +1,81 @@
1#ifndef _ASM_X86_BYTEORDER_H
2#define _ASM_X86_BYTEORDER_H
3
4#include <asm/types.h>
5#include <linux/compiler.h>
6
7#ifdef __GNUC__
8
9#ifdef __i386__
10
11static inline __attribute_const__ __u32 ___arch__swab32(__u32 x)
12{
13#ifdef CONFIG_X86_BSWAP
14 asm("bswap %0" : "=r" (x) : "0" (x));
15#else
16 asm("xchgb %b0,%h0\n\t" /* swap lower bytes */
17 "rorl $16,%0\n\t" /* swap words */
18 "xchgb %b0,%h0" /* swap higher bytes */
19 : "=q" (x)
20 : "0" (x));
21#endif
22 return x;
23}
24
25static inline __attribute_const__ __u64 ___arch__swab64(__u64 val)
26{
27 union {
28 struct {
29 __u32 a;
30 __u32 b;
31 } s;
32 __u64 u;
33 } v;
34 v.u = val;
35#ifdef CONFIG_X86_BSWAP
36 asm("bswapl %0 ; bswapl %1 ; xchgl %0,%1"
37 : "=r" (v.s.a), "=r" (v.s.b)
38 : "0" (v.s.a), "1" (v.s.b));
39#else
40 v.s.a = ___arch__swab32(v.s.a);
41 v.s.b = ___arch__swab32(v.s.b);
42 asm("xchgl %0,%1"
43 : "=r" (v.s.a), "=r" (v.s.b)
44 : "0" (v.s.a), "1" (v.s.b));
45#endif
46 return v.u;
47}
48
49#else /* __i386__ */
50
51static inline __attribute_const__ __u64 ___arch__swab64(__u64 x)
52{
53 asm("bswapq %0"
54 : "=r" (x)
55 : "0" (x));
56 return x;
57}
58
59static inline __attribute_const__ __u32 ___arch__swab32(__u32 x)
60{
61 asm("bswapl %0"
62 : "=r" (x)
63 : "0" (x));
64 return x;
65}
66
67#endif
68
69/* Do not define swab16. Gcc is smart enough to recognize "C" version and
70 convert it into rotation or exhange. */
71
72#define __arch__swab64(x) ___arch__swab64(x)
73#define __arch__swab32(x) ___arch__swab32(x)
74
75#define __BYTEORDER_HAS_U64__
76
77#endif /* __GNUC__ */
78
79#include <linux/byteorder/little_endian.h>
80
81#endif /* _ASM_X86_BYTEORDER_H */
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
new file mode 100644
index 000000000000..5d367caa0e36
--- /dev/null
+++ b/arch/x86/include/asm/cache.h
@@ -0,0 +1,20 @@
1#ifndef _ASM_X86_CACHE_H
2#define _ASM_X86_CACHE_H
3
4/* L1 cache line size */
5#define L1_CACHE_SHIFT (CONFIG_X86_L1_CACHE_SHIFT)
6#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
7
8#define __read_mostly __attribute__((__section__(".data.read_mostly")))
9
10#ifdef CONFIG_X86_VSMP
11/* vSMP Internode cacheline shift */
12#define INTERNODE_CACHE_SHIFT (12)
13#ifdef CONFIG_SMP
14#define __cacheline_aligned_in_smp \
15 __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \
16 __attribute__((__section__(".data.page_aligned")))
17#endif
18#endif
19
20#endif /* _ASM_X86_CACHE_H */
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
new file mode 100644
index 000000000000..2f8466540fb5
--- /dev/null
+++ b/arch/x86/include/asm/cacheflush.h
@@ -0,0 +1,118 @@
1#ifndef _ASM_X86_CACHEFLUSH_H
2#define _ASM_X86_CACHEFLUSH_H
3
4/* Keep includes the same across arches. */
5#include <linux/mm.h>
6
7/* Caches aren't brain-dead on the intel. */
8#define flush_cache_all() do { } while (0)
9#define flush_cache_mm(mm) do { } while (0)
10#define flush_cache_dup_mm(mm) do { } while (0)
11#define flush_cache_range(vma, start, end) do { } while (0)
12#define flush_cache_page(vma, vmaddr, pfn) do { } while (0)
13#define flush_dcache_page(page) do { } while (0)
14#define flush_dcache_mmap_lock(mapping) do { } while (0)
15#define flush_dcache_mmap_unlock(mapping) do { } while (0)
16#define flush_icache_range(start, end) do { } while (0)
17#define flush_icache_page(vma, pg) do { } while (0)
18#define flush_icache_user_range(vma, pg, adr, len) do { } while (0)
19#define flush_cache_vmap(start, end) do { } while (0)
20#define flush_cache_vunmap(start, end) do { } while (0)
21
22#define copy_to_user_page(vma, page, vaddr, dst, src, len) \
23 memcpy((dst), (src), (len))
24#define copy_from_user_page(vma, page, vaddr, dst, src, len) \
25 memcpy((dst), (src), (len))
26
27#define PG_non_WB PG_arch_1
28PAGEFLAG(NonWB, non_WB)
29
30/*
31 * The set_memory_* API can be used to change various attributes of a virtual
32 * address range. The attributes include:
33 * Cachability : UnCached, WriteCombining, WriteBack
34 * Executability : eXeutable, NoteXecutable
35 * Read/Write : ReadOnly, ReadWrite
36 * Presence : NotPresent
37 *
38 * Within a catagory, the attributes are mutually exclusive.
39 *
40 * The implementation of this API will take care of various aspects that
41 * are associated with changing such attributes, such as:
42 * - Flushing TLBs
43 * - Flushing CPU caches
44 * - Making sure aliases of the memory behind the mapping don't violate
45 * coherency rules as defined by the CPU in the system.
46 *
47 * What this API does not do:
48 * - Provide exclusion between various callers - including callers that
49 * operation on other mappings of the same physical page
50 * - Restore default attributes when a page is freed
51 * - Guarantee that mappings other than the requested one are
52 * in any state, other than that these do not violate rules for
53 * the CPU you have. Do not depend on any effects on other mappings,
54 * CPUs other than the one you have may have more relaxed rules.
55 * The caller is required to take care of these.
56 */
57
58int _set_memory_uc(unsigned long addr, int numpages);
59int _set_memory_wc(unsigned long addr, int numpages);
60int _set_memory_wb(unsigned long addr, int numpages);
61int set_memory_uc(unsigned long addr, int numpages);
62int set_memory_wc(unsigned long addr, int numpages);
63int set_memory_wb(unsigned long addr, int numpages);
64int set_memory_x(unsigned long addr, int numpages);
65int set_memory_nx(unsigned long addr, int numpages);
66int set_memory_ro(unsigned long addr, int numpages);
67int set_memory_rw(unsigned long addr, int numpages);
68int set_memory_np(unsigned long addr, int numpages);
69int set_memory_4k(unsigned long addr, int numpages);
70
71int set_memory_array_uc(unsigned long *addr, int addrinarray);
72int set_memory_array_wb(unsigned long *addr, int addrinarray);
73
74/*
75 * For legacy compatibility with the old APIs, a few functions
76 * are provided that work on a "struct page".
77 * These functions operate ONLY on the 1:1 kernel mapping of the
78 * memory that the struct page represents, and internally just
79 * call the set_memory_* function. See the description of the
80 * set_memory_* function for more details on conventions.
81 *
82 * These APIs should be considered *deprecated* and are likely going to
83 * be removed in the future.
84 * The reason for this is the implicit operation on the 1:1 mapping only,
85 * making this not a generally useful API.
86 *
87 * Specifically, many users of the old APIs had a virtual address,
88 * called virt_to_page() or vmalloc_to_page() on that address to
89 * get a struct page* that the old API required.
90 * To convert these cases, use set_memory_*() on the original
91 * virtual address, do not use these functions.
92 */
93
94int set_pages_uc(struct page *page, int numpages);
95int set_pages_wb(struct page *page, int numpages);
96int set_pages_x(struct page *page, int numpages);
97int set_pages_nx(struct page *page, int numpages);
98int set_pages_ro(struct page *page, int numpages);
99int set_pages_rw(struct page *page, int numpages);
100
101
102void clflush_cache_range(void *addr, unsigned int size);
103
104#ifdef CONFIG_DEBUG_RODATA
105void mark_rodata_ro(void);
106extern const int rodata_test_data;
107#endif
108
109#ifdef CONFIG_DEBUG_RODATA_TEST
110int rodata_test(void);
111#else
112static inline int rodata_test(void)
113{
114 return 0;
115}
116#endif
117
118#endif /* _ASM_X86_CACHEFLUSH_H */
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
new file mode 100644
index 000000000000..b03bedb62aa7
--- /dev/null
+++ b/arch/x86/include/asm/calgary.h
@@ -0,0 +1,72 @@
1/*
2 * Derived from include/asm-powerpc/iommu.h
3 *
4 * Copyright IBM Corporation, 2006-2007
5 *
6 * Author: Jon Mason <jdmason@us.ibm.com>
7 * Author: Muli Ben-Yehuda <muli@il.ibm.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#ifndef _ASM_X86_CALGARY_H
25#define _ASM_X86_CALGARY_H
26
27#include <linux/spinlock.h>
28#include <linux/device.h>
29#include <linux/dma-mapping.h>
30#include <linux/timer.h>
31#include <asm/types.h>
32
33struct iommu_table {
34 struct cal_chipset_ops *chip_ops; /* chipset specific funcs */
35 unsigned long it_base; /* mapped address of tce table */
36 unsigned long it_hint; /* Hint for next alloc */
37 unsigned long *it_map; /* A simple allocation bitmap for now */
38 void __iomem *bbar; /* Bridge BAR */
39 u64 tar_val; /* Table Address Register */
40 struct timer_list watchdog_timer;
41 spinlock_t it_lock; /* Protects it_map */
42 unsigned int it_size; /* Size of iommu table in entries */
43 unsigned char it_busno; /* Bus number this table belongs to */
44};
45
46struct cal_chipset_ops {
47 void (*handle_quirks)(struct iommu_table *tbl, struct pci_dev *dev);
48 void (*tce_cache_blast)(struct iommu_table *tbl);
49 void (*dump_error_regs)(struct iommu_table *tbl);
50};
51
52#define TCE_TABLE_SIZE_UNSPECIFIED ~0
53#define TCE_TABLE_SIZE_64K 0
54#define TCE_TABLE_SIZE_128K 1
55#define TCE_TABLE_SIZE_256K 2
56#define TCE_TABLE_SIZE_512K 3
57#define TCE_TABLE_SIZE_1M 4
58#define TCE_TABLE_SIZE_2M 5
59#define TCE_TABLE_SIZE_4M 6
60#define TCE_TABLE_SIZE_8M 7
61
62extern int use_calgary;
63
64#ifdef CONFIG_CALGARY_IOMMU
65extern int calgary_iommu_init(void);
66extern void detect_calgary(void);
67#else
68static inline int calgary_iommu_init(void) { return 1; }
69static inline void detect_calgary(void) { return; }
70#endif
71
72#endif /* _ASM_X86_CALGARY_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
new file mode 100644
index 000000000000..2bc162e0ec6e
--- /dev/null
+++ b/arch/x86/include/asm/calling.h
@@ -0,0 +1,170 @@
1/*
2 * Some macros to handle stack frames in assembly.
3 */
4
5#define R15 0
6#define R14 8
7#define R13 16
8#define R12 24
9#define RBP 32
10#define RBX 40
11
12/* arguments: interrupts/non tracing syscalls only save upto here*/
13#define R11 48
14#define R10 56
15#define R9 64
16#define R8 72
17#define RAX 80
18#define RCX 88
19#define RDX 96
20#define RSI 104
21#define RDI 112
22#define ORIG_RAX 120 /* + error_code */
23/* end of arguments */
24
25/* cpu exception frame or undefined in case of fast syscall. */
26#define RIP 128
27#define CS 136
28#define EFLAGS 144
29#define RSP 152
30#define SS 160
31
32#define ARGOFFSET R11
33#define SWFRAME ORIG_RAX
34
35 .macro SAVE_ARGS addskip=0, norcx=0, nor891011=0
36 subq $9*8+\addskip, %rsp
37 CFI_ADJUST_CFA_OFFSET 9*8+\addskip
38 movq %rdi, 8*8(%rsp)
39 CFI_REL_OFFSET rdi, 8*8
40 movq %rsi, 7*8(%rsp)
41 CFI_REL_OFFSET rsi, 7*8
42 movq %rdx, 6*8(%rsp)
43 CFI_REL_OFFSET rdx, 6*8
44 .if \norcx
45 .else
46 movq %rcx, 5*8(%rsp)
47 CFI_REL_OFFSET rcx, 5*8
48 .endif
49 movq %rax, 4*8(%rsp)
50 CFI_REL_OFFSET rax, 4*8
51 .if \nor891011
52 .else
53 movq %r8, 3*8(%rsp)
54 CFI_REL_OFFSET r8, 3*8
55 movq %r9, 2*8(%rsp)
56 CFI_REL_OFFSET r9, 2*8
57 movq %r10, 1*8(%rsp)
58 CFI_REL_OFFSET r10, 1*8
59 movq %r11, (%rsp)
60 CFI_REL_OFFSET r11, 0*8
61 .endif
62 .endm
63
64#define ARG_SKIP 9*8
65
66 .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
67 skipr8910=0, skiprdx=0
68 .if \skipr11
69 .else
70 movq (%rsp), %r11
71 CFI_RESTORE r11
72 .endif
73 .if \skipr8910
74 .else
75 movq 1*8(%rsp), %r10
76 CFI_RESTORE r10
77 movq 2*8(%rsp), %r9
78 CFI_RESTORE r9
79 movq 3*8(%rsp), %r8
80 CFI_RESTORE r8
81 .endif
82 .if \skiprax
83 .else
84 movq 4*8(%rsp), %rax
85 CFI_RESTORE rax
86 .endif
87 .if \skiprcx
88 .else
89 movq 5*8(%rsp), %rcx
90 CFI_RESTORE rcx
91 .endif
92 .if \skiprdx
93 .else
94 movq 6*8(%rsp), %rdx
95 CFI_RESTORE rdx
96 .endif
97 movq 7*8(%rsp), %rsi
98 CFI_RESTORE rsi
99 movq 8*8(%rsp), %rdi
100 CFI_RESTORE rdi
101 .if ARG_SKIP+\addskip > 0
102 addq $ARG_SKIP+\addskip, %rsp
103 CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
104 .endif
105 .endm
106
107 .macro LOAD_ARGS offset, skiprax=0
108 movq \offset(%rsp), %r11
109 movq \offset+8(%rsp), %r10
110 movq \offset+16(%rsp), %r9
111 movq \offset+24(%rsp), %r8
112 movq \offset+40(%rsp), %rcx
113 movq \offset+48(%rsp), %rdx
114 movq \offset+56(%rsp), %rsi
115 movq \offset+64(%rsp), %rdi
116 .if \skiprax
117 .else
118 movq \offset+72(%rsp), %rax
119 .endif
120 .endm
121
122#define REST_SKIP 6*8
123
124 .macro SAVE_REST
125 subq $REST_SKIP, %rsp
126 CFI_ADJUST_CFA_OFFSET REST_SKIP
127 movq %rbx, 5*8(%rsp)
128 CFI_REL_OFFSET rbx, 5*8
129 movq %rbp, 4*8(%rsp)
130 CFI_REL_OFFSET rbp, 4*8
131 movq %r12, 3*8(%rsp)
132 CFI_REL_OFFSET r12, 3*8
133 movq %r13, 2*8(%rsp)
134 CFI_REL_OFFSET r13, 2*8
135 movq %r14, 1*8(%rsp)
136 CFI_REL_OFFSET r14, 1*8
137 movq %r15, (%rsp)
138 CFI_REL_OFFSET r15, 0*8
139 .endm
140
141 .macro RESTORE_REST
142 movq (%rsp), %r15
143 CFI_RESTORE r15
144 movq 1*8(%rsp), %r14
145 CFI_RESTORE r14
146 movq 2*8(%rsp), %r13
147 CFI_RESTORE r13
148 movq 3*8(%rsp), %r12
149 CFI_RESTORE r12
150 movq 4*8(%rsp), %rbp
151 CFI_RESTORE rbp
152 movq 5*8(%rsp), %rbx
153 CFI_RESTORE rbx
154 addq $REST_SKIP, %rsp
155 CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
156 .endm
157
158 .macro SAVE_ALL
159 SAVE_ARGS
160 SAVE_REST
161 .endm
162
163 .macro RESTORE_ALL addskip=0
164 RESTORE_REST
165 RESTORE_ARGS 0, \addskip
166 .endm
167
168 .macro icebp
169 .byte 0xf1
170 .endm
diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h
new file mode 100644
index 000000000000..848850fd7d62
--- /dev/null
+++ b/arch/x86/include/asm/checksum.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "checksum_32.h"
3#else
4# include "checksum_64.h"
5#endif
diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h
new file mode 100644
index 000000000000..7c5ef8b14d92
--- /dev/null
+++ b/arch/x86/include/asm/checksum_32.h
@@ -0,0 +1,189 @@
1#ifndef _ASM_X86_CHECKSUM_32_H
2#define _ASM_X86_CHECKSUM_32_H
3
4#include <linux/in6.h>
5
6#include <asm/uaccess.h>
7
8/*
9 * computes the checksum of a memory block at buff, length len,
10 * and adds in "sum" (32-bit)
11 *
12 * returns a 32-bit number suitable for feeding into itself
13 * or csum_tcpudp_magic
14 *
15 * this function must be called with even lengths, except
16 * for the last fragment, which may be odd
17 *
18 * it's best to have buff aligned on a 32-bit boundary
19 */
20asmlinkage __wsum csum_partial(const void *buff, int len, __wsum sum);
21
22/*
23 * the same as csum_partial, but copies from src while it
24 * checksums, and handles user-space pointer exceptions correctly, when needed.
25 *
26 * here even more important to align src and dst on a 32-bit (or even
27 * better 64-bit) boundary
28 */
29
30asmlinkage __wsum csum_partial_copy_generic(const void *src, void *dst,
31 int len, __wsum sum,
32 int *src_err_ptr, int *dst_err_ptr);
33
34/*
35 * Note: when you get a NULL pointer exception here this means someone
36 * passed in an incorrect kernel address to one of these functions.
37 *
38 * If you use these functions directly please don't forget the
39 * access_ok().
40 */
41static inline __wsum csum_partial_copy_nocheck(const void *src, void *dst,
42 int len, __wsum sum)
43{
44 return csum_partial_copy_generic(src, dst, len, sum, NULL, NULL);
45}
46
47static inline __wsum csum_partial_copy_from_user(const void __user *src,
48 void *dst,
49 int len, __wsum sum,
50 int *err_ptr)
51{
52 might_sleep();
53 return csum_partial_copy_generic((__force void *)src, dst,
54 len, sum, err_ptr, NULL);
55}
56
57/*
58 * This is a version of ip_compute_csum() optimized for IP headers,
59 * which always checksum on 4 octet boundaries.
60 *
61 * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
62 * Arnt Gulbrandsen.
63 */
64static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
65{
66 unsigned int sum;
67
68 asm volatile("movl (%1), %0 ;\n"
69 "subl $4, %2 ;\n"
70 "jbe 2f ;\n"
71 "addl 4(%1), %0 ;\n"
72 "adcl 8(%1), %0 ;\n"
73 "adcl 12(%1), %0;\n"
74 "1: adcl 16(%1), %0 ;\n"
75 "lea 4(%1), %1 ;\n"
76 "decl %2 ;\n"
77 "jne 1b ;\n"
78 "adcl $0, %0 ;\n"
79 "movl %0, %2 ;\n"
80 "shrl $16, %0 ;\n"
81 "addw %w2, %w0 ;\n"
82 "adcl $0, %0 ;\n"
83 "notl %0 ;\n"
84 "2: ;\n"
85 /* Since the input registers which are loaded with iph and ihl
86 are modified, we must also specify them as outputs, or gcc
87 will assume they contain their original values. */
88 : "=r" (sum), "=r" (iph), "=r" (ihl)
89 : "1" (iph), "2" (ihl)
90 : "memory");
91 return (__force __sum16)sum;
92}
93
94/*
95 * Fold a partial checksum
96 */
97
98static inline __sum16 csum_fold(__wsum sum)
99{
100 asm("addl %1, %0 ;\n"
101 "adcl $0xffff, %0 ;\n"
102 : "=r" (sum)
103 : "r" ((__force u32)sum << 16),
104 "0" ((__force u32)sum & 0xffff0000));
105 return (__force __sum16)(~(__force u32)sum >> 16);
106}
107
108static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr,
109 unsigned short len,
110 unsigned short proto,
111 __wsum sum)
112{
113 asm("addl %1, %0 ;\n"
114 "adcl %2, %0 ;\n"
115 "adcl %3, %0 ;\n"
116 "adcl $0, %0 ;\n"
117 : "=r" (sum)
118 : "g" (daddr), "g"(saddr),
119 "g" ((len + proto) << 8), "0" (sum));
120 return sum;
121}
122
123/*
124 * computes the checksum of the TCP/UDP pseudo-header
125 * returns a 16-bit checksum, already complemented
126 */
127static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
128 unsigned short len,
129 unsigned short proto,
130 __wsum sum)
131{
132 return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
133}
134
135/*
136 * this routine is used for miscellaneous IP-like checksums, mainly
137 * in icmp.c
138 */
139
140static inline __sum16 ip_compute_csum(const void *buff, int len)
141{
142 return csum_fold(csum_partial(buff, len, 0));
143}
144
145#define _HAVE_ARCH_IPV6_CSUM
146static inline __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
147 const struct in6_addr *daddr,
148 __u32 len, unsigned short proto,
149 __wsum sum)
150{
151 asm("addl 0(%1), %0 ;\n"
152 "adcl 4(%1), %0 ;\n"
153 "adcl 8(%1), %0 ;\n"
154 "adcl 12(%1), %0 ;\n"
155 "adcl 0(%2), %0 ;\n"
156 "adcl 4(%2), %0 ;\n"
157 "adcl 8(%2), %0 ;\n"
158 "adcl 12(%2), %0 ;\n"
159 "adcl %3, %0 ;\n"
160 "adcl %4, %0 ;\n"
161 "adcl $0, %0 ;\n"
162 : "=&r" (sum)
163 : "r" (saddr), "r" (daddr),
164 "r" (htonl(len)), "r" (htonl(proto)), "0" (sum));
165
166 return csum_fold(sum);
167}
168
169/*
170 * Copy and checksum to user
171 */
172#define HAVE_CSUM_COPY_USER
173static inline __wsum csum_and_copy_to_user(const void *src,
174 void __user *dst,
175 int len, __wsum sum,
176 int *err_ptr)
177{
178 might_sleep();
179 if (access_ok(VERIFY_WRITE, dst, len))
180 return csum_partial_copy_generic(src, (__force void *)dst,
181 len, sum, NULL, err_ptr);
182
183 if (len)
184 *err_ptr = -EFAULT;
185
186 return (__force __wsum)-1; /* invalid checksum */
187}
188
189#endif /* _ASM_X86_CHECKSUM_32_H */
diff --git a/arch/x86/include/asm/checksum_64.h b/arch/x86/include/asm/checksum_64.h
new file mode 100644
index 000000000000..9bfdc41629ec
--- /dev/null
+++ b/arch/x86/include/asm/checksum_64.h
@@ -0,0 +1,191 @@
1#ifndef _ASM_X86_CHECKSUM_64_H
2#define _ASM_X86_CHECKSUM_64_H
3
4/*
5 * Checksums for x86-64
6 * Copyright 2002 by Andi Kleen, SuSE Labs
7 * with some code from asm-x86/checksum.h
8 */
9
10#include <linux/compiler.h>
11#include <asm/uaccess.h>
12#include <asm/byteorder.h>
13
14/**
15 * csum_fold - Fold and invert a 32bit checksum.
16 * sum: 32bit unfolded sum
17 *
18 * Fold a 32bit running checksum to 16bit and invert it. This is usually
19 * the last step before putting a checksum into a packet.
20 * Make sure not to mix with 64bit checksums.
21 */
22static inline __sum16 csum_fold(__wsum sum)
23{
24 asm(" addl %1,%0\n"
25 " adcl $0xffff,%0"
26 : "=r" (sum)
27 : "r" ((__force u32)sum << 16),
28 "0" ((__force u32)sum & 0xffff0000));
29 return (__force __sum16)(~(__force u32)sum >> 16);
30}
31
32/*
33 * This is a version of ip_compute_csum() optimized for IP headers,
34 * which always checksum on 4 octet boundaries.
35 *
36 * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by
37 * Arnt Gulbrandsen.
38 */
39
40/**
41 * ip_fast_csum - Compute the IPv4 header checksum efficiently.
42 * iph: ipv4 header
43 * ihl: length of header / 4
44 */
45static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
46{
47 unsigned int sum;
48
49 asm(" movl (%1), %0\n"
50 " subl $4, %2\n"
51 " jbe 2f\n"
52 " addl 4(%1), %0\n"
53 " adcl 8(%1), %0\n"
54 " adcl 12(%1), %0\n"
55 "1: adcl 16(%1), %0\n"
56 " lea 4(%1), %1\n"
57 " decl %2\n"
58 " jne 1b\n"
59 " adcl $0, %0\n"
60 " movl %0, %2\n"
61 " shrl $16, %0\n"
62 " addw %w2, %w0\n"
63 " adcl $0, %0\n"
64 " notl %0\n"
65 "2:"
66 /* Since the input registers which are loaded with iph and ihl
67 are modified, we must also specify them as outputs, or gcc
68 will assume they contain their original values. */
69 : "=r" (sum), "=r" (iph), "=r" (ihl)
70 : "1" (iph), "2" (ihl)
71 : "memory");
72 return (__force __sum16)sum;
73}
74
75/**
76 * csum_tcpup_nofold - Compute an IPv4 pseudo header checksum.
77 * @saddr: source address
78 * @daddr: destination address
79 * @len: length of packet
80 * @proto: ip protocol of packet
81 * @sum: initial sum to be added in (32bit unfolded)
82 *
83 * Returns the pseudo header checksum the input data. Result is
84 * 32bit unfolded.
85 */
86static inline __wsum
87csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
88 unsigned short proto, __wsum sum)
89{
90 asm(" addl %1, %0\n"
91 " adcl %2, %0\n"
92 " adcl %3, %0\n"
93 " adcl $0, %0\n"
94 : "=r" (sum)
95 : "g" (daddr), "g" (saddr),
96 "g" ((len + proto)<<8), "0" (sum));
97 return sum;
98}
99
100
101/**
102 * csum_tcpup_magic - Compute an IPv4 pseudo header checksum.
103 * @saddr: source address
104 * @daddr: destination address
105 * @len: length of packet
106 * @proto: ip protocol of packet
107 * @sum: initial sum to be added in (32bit unfolded)
108 *
109 * Returns the 16bit pseudo header checksum the input data already
110 * complemented and ready to be filled in.
111 */
112static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr,
113 unsigned short len,
114 unsigned short proto, __wsum sum)
115{
116 return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum));
117}
118
119/**
120 * csum_partial - Compute an internet checksum.
121 * @buff: buffer to be checksummed
122 * @len: length of buffer.
123 * @sum: initial sum to be added in (32bit unfolded)
124 *
125 * Returns the 32bit unfolded internet checksum of the buffer.
126 * Before filling it in it needs to be csum_fold()'ed.
127 * buff should be aligned to a 64bit boundary if possible.
128 */
129extern __wsum csum_partial(const void *buff, int len, __wsum sum);
130
131#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1
132#define HAVE_CSUM_COPY_USER 1
133
134
135/* Do not call this directly. Use the wrappers below */
136extern __wsum csum_partial_copy_generic(const void *src, const void *dst,
137 int len, __wsum sum,
138 int *src_err_ptr, int *dst_err_ptr);
139
140
141extern __wsum csum_partial_copy_from_user(const void __user *src, void *dst,
142 int len, __wsum isum, int *errp);
143extern __wsum csum_partial_copy_to_user(const void *src, void __user *dst,
144 int len, __wsum isum, int *errp);
145extern __wsum csum_partial_copy_nocheck(const void *src, void *dst,
146 int len, __wsum sum);
147
148/* Old names. To be removed. */
149#define csum_and_copy_to_user csum_partial_copy_to_user
150#define csum_and_copy_from_user csum_partial_copy_from_user
151
152/**
153 * ip_compute_csum - Compute an 16bit IP checksum.
154 * @buff: buffer address.
155 * @len: length of buffer.
156 *
157 * Returns the 16bit folded/inverted checksum of the passed buffer.
158 * Ready to fill in.
159 */
160extern __sum16 ip_compute_csum(const void *buff, int len);
161
162/**
163 * csum_ipv6_magic - Compute checksum of an IPv6 pseudo header.
164 * @saddr: source address
165 * @daddr: destination address
166 * @len: length of packet
167 * @proto: protocol of packet
168 * @sum: initial sum (32bit unfolded) to be added in
169 *
170 * Computes an IPv6 pseudo header checksum. This sum is added the checksum
171 * into UDP/TCP packets and contains some link layer information.
172 * Returns the unfolded 32bit checksum.
173 */
174
175struct in6_addr;
176
177#define _HAVE_ARCH_IPV6_CSUM 1
178extern __sum16
179csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr,
180 __u32 len, unsigned short proto, __wsum sum);
181
182static inline unsigned add32_with_carry(unsigned a, unsigned b)
183{
184 asm("addl %2,%0\n\t"
185 "adcl $0,%0"
186 : "=r" (a)
187 : "0" (a), "r" (b));
188 return a;
189}
190
191#endif /* _ASM_X86_CHECKSUM_64_H */
diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h
new file mode 100644
index 000000000000..a460fa088d4c
--- /dev/null
+++ b/arch/x86/include/asm/cmpxchg.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "cmpxchg_32.h"
3#else
4# include "cmpxchg_64.h"
5#endif
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
new file mode 100644
index 000000000000..82ceb788a981
--- /dev/null
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -0,0 +1,344 @@
1#ifndef _ASM_X86_CMPXCHG_32_H
2#define _ASM_X86_CMPXCHG_32_H
3
4#include <linux/bitops.h> /* for LOCK_PREFIX */
5
6/*
7 * Note: if you use set64_bit(), __cmpxchg64(), or their variants, you
8 * you need to test for the feature in boot_cpu_data.
9 */
10
11#define xchg(ptr, v) \
12 ((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr))))
13
14struct __xchg_dummy {
15 unsigned long a[100];
16};
17#define __xg(x) ((struct __xchg_dummy *)(x))
18
19/*
20 * The semantics of XCHGCMP8B are a bit strange, this is why
21 * there is a loop and the loading of %%eax and %%edx has to
22 * be inside. This inlines well in most cases, the cached
23 * cost is around ~38 cycles. (in the future we might want
24 * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that
25 * might have an implicit FPU-save as a cost, so it's not
26 * clear which path to go.)
27 *
28 * cmpxchg8b must be used with the lock prefix here to allow
29 * the instruction to be executed atomically, see page 3-102
30 * of the instruction set reference 24319102.pdf. We need
31 * the reader side to see the coherent 64bit value.
32 */
33static inline void __set_64bit(unsigned long long *ptr,
34 unsigned int low, unsigned int high)
35{
36 asm volatile("\n1:\t"
37 "movl (%0), %%eax\n\t"
38 "movl 4(%0), %%edx\n\t"
39 LOCK_PREFIX "cmpxchg8b (%0)\n\t"
40 "jnz 1b"
41 : /* no outputs */
42 : "D"(ptr),
43 "b"(low),
44 "c"(high)
45 : "ax", "dx", "memory");
46}
47
48static inline void __set_64bit_constant(unsigned long long *ptr,
49 unsigned long long value)
50{
51 __set_64bit(ptr, (unsigned int)value, (unsigned int)(value >> 32));
52}
53
54#define ll_low(x) *(((unsigned int *)&(x)) + 0)
55#define ll_high(x) *(((unsigned int *)&(x)) + 1)
56
57static inline void __set_64bit_var(unsigned long long *ptr,
58 unsigned long long value)
59{
60 __set_64bit(ptr, ll_low(value), ll_high(value));
61}
62
63#define set_64bit(ptr, value) \
64 (__builtin_constant_p((value)) \
65 ? __set_64bit_constant((ptr), (value)) \
66 : __set_64bit_var((ptr), (value)))
67
68#define _set_64bit(ptr, value) \
69 (__builtin_constant_p(value) \
70 ? __set_64bit(ptr, (unsigned int)(value), \
71 (unsigned int)((value) >> 32)) \
72 : __set_64bit(ptr, ll_low((value)), ll_high((value))))
73
74/*
75 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
76 * Note 2: xchg has side effect, so that attribute volatile is necessary,
77 * but generally the primitive is invalid, *ptr is output argument. --ANK
78 */
79static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
80 int size)
81{
82 switch (size) {
83 case 1:
84 asm volatile("xchgb %b0,%1"
85 : "=q" (x)
86 : "m" (*__xg(ptr)), "0" (x)
87 : "memory");
88 break;
89 case 2:
90 asm volatile("xchgw %w0,%1"
91 : "=r" (x)
92 : "m" (*__xg(ptr)), "0" (x)
93 : "memory");
94 break;
95 case 4:
96 asm volatile("xchgl %0,%1"
97 : "=r" (x)
98 : "m" (*__xg(ptr)), "0" (x)
99 : "memory");
100 break;
101 }
102 return x;
103}
104
105/*
106 * Atomic compare and exchange. Compare OLD with MEM, if identical,
107 * store NEW in MEM. Return the initial value in MEM. Success is
108 * indicated by comparing RETURN with OLD.
109 */
110
111#ifdef CONFIG_X86_CMPXCHG
112#define __HAVE_ARCH_CMPXCHG 1
113#define cmpxchg(ptr, o, n) \
114 ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
115 (unsigned long)(n), \
116 sizeof(*(ptr))))
117#define sync_cmpxchg(ptr, o, n) \
118 ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
119 (unsigned long)(n), \
120 sizeof(*(ptr))))
121#define cmpxchg_local(ptr, o, n) \
122 ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
123 (unsigned long)(n), \
124 sizeof(*(ptr))))
125#endif
126
127#ifdef CONFIG_X86_CMPXCHG64
128#define cmpxchg64(ptr, o, n) \
129 ((__typeof__(*(ptr)))__cmpxchg64((ptr), (unsigned long long)(o), \
130 (unsigned long long)(n)))
131#define cmpxchg64_local(ptr, o, n) \
132 ((__typeof__(*(ptr)))__cmpxchg64_local((ptr), (unsigned long long)(o), \
133 (unsigned long long)(n)))
134#endif
135
136static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
137 unsigned long new, int size)
138{
139 unsigned long prev;
140 switch (size) {
141 case 1:
142 asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
143 : "=a"(prev)
144 : "q"(new), "m"(*__xg(ptr)), "0"(old)
145 : "memory");
146 return prev;
147 case 2:
148 asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
149 : "=a"(prev)
150 : "r"(new), "m"(*__xg(ptr)), "0"(old)
151 : "memory");
152 return prev;
153 case 4:
154 asm volatile(LOCK_PREFIX "cmpxchgl %1,%2"
155 : "=a"(prev)
156 : "r"(new), "m"(*__xg(ptr)), "0"(old)
157 : "memory");
158 return prev;
159 }
160 return old;
161}
162
163/*
164 * Always use locked operations when touching memory shared with a
165 * hypervisor, since the system may be SMP even if the guest kernel
166 * isn't.
167 */
168static inline unsigned long __sync_cmpxchg(volatile void *ptr,
169 unsigned long old,
170 unsigned long new, int size)
171{
172 unsigned long prev;
173 switch (size) {
174 case 1:
175 asm volatile("lock; cmpxchgb %b1,%2"
176 : "=a"(prev)
177 : "q"(new), "m"(*__xg(ptr)), "0"(old)
178 : "memory");
179 return prev;
180 case 2:
181 asm volatile("lock; cmpxchgw %w1,%2"
182 : "=a"(prev)
183 : "r"(new), "m"(*__xg(ptr)), "0"(old)
184 : "memory");
185 return prev;
186 case 4:
187 asm volatile("lock; cmpxchgl %1,%2"
188 : "=a"(prev)
189 : "r"(new), "m"(*__xg(ptr)), "0"(old)
190 : "memory");
191 return prev;
192 }
193 return old;
194}
195
196static inline unsigned long __cmpxchg_local(volatile void *ptr,
197 unsigned long old,
198 unsigned long new, int size)
199{
200 unsigned long prev;
201 switch (size) {
202 case 1:
203 asm volatile("cmpxchgb %b1,%2"
204 : "=a"(prev)
205 : "q"(new), "m"(*__xg(ptr)), "0"(old)
206 : "memory");
207 return prev;
208 case 2:
209 asm volatile("cmpxchgw %w1,%2"
210 : "=a"(prev)
211 : "r"(new), "m"(*__xg(ptr)), "0"(old)
212 : "memory");
213 return prev;
214 case 4:
215 asm volatile("cmpxchgl %1,%2"
216 : "=a"(prev)
217 : "r"(new), "m"(*__xg(ptr)), "0"(old)
218 : "memory");
219 return prev;
220 }
221 return old;
222}
223
224static inline unsigned long long __cmpxchg64(volatile void *ptr,
225 unsigned long long old,
226 unsigned long long new)
227{
228 unsigned long long prev;
229 asm volatile(LOCK_PREFIX "cmpxchg8b %3"
230 : "=A"(prev)
231 : "b"((unsigned long)new),
232 "c"((unsigned long)(new >> 32)),
233 "m"(*__xg(ptr)),
234 "0"(old)
235 : "memory");
236 return prev;
237}
238
239static inline unsigned long long __cmpxchg64_local(volatile void *ptr,
240 unsigned long long old,
241 unsigned long long new)
242{
243 unsigned long long prev;
244 asm volatile("cmpxchg8b %3"
245 : "=A"(prev)
246 : "b"((unsigned long)new),
247 "c"((unsigned long)(new >> 32)),
248 "m"(*__xg(ptr)),
249 "0"(old)
250 : "memory");
251 return prev;
252}
253
254#ifndef CONFIG_X86_CMPXCHG
255/*
256 * Building a kernel capable running on 80386. It may be necessary to
257 * simulate the cmpxchg on the 80386 CPU. For that purpose we define
258 * a function for each of the sizes we support.
259 */
260
261extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8);
262extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16);
263extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32);
264
265static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old,
266 unsigned long new, int size)
267{
268 switch (size) {
269 case 1:
270 return cmpxchg_386_u8(ptr, old, new);
271 case 2:
272 return cmpxchg_386_u16(ptr, old, new);
273 case 4:
274 return cmpxchg_386_u32(ptr, old, new);
275 }
276 return old;
277}
278
279#define cmpxchg(ptr, o, n) \
280({ \
281 __typeof__(*(ptr)) __ret; \
282 if (likely(boot_cpu_data.x86 > 3)) \
283 __ret = (__typeof__(*(ptr)))__cmpxchg((ptr), \
284 (unsigned long)(o), (unsigned long)(n), \
285 sizeof(*(ptr))); \
286 else \
287 __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \
288 (unsigned long)(o), (unsigned long)(n), \
289 sizeof(*(ptr))); \
290 __ret; \
291})
292#define cmpxchg_local(ptr, o, n) \
293({ \
294 __typeof__(*(ptr)) __ret; \
295 if (likely(boot_cpu_data.x86 > 3)) \
296 __ret = (__typeof__(*(ptr)))__cmpxchg_local((ptr), \
297 (unsigned long)(o), (unsigned long)(n), \
298 sizeof(*(ptr))); \
299 else \
300 __ret = (__typeof__(*(ptr)))cmpxchg_386((ptr), \
301 (unsigned long)(o), (unsigned long)(n), \
302 sizeof(*(ptr))); \
303 __ret; \
304})
305#endif
306
307#ifndef CONFIG_X86_CMPXCHG64
308/*
309 * Building a kernel capable running on 80386 and 80486. It may be necessary
310 * to simulate the cmpxchg8b on the 80386 and 80486 CPU.
311 */
312
313extern unsigned long long cmpxchg_486_u64(volatile void *, u64, u64);
314
315#define cmpxchg64(ptr, o, n) \
316({ \
317 __typeof__(*(ptr)) __ret; \
318 if (likely(boot_cpu_data.x86 > 4)) \
319 __ret = (__typeof__(*(ptr)))__cmpxchg64((ptr), \
320 (unsigned long long)(o), \
321 (unsigned long long)(n)); \
322 else \
323 __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \
324 (unsigned long long)(o), \
325 (unsigned long long)(n)); \
326 __ret; \
327})
328#define cmpxchg64_local(ptr, o, n) \
329({ \
330 __typeof__(*(ptr)) __ret; \
331 if (likely(boot_cpu_data.x86 > 4)) \
332 __ret = (__typeof__(*(ptr)))__cmpxchg64_local((ptr), \
333 (unsigned long long)(o), \
334 (unsigned long long)(n)); \
335 else \
336 __ret = (__typeof__(*(ptr)))cmpxchg_486_u64((ptr), \
337 (unsigned long long)(o), \
338 (unsigned long long)(n)); \
339 __ret; \
340})
341
342#endif
343
344#endif /* _ASM_X86_CMPXCHG_32_H */
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
new file mode 100644
index 000000000000..52de72e0de8c
--- /dev/null
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -0,0 +1,185 @@
1#ifndef _ASM_X86_CMPXCHG_64_H
2#define _ASM_X86_CMPXCHG_64_H
3
4#include <asm/alternative.h> /* Provides LOCK_PREFIX */
5
6#define xchg(ptr, v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v), \
7 (ptr), sizeof(*(ptr))))
8
9#define __xg(x) ((volatile long *)(x))
10
11static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
12{
13 *ptr = val;
14}
15
16#define _set_64bit set_64bit
17
18/*
19 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
20 * Note 2: xchg has side effect, so that attribute volatile is necessary,
21 * but generally the primitive is invalid, *ptr is output argument. --ANK
22 */
23static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
24 int size)
25{
26 switch (size) {
27 case 1:
28 asm volatile("xchgb %b0,%1"
29 : "=q" (x)
30 : "m" (*__xg(ptr)), "0" (x)
31 : "memory");
32 break;
33 case 2:
34 asm volatile("xchgw %w0,%1"
35 : "=r" (x)
36 : "m" (*__xg(ptr)), "0" (x)
37 : "memory");
38 break;
39 case 4:
40 asm volatile("xchgl %k0,%1"
41 : "=r" (x)
42 : "m" (*__xg(ptr)), "0" (x)
43 : "memory");
44 break;
45 case 8:
46 asm volatile("xchgq %0,%1"
47 : "=r" (x)
48 : "m" (*__xg(ptr)), "0" (x)
49 : "memory");
50 break;
51 }
52 return x;
53}
54
55/*
56 * Atomic compare and exchange. Compare OLD with MEM, if identical,
57 * store NEW in MEM. Return the initial value in MEM. Success is
58 * indicated by comparing RETURN with OLD.
59 */
60
61#define __HAVE_ARCH_CMPXCHG 1
62
63static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
64 unsigned long new, int size)
65{
66 unsigned long prev;
67 switch (size) {
68 case 1:
69 asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
70 : "=a"(prev)
71 : "q"(new), "m"(*__xg(ptr)), "0"(old)
72 : "memory");
73 return prev;
74 case 2:
75 asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
76 : "=a"(prev)
77 : "r"(new), "m"(*__xg(ptr)), "0"(old)
78 : "memory");
79 return prev;
80 case 4:
81 asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2"
82 : "=a"(prev)
83 : "r"(new), "m"(*__xg(ptr)), "0"(old)
84 : "memory");
85 return prev;
86 case 8:
87 asm volatile(LOCK_PREFIX "cmpxchgq %1,%2"
88 : "=a"(prev)
89 : "r"(new), "m"(*__xg(ptr)), "0"(old)
90 : "memory");
91 return prev;
92 }
93 return old;
94}
95
96/*
97 * Always use locked operations when touching memory shared with a
98 * hypervisor, since the system may be SMP even if the guest kernel
99 * isn't.
100 */
101static inline unsigned long __sync_cmpxchg(volatile void *ptr,
102 unsigned long old,
103 unsigned long new, int size)
104{
105 unsigned long prev;
106 switch (size) {
107 case 1:
108 asm volatile("lock; cmpxchgb %b1,%2"
109 : "=a"(prev)
110 : "q"(new), "m"(*__xg(ptr)), "0"(old)
111 : "memory");
112 return prev;
113 case 2:
114 asm volatile("lock; cmpxchgw %w1,%2"
115 : "=a"(prev)
116 : "r"(new), "m"(*__xg(ptr)), "0"(old)
117 : "memory");
118 return prev;
119 case 4:
120 asm volatile("lock; cmpxchgl %1,%2"
121 : "=a"(prev)
122 : "r"(new), "m"(*__xg(ptr)), "0"(old)
123 : "memory");
124 return prev;
125 }
126 return old;
127}
128
129static inline unsigned long __cmpxchg_local(volatile void *ptr,
130 unsigned long old,
131 unsigned long new, int size)
132{
133 unsigned long prev;
134 switch (size) {
135 case 1:
136 asm volatile("cmpxchgb %b1,%2"
137 : "=a"(prev)
138 : "q"(new), "m"(*__xg(ptr)), "0"(old)
139 : "memory");
140 return prev;
141 case 2:
142 asm volatile("cmpxchgw %w1,%2"
143 : "=a"(prev)
144 : "r"(new), "m"(*__xg(ptr)), "0"(old)
145 : "memory");
146 return prev;
147 case 4:
148 asm volatile("cmpxchgl %k1,%2"
149 : "=a"(prev)
150 : "r"(new), "m"(*__xg(ptr)), "0"(old)
151 : "memory");
152 return prev;
153 case 8:
154 asm volatile("cmpxchgq %1,%2"
155 : "=a"(prev)
156 : "r"(new), "m"(*__xg(ptr)), "0"(old)
157 : "memory");
158 return prev;
159 }
160 return old;
161}
162
163#define cmpxchg(ptr, o, n) \
164 ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
165 (unsigned long)(n), sizeof(*(ptr))))
166#define cmpxchg64(ptr, o, n) \
167({ \
168 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
169 cmpxchg((ptr), (o), (n)); \
170})
171#define cmpxchg_local(ptr, o, n) \
172 ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
173 (unsigned long)(n), \
174 sizeof(*(ptr))))
175#define sync_cmpxchg(ptr, o, n) \
176 ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
177 (unsigned long)(n), \
178 sizeof(*(ptr))))
179#define cmpxchg64_local(ptr, o, n) \
180({ \
181 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
182 cmpxchg_local((ptr), (o), (n)); \
183})
184
185#endif /* _ASM_X86_CMPXCHG_64_H */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
new file mode 100644
index 000000000000..9a9c7bdc923d
--- /dev/null
+++ b/arch/x86/include/asm/compat.h
@@ -0,0 +1,218 @@
1#ifndef _ASM_X86_COMPAT_H
2#define _ASM_X86_COMPAT_H
3
4/*
5 * Architecture specific compatibility types
6 */
7#include <linux/types.h>
8#include <linux/sched.h>
9#include <asm/user32.h>
10
11#define COMPAT_USER_HZ 100
12
13typedef u32 compat_size_t;
14typedef s32 compat_ssize_t;
15typedef s32 compat_time_t;
16typedef s32 compat_clock_t;
17typedef s32 compat_pid_t;
18typedef u16 __compat_uid_t;
19typedef u16 __compat_gid_t;
20typedef u32 __compat_uid32_t;
21typedef u32 __compat_gid32_t;
22typedef u16 compat_mode_t;
23typedef u32 compat_ino_t;
24typedef u16 compat_dev_t;
25typedef s32 compat_off_t;
26typedef s64 compat_loff_t;
27typedef u16 compat_nlink_t;
28typedef u16 compat_ipc_pid_t;
29typedef s32 compat_daddr_t;
30typedef u32 compat_caddr_t;
31typedef __kernel_fsid_t compat_fsid_t;
32typedef s32 compat_timer_t;
33typedef s32 compat_key_t;
34
35typedef s32 compat_int_t;
36typedef s32 compat_long_t;
37typedef s64 __attribute__((aligned(4))) compat_s64;
38typedef u32 compat_uint_t;
39typedef u32 compat_ulong_t;
40typedef u64 __attribute__((aligned(4))) compat_u64;
41
42struct compat_timespec {
43 compat_time_t tv_sec;
44 s32 tv_nsec;
45};
46
47struct compat_timeval {
48 compat_time_t tv_sec;
49 s32 tv_usec;
50};
51
52struct compat_stat {
53 compat_dev_t st_dev;
54 u16 __pad1;
55 compat_ino_t st_ino;
56 compat_mode_t st_mode;
57 compat_nlink_t st_nlink;
58 __compat_uid_t st_uid;
59 __compat_gid_t st_gid;
60 compat_dev_t st_rdev;
61 u16 __pad2;
62 u32 st_size;
63 u32 st_blksize;
64 u32 st_blocks;
65 u32 st_atime;
66 u32 st_atime_nsec;
67 u32 st_mtime;
68 u32 st_mtime_nsec;
69 u32 st_ctime;
70 u32 st_ctime_nsec;
71 u32 __unused4;
72 u32 __unused5;
73};
74
75struct compat_flock {
76 short l_type;
77 short l_whence;
78 compat_off_t l_start;
79 compat_off_t l_len;
80 compat_pid_t l_pid;
81};
82
83#define F_GETLK64 12 /* using 'struct flock64' */
84#define F_SETLK64 13
85#define F_SETLKW64 14
86
87/*
88 * IA32 uses 4 byte alignment for 64 bit quantities,
89 * so we need to pack this structure.
90 */
91struct compat_flock64 {
92 short l_type;
93 short l_whence;
94 compat_loff_t l_start;
95 compat_loff_t l_len;
96 compat_pid_t l_pid;
97} __attribute__((packed));
98
99struct compat_statfs {
100 int f_type;
101 int f_bsize;
102 int f_blocks;
103 int f_bfree;
104 int f_bavail;
105 int f_files;
106 int f_ffree;
107 compat_fsid_t f_fsid;
108 int f_namelen; /* SunOS ignores this field. */
109 int f_frsize;
110 int f_spare[5];
111};
112
113#define COMPAT_RLIM_OLD_INFINITY 0x7fffffff
114#define COMPAT_RLIM_INFINITY 0xffffffff
115
116typedef u32 compat_old_sigset_t; /* at least 32 bits */
117
118#define _COMPAT_NSIG 64
119#define _COMPAT_NSIG_BPW 32
120
121typedef u32 compat_sigset_word;
122
123#define COMPAT_OFF_T_MAX 0x7fffffff
124#define COMPAT_LOFF_T_MAX 0x7fffffffffffffffL
125
126struct compat_ipc64_perm {
127 compat_key_t key;
128 __compat_uid32_t uid;
129 __compat_gid32_t gid;
130 __compat_uid32_t cuid;
131 __compat_gid32_t cgid;
132 unsigned short mode;
133 unsigned short __pad1;
134 unsigned short seq;
135 unsigned short __pad2;
136 compat_ulong_t unused1;
137 compat_ulong_t unused2;
138};
139
140struct compat_semid64_ds {
141 struct compat_ipc64_perm sem_perm;
142 compat_time_t sem_otime;
143 compat_ulong_t __unused1;
144 compat_time_t sem_ctime;
145 compat_ulong_t __unused2;
146 compat_ulong_t sem_nsems;
147 compat_ulong_t __unused3;
148 compat_ulong_t __unused4;
149};
150
151struct compat_msqid64_ds {
152 struct compat_ipc64_perm msg_perm;
153 compat_time_t msg_stime;
154 compat_ulong_t __unused1;
155 compat_time_t msg_rtime;
156 compat_ulong_t __unused2;
157 compat_time_t msg_ctime;
158 compat_ulong_t __unused3;
159 compat_ulong_t msg_cbytes;
160 compat_ulong_t msg_qnum;
161 compat_ulong_t msg_qbytes;
162 compat_pid_t msg_lspid;
163 compat_pid_t msg_lrpid;
164 compat_ulong_t __unused4;
165 compat_ulong_t __unused5;
166};
167
168struct compat_shmid64_ds {
169 struct compat_ipc64_perm shm_perm;
170 compat_size_t shm_segsz;
171 compat_time_t shm_atime;
172 compat_ulong_t __unused1;
173 compat_time_t shm_dtime;
174 compat_ulong_t __unused2;
175 compat_time_t shm_ctime;
176 compat_ulong_t __unused3;
177 compat_pid_t shm_cpid;
178 compat_pid_t shm_lpid;
179 compat_ulong_t shm_nattch;
180 compat_ulong_t __unused4;
181 compat_ulong_t __unused5;
182};
183
184/*
185 * The type of struct elf_prstatus.pr_reg in compatible core dumps.
186 */
187typedef struct user_regs_struct32 compat_elf_gregset_t;
188
189/*
190 * A pointer passed in from user mode. This should not
191 * be used for syscall parameters, just declare them
192 * as pointers because the syscall entry code will have
193 * appropriately converted them already.
194 */
195typedef u32 compat_uptr_t;
196
197static inline void __user *compat_ptr(compat_uptr_t uptr)
198{
199 return (void __user *)(unsigned long)uptr;
200}
201
202static inline compat_uptr_t ptr_to_compat(void __user *uptr)
203{
204 return (u32)(unsigned long)uptr;
205}
206
207static inline void __user *compat_alloc_user_space(long len)
208{
209 struct pt_regs *regs = task_pt_regs(current);
210 return (void __user *)regs->sp - len;
211}
212
213static inline int is_compat_task(void)
214{
215 return current_thread_info()->status & TS_COMPAT;
216}
217
218#endif /* _ASM_X86_COMPAT_H */
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
new file mode 100644
index 000000000000..bae482df6039
--- /dev/null
+++ b/arch/x86/include/asm/cpu.h
@@ -0,0 +1,20 @@
1#ifndef _ASM_X86_CPU_H
2#define _ASM_X86_CPU_H
3
4#include <linux/device.h>
5#include <linux/cpu.h>
6#include <linux/topology.h>
7#include <linux/nodemask.h>
8#include <linux/percpu.h>
9
10struct x86_cpu {
11 struct cpu cpu;
12};
13
14#ifdef CONFIG_HOTPLUG_CPU
15extern int arch_register_cpu(int num);
16extern void arch_unregister_cpu(int);
17#endif
18
19DECLARE_PER_CPU(int, cpu_state);
20#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
new file mode 100644
index 000000000000..f73e95d75b45
--- /dev/null
+++ b/arch/x86/include/asm/cpufeature.h
@@ -0,0 +1,271 @@
1/*
2 * Defines x86 CPU feature bits
3 */
4#ifndef _ASM_X86_CPUFEATURE_H
5#define _ASM_X86_CPUFEATURE_H
6
7#include <asm/required-features.h>
8
9#define NCAPINTS 9 /* N 32-bit words worth of info */
10
11/*
12 * Note: If the comment begins with a quoted string, that string is used
13 * in /proc/cpuinfo instead of the macro name. If the string is "",
14 * this feature bit is not displayed in /proc/cpuinfo at all.
15 */
16
17/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
18#define X86_FEATURE_FPU (0*32+ 0) /* Onboard FPU */
19#define X86_FEATURE_VME (0*32+ 1) /* Virtual Mode Extensions */
20#define X86_FEATURE_DE (0*32+ 2) /* Debugging Extensions */
21#define X86_FEATURE_PSE (0*32+ 3) /* Page Size Extensions */
22#define X86_FEATURE_TSC (0*32+ 4) /* Time Stamp Counter */
23#define X86_FEATURE_MSR (0*32+ 5) /* Model-Specific Registers */
24#define X86_FEATURE_PAE (0*32+ 6) /* Physical Address Extensions */
25#define X86_FEATURE_MCE (0*32+ 7) /* Machine Check Architecture */
26#define X86_FEATURE_CX8 (0*32+ 8) /* CMPXCHG8 instruction */
27#define X86_FEATURE_APIC (0*32+ 9) /* Onboard APIC */
28#define X86_FEATURE_SEP (0*32+11) /* SYSENTER/SYSEXIT */
29#define X86_FEATURE_MTRR (0*32+12) /* Memory Type Range Registers */
30#define X86_FEATURE_PGE (0*32+13) /* Page Global Enable */
31#define X86_FEATURE_MCA (0*32+14) /* Machine Check Architecture */
32#define X86_FEATURE_CMOV (0*32+15) /* CMOV instructions */
33 /* (plus FCMOVcc, FCOMI with FPU) */
34#define X86_FEATURE_PAT (0*32+16) /* Page Attribute Table */
35#define X86_FEATURE_PSE36 (0*32+17) /* 36-bit PSEs */
36#define X86_FEATURE_PN (0*32+18) /* Processor serial number */
37#define X86_FEATURE_CLFLSH (0*32+19) /* "clflush" CLFLUSH instruction */
38#define X86_FEATURE_DS (0*32+21) /* "dts" Debug Store */
39#define X86_FEATURE_ACPI (0*32+22) /* ACPI via MSR */
40#define X86_FEATURE_MMX (0*32+23) /* Multimedia Extensions */
41#define X86_FEATURE_FXSR (0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
42#define X86_FEATURE_XMM (0*32+25) /* "sse" */
43#define X86_FEATURE_XMM2 (0*32+26) /* "sse2" */
44#define X86_FEATURE_SELFSNOOP (0*32+27) /* "ss" CPU self snoop */
45#define X86_FEATURE_HT (0*32+28) /* Hyper-Threading */
46#define X86_FEATURE_ACC (0*32+29) /* "tm" Automatic clock control */
47#define X86_FEATURE_IA64 (0*32+30) /* IA-64 processor */
48#define X86_FEATURE_PBE (0*32+31) /* Pending Break Enable */
49
50/* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
51/* Don't duplicate feature flags which are redundant with Intel! */
52#define X86_FEATURE_SYSCALL (1*32+11) /* SYSCALL/SYSRET */
53#define X86_FEATURE_MP (1*32+19) /* MP Capable. */
54#define X86_FEATURE_NX (1*32+20) /* Execute Disable */
55#define X86_FEATURE_MMXEXT (1*32+22) /* AMD MMX extensions */
56#define X86_FEATURE_FXSR_OPT (1*32+25) /* FXSAVE/FXRSTOR optimizations */
57#define X86_FEATURE_GBPAGES (1*32+26) /* "pdpe1gb" GB pages */
58#define X86_FEATURE_RDTSCP (1*32+27) /* RDTSCP */
59#define X86_FEATURE_LM (1*32+29) /* Long Mode (x86-64) */
60#define X86_FEATURE_3DNOWEXT (1*32+30) /* AMD 3DNow! extensions */
61#define X86_FEATURE_3DNOW (1*32+31) /* 3DNow! */
62
63/* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
64#define X86_FEATURE_RECOVERY (2*32+ 0) /* CPU in recovery mode */
65#define X86_FEATURE_LONGRUN (2*32+ 1) /* Longrun power control */
66#define X86_FEATURE_LRTI (2*32+ 3) /* LongRun table interface */
67
68/* Other features, Linux-defined mapping, word 3 */
69/* This range is used for feature bits which conflict or are synthesized */
70#define X86_FEATURE_CXMMX (3*32+ 0) /* Cyrix MMX extensions */
71#define X86_FEATURE_K6_MTRR (3*32+ 1) /* AMD K6 nonstandard MTRRs */
72#define X86_FEATURE_CYRIX_ARR (3*32+ 2) /* Cyrix ARRs (= MTRRs) */
73#define X86_FEATURE_CENTAUR_MCR (3*32+ 3) /* Centaur MCRs (= MTRRs) */
74/* cpu types for specific tunings: */
75#define X86_FEATURE_K8 (3*32+ 4) /* "" Opteron, Athlon64 */
76#define X86_FEATURE_K7 (3*32+ 5) /* "" Athlon */
77#define X86_FEATURE_P3 (3*32+ 6) /* "" P3 */
78#define X86_FEATURE_P4 (3*32+ 7) /* "" P4 */
79#define X86_FEATURE_CONSTANT_TSC (3*32+ 8) /* TSC ticks at a constant rate */
80#define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */
81#define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* "" FXSAVE leaks FOP/FIP/FOP */
82#define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */
83#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
84#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */
85#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */
86#define X86_FEATURE_SYSCALL32 (3*32+14) /* "" syscall in ia32 userspace */
87#define X86_FEATURE_SYSENTER32 (3*32+15) /* "" sysenter in ia32 userspace */
88#define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well */
89#define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* "" Mfence synchronizes RDTSC */
90#define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* "" Lfence synchronizes RDTSC */
91#define X86_FEATURE_11AP (3*32+19) /* "" Bad local APIC aka 11AP */
92#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */
93#define X86_FEATURE_AMDC1E (3*32+21) /* AMD C1E detected */
94#define X86_FEATURE_XTOPOLOGY (3*32+21) /* cpu topology enum extensions */
95
96/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
97#define X86_FEATURE_XMM3 (4*32+ 0) /* "pni" SSE-3 */
98#define X86_FEATURE_PCLMULQDQ (4*32+ 1) /* PCLMULQDQ instruction */
99#define X86_FEATURE_DTES64 (4*32+ 2) /* 64-bit Debug Store */
100#define X86_FEATURE_MWAIT (4*32+ 3) /* "monitor" Monitor/Mwait support */
101#define X86_FEATURE_DSCPL (4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
102#define X86_FEATURE_VMX (4*32+ 5) /* Hardware virtualization */
103#define X86_FEATURE_SMX (4*32+ 6) /* Safer mode */
104#define X86_FEATURE_EST (4*32+ 7) /* Enhanced SpeedStep */
105#define X86_FEATURE_TM2 (4*32+ 8) /* Thermal Monitor 2 */
106#define X86_FEATURE_SSSE3 (4*32+ 9) /* Supplemental SSE-3 */
107#define X86_FEATURE_CID (4*32+10) /* Context ID */
108#define X86_FEATURE_FMA (4*32+12) /* Fused multiply-add */
109#define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */
110#define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */
111#define X86_FEATURE_PDCM (4*32+15) /* Performance Capabilities */
112#define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */
113#define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */
114#define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */
115#define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */
116#define X86_FEATURE_AES (4*32+25) /* AES instructions */
117#define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
118#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
119#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
120
121/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
122#define X86_FEATURE_XSTORE (5*32+ 2) /* "rng" RNG present (xstore) */
123#define X86_FEATURE_XSTORE_EN (5*32+ 3) /* "rng_en" RNG enabled */
124#define X86_FEATURE_XCRYPT (5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
125#define X86_FEATURE_XCRYPT_EN (5*32+ 7) /* "ace_en" on-CPU crypto enabled */
126#define X86_FEATURE_ACE2 (5*32+ 8) /* Advanced Cryptography Engine v2 */
127#define X86_FEATURE_ACE2_EN (5*32+ 9) /* ACE v2 enabled */
128#define X86_FEATURE_PHE (5*32+10) /* PadLock Hash Engine */
129#define X86_FEATURE_PHE_EN (5*32+11) /* PHE enabled */
130#define X86_FEATURE_PMM (5*32+12) /* PadLock Montgomery Multiplier */
131#define X86_FEATURE_PMM_EN (5*32+13) /* PMM enabled */
132
133/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
134#define X86_FEATURE_LAHF_LM (6*32+ 0) /* LAHF/SAHF in long mode */
135#define X86_FEATURE_CMP_LEGACY (6*32+ 1) /* If yes HyperThreading not valid */
136#define X86_FEATURE_SVM (6*32+ 2) /* Secure virtual machine */
137#define X86_FEATURE_EXTAPIC (6*32+ 3) /* Extended APIC space */
138#define X86_FEATURE_CR8_LEGACY (6*32+ 4) /* CR8 in 32-bit mode */
139#define X86_FEATURE_ABM (6*32+ 5) /* Advanced bit manipulation */
140#define X86_FEATURE_SSE4A (6*32+ 6) /* SSE-4A */
141#define X86_FEATURE_MISALIGNSSE (6*32+ 7) /* Misaligned SSE mode */
142#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
143#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */
144#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */
145#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */
146#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
147#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
148
149/*
150 * Auxiliary flags: Linux defined - For features scattered in various
151 * CPUID levels like 0x6, 0xA etc
152 */
153#define X86_FEATURE_IDA (7*32+ 0) /* Intel Dynamic Acceleration */
154
155/* Virtualization flags: Linux defined */
156#define X86_FEATURE_TPR_SHADOW (8*32+ 0) /* Intel TPR Shadow */
157#define X86_FEATURE_VNMI (8*32+ 1) /* Intel Virtual NMI */
158#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
159#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
160#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
161
162#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
163
164#include <linux/bitops.h>
165
166extern const char * const x86_cap_flags[NCAPINTS*32];
167extern const char * const x86_power_flags[32];
168
169#define test_cpu_cap(c, bit) \
170 test_bit(bit, (unsigned long *)((c)->x86_capability))
171
172#define cpu_has(c, bit) \
173 (__builtin_constant_p(bit) && \
174 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
175 (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
176 (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
177 (((bit)>>5)==3 && (1UL<<((bit)&31) & REQUIRED_MASK3)) || \
178 (((bit)>>5)==4 && (1UL<<((bit)&31) & REQUIRED_MASK4)) || \
179 (((bit)>>5)==5 && (1UL<<((bit)&31) & REQUIRED_MASK5)) || \
180 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
181 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) ) \
182 ? 1 : \
183 test_cpu_cap(c, bit))
184
185#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
186
187#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
188#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability))
189#define setup_clear_cpu_cap(bit) do { \
190 clear_cpu_cap(&boot_cpu_data, bit); \
191 set_bit(bit, (unsigned long *)cleared_cpu_caps); \
192} while (0)
193#define setup_force_cpu_cap(bit) do { \
194 set_cpu_cap(&boot_cpu_data, bit); \
195 clear_bit(bit, (unsigned long *)cleared_cpu_caps); \
196} while (0)
197
198#define cpu_has_fpu boot_cpu_has(X86_FEATURE_FPU)
199#define cpu_has_vme boot_cpu_has(X86_FEATURE_VME)
200#define cpu_has_de boot_cpu_has(X86_FEATURE_DE)
201#define cpu_has_pse boot_cpu_has(X86_FEATURE_PSE)
202#define cpu_has_tsc boot_cpu_has(X86_FEATURE_TSC)
203#define cpu_has_pae boot_cpu_has(X86_FEATURE_PAE)
204#define cpu_has_pge boot_cpu_has(X86_FEATURE_PGE)
205#define cpu_has_apic boot_cpu_has(X86_FEATURE_APIC)
206#define cpu_has_sep boot_cpu_has(X86_FEATURE_SEP)
207#define cpu_has_mtrr boot_cpu_has(X86_FEATURE_MTRR)
208#define cpu_has_mmx boot_cpu_has(X86_FEATURE_MMX)
209#define cpu_has_fxsr boot_cpu_has(X86_FEATURE_FXSR)
210#define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM)
211#define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2)
212#define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3)
213#define cpu_has_ht boot_cpu_has(X86_FEATURE_HT)
214#define cpu_has_mp boot_cpu_has(X86_FEATURE_MP)
215#define cpu_has_nx boot_cpu_has(X86_FEATURE_NX)
216#define cpu_has_k6_mtrr boot_cpu_has(X86_FEATURE_K6_MTRR)
217#define cpu_has_cyrix_arr boot_cpu_has(X86_FEATURE_CYRIX_ARR)
218#define cpu_has_centaur_mcr boot_cpu_has(X86_FEATURE_CENTAUR_MCR)
219#define cpu_has_xstore boot_cpu_has(X86_FEATURE_XSTORE)
220#define cpu_has_xstore_enabled boot_cpu_has(X86_FEATURE_XSTORE_EN)
221#define cpu_has_xcrypt boot_cpu_has(X86_FEATURE_XCRYPT)
222#define cpu_has_xcrypt_enabled boot_cpu_has(X86_FEATURE_XCRYPT_EN)
223#define cpu_has_ace2 boot_cpu_has(X86_FEATURE_ACE2)
224#define cpu_has_ace2_enabled boot_cpu_has(X86_FEATURE_ACE2_EN)
225#define cpu_has_phe boot_cpu_has(X86_FEATURE_PHE)
226#define cpu_has_phe_enabled boot_cpu_has(X86_FEATURE_PHE_EN)
227#define cpu_has_pmm boot_cpu_has(X86_FEATURE_PMM)
228#define cpu_has_pmm_enabled boot_cpu_has(X86_FEATURE_PMM_EN)
229#define cpu_has_ds boot_cpu_has(X86_FEATURE_DS)
230#define cpu_has_pebs boot_cpu_has(X86_FEATURE_PEBS)
231#define cpu_has_clflush boot_cpu_has(X86_FEATURE_CLFLSH)
232#define cpu_has_bts boot_cpu_has(X86_FEATURE_BTS)
233#define cpu_has_gbpages boot_cpu_has(X86_FEATURE_GBPAGES)
234#define cpu_has_arch_perfmon boot_cpu_has(X86_FEATURE_ARCH_PERFMON)
235#define cpu_has_pat boot_cpu_has(X86_FEATURE_PAT)
236#define cpu_has_xmm4_1 boot_cpu_has(X86_FEATURE_XMM4_1)
237#define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2)
238#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
239#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
240
241#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
242# define cpu_has_invlpg 1
243#else
244# define cpu_has_invlpg (boot_cpu_data.x86 > 3)
245#endif
246
247#ifdef CONFIG_X86_64
248
249#undef cpu_has_vme
250#define cpu_has_vme 0
251
252#undef cpu_has_pae
253#define cpu_has_pae ___BUG___
254
255#undef cpu_has_mp
256#define cpu_has_mp 1
257
258#undef cpu_has_k6_mtrr
259#define cpu_has_k6_mtrr 0
260
261#undef cpu_has_cyrix_arr
262#define cpu_has_cyrix_arr 0
263
264#undef cpu_has_centaur_mcr
265#define cpu_has_centaur_mcr 0
266
267#endif /* CONFIG_X86_64 */
268
269#endif /* defined(__KERNEL__) && !defined(__ASSEMBLY__) */
270
271#endif /* _ASM_X86_CPUFEATURE_H */
diff --git a/arch/x86/include/asm/cputime.h b/arch/x86/include/asm/cputime.h
new file mode 100644
index 000000000000..6d68ad7e0ea3
--- /dev/null
+++ b/arch/x86/include/asm/cputime.h
@@ -0,0 +1 @@
#include <asm-generic/cputime.h>
diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
new file mode 100644
index 000000000000..0930b4f8d672
--- /dev/null
+++ b/arch/x86/include/asm/current.h
@@ -0,0 +1,39 @@
1#ifndef _ASM_X86_CURRENT_H
2#define _ASM_X86_CURRENT_H
3
4#ifdef CONFIG_X86_32
5#include <linux/compiler.h>
6#include <asm/percpu.h>
7
8struct task_struct;
9
10DECLARE_PER_CPU(struct task_struct *, current_task);
11static __always_inline struct task_struct *get_current(void)
12{
13 return x86_read_percpu(current_task);
14}
15
16#else /* X86_32 */
17
18#ifndef __ASSEMBLY__
19#include <asm/pda.h>
20
21struct task_struct;
22
23static __always_inline struct task_struct *get_current(void)
24{
25 return read_pda(pcurrent);
26}
27
28#else /* __ASSEMBLY__ */
29
30#include <asm/asm-offsets.h>
31#define GET_CURRENT(reg) movq %gs:(pda_pcurrent),reg
32
33#endif /* __ASSEMBLY__ */
34
35#endif /* X86_32 */
36
37#define current get_current()
38
39#endif /* _ASM_X86_CURRENT_H */
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
new file mode 100644
index 000000000000..3ea6f37be9e2
--- /dev/null
+++ b/arch/x86/include/asm/debugreg.h
@@ -0,0 +1,70 @@
1#ifndef _ASM_X86_DEBUGREG_H
2#define _ASM_X86_DEBUGREG_H
3
4
5/* Indicate the register numbers for a number of the specific
6 debug registers. Registers 0-3 contain the addresses we wish to trap on */
7#define DR_FIRSTADDR 0 /* u_debugreg[DR_FIRSTADDR] */
8#define DR_LASTADDR 3 /* u_debugreg[DR_LASTADDR] */
9
10#define DR_STATUS 6 /* u_debugreg[DR_STATUS] */
11#define DR_CONTROL 7 /* u_debugreg[DR_CONTROL] */
12
13/* Define a few things for the status register. We can use this to determine
14 which debugging register was responsible for the trap. The other bits
15 are either reserved or not of interest to us. */
16
17#define DR_TRAP0 (0x1) /* db0 */
18#define DR_TRAP1 (0x2) /* db1 */
19#define DR_TRAP2 (0x4) /* db2 */
20#define DR_TRAP3 (0x8) /* db3 */
21
22#define DR_STEP (0x4000) /* single-step */
23#define DR_SWITCH (0x8000) /* task switch */
24
25/* Now define a bunch of things for manipulating the control register.
26 The top two bytes of the control register consist of 4 fields of 4
27 bits - each field corresponds to one of the four debug registers,
28 and indicates what types of access we trap on, and how large the data
29 field is that we are looking at */
30
31#define DR_CONTROL_SHIFT 16 /* Skip this many bits in ctl register */
32#define DR_CONTROL_SIZE 4 /* 4 control bits per register */
33
34#define DR_RW_EXECUTE (0x0) /* Settings for the access types to trap on */
35#define DR_RW_WRITE (0x1)
36#define DR_RW_READ (0x3)
37
38#define DR_LEN_1 (0x0) /* Settings for data length to trap on */
39#define DR_LEN_2 (0x4)
40#define DR_LEN_4 (0xC)
41#define DR_LEN_8 (0x8)
42
43/* The low byte to the control register determine which registers are
44 enabled. There are 4 fields of two bits. One bit is "local", meaning
45 that the processor will reset the bit after a task switch and the other
46 is global meaning that we have to explicitly reset the bit. With linux,
47 you can use either one, since we explicitly zero the register when we enter
48 kernel mode. */
49
50#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */
51#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */
52#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */
53
54#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */
55#define DR_GLOBAL_ENABLE_MASK (0xAA) /* Set global bits for all 4 regs */
56
57/* The second byte to the control register has a few special things.
58 We can slow the instruction pipeline for instructions coming via the
59 gdt or the ldt if we want to. I am not sure why this is an advantage */
60
61#ifdef __i386__
62#define DR_CONTROL_RESERVED (0xFC00) /* Reserved by Intel */
63#else
64#define DR_CONTROL_RESERVED (0xFFFFFFFF0000FC00UL) /* Reserved */
65#endif
66
67#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */
68#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */
69
70#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h
new file mode 100644
index 000000000000..409a649204aa
--- /dev/null
+++ b/arch/x86/include/asm/delay.h
@@ -0,0 +1,31 @@
1#ifndef _ASM_X86_DELAY_H
2#define _ASM_X86_DELAY_H
3
4/*
5 * Copyright (C) 1993 Linus Torvalds
6 *
7 * Delay routines calling functions in arch/x86/lib/delay.c
8 */
9
10/* Undefined functions to get compile-time errors */
11extern void __bad_udelay(void);
12extern void __bad_ndelay(void);
13
14extern void __udelay(unsigned long usecs);
15extern void __ndelay(unsigned long nsecs);
16extern void __const_udelay(unsigned long xloops);
17extern void __delay(unsigned long loops);
18
19/* 0x10c7 is 2**32 / 1000000 (rounded up) */
20#define udelay(n) (__builtin_constant_p(n) ? \
21 ((n) > 20000 ? __bad_udelay() : __const_udelay((n) * 0x10c7ul)) : \
22 __udelay(n))
23
24/* 0x5 is 2**32 / 1000000000 (rounded up) */
25#define ndelay(n) (__builtin_constant_p(n) ? \
26 ((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
27 __ndelay(n))
28
29void use_tsc_delay(void);
30
31#endif /* _ASM_X86_DELAY_H */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
new file mode 100644
index 000000000000..e6b82b17b072
--- /dev/null
+++ b/arch/x86/include/asm/desc.h
@@ -0,0 +1,409 @@
1#ifndef _ASM_X86_DESC_H
2#define _ASM_X86_DESC_H
3
4#ifndef __ASSEMBLY__
5#include <asm/desc_defs.h>
6#include <asm/ldt.h>
7#include <asm/mmu.h>
8#include <linux/smp.h>
9
10static inline void fill_ldt(struct desc_struct *desc,
11 const struct user_desc *info)
12{
13 desc->limit0 = info->limit & 0x0ffff;
14 desc->base0 = info->base_addr & 0x0000ffff;
15
16 desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
17 desc->type = (info->read_exec_only ^ 1) << 1;
18 desc->type |= info->contents << 2;
19 desc->s = 1;
20 desc->dpl = 0x3;
21 desc->p = info->seg_not_present ^ 1;
22 desc->limit = (info->limit & 0xf0000) >> 16;
23 desc->avl = info->useable;
24 desc->d = info->seg_32bit;
25 desc->g = info->limit_in_pages;
26 desc->base2 = (info->base_addr & 0xff000000) >> 24;
27 /*
28 * Don't allow setting of the lm bit. It is useless anyway
29 * because 64bit system calls require __USER_CS:
30 */
31 desc->l = 0;
32}
33
34extern struct desc_ptr idt_descr;
35extern gate_desc idt_table[];
36
37struct gdt_page {
38 struct desc_struct gdt[GDT_ENTRIES];
39} __attribute__((aligned(PAGE_SIZE)));
40DECLARE_PER_CPU(struct gdt_page, gdt_page);
41
42static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
43{
44 return per_cpu(gdt_page, cpu).gdt;
45}
46
47#ifdef CONFIG_X86_64
48
49static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
50 unsigned dpl, unsigned ist, unsigned seg)
51{
52 gate->offset_low = PTR_LOW(func);
53 gate->segment = __KERNEL_CS;
54 gate->ist = ist;
55 gate->p = 1;
56 gate->dpl = dpl;
57 gate->zero0 = 0;
58 gate->zero1 = 0;
59 gate->type = type;
60 gate->offset_middle = PTR_MIDDLE(func);
61 gate->offset_high = PTR_HIGH(func);
62}
63
64#else
65static inline void pack_gate(gate_desc *gate, unsigned char type,
66 unsigned long base, unsigned dpl, unsigned flags,
67 unsigned short seg)
68{
69 gate->a = (seg << 16) | (base & 0xffff);
70 gate->b = (base & 0xffff0000) |
71 (((0x80 | type | (dpl << 5)) & 0xff) << 8);
72}
73
74#endif
75
76static inline int desc_empty(const void *ptr)
77{
78 const u32 *desc = ptr;
79 return !(desc[0] | desc[1]);
80}
81
82#ifdef CONFIG_PARAVIRT
83#include <asm/paravirt.h>
84#else
85#define load_TR_desc() native_load_tr_desc()
86#define load_gdt(dtr) native_load_gdt(dtr)
87#define load_idt(dtr) native_load_idt(dtr)
88#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
89#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
90
91#define store_gdt(dtr) native_store_gdt(dtr)
92#define store_idt(dtr) native_store_idt(dtr)
93#define store_tr(tr) (tr = native_store_tr())
94#define store_ldt(ldt) asm("sldt %0":"=m" (ldt))
95
96#define load_TLS(t, cpu) native_load_tls(t, cpu)
97#define set_ldt native_set_ldt
98
99#define write_ldt_entry(dt, entry, desc) \
100 native_write_ldt_entry(dt, entry, desc)
101#define write_gdt_entry(dt, entry, desc, type) \
102 native_write_gdt_entry(dt, entry, desc, type)
103#define write_idt_entry(dt, entry, g) \
104 native_write_idt_entry(dt, entry, g)
105
106static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
107{
108}
109
110static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
111{
112}
113#endif /* CONFIG_PARAVIRT */
114
115static inline void native_write_idt_entry(gate_desc *idt, int entry,
116 const gate_desc *gate)
117{
118 memcpy(&idt[entry], gate, sizeof(*gate));
119}
120
121static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry,
122 const void *desc)
123{
124 memcpy(&ldt[entry], desc, 8);
125}
126
127static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry,
128 const void *desc, int type)
129{
130 unsigned int size;
131 switch (type) {
132 case DESC_TSS:
133 size = sizeof(tss_desc);
134 break;
135 case DESC_LDT:
136 size = sizeof(ldt_desc);
137 break;
138 default:
139 size = sizeof(struct desc_struct);
140 break;
141 }
142 memcpy(&gdt[entry], desc, size);
143}
144
145static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
146 unsigned long limit, unsigned char type,
147 unsigned char flags)
148{
149 desc->a = ((base & 0xffff) << 16) | (limit & 0xffff);
150 desc->b = (base & 0xff000000) | ((base & 0xff0000) >> 16) |
151 (limit & 0x000f0000) | ((type & 0xff) << 8) |
152 ((flags & 0xf) << 20);
153 desc->p = 1;
154}
155
156
157static inline void set_tssldt_descriptor(void *d, unsigned long addr,
158 unsigned type, unsigned size)
159{
160#ifdef CONFIG_X86_64
161 struct ldttss_desc64 *desc = d;
162 memset(desc, 0, sizeof(*desc));
163 desc->limit0 = size & 0xFFFF;
164 desc->base0 = PTR_LOW(addr);
165 desc->base1 = PTR_MIDDLE(addr) & 0xFF;
166 desc->type = type;
167 desc->p = 1;
168 desc->limit1 = (size >> 16) & 0xF;
169 desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
170 desc->base3 = PTR_HIGH(addr);
171#else
172 pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
173#endif
174}
175
176static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr)
177{
178 struct desc_struct *d = get_cpu_gdt_table(cpu);
179 tss_desc tss;
180
181 /*
182 * sizeof(unsigned long) coming from an extra "long" at the end
183 * of the iobitmap. See tss_struct definition in processor.h
184 *
185 * -1? seg base+limit should be pointing to the address of the
186 * last valid byte
187 */
188 set_tssldt_descriptor(&tss, (unsigned long)addr, DESC_TSS,
189 IO_BITMAP_OFFSET + IO_BITMAP_BYTES +
190 sizeof(unsigned long) - 1);
191 write_gdt_entry(d, entry, &tss, DESC_TSS);
192}
193
194#define set_tss_desc(cpu, addr) __set_tss_desc(cpu, GDT_ENTRY_TSS, addr)
195
196static inline void native_set_ldt(const void *addr, unsigned int entries)
197{
198 if (likely(entries == 0))
199 asm volatile("lldt %w0"::"q" (0));
200 else {
201 unsigned cpu = smp_processor_id();
202 ldt_desc ldt;
203
204 set_tssldt_descriptor(&ldt, (unsigned long)addr, DESC_LDT,
205 entries * LDT_ENTRY_SIZE - 1);
206 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT,
207 &ldt, DESC_LDT);
208 asm volatile("lldt %w0"::"q" (GDT_ENTRY_LDT*8));
209 }
210}
211
212static inline void native_load_tr_desc(void)
213{
214 asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8));
215}
216
217static inline void native_load_gdt(const struct desc_ptr *dtr)
218{
219 asm volatile("lgdt %0"::"m" (*dtr));
220}
221
222static inline void native_load_idt(const struct desc_ptr *dtr)
223{
224 asm volatile("lidt %0"::"m" (*dtr));
225}
226
227static inline void native_store_gdt(struct desc_ptr *dtr)
228{
229 asm volatile("sgdt %0":"=m" (*dtr));
230}
231
232static inline void native_store_idt(struct desc_ptr *dtr)
233{
234 asm volatile("sidt %0":"=m" (*dtr));
235}
236
237static inline unsigned long native_store_tr(void)
238{
239 unsigned long tr;
240 asm volatile("str %0":"=r" (tr));
241 return tr;
242}
243
244static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
245{
246 unsigned int i;
247 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
248
249 for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
250 gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
251}
252
253#define _LDT_empty(info) \
254 ((info)->base_addr == 0 && \
255 (info)->limit == 0 && \
256 (info)->contents == 0 && \
257 (info)->read_exec_only == 1 && \
258 (info)->seg_32bit == 0 && \
259 (info)->limit_in_pages == 0 && \
260 (info)->seg_not_present == 1 && \
261 (info)->useable == 0)
262
263#ifdef CONFIG_X86_64
264#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0))
265#else
266#define LDT_empty(info) (_LDT_empty(info))
267#endif
268
269static inline void clear_LDT(void)
270{
271 set_ldt(NULL, 0);
272}
273
274/*
275 * load one particular LDT into the current CPU
276 */
277static inline void load_LDT_nolock(mm_context_t *pc)
278{
279 set_ldt(pc->ldt, pc->size);
280}
281
282static inline void load_LDT(mm_context_t *pc)
283{
284 preempt_disable();
285 load_LDT_nolock(pc);
286 preempt_enable();
287}
288
289static inline unsigned long get_desc_base(const struct desc_struct *desc)
290{
291 return desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24);
292}
293
294static inline unsigned long get_desc_limit(const struct desc_struct *desc)
295{
296 return desc->limit0 | (desc->limit << 16);
297}
298
299static inline void _set_gate(int gate, unsigned type, void *addr,
300 unsigned dpl, unsigned ist, unsigned seg)
301{
302 gate_desc s;
303 pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
304 /*
305 * does not need to be atomic because it is only done once at
306 * setup time
307 */
308 write_idt_entry(idt_table, gate, &s);
309}
310
311/*
312 * This needs to use 'idt_table' rather than 'idt', and
313 * thus use the _nonmapped_ version of the IDT, as the
314 * Pentium F0 0F bugfix can have resulted in the mapped
315 * IDT being write-protected.
316 */
317static inline void set_intr_gate(unsigned int n, void *addr)
318{
319 BUG_ON((unsigned)n > 0xFF);
320 _set_gate(n, GATE_INTERRUPT, addr, 0, 0, __KERNEL_CS);
321}
322
323#define SYS_VECTOR_FREE 0
324#define SYS_VECTOR_ALLOCED 1
325
326extern int first_system_vector;
327extern char system_vectors[];
328
329static inline void alloc_system_vector(int vector)
330{
331 if (system_vectors[vector] == SYS_VECTOR_FREE) {
332 system_vectors[vector] = SYS_VECTOR_ALLOCED;
333 if (first_system_vector > vector)
334 first_system_vector = vector;
335 } else
336 BUG();
337}
338
339static inline void alloc_intr_gate(unsigned int n, void *addr)
340{
341 alloc_system_vector(n);
342 set_intr_gate(n, addr);
343}
344
345/*
346 * This routine sets up an interrupt gate at directory privilege level 3.
347 */
348static inline void set_system_intr_gate(unsigned int n, void *addr)
349{
350 BUG_ON((unsigned)n > 0xFF);
351 _set_gate(n, GATE_INTERRUPT, addr, 0x3, 0, __KERNEL_CS);
352}
353
354static inline void set_system_trap_gate(unsigned int n, void *addr)
355{
356 BUG_ON((unsigned)n > 0xFF);
357 _set_gate(n, GATE_TRAP, addr, 0x3, 0, __KERNEL_CS);
358}
359
360static inline void set_trap_gate(unsigned int n, void *addr)
361{
362 BUG_ON((unsigned)n > 0xFF);
363 _set_gate(n, GATE_TRAP, addr, 0, 0, __KERNEL_CS);
364}
365
366static inline void set_task_gate(unsigned int n, unsigned int gdt_entry)
367{
368 BUG_ON((unsigned)n > 0xFF);
369 _set_gate(n, GATE_TASK, (void *)0, 0, 0, (gdt_entry<<3));
370}
371
372static inline void set_intr_gate_ist(int n, void *addr, unsigned ist)
373{
374 BUG_ON((unsigned)n > 0xFF);
375 _set_gate(n, GATE_INTERRUPT, addr, 0, ist, __KERNEL_CS);
376}
377
378static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist)
379{
380 BUG_ON((unsigned)n > 0xFF);
381 _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS);
382}
383
384#else
385/*
386 * GET_DESC_BASE reads the descriptor base of the specified segment.
387 *
388 * Args:
389 * idx - descriptor index
390 * gdt - GDT pointer
391 * base - 32bit register to which the base will be written
392 * lo_w - lo word of the "base" register
393 * lo_b - lo byte of the "base" register
394 * hi_b - hi byte of the low word of the "base" register
395 *
396 * Example:
397 * GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah)
398 * Will read the base address of GDT_ENTRY_ESPFIX_SS and put it into %eax.
399 */
400#define GET_DESC_BASE(idx, gdt, base, lo_w, lo_b, hi_b) \
401 movb idx * 8 + 4(gdt), lo_b; \
402 movb idx * 8 + 7(gdt), hi_b; \
403 shll $16, base; \
404 movw idx * 8 + 2(gdt), lo_w;
405
406
407#endif /* __ASSEMBLY__ */
408
409#endif /* _ASM_X86_DESC_H */
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
new file mode 100644
index 000000000000..a6adefa28b94
--- /dev/null
+++ b/arch/x86/include/asm/desc_defs.h
@@ -0,0 +1,95 @@
1/* Written 2000 by Andi Kleen */
2#ifndef _ASM_X86_DESC_DEFS_H
3#define _ASM_X86_DESC_DEFS_H
4
5/*
6 * Segment descriptor structure definitions, usable from both x86_64 and i386
7 * archs.
8 */
9
10#ifndef __ASSEMBLY__
11
12#include <linux/types.h>
13
14/*
15 * FIXME: Acessing the desc_struct through its fields is more elegant,
16 * and should be the one valid thing to do. However, a lot of open code
17 * still touches the a and b acessors, and doing this allow us to do it
18 * incrementally. We keep the signature as a struct, rather than an union,
19 * so we can get rid of it transparently in the future -- glommer
20 */
21/* 8 byte segment descriptor */
22struct desc_struct {
23 union {
24 struct {
25 unsigned int a;
26 unsigned int b;
27 };
28 struct {
29 u16 limit0;
30 u16 base0;
31 unsigned base1: 8, type: 4, s: 1, dpl: 2, p: 1;
32 unsigned limit: 4, avl: 1, l: 1, d: 1, g: 1, base2: 8;
33 };
34 };
35} __attribute__((packed));
36
37enum {
38 GATE_INTERRUPT = 0xE,
39 GATE_TRAP = 0xF,
40 GATE_CALL = 0xC,
41 GATE_TASK = 0x5,
42};
43
44/* 16byte gate */
45struct gate_struct64 {
46 u16 offset_low;
47 u16 segment;
48 unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1;
49 u16 offset_middle;
50 u32 offset_high;
51 u32 zero1;
52} __attribute__((packed));
53
54#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF)
55#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF)
56#define PTR_HIGH(x) ((unsigned long long)(x) >> 32)
57
58enum {
59 DESC_TSS = 0x9,
60 DESC_LDT = 0x2,
61 DESCTYPE_S = 0x10, /* !system */
62};
63
64/* LDT or TSS descriptor in the GDT. 16 bytes. */
65struct ldttss_desc64 {
66 u16 limit0;
67 u16 base0;
68 unsigned base1 : 8, type : 5, dpl : 2, p : 1;
69 unsigned limit1 : 4, zero0 : 3, g : 1, base2 : 8;
70 u32 base3;
71 u32 zero1;
72} __attribute__((packed));
73
74#ifdef CONFIG_X86_64
75typedef struct gate_struct64 gate_desc;
76typedef struct ldttss_desc64 ldt_desc;
77typedef struct ldttss_desc64 tss_desc;
78#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32))
79#define gate_segment(g) ((g).segment)
80#else
81typedef struct desc_struct gate_desc;
82typedef struct desc_struct ldt_desc;
83typedef struct desc_struct tss_desc;
84#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff))
85#define gate_segment(g) ((g).a >> 16)
86#endif
87
88struct desc_ptr {
89 unsigned short size;
90 unsigned long address;
91} __attribute__((packed)) ;
92
93#endif /* !__ASSEMBLY__ */
94
95#endif /* _ASM_X86_DESC_DEFS_H */
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
new file mode 100644
index 000000000000..3c034f48fdb0
--- /dev/null
+++ b/arch/x86/include/asm/device.h
@@ -0,0 +1,16 @@
1#ifndef _ASM_X86_DEVICE_H
2#define _ASM_X86_DEVICE_H
3
4struct dev_archdata {
5#ifdef CONFIG_ACPI
6 void *acpi_handle;
7#endif
8#ifdef CONFIG_X86_64
9struct dma_mapping_ops *dma_ops;
10#endif
11#ifdef CONFIG_DMAR
12 void *iommu; /* hook for IOMMU specific extension */
13#endif
14};
15
16#endif /* _ASM_X86_DEVICE_H */
diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h
new file mode 100644
index 000000000000..9a2d644c08ef
--- /dev/null
+++ b/arch/x86/include/asm/div64.h
@@ -0,0 +1,60 @@
1#ifndef _ASM_X86_DIV64_H
2#define _ASM_X86_DIV64_H
3
4#ifdef CONFIG_X86_32
5
6#include <linux/types.h>
7
8/*
9 * do_div() is NOT a C function. It wants to return
10 * two values (the quotient and the remainder), but
11 * since that doesn't work very well in C, what it
12 * does is:
13 *
14 * - modifies the 64-bit dividend _in_place_
15 * - returns the 32-bit remainder
16 *
17 * This ends up being the most efficient "calling
18 * convention" on x86.
19 */
20#define do_div(n, base) \
21({ \
22 unsigned long __upper, __low, __high, __mod, __base; \
23 __base = (base); \
24 asm("":"=a" (__low), "=d" (__high) : "A" (n)); \
25 __upper = __high; \
26 if (__high) { \
27 __upper = __high % (__base); \
28 __high = __high / (__base); \
29 } \
30 asm("divl %2":"=a" (__low), "=d" (__mod) \
31 : "rm" (__base), "0" (__low), "1" (__upper)); \
32 asm("":"=A" (n) : "a" (__low), "d" (__high)); \
33 __mod; \
34})
35
36static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder)
37{
38 union {
39 u64 v64;
40 u32 v32[2];
41 } d = { dividend };
42 u32 upper;
43
44 upper = d.v32[1];
45 d.v32[1] = 0;
46 if (upper >= divisor) {
47 d.v32[1] = upper / divisor;
48 upper %= divisor;
49 }
50 asm ("divl %2" : "=a" (d.v32[0]), "=d" (*remainder) :
51 "rm" (divisor), "0" (d.v32[0]), "1" (upper));
52 return d.v64;
53}
54#define div_u64_rem div_u64_rem
55
56#else
57# include <asm-generic/div64.h>
58#endif /* CONFIG_X86_32 */
59
60#endif /* _ASM_X86_DIV64_H */
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
new file mode 100644
index 000000000000..4a5397bfce27
--- /dev/null
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -0,0 +1,308 @@
1#ifndef _ASM_X86_DMA_MAPPING_H
2#define _ASM_X86_DMA_MAPPING_H
3
4/*
5 * IOMMU interface. See Documentation/DMA-mapping.txt and DMA-API.txt for
6 * documentation.
7 */
8
9#include <linux/scatterlist.h>
10#include <asm/io.h>
11#include <asm/swiotlb.h>
12#include <asm-generic/dma-coherent.h>
13
14extern dma_addr_t bad_dma_address;
15extern int iommu_merge;
16extern struct device x86_dma_fallback_dev;
17extern int panic_on_overflow;
18
19struct dma_mapping_ops {
20 int (*mapping_error)(struct device *dev,
21 dma_addr_t dma_addr);
22 void* (*alloc_coherent)(struct device *dev, size_t size,
23 dma_addr_t *dma_handle, gfp_t gfp);
24 void (*free_coherent)(struct device *dev, size_t size,
25 void *vaddr, dma_addr_t dma_handle);
26 dma_addr_t (*map_single)(struct device *hwdev, phys_addr_t ptr,
27 size_t size, int direction);
28 void (*unmap_single)(struct device *dev, dma_addr_t addr,
29 size_t size, int direction);
30 void (*sync_single_for_cpu)(struct device *hwdev,
31 dma_addr_t dma_handle, size_t size,
32 int direction);
33 void (*sync_single_for_device)(struct device *hwdev,
34 dma_addr_t dma_handle, size_t size,
35 int direction);
36 void (*sync_single_range_for_cpu)(struct device *hwdev,
37 dma_addr_t dma_handle, unsigned long offset,
38 size_t size, int direction);
39 void (*sync_single_range_for_device)(struct device *hwdev,
40 dma_addr_t dma_handle, unsigned long offset,
41 size_t size, int direction);
42 void (*sync_sg_for_cpu)(struct device *hwdev,
43 struct scatterlist *sg, int nelems,
44 int direction);
45 void (*sync_sg_for_device)(struct device *hwdev,
46 struct scatterlist *sg, int nelems,
47 int direction);
48 int (*map_sg)(struct device *hwdev, struct scatterlist *sg,
49 int nents, int direction);
50 void (*unmap_sg)(struct device *hwdev,
51 struct scatterlist *sg, int nents,
52 int direction);
53 int (*dma_supported)(struct device *hwdev, u64 mask);
54 int is_phys;
55};
56
57extern struct dma_mapping_ops *dma_ops;
58
59static inline struct dma_mapping_ops *get_dma_ops(struct device *dev)
60{
61#ifdef CONFIG_X86_32
62 return dma_ops;
63#else
64 if (unlikely(!dev) || !dev->archdata.dma_ops)
65 return dma_ops;
66 else
67 return dev->archdata.dma_ops;
68#endif /* _ASM_X86_DMA_MAPPING_H */
69}
70
71/* Make sure we keep the same behaviour */
72static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
73{
74#ifdef CONFIG_X86_32
75 return 0;
76#else
77 struct dma_mapping_ops *ops = get_dma_ops(dev);
78 if (ops->mapping_error)
79 return ops->mapping_error(dev, dma_addr);
80
81 return (dma_addr == bad_dma_address);
82#endif
83}
84
85#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
86#define dma_free_noncoherent(d, s, v, h) dma_free_coherent(d, s, v, h)
87#define dma_is_consistent(d, h) (1)
88
89extern int dma_supported(struct device *hwdev, u64 mask);
90extern int dma_set_mask(struct device *dev, u64 mask);
91
92extern void *dma_generic_alloc_coherent(struct device *dev, size_t size,
93 dma_addr_t *dma_addr, gfp_t flag);
94
95static inline dma_addr_t
96dma_map_single(struct device *hwdev, void *ptr, size_t size,
97 int direction)
98{
99 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
100
101 BUG_ON(!valid_dma_direction(direction));
102 return ops->map_single(hwdev, virt_to_phys(ptr), size, direction);
103}
104
105static inline void
106dma_unmap_single(struct device *dev, dma_addr_t addr, size_t size,
107 int direction)
108{
109 struct dma_mapping_ops *ops = get_dma_ops(dev);
110
111 BUG_ON(!valid_dma_direction(direction));
112 if (ops->unmap_single)
113 ops->unmap_single(dev, addr, size, direction);
114}
115
116static inline int
117dma_map_sg(struct device *hwdev, struct scatterlist *sg,
118 int nents, int direction)
119{
120 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
121
122 BUG_ON(!valid_dma_direction(direction));
123 return ops->map_sg(hwdev, sg, nents, direction);
124}
125
126static inline void
127dma_unmap_sg(struct device *hwdev, struct scatterlist *sg, int nents,
128 int direction)
129{
130 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
131
132 BUG_ON(!valid_dma_direction(direction));
133 if (ops->unmap_sg)
134 ops->unmap_sg(hwdev, sg, nents, direction);
135}
136
137static inline void
138dma_sync_single_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
139 size_t size, int direction)
140{
141 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
142
143 BUG_ON(!valid_dma_direction(direction));
144 if (ops->sync_single_for_cpu)
145 ops->sync_single_for_cpu(hwdev, dma_handle, size, direction);
146 flush_write_buffers();
147}
148
149static inline void
150dma_sync_single_for_device(struct device *hwdev, dma_addr_t dma_handle,
151 size_t size, int direction)
152{
153 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
154
155 BUG_ON(!valid_dma_direction(direction));
156 if (ops->sync_single_for_device)
157 ops->sync_single_for_device(hwdev, dma_handle, size, direction);
158 flush_write_buffers();
159}
160
161static inline void
162dma_sync_single_range_for_cpu(struct device *hwdev, dma_addr_t dma_handle,
163 unsigned long offset, size_t size, int direction)
164{
165 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
166
167 BUG_ON(!valid_dma_direction(direction));
168 if (ops->sync_single_range_for_cpu)
169 ops->sync_single_range_for_cpu(hwdev, dma_handle, offset,
170 size, direction);
171 flush_write_buffers();
172}
173
174static inline void
175dma_sync_single_range_for_device(struct device *hwdev, dma_addr_t dma_handle,
176 unsigned long offset, size_t size,
177 int direction)
178{
179 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
180
181 BUG_ON(!valid_dma_direction(direction));
182 if (ops->sync_single_range_for_device)
183 ops->sync_single_range_for_device(hwdev, dma_handle,
184 offset, size, direction);
185 flush_write_buffers();
186}
187
188static inline void
189dma_sync_sg_for_cpu(struct device *hwdev, struct scatterlist *sg,
190 int nelems, int direction)
191{
192 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
193
194 BUG_ON(!valid_dma_direction(direction));
195 if (ops->sync_sg_for_cpu)
196 ops->sync_sg_for_cpu(hwdev, sg, nelems, direction);
197 flush_write_buffers();
198}
199
200static inline void
201dma_sync_sg_for_device(struct device *hwdev, struct scatterlist *sg,
202 int nelems, int direction)
203{
204 struct dma_mapping_ops *ops = get_dma_ops(hwdev);
205
206 BUG_ON(!valid_dma_direction(direction));
207 if (ops->sync_sg_for_device)
208 ops->sync_sg_for_device(hwdev, sg, nelems, direction);
209
210 flush_write_buffers();
211}
212
213static inline dma_addr_t dma_map_page(struct device *dev, struct page *page,
214 size_t offset, size_t size,
215 int direction)
216{
217 struct dma_mapping_ops *ops = get_dma_ops(dev);
218
219 BUG_ON(!valid_dma_direction(direction));
220 return ops->map_single(dev, page_to_phys(page) + offset,
221 size, direction);
222}
223
224static inline void dma_unmap_page(struct device *dev, dma_addr_t addr,
225 size_t size, int direction)
226{
227 dma_unmap_single(dev, addr, size, direction);
228}
229
230static inline void
231dma_cache_sync(struct device *dev, void *vaddr, size_t size,
232 enum dma_data_direction dir)
233{
234 flush_write_buffers();
235}
236
237static inline int dma_get_cache_alignment(void)
238{
239 /* no easy way to get cache size on all x86, so return the
240 * maximum possible, to be safe */
241 return boot_cpu_data.x86_clflush_size;
242}
243
244static inline unsigned long dma_alloc_coherent_mask(struct device *dev,
245 gfp_t gfp)
246{
247 unsigned long dma_mask = 0;
248
249 dma_mask = dev->coherent_dma_mask;
250 if (!dma_mask)
251 dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
252
253 return dma_mask;
254}
255
256static inline gfp_t dma_alloc_coherent_gfp_flags(struct device *dev, gfp_t gfp)
257{
258#ifdef CONFIG_X86_64
259 unsigned long dma_mask = dma_alloc_coherent_mask(dev, gfp);
260
261 if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA))
262 gfp |= GFP_DMA32;
263#endif
264 return gfp;
265}
266
267static inline void *
268dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
269 gfp_t gfp)
270{
271 struct dma_mapping_ops *ops = get_dma_ops(dev);
272 void *memory;
273
274 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
275
276 if (dma_alloc_from_coherent(dev, size, dma_handle, &memory))
277 return memory;
278
279 if (!dev) {
280 dev = &x86_dma_fallback_dev;
281 gfp |= GFP_DMA;
282 }
283
284 if (!is_device_dma_capable(dev))
285 return NULL;
286
287 if (!ops->alloc_coherent)
288 return NULL;
289
290 return ops->alloc_coherent(dev, size, dma_handle,
291 dma_alloc_coherent_gfp_flags(dev, gfp));
292}
293
294static inline void dma_free_coherent(struct device *dev, size_t size,
295 void *vaddr, dma_addr_t bus)
296{
297 struct dma_mapping_ops *ops = get_dma_ops(dev);
298
299 WARN_ON(irqs_disabled()); /* for portability */
300
301 if (dma_release_from_coherent(dev, get_order(size), vaddr))
302 return;
303
304 if (ops->free_coherent)
305 ops->free_coherent(dev, size, vaddr, bus);
306}
307
308#endif
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
new file mode 100644
index 000000000000..ca1098a7e580
--- /dev/null
+++ b/arch/x86/include/asm/dma.h
@@ -0,0 +1,318 @@
1/*
2 * linux/include/asm/dma.h: Defines for using and allocating dma channels.
3 * Written by Hennus Bergman, 1992.
4 * High DMA channel support & info by Hannu Savolainen
5 * and John Boyd, Nov. 1992.
6 */
7
8#ifndef _ASM_X86_DMA_H
9#define _ASM_X86_DMA_H
10
11#include <linux/spinlock.h> /* And spinlocks */
12#include <asm/io.h> /* need byte IO */
13#include <linux/delay.h>
14
15#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
16#define dma_outb outb_p
17#else
18#define dma_outb outb
19#endif
20
21#define dma_inb inb
22
23/*
24 * NOTES about DMA transfers:
25 *
26 * controller 1: channels 0-3, byte operations, ports 00-1F
27 * controller 2: channels 4-7, word operations, ports C0-DF
28 *
29 * - ALL registers are 8 bits only, regardless of transfer size
30 * - channel 4 is not used - cascades 1 into 2.
31 * - channels 0-3 are byte - addresses/counts are for physical bytes
32 * - channels 5-7 are word - addresses/counts are for physical words
33 * - transfers must not cross physical 64K (0-3) or 128K (5-7) boundaries
34 * - transfer count loaded to registers is 1 less than actual count
35 * - controller 2 offsets are all even (2x offsets for controller 1)
36 * - page registers for 5-7 don't use data bit 0, represent 128K pages
37 * - page registers for 0-3 use bit 0, represent 64K pages
38 *
39 * DMA transfers are limited to the lower 16MB of _physical_ memory.
40 * Note that addresses loaded into registers must be _physical_ addresses,
41 * not logical addresses (which may differ if paging is active).
42 *
43 * Address mapping for channels 0-3:
44 *
45 * A23 ... A16 A15 ... A8 A7 ... A0 (Physical addresses)
46 * | ... | | ... | | ... |
47 * | ... | | ... | | ... |
48 * | ... | | ... | | ... |
49 * P7 ... P0 A7 ... A0 A7 ... A0
50 * | Page | Addr MSB | Addr LSB | (DMA registers)
51 *
52 * Address mapping for channels 5-7:
53 *
54 * A23 ... A17 A16 A15 ... A9 A8 A7 ... A1 A0 (Physical addresses)
55 * | ... | \ \ ... \ \ \ ... \ \
56 * | ... | \ \ ... \ \ \ ... \ (not used)
57 * | ... | \ \ ... \ \ \ ... \
58 * P7 ... P1 (0) A7 A6 ... A0 A7 A6 ... A0
59 * | Page | Addr MSB | Addr LSB | (DMA registers)
60 *
61 * Again, channels 5-7 transfer _physical_ words (16 bits), so addresses
62 * and counts _must_ be word-aligned (the lowest address bit is _ignored_ at
63 * the hardware level, so odd-byte transfers aren't possible).
64 *
65 * Transfer count (_not # bytes_) is limited to 64K, represented as actual
66 * count - 1 : 64K => 0xFFFF, 1 => 0x0000. Thus, count is always 1 or more,
67 * and up to 128K bytes may be transferred on channels 5-7 in one operation.
68 *
69 */
70
71#define MAX_DMA_CHANNELS 8
72
73#ifdef CONFIG_X86_32
74
75/* The maximum address that we can perform a DMA transfer to on this platform */
76#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
77
78#else
79
80/* 16MB ISA DMA zone */
81#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT)
82
83/* 4GB broken PCI/AGP hardware bus master zone */
84#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
85
86/* Compat define for old dma zone */
87#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
88
89#endif
90
91/* 8237 DMA controllers */
92#define IO_DMA1_BASE 0x00 /* 8 bit slave DMA, channels 0..3 */
93#define IO_DMA2_BASE 0xC0 /* 16 bit master DMA, ch 4(=slave input)..7 */
94
95/* DMA controller registers */
96#define DMA1_CMD_REG 0x08 /* command register (w) */
97#define DMA1_STAT_REG 0x08 /* status register (r) */
98#define DMA1_REQ_REG 0x09 /* request register (w) */
99#define DMA1_MASK_REG 0x0A /* single-channel mask (w) */
100#define DMA1_MODE_REG 0x0B /* mode register (w) */
101#define DMA1_CLEAR_FF_REG 0x0C /* clear pointer flip-flop (w) */
102#define DMA1_TEMP_REG 0x0D /* Temporary Register (r) */
103#define DMA1_RESET_REG 0x0D /* Master Clear (w) */
104#define DMA1_CLR_MASK_REG 0x0E /* Clear Mask */
105#define DMA1_MASK_ALL_REG 0x0F /* all-channels mask (w) */
106
107#define DMA2_CMD_REG 0xD0 /* command register (w) */
108#define DMA2_STAT_REG 0xD0 /* status register (r) */
109#define DMA2_REQ_REG 0xD2 /* request register (w) */
110#define DMA2_MASK_REG 0xD4 /* single-channel mask (w) */
111#define DMA2_MODE_REG 0xD6 /* mode register (w) */
112#define DMA2_CLEAR_FF_REG 0xD8 /* clear pointer flip-flop (w) */
113#define DMA2_TEMP_REG 0xDA /* Temporary Register (r) */
114#define DMA2_RESET_REG 0xDA /* Master Clear (w) */
115#define DMA2_CLR_MASK_REG 0xDC /* Clear Mask */
116#define DMA2_MASK_ALL_REG 0xDE /* all-channels mask (w) */
117
118#define DMA_ADDR_0 0x00 /* DMA address registers */
119#define DMA_ADDR_1 0x02
120#define DMA_ADDR_2 0x04
121#define DMA_ADDR_3 0x06
122#define DMA_ADDR_4 0xC0
123#define DMA_ADDR_5 0xC4
124#define DMA_ADDR_6 0xC8
125#define DMA_ADDR_7 0xCC
126
127#define DMA_CNT_0 0x01 /* DMA count registers */
128#define DMA_CNT_1 0x03
129#define DMA_CNT_2 0x05
130#define DMA_CNT_3 0x07
131#define DMA_CNT_4 0xC2
132#define DMA_CNT_5 0xC6
133#define DMA_CNT_6 0xCA
134#define DMA_CNT_7 0xCE
135
136#define DMA_PAGE_0 0x87 /* DMA page registers */
137#define DMA_PAGE_1 0x83
138#define DMA_PAGE_2 0x81
139#define DMA_PAGE_3 0x82
140#define DMA_PAGE_5 0x8B
141#define DMA_PAGE_6 0x89
142#define DMA_PAGE_7 0x8A
143
144/* I/O to memory, no autoinit, increment, single mode */
145#define DMA_MODE_READ 0x44
146/* memory to I/O, no autoinit, increment, single mode */
147#define DMA_MODE_WRITE 0x48
148/* pass thru DREQ->HRQ, DACK<-HLDA only */
149#define DMA_MODE_CASCADE 0xC0
150
151#define DMA_AUTOINIT 0x10
152
153
154extern spinlock_t dma_spin_lock;
155
156static inline unsigned long claim_dma_lock(void)
157{
158 unsigned long flags;
159 spin_lock_irqsave(&dma_spin_lock, flags);
160 return flags;
161}
162
163static inline void release_dma_lock(unsigned long flags)
164{
165 spin_unlock_irqrestore(&dma_spin_lock, flags);
166}
167
168/* enable/disable a specific DMA channel */
169static inline void enable_dma(unsigned int dmanr)
170{
171 if (dmanr <= 3)
172 dma_outb(dmanr, DMA1_MASK_REG);
173 else
174 dma_outb(dmanr & 3, DMA2_MASK_REG);
175}
176
177static inline void disable_dma(unsigned int dmanr)
178{
179 if (dmanr <= 3)
180 dma_outb(dmanr | 4, DMA1_MASK_REG);
181 else
182 dma_outb((dmanr & 3) | 4, DMA2_MASK_REG);
183}
184
185/* Clear the 'DMA Pointer Flip Flop'.
186 * Write 0 for LSB/MSB, 1 for MSB/LSB access.
187 * Use this once to initialize the FF to a known state.
188 * After that, keep track of it. :-)
189 * --- In order to do that, the DMA routines below should ---
190 * --- only be used while holding the DMA lock ! ---
191 */
192static inline void clear_dma_ff(unsigned int dmanr)
193{
194 if (dmanr <= 3)
195 dma_outb(0, DMA1_CLEAR_FF_REG);
196 else
197 dma_outb(0, DMA2_CLEAR_FF_REG);
198}
199
200/* set mode (above) for a specific DMA channel */
201static inline void set_dma_mode(unsigned int dmanr, char mode)
202{
203 if (dmanr <= 3)
204 dma_outb(mode | dmanr, DMA1_MODE_REG);
205 else
206 dma_outb(mode | (dmanr & 3), DMA2_MODE_REG);
207}
208
209/* Set only the page register bits of the transfer address.
210 * This is used for successive transfers when we know the contents of
211 * the lower 16 bits of the DMA current address register, but a 64k boundary
212 * may have been crossed.
213 */
214static inline void set_dma_page(unsigned int dmanr, char pagenr)
215{
216 switch (dmanr) {
217 case 0:
218 dma_outb(pagenr, DMA_PAGE_0);
219 break;
220 case 1:
221 dma_outb(pagenr, DMA_PAGE_1);
222 break;
223 case 2:
224 dma_outb(pagenr, DMA_PAGE_2);
225 break;
226 case 3:
227 dma_outb(pagenr, DMA_PAGE_3);
228 break;
229 case 5:
230 dma_outb(pagenr & 0xfe, DMA_PAGE_5);
231 break;
232 case 6:
233 dma_outb(pagenr & 0xfe, DMA_PAGE_6);
234 break;
235 case 7:
236 dma_outb(pagenr & 0xfe, DMA_PAGE_7);
237 break;
238 }
239}
240
241
242/* Set transfer address & page bits for specific DMA channel.
243 * Assumes dma flipflop is clear.
244 */
245static inline void set_dma_addr(unsigned int dmanr, unsigned int a)
246{
247 set_dma_page(dmanr, a>>16);
248 if (dmanr <= 3) {
249 dma_outb(a & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE);
250 dma_outb((a >> 8) & 0xff, ((dmanr & 3) << 1) + IO_DMA1_BASE);
251 } else {
252 dma_outb((a >> 1) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE);
253 dma_outb((a >> 9) & 0xff, ((dmanr & 3) << 2) + IO_DMA2_BASE);
254 }
255}
256
257
258/* Set transfer size (max 64k for DMA0..3, 128k for DMA5..7) for
259 * a specific DMA channel.
260 * You must ensure the parameters are valid.
261 * NOTE: from a manual: "the number of transfers is one more
262 * than the initial word count"! This is taken into account.
263 * Assumes dma flip-flop is clear.
264 * NOTE 2: "count" represents _bytes_ and must be even for channels 5-7.
265 */
266static inline void set_dma_count(unsigned int dmanr, unsigned int count)
267{
268 count--;
269 if (dmanr <= 3) {
270 dma_outb(count & 0xff, ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE);
271 dma_outb((count >> 8) & 0xff,
272 ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE);
273 } else {
274 dma_outb((count >> 1) & 0xff,
275 ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE);
276 dma_outb((count >> 9) & 0xff,
277 ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE);
278 }
279}
280
281
282/* Get DMA residue count. After a DMA transfer, this
283 * should return zero. Reading this while a DMA transfer is
284 * still in progress will return unpredictable results.
285 * If called before the channel has been used, it may return 1.
286 * Otherwise, it returns the number of _bytes_ left to transfer.
287 *
288 * Assumes DMA flip-flop is clear.
289 */
290static inline int get_dma_residue(unsigned int dmanr)
291{
292 unsigned int io_port;
293 /* using short to get 16-bit wrap around */
294 unsigned short count;
295
296 io_port = (dmanr <= 3) ? ((dmanr & 3) << 1) + 1 + IO_DMA1_BASE
297 : ((dmanr & 3) << 2) + 2 + IO_DMA2_BASE;
298
299 count = 1 + dma_inb(io_port);
300 count += dma_inb(io_port) << 8;
301
302 return (dmanr <= 3) ? count : (count << 1);
303}
304
305
306/* These are in kernel/dma.c: */
307extern int request_dma(unsigned int dmanr, const char *device_id);
308extern void free_dma(unsigned int dmanr);
309
310/* From PCI */
311
312#ifdef CONFIG_PCI
313extern int isa_dma_bridge_buggy;
314#else
315#define isa_dma_bridge_buggy (0)
316#endif
317
318#endif /* _ASM_X86_DMA_H */
diff --git a/arch/x86/include/asm/dmi.h b/arch/x86/include/asm/dmi.h
new file mode 100644
index 000000000000..bc68212c6bc0
--- /dev/null
+++ b/arch/x86/include/asm/dmi.h
@@ -0,0 +1,26 @@
1#ifndef _ASM_X86_DMI_H
2#define _ASM_X86_DMI_H
3
4#include <asm/io.h>
5
6#define DMI_MAX_DATA 2048
7
8extern int dmi_alloc_index;
9extern char dmi_alloc_data[DMI_MAX_DATA];
10
11/* This is so early that there is no good way to allocate dynamic memory.
12 Allocate data in an BSS array. */
13static inline void *dmi_alloc(unsigned len)
14{
15 int idx = dmi_alloc_index;
16 if ((dmi_alloc_index + len) > DMI_MAX_DATA)
17 return NULL;
18 dmi_alloc_index += len;
19 return dmi_alloc_data + idx;
20}
21
22/* Use early IO mappings for DMI because it's initialized early */
23#define dmi_ioremap early_ioremap
24#define dmi_iounmap early_iounmap
25
26#endif /* _ASM_X86_DMI_H */
diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h
new file mode 100644
index 000000000000..72c5a190bf48
--- /dev/null
+++ b/arch/x86/include/asm/ds.h
@@ -0,0 +1,238 @@
1/*
2 * Debug Store (DS) support
3 *
4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS).
7 *
8 * It manages:
9 * - per-thread and per-cpu allocation of BTS and PEBS
10 * - buffer memory allocation (optional)
11 * - buffer overflow handling
12 * - buffer access
13 *
14 * It assumes:
15 * - get_task_struct on all parameter tasks
16 * - current is allowed to trace parameter tasks
17 *
18 *
19 * Copyright (C) 2007-2008 Intel Corporation.
20 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
21 */
22
23#ifndef _ASM_X86_DS_H
24#define _ASM_X86_DS_H
25
26#ifdef CONFIG_X86_DS
27
28#include <linux/types.h>
29#include <linux/init.h>
30
31
32struct task_struct;
33
34/*
35 * Request BTS or PEBS
36 *
37 * Due to alignement constraints, the actual buffer may be slightly
38 * smaller than the requested or provided buffer.
39 *
40 * Returns 0 on success; -Eerrno otherwise
41 *
42 * task: the task to request recording for;
43 * NULL for per-cpu recording on the current cpu
44 * base: the base pointer for the (non-pageable) buffer;
45 * NULL if buffer allocation requested
46 * size: the size of the requested or provided buffer
47 * ovfl: pointer to a function to be called on buffer overflow;
48 * NULL if cyclic buffer requested
49 */
50typedef void (*ds_ovfl_callback_t)(struct task_struct *);
51extern int ds_request_bts(struct task_struct *task, void *base, size_t size,
52 ds_ovfl_callback_t ovfl);
53extern int ds_request_pebs(struct task_struct *task, void *base, size_t size,
54 ds_ovfl_callback_t ovfl);
55
56/*
57 * Release BTS or PEBS resources
58 *
59 * Frees buffers allocated on ds_request.
60 *
61 * Returns 0 on success; -Eerrno otherwise
62 *
63 * task: the task to release resources for;
64 * NULL to release resources for the current cpu
65 */
66extern int ds_release_bts(struct task_struct *task);
67extern int ds_release_pebs(struct task_struct *task);
68
69/*
70 * Return the (array) index of the write pointer.
71 * (assuming an array of BTS/PEBS records)
72 *
73 * Returns -Eerrno on error
74 *
75 * task: the task to access;
76 * NULL to access the current cpu
77 * pos (out): if not NULL, will hold the result
78 */
79extern int ds_get_bts_index(struct task_struct *task, size_t *pos);
80extern int ds_get_pebs_index(struct task_struct *task, size_t *pos);
81
82/*
83 * Return the (array) index one record beyond the end of the array.
84 * (assuming an array of BTS/PEBS records)
85 *
86 * Returns -Eerrno on error
87 *
88 * task: the task to access;
89 * NULL to access the current cpu
90 * pos (out): if not NULL, will hold the result
91 */
92extern int ds_get_bts_end(struct task_struct *task, size_t *pos);
93extern int ds_get_pebs_end(struct task_struct *task, size_t *pos);
94
95/*
96 * Provide a pointer to the BTS/PEBS record at parameter index.
97 * (assuming an array of BTS/PEBS records)
98 *
99 * The pointer points directly into the buffer. The user is
100 * responsible for copying the record.
101 *
102 * Returns the size of a single record on success; -Eerrno on error
103 *
104 * task: the task to access;
105 * NULL to access the current cpu
106 * index: the index of the requested record
107 * record (out): pointer to the requested record
108 */
109extern int ds_access_bts(struct task_struct *task,
110 size_t index, const void **record);
111extern int ds_access_pebs(struct task_struct *task,
112 size_t index, const void **record);
113
114/*
115 * Write one or more BTS/PEBS records at the write pointer index and
116 * advance the write pointer.
117 *
118 * If size is not a multiple of the record size, trailing bytes are
119 * zeroed out.
120 *
121 * May result in one or more overflow notifications.
122 *
123 * If called during overflow handling, that is, with index >=
124 * interrupt threshold, the write will wrap around.
125 *
126 * An overflow notification is given if and when the interrupt
127 * threshold is reached during or after the write.
128 *
129 * Returns the number of bytes written or -Eerrno.
130 *
131 * task: the task to access;
132 * NULL to access the current cpu
133 * buffer: the buffer to write
134 * size: the size of the buffer
135 */
136extern int ds_write_bts(struct task_struct *task,
137 const void *buffer, size_t size);
138extern int ds_write_pebs(struct task_struct *task,
139 const void *buffer, size_t size);
140
141/*
142 * Same as ds_write_bts/pebs, but omit ownership checks.
143 *
144 * This is needed to have some other task than the owner of the
145 * BTS/PEBS buffer or the parameter task itself write into the
146 * respective buffer.
147 */
148extern int ds_unchecked_write_bts(struct task_struct *task,
149 const void *buffer, size_t size);
150extern int ds_unchecked_write_pebs(struct task_struct *task,
151 const void *buffer, size_t size);
152
153/*
154 * Reset the write pointer of the BTS/PEBS buffer.
155 *
156 * Returns 0 on success; -Eerrno on error
157 *
158 * task: the task to access;
159 * NULL to access the current cpu
160 */
161extern int ds_reset_bts(struct task_struct *task);
162extern int ds_reset_pebs(struct task_struct *task);
163
164/*
165 * Clear the BTS/PEBS buffer and reset the write pointer.
166 * The entire buffer will be zeroed out.
167 *
168 * Returns 0 on success; -Eerrno on error
169 *
170 * task: the task to access;
171 * NULL to access the current cpu
172 */
173extern int ds_clear_bts(struct task_struct *task);
174extern int ds_clear_pebs(struct task_struct *task);
175
176/*
177 * Provide the PEBS counter reset value.
178 *
179 * Returns 0 on success; -Eerrno on error
180 *
181 * task: the task to access;
182 * NULL to access the current cpu
183 * value (out): the counter reset value
184 */
185extern int ds_get_pebs_reset(struct task_struct *task, u64 *value);
186
187/*
188 * Set the PEBS counter reset value.
189 *
190 * Returns 0 on success; -Eerrno on error
191 *
192 * task: the task to access;
193 * NULL to access the current cpu
194 * value: the new counter reset value
195 */
196extern int ds_set_pebs_reset(struct task_struct *task, u64 value);
197
198/*
199 * Initialization
200 */
201struct cpuinfo_x86;
202extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
203
204
205
206/*
207 * The DS context - part of struct thread_struct.
208 */
209struct ds_context {
210 /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
211 unsigned char *ds;
212 /* the owner of the BTS and PEBS configuration, respectively */
213 struct task_struct *owner[2];
214 /* buffer overflow notification function for BTS and PEBS */
215 ds_ovfl_callback_t callback[2];
216 /* the original buffer address */
217 void *buffer[2];
218 /* the number of allocated pages for on-request allocated buffers */
219 unsigned int pages[2];
220 /* use count */
221 unsigned long count;
222 /* a pointer to the context location inside the thread_struct
223 * or the per_cpu context array */
224 struct ds_context **this;
225 /* a pointer to the task owning this context, or NULL, if the
226 * context is owned by a cpu */
227 struct task_struct *task;
228};
229
230/* called by exit_thread() to free leftover contexts */
231extern void ds_free(struct ds_context *context);
232
233#else /* CONFIG_X86_DS */
234
235#define ds_init_intel(config) do {} while (0)
236
237#endif /* CONFIG_X86_DS */
238#endif /* _ASM_X86_DS_H */
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
new file mode 100644
index 000000000000..804b6e6be929
--- /dev/null
+++ b/arch/x86/include/asm/dwarf2.h
@@ -0,0 +1,61 @@
1#ifndef _ASM_X86_DWARF2_H
2#define _ASM_X86_DWARF2_H
3
4#ifndef __ASSEMBLY__
5#warning "asm/dwarf2.h should be only included in pure assembly files"
6#endif
7
8/*
9 Macros for dwarf2 CFI unwind table entries.
10 See "as.info" for details on these pseudo ops. Unfortunately
11 they are only supported in very new binutils, so define them
12 away for older version.
13 */
14
15#ifdef CONFIG_AS_CFI
16
17#define CFI_STARTPROC .cfi_startproc
18#define CFI_ENDPROC .cfi_endproc
19#define CFI_DEF_CFA .cfi_def_cfa
20#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register
21#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset
22#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset
23#define CFI_OFFSET .cfi_offset
24#define CFI_REL_OFFSET .cfi_rel_offset
25#define CFI_REGISTER .cfi_register
26#define CFI_RESTORE .cfi_restore
27#define CFI_REMEMBER_STATE .cfi_remember_state
28#define CFI_RESTORE_STATE .cfi_restore_state
29#define CFI_UNDEFINED .cfi_undefined
30
31#ifdef CONFIG_AS_CFI_SIGNAL_FRAME
32#define CFI_SIGNAL_FRAME .cfi_signal_frame
33#else
34#define CFI_SIGNAL_FRAME
35#endif
36
37#else
38
39/* Due to the structure of pre-exisiting code, don't use assembler line
40 comment character # to ignore the arguments. Instead, use a dummy macro. */
41.macro cfi_ignore a=0, b=0, c=0, d=0
42.endm
43
44#define CFI_STARTPROC cfi_ignore
45#define CFI_ENDPROC cfi_ignore
46#define CFI_DEF_CFA cfi_ignore
47#define CFI_DEF_CFA_REGISTER cfi_ignore
48#define CFI_DEF_CFA_OFFSET cfi_ignore
49#define CFI_ADJUST_CFA_OFFSET cfi_ignore
50#define CFI_OFFSET cfi_ignore
51#define CFI_REL_OFFSET cfi_ignore
52#define CFI_REGISTER cfi_ignore
53#define CFI_RESTORE cfi_ignore
54#define CFI_REMEMBER_STATE cfi_ignore
55#define CFI_RESTORE_STATE cfi_ignore
56#define CFI_UNDEFINED cfi_ignore
57#define CFI_SIGNAL_FRAME cfi_ignore
58
59#endif
60
61#endif /* _ASM_X86_DWARF2_H */
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
new file mode 100644
index 000000000000..3d8ceddbd407
--- /dev/null
+++ b/arch/x86/include/asm/e820.h
@@ -0,0 +1,146 @@
1#ifndef _ASM_X86_E820_H
2#define _ASM_X86_E820_H
3#define E820MAP 0x2d0 /* our map */
4#define E820MAX 128 /* number of entries in E820MAP */
5
6/*
7 * Legacy E820 BIOS limits us to 128 (E820MAX) nodes due to the
8 * constrained space in the zeropage. If we have more nodes than
9 * that, and if we've booted off EFI firmware, then the EFI tables
10 * passed us from the EFI firmware can list more nodes. Size our
11 * internal memory map tables to have room for these additional
12 * nodes, based on up to three entries per node for which the
13 * kernel was built: MAX_NUMNODES == (1 << CONFIG_NODES_SHIFT),
14 * plus E820MAX, allowing space for the possible duplicate E820
15 * entries that might need room in the same arrays, prior to the
16 * call to sanitize_e820_map() to remove duplicates. The allowance
17 * of three memory map entries per node is "enough" entries for
18 * the initial hardware platform motivating this mechanism to make
19 * use of additional EFI map entries. Future platforms may want
20 * to allow more than three entries per node or otherwise refine
21 * this size.
22 */
23
24/*
25 * Odd: 'make headers_check' complains about numa.h if I try
26 * to collapse the next two #ifdef lines to a single line:
27 * #if defined(__KERNEL__) && defined(CONFIG_EFI)
28 */
29#ifdef __KERNEL__
30#ifdef CONFIG_EFI
31#include <linux/numa.h>
32#define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES)
33#else /* ! CONFIG_EFI */
34#define E820_X_MAX E820MAX
35#endif
36#else /* ! __KERNEL__ */
37#define E820_X_MAX E820MAX
38#endif
39
40#define E820NR 0x1e8 /* # entries in E820MAP */
41
42#define E820_RAM 1
43#define E820_RESERVED 2
44#define E820_ACPI 3
45#define E820_NVS 4
46#define E820_UNUSABLE 5
47
48/* reserved RAM used by kernel itself */
49#define E820_RESERVED_KERN 128
50
51#ifndef __ASSEMBLY__
52struct e820entry {
53 __u64 addr; /* start of memory segment */
54 __u64 size; /* size of memory segment */
55 __u32 type; /* type of memory segment */
56} __attribute__((packed));
57
58struct e820map {
59 __u32 nr_map;
60 struct e820entry map[E820_X_MAX];
61};
62
63#ifdef __KERNEL__
64/* see comment in arch/x86/kernel/e820.c */
65extern struct e820map e820;
66extern struct e820map e820_saved;
67
68extern unsigned long pci_mem_start;
69extern int e820_any_mapped(u64 start, u64 end, unsigned type);
70extern int e820_all_mapped(u64 start, u64 end, unsigned type);
71extern void e820_add_region(u64 start, u64 size, int type);
72extern void e820_print_map(char *who);
73extern int
74sanitize_e820_map(struct e820entry *biosmap, int max_nr_map, int *pnr_map);
75extern u64 e820_update_range(u64 start, u64 size, unsigned old_type,
76 unsigned new_type);
77extern u64 e820_remove_range(u64 start, u64 size, unsigned old_type,
78 int checktype);
79extern void update_e820(void);
80extern void e820_setup_gap(void);
81extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
82 unsigned long start_addr, unsigned long long end_addr);
83struct setup_data;
84extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data);
85
86#if defined(CONFIG_X86_64) || \
87 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
88extern void e820_mark_nosave_regions(unsigned long limit_pfn);
89#else
90static inline void e820_mark_nosave_regions(unsigned long limit_pfn)
91{
92}
93#endif
94
95#ifdef CONFIG_MEMTEST
96extern void early_memtest(unsigned long start, unsigned long end);
97#else
98static inline void early_memtest(unsigned long start, unsigned long end)
99{
100}
101#endif
102
103extern unsigned long end_user_pfn;
104
105extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
106extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
107extern void reserve_early(u64 start, u64 end, char *name);
108extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
109extern void free_early(u64 start, u64 end);
110extern void early_res_to_bootmem(u64 start, u64 end);
111extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
112
113extern unsigned long e820_end_of_ram_pfn(void);
114extern unsigned long e820_end_of_low_ram_pfn(void);
115extern int e820_find_active_region(const struct e820entry *ei,
116 unsigned long start_pfn,
117 unsigned long last_pfn,
118 unsigned long *ei_startpfn,
119 unsigned long *ei_endpfn);
120extern void e820_register_active_regions(int nid, unsigned long start_pfn,
121 unsigned long end_pfn);
122extern u64 e820_hole_size(u64 start, u64 end);
123extern void finish_e820_parsing(void);
124extern void e820_reserve_resources(void);
125extern void e820_reserve_resources_late(void);
126extern void setup_memory_map(void);
127extern char *default_machine_specific_memory_setup(void);
128extern char *machine_specific_memory_setup(void);
129extern char *memory_setup(void);
130#endif /* __KERNEL__ */
131#endif /* __ASSEMBLY__ */
132
133#define ISA_START_ADDRESS 0xa0000
134#define ISA_END_ADDRESS 0x100000
135#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS)
136
137#define BIOS_BEGIN 0x000a0000
138#define BIOS_END 0x00100000
139
140#ifdef __KERNEL__
141#include <linux/ioport.h>
142
143#define HIGH_MEMORY (1024*1024)
144#endif /* __KERNEL__ */
145
146#endif /* _ASM_X86_E820_H */
diff --git a/arch/x86/include/asm/edac.h b/arch/x86/include/asm/edac.h
new file mode 100644
index 000000000000..e9b57ecc70c5
--- /dev/null
+++ b/arch/x86/include/asm/edac.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_X86_EDAC_H
2#define _ASM_X86_EDAC_H
3
4/* ECC atomic, DMA, SMP and interrupt safe scrub function */
5
6static inline void atomic_scrub(void *va, u32 size)
7{
8 u32 i, *virt_addr = va;
9
10 /*
11 * Very carefully read and write to memory atomically so we
12 * are interrupt, DMA and SMP safe.
13 */
14 for (i = 0; i < size / 4; i++, virt_addr++)
15 asm volatile("lock; addl $0, %0"::"m" (*virt_addr));
16}
17
18#endif /* _ASM_X86_EDAC_H */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
new file mode 100644
index 000000000000..a2e545c91c35
--- /dev/null
+++ b/arch/x86/include/asm/efi.h
@@ -0,0 +1,110 @@
1#ifndef _ASM_X86_EFI_H
2#define _ASM_X86_EFI_H
3
4#ifdef CONFIG_X86_32
5
6extern unsigned long asmlinkage efi_call_phys(void *, ...);
7
8#define efi_call_phys0(f) efi_call_phys(f)
9#define efi_call_phys1(f, a1) efi_call_phys(f, a1)
10#define efi_call_phys2(f, a1, a2) efi_call_phys(f, a1, a2)
11#define efi_call_phys3(f, a1, a2, a3) efi_call_phys(f, a1, a2, a3)
12#define efi_call_phys4(f, a1, a2, a3, a4) \
13 efi_call_phys(f, a1, a2, a3, a4)
14#define efi_call_phys5(f, a1, a2, a3, a4, a5) \
15 efi_call_phys(f, a1, a2, a3, a4, a5)
16#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \
17 efi_call_phys(f, a1, a2, a3, a4, a5, a6)
18/*
19 * Wrap all the virtual calls in a way that forces the parameters on the stack.
20 */
21
22#define efi_call_virt(f, args...) \
23 ((efi_##f##_t __attribute__((regparm(0)))*)efi.systab->runtime->f)(args)
24
25#define efi_call_virt0(f) efi_call_virt(f)
26#define efi_call_virt1(f, a1) efi_call_virt(f, a1)
27#define efi_call_virt2(f, a1, a2) efi_call_virt(f, a1, a2)
28#define efi_call_virt3(f, a1, a2, a3) efi_call_virt(f, a1, a2, a3)
29#define efi_call_virt4(f, a1, a2, a3, a4) \
30 efi_call_virt(f, a1, a2, a3, a4)
31#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
32 efi_call_virt(f, a1, a2, a3, a4, a5)
33#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
34 efi_call_virt(f, a1, a2, a3, a4, a5, a6)
35
36#define efi_ioremap(addr, size) ioremap_cache(addr, size)
37
38#else /* !CONFIG_X86_32 */
39
40#define MAX_EFI_IO_PAGES 100
41
42extern u64 efi_call0(void *fp);
43extern u64 efi_call1(void *fp, u64 arg1);
44extern u64 efi_call2(void *fp, u64 arg1, u64 arg2);
45extern u64 efi_call3(void *fp, u64 arg1, u64 arg2, u64 arg3);
46extern u64 efi_call4(void *fp, u64 arg1, u64 arg2, u64 arg3, u64 arg4);
47extern u64 efi_call5(void *fp, u64 arg1, u64 arg2, u64 arg3,
48 u64 arg4, u64 arg5);
49extern u64 efi_call6(void *fp, u64 arg1, u64 arg2, u64 arg3,
50 u64 arg4, u64 arg5, u64 arg6);
51
52#define efi_call_phys0(f) \
53 efi_call0((void *)(f))
54#define efi_call_phys1(f, a1) \
55 efi_call1((void *)(f), (u64)(a1))
56#define efi_call_phys2(f, a1, a2) \
57 efi_call2((void *)(f), (u64)(a1), (u64)(a2))
58#define efi_call_phys3(f, a1, a2, a3) \
59 efi_call3((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3))
60#define efi_call_phys4(f, a1, a2, a3, a4) \
61 efi_call4((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
62 (u64)(a4))
63#define efi_call_phys5(f, a1, a2, a3, a4, a5) \
64 efi_call5((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
65 (u64)(a4), (u64)(a5))
66#define efi_call_phys6(f, a1, a2, a3, a4, a5, a6) \
67 efi_call6((void *)(f), (u64)(a1), (u64)(a2), (u64)(a3), \
68 (u64)(a4), (u64)(a5), (u64)(a6))
69
70#define efi_call_virt0(f) \
71 efi_call0((void *)(efi.systab->runtime->f))
72#define efi_call_virt1(f, a1) \
73 efi_call1((void *)(efi.systab->runtime->f), (u64)(a1))
74#define efi_call_virt2(f, a1, a2) \
75 efi_call2((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2))
76#define efi_call_virt3(f, a1, a2, a3) \
77 efi_call3((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
78 (u64)(a3))
79#define efi_call_virt4(f, a1, a2, a3, a4) \
80 efi_call4((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
81 (u64)(a3), (u64)(a4))
82#define efi_call_virt5(f, a1, a2, a3, a4, a5) \
83 efi_call5((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
84 (u64)(a3), (u64)(a4), (u64)(a5))
85#define efi_call_virt6(f, a1, a2, a3, a4, a5, a6) \
86 efi_call6((void *)(efi.systab->runtime->f), (u64)(a1), (u64)(a2), \
87 (u64)(a3), (u64)(a4), (u64)(a5), (u64)(a6))
88
89extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size);
90
91#endif /* CONFIG_X86_32 */
92
93extern void efi_reserve_early(void);
94extern void efi_call_phys_prelog(void);
95extern void efi_call_phys_epilog(void);
96
97#ifndef CONFIG_EFI
98/*
99 * IF EFI is not configured, have the EFI calls return -ENOSYS.
100 */
101#define efi_call0(_f) (-ENOSYS)
102#define efi_call1(_f, _a1) (-ENOSYS)
103#define efi_call2(_f, _a1, _a2) (-ENOSYS)
104#define efi_call3(_f, _a1, _a2, _a3) (-ENOSYS)
105#define efi_call4(_f, _a1, _a2, _a3, _a4) (-ENOSYS)
106#define efi_call5(_f, _a1, _a2, _a3, _a4, _a5) (-ENOSYS)
107#define efi_call6(_f, _a1, _a2, _a3, _a4, _a5, _a6) (-ENOSYS)
108#endif /* CONFIG_EFI */
109
110#endif /* _ASM_X86_EFI_H */
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
new file mode 100644
index 000000000000..40ca1bea7916
--- /dev/null
+++ b/arch/x86/include/asm/elf.h
@@ -0,0 +1,336 @@
1#ifndef _ASM_X86_ELF_H
2#define _ASM_X86_ELF_H
3
4/*
5 * ELF register definitions..
6 */
7
8#include <asm/ptrace.h>
9#include <asm/user.h>
10#include <asm/auxvec.h>
11
12typedef unsigned long elf_greg_t;
13
14#define ELF_NGREG (sizeof(struct user_regs_struct) / sizeof(elf_greg_t))
15typedef elf_greg_t elf_gregset_t[ELF_NGREG];
16
17typedef struct user_i387_struct elf_fpregset_t;
18
19#ifdef __i386__
20
21typedef struct user_fxsr_struct elf_fpxregset_t;
22
23#define R_386_NONE 0
24#define R_386_32 1
25#define R_386_PC32 2
26#define R_386_GOT32 3
27#define R_386_PLT32 4
28#define R_386_COPY 5
29#define R_386_GLOB_DAT 6
30#define R_386_JMP_SLOT 7
31#define R_386_RELATIVE 8
32#define R_386_GOTOFF 9
33#define R_386_GOTPC 10
34#define R_386_NUM 11
35
36/*
37 * These are used to set parameters in the core dumps.
38 */
39#define ELF_CLASS ELFCLASS32
40#define ELF_DATA ELFDATA2LSB
41#define ELF_ARCH EM_386
42
43#else
44
45/* x86-64 relocation types */
46#define R_X86_64_NONE 0 /* No reloc */
47#define R_X86_64_64 1 /* Direct 64 bit */
48#define R_X86_64_PC32 2 /* PC relative 32 bit signed */
49#define R_X86_64_GOT32 3 /* 32 bit GOT entry */
50#define R_X86_64_PLT32 4 /* 32 bit PLT address */
51#define R_X86_64_COPY 5 /* Copy symbol at runtime */
52#define R_X86_64_GLOB_DAT 6 /* Create GOT entry */
53#define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */
54#define R_X86_64_RELATIVE 8 /* Adjust by program base */
55#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative
56 offset to GOT */
57#define R_X86_64_32 10 /* Direct 32 bit zero extended */
58#define R_X86_64_32S 11 /* Direct 32 bit sign extended */
59#define R_X86_64_16 12 /* Direct 16 bit zero extended */
60#define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */
61#define R_X86_64_8 14 /* Direct 8 bit sign extended */
62#define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */
63
64#define R_X86_64_NUM 16
65
66/*
67 * These are used to set parameters in the core dumps.
68 */
69#define ELF_CLASS ELFCLASS64
70#define ELF_DATA ELFDATA2LSB
71#define ELF_ARCH EM_X86_64
72
73#endif
74
75#include <asm/vdso.h>
76
77extern unsigned int vdso_enabled;
78
79/*
80 * This is used to ensure we don't load something for the wrong architecture.
81 */
82#define elf_check_arch_ia32(x) \
83 (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486))
84
85#include <asm/processor.h>
86#include <asm/system.h>
87
88#ifdef CONFIG_X86_32
89#include <asm/desc.h>
90
91#define elf_check_arch(x) elf_check_arch_ia32(x)
92
93/* SVR4/i386 ABI (pages 3-31, 3-32) says that when the program starts %edx
94 contains a pointer to a function which might be registered using `atexit'.
95 This provides a mean for the dynamic linker to call DT_FINI functions for
96 shared libraries that have been loaded before the code runs.
97
98 A value of 0 tells we have no such handler.
99
100 We might as well make sure everything else is cleared too (except for %esp),
101 just to make things more deterministic.
102 */
103#define ELF_PLAT_INIT(_r, load_addr) \
104 do { \
105 _r->bx = 0; _r->cx = 0; _r->dx = 0; \
106 _r->si = 0; _r->di = 0; _r->bp = 0; \
107 _r->ax = 0; \
108} while (0)
109
110/*
111 * regs is struct pt_regs, pr_reg is elf_gregset_t (which is
112 * now struct_user_regs, they are different)
113 */
114
115#define ELF_CORE_COPY_REGS(pr_reg, regs) \
116do { \
117 pr_reg[0] = regs->bx; \
118 pr_reg[1] = regs->cx; \
119 pr_reg[2] = regs->dx; \
120 pr_reg[3] = regs->si; \
121 pr_reg[4] = regs->di; \
122 pr_reg[5] = regs->bp; \
123 pr_reg[6] = regs->ax; \
124 pr_reg[7] = regs->ds & 0xffff; \
125 pr_reg[8] = regs->es & 0xffff; \
126 pr_reg[9] = regs->fs & 0xffff; \
127 savesegment(gs, pr_reg[10]); \
128 pr_reg[11] = regs->orig_ax; \
129 pr_reg[12] = regs->ip; \
130 pr_reg[13] = regs->cs & 0xffff; \
131 pr_reg[14] = regs->flags; \
132 pr_reg[15] = regs->sp; \
133 pr_reg[16] = regs->ss & 0xffff; \
134} while (0);
135
136#define ELF_PLATFORM (utsname()->machine)
137#define set_personality_64bit() do { } while (0)
138
139#else /* CONFIG_X86_32 */
140
141/*
142 * This is used to ensure we don't load something for the wrong architecture.
143 */
144#define elf_check_arch(x) \
145 ((x)->e_machine == EM_X86_64)
146
147#define compat_elf_check_arch(x) elf_check_arch_ia32(x)
148
149static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp)
150{
151 loadsegment(fs, 0);
152 loadsegment(ds, __USER32_DS);
153 loadsegment(es, __USER32_DS);
154 load_gs_index(0);
155 regs->ip = ip;
156 regs->sp = sp;
157 regs->flags = X86_EFLAGS_IF;
158 regs->cs = __USER32_CS;
159 regs->ss = __USER32_DS;
160}
161
162static inline void elf_common_init(struct thread_struct *t,
163 struct pt_regs *regs, const u16 ds)
164{
165 regs->ax = regs->bx = regs->cx = regs->dx = 0;
166 regs->si = regs->di = regs->bp = 0;
167 regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0;
168 regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;
169 t->fs = t->gs = 0;
170 t->fsindex = t->gsindex = 0;
171 t->ds = t->es = ds;
172}
173
174#define ELF_PLAT_INIT(_r, load_addr) \
175do { \
176 elf_common_init(&current->thread, _r, 0); \
177 clear_thread_flag(TIF_IA32); \
178} while (0)
179
180#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
181 elf_common_init(&current->thread, regs, __USER_DS)
182
183#define compat_start_thread(regs, ip, sp) \
184do { \
185 start_ia32_thread(regs, ip, sp); \
186 set_fs(USER_DS); \
187} while (0)
188
189#define COMPAT_SET_PERSONALITY(ex) \
190do { \
191 if (test_thread_flag(TIF_IA32)) \
192 clear_thread_flag(TIF_ABI_PENDING); \
193 else \
194 set_thread_flag(TIF_ABI_PENDING); \
195 current->personality |= force_personality32; \
196} while (0)
197
198#define COMPAT_ELF_PLATFORM ("i686")
199
200/*
201 * regs is struct pt_regs, pr_reg is elf_gregset_t (which is
202 * now struct_user_regs, they are different). Assumes current is the process
203 * getting dumped.
204 */
205
206#define ELF_CORE_COPY_REGS(pr_reg, regs) \
207do { \
208 unsigned v; \
209 (pr_reg)[0] = (regs)->r15; \
210 (pr_reg)[1] = (regs)->r14; \
211 (pr_reg)[2] = (regs)->r13; \
212 (pr_reg)[3] = (regs)->r12; \
213 (pr_reg)[4] = (regs)->bp; \
214 (pr_reg)[5] = (regs)->bx; \
215 (pr_reg)[6] = (regs)->r11; \
216 (pr_reg)[7] = (regs)->r10; \
217 (pr_reg)[8] = (regs)->r9; \
218 (pr_reg)[9] = (regs)->r8; \
219 (pr_reg)[10] = (regs)->ax; \
220 (pr_reg)[11] = (regs)->cx; \
221 (pr_reg)[12] = (regs)->dx; \
222 (pr_reg)[13] = (regs)->si; \
223 (pr_reg)[14] = (regs)->di; \
224 (pr_reg)[15] = (regs)->orig_ax; \
225 (pr_reg)[16] = (regs)->ip; \
226 (pr_reg)[17] = (regs)->cs; \
227 (pr_reg)[18] = (regs)->flags; \
228 (pr_reg)[19] = (regs)->sp; \
229 (pr_reg)[20] = (regs)->ss; \
230 (pr_reg)[21] = current->thread.fs; \
231 (pr_reg)[22] = current->thread.gs; \
232 asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \
233 asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \
234 asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \
235 asm("movl %%gs,%0" : "=r" (v)); (pr_reg)[26] = v; \
236} while (0);
237
238/* I'm not sure if we can use '-' here */
239#define ELF_PLATFORM ("x86_64")
240extern void set_personality_64bit(void);
241extern unsigned int sysctl_vsyscall32;
242extern int force_personality32;
243
244#endif /* !CONFIG_X86_32 */
245
246#define CORE_DUMP_USE_REGSET
247#define USE_ELF_CORE_DUMP
248#define ELF_EXEC_PAGESIZE 4096
249
250/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
251 use of this is to invoke "./ld.so someprog" to test out a new version of
252 the loader. We need to make sure that it is out of the way of the program
253 that it will "exec", and that there is sufficient room for the brk. */
254
255#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2)
256
257/* This yields a mask that user programs can use to figure out what
258 instruction set this CPU supports. This could be done in user space,
259 but it's not easy, and we've already done it here. */
260
261#define ELF_HWCAP (boot_cpu_data.x86_capability[0])
262
263/* This yields a string that ld.so will use to load implementation
264 specific libraries for optimization. This is more specific in
265 intent than poking at uname or /proc/cpuinfo.
266
267 For the moment, we have only optimizations for the Intel generations,
268 but that could change... */
269
270#define SET_PERSONALITY(ex) set_personality_64bit()
271
272/*
273 * An executable for which elf_read_implies_exec() returns TRUE will
274 * have the READ_IMPLIES_EXEC personality flag set automatically.
275 */
276#define elf_read_implies_exec(ex, executable_stack) \
277 (executable_stack != EXSTACK_DISABLE_X)
278
279struct task_struct;
280
281#define ARCH_DLINFO_IA32(vdso_enabled) \
282do { \
283 if (vdso_enabled) { \
284 NEW_AUX_ENT(AT_SYSINFO, VDSO_ENTRY); \
285 NEW_AUX_ENT(AT_SYSINFO_EHDR, VDSO_CURRENT_BASE); \
286 } \
287} while (0)
288
289#ifdef CONFIG_X86_32
290
291#define VDSO_HIGH_BASE (__fix_to_virt(FIX_VDSO))
292
293#define ARCH_DLINFO ARCH_DLINFO_IA32(vdso_enabled)
294
295/* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */
296
297#else /* CONFIG_X86_32 */
298
299#define VDSO_HIGH_BASE 0xffffe000U /* CONFIG_COMPAT_VDSO address */
300
301/* 1GB for 64bit, 8MB for 32bit */
302#define STACK_RND_MASK (test_thread_flag(TIF_IA32) ? 0x7ff : 0x3fffff)
303
304#define ARCH_DLINFO \
305do { \
306 if (vdso_enabled) \
307 NEW_AUX_ENT(AT_SYSINFO_EHDR, \
308 (unsigned long)current->mm->context.vdso); \
309} while (0)
310
311#define AT_SYSINFO 32
312
313#define COMPAT_ARCH_DLINFO ARCH_DLINFO_IA32(sysctl_vsyscall32)
314
315#define COMPAT_ELF_ET_DYN_BASE (TASK_UNMAPPED_BASE + 0x1000000)
316
317#endif /* !CONFIG_X86_32 */
318
319#define VDSO_CURRENT_BASE ((unsigned long)current->mm->context.vdso)
320
321#define VDSO_ENTRY \
322 ((unsigned long)VDSO32_SYMBOL(VDSO_CURRENT_BASE, vsyscall))
323
324struct linux_binprm;
325
326#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1
327extern int arch_setup_additional_pages(struct linux_binprm *bprm,
328 int executable_stack);
329
330extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
331#define compat_arch_setup_additional_pages syscall32_setup_pages
332
333extern unsigned long arch_randomize_brk(struct mm_struct *mm);
334#define arch_randomize_brk arch_randomize_brk
335
336#endif /* _ASM_X86_ELF_H */
diff --git a/arch/x86/include/asm/emergency-restart.h b/arch/x86/include/asm/emergency-restart.h
new file mode 100644
index 000000000000..94826cf87455
--- /dev/null
+++ b/arch/x86/include/asm/emergency-restart.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_X86_EMERGENCY_RESTART_H
2#define _ASM_X86_EMERGENCY_RESTART_H
3
4enum reboot_type {
5 BOOT_TRIPLE = 't',
6 BOOT_KBD = 'k',
7#ifdef CONFIG_X86_32
8 BOOT_BIOS = 'b',
9#endif
10 BOOT_ACPI = 'a',
11 BOOT_EFI = 'e'
12};
13
14extern enum reboot_type reboot_type;
15
16extern void machine_emergency_restart(void);
17
18#endif /* _ASM_X86_EMERGENCY_RESTART_H */
diff --git a/arch/x86/include/asm/errno.h b/arch/x86/include/asm/errno.h
new file mode 100644
index 000000000000..4c82b503d92f
--- /dev/null
+++ b/arch/x86/include/asm/errno.h
@@ -0,0 +1 @@
#include <asm-generic/errno.h>
diff --git a/arch/x86/include/asm/es7000/apic.h b/arch/x86/include/asm/es7000/apic.h
new file mode 100644
index 000000000000..380f0b4f17ed
--- /dev/null
+++ b/arch/x86/include/asm/es7000/apic.h
@@ -0,0 +1,193 @@
1#ifndef __ASM_ES7000_APIC_H
2#define __ASM_ES7000_APIC_H
3
4#define xapic_phys_to_log_apicid(cpu) per_cpu(x86_bios_cpu_apicid, cpu)
5#define esr_disable (1)
6
7static inline int apic_id_registered(void)
8{
9 return (1);
10}
11
12static inline cpumask_t target_cpus(void)
13{
14#if defined CONFIG_ES7000_CLUSTERED_APIC
15 return CPU_MASK_ALL;
16#else
17 return cpumask_of_cpu(smp_processor_id());
18#endif
19}
20
21#if defined CONFIG_ES7000_CLUSTERED_APIC
22#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
23#define INT_DELIVERY_MODE (dest_LowestPrio)
24#define INT_DEST_MODE (1) /* logical delivery broadcast to all procs */
25#define NO_BALANCE_IRQ (1)
26#undef WAKE_SECONDARY_VIA_INIT
27#define WAKE_SECONDARY_VIA_MIP
28#else
29#define APIC_DFR_VALUE (APIC_DFR_FLAT)
30#define INT_DELIVERY_MODE (dest_Fixed)
31#define INT_DEST_MODE (0) /* phys delivery to target procs */
32#define NO_BALANCE_IRQ (0)
33#undef APIC_DEST_LOGICAL
34#define APIC_DEST_LOGICAL 0x0
35#define WAKE_SECONDARY_VIA_INIT
36#endif
37
38static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
39{
40 return 0;
41}
42static inline unsigned long check_apicid_present(int bit)
43{
44 return physid_isset(bit, phys_cpu_present_map);
45}
46
47#define apicid_cluster(apicid) (apicid & 0xF0)
48
49static inline unsigned long calculate_ldr(int cpu)
50{
51 unsigned long id;
52 id = xapic_phys_to_log_apicid(cpu);
53 return (SET_APIC_LOGICAL_ID(id));
54}
55
56/*
57 * Set up the logical destination ID.
58 *
59 * Intel recommends to set DFR, LdR and TPR before enabling
60 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
61 * document number 292116). So here it goes...
62 */
63static inline void init_apic_ldr(void)
64{
65 unsigned long val;
66 int cpu = smp_processor_id();
67
68 apic_write(APIC_DFR, APIC_DFR_VALUE);
69 val = calculate_ldr(cpu);
70 apic_write(APIC_LDR, val);
71}
72
73#ifndef CONFIG_X86_GENERICARCH
74extern void enable_apic_mode(void);
75#endif
76
77extern int apic_version [MAX_APICS];
78static inline void setup_apic_routing(void)
79{
80 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
81 printk("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
82 (apic_version[apic] == 0x14) ?
83 "Physical Cluster" : "Logical Cluster", nr_ioapics, cpus_addr(target_cpus())[0]);
84}
85
86static inline int multi_timer_check(int apic, int irq)
87{
88 return 0;
89}
90
91static inline int apicid_to_node(int logical_apicid)
92{
93 return 0;
94}
95
96
97static inline int cpu_present_to_apicid(int mps_cpu)
98{
99 if (!mps_cpu)
100 return boot_cpu_physical_apicid;
101 else if (mps_cpu < NR_CPUS)
102 return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu);
103 else
104 return BAD_APICID;
105}
106
107static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
108{
109 static int id = 0;
110 physid_mask_t mask;
111 mask = physid_mask_of_physid(id);
112 ++id;
113 return mask;
114}
115
116extern u8 cpu_2_logical_apicid[];
117/* Mapping from cpu number to logical apicid */
118static inline int cpu_to_logical_apicid(int cpu)
119{
120#ifdef CONFIG_SMP
121 if (cpu >= NR_CPUS)
122 return BAD_APICID;
123 return (int)cpu_2_logical_apicid[cpu];
124#else
125 return logical_smp_processor_id();
126#endif
127}
128
129static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
130{
131 /* For clustered we don't have a good way to do this yet - hack */
132 return physids_promote(0xff);
133}
134
135
136static inline void setup_portio_remap(void)
137{
138}
139
140extern unsigned int boot_cpu_physical_apicid;
141static inline int check_phys_apicid_present(int cpu_physical_apicid)
142{
143 boot_cpu_physical_apicid = read_apic_id();
144 return (1);
145}
146
147static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
148{
149 int num_bits_set;
150 int cpus_found = 0;
151 int cpu;
152 int apicid;
153
154 num_bits_set = cpus_weight(cpumask);
155 /* Return id to all */
156 if (num_bits_set == NR_CPUS)
157#if defined CONFIG_ES7000_CLUSTERED_APIC
158 return 0xFF;
159#else
160 return cpu_to_logical_apicid(0);
161#endif
162 /*
163 * The cpus in the mask must all be on the apic cluster. If are not
164 * on the same apicid cluster return default value of TARGET_CPUS.
165 */
166 cpu = first_cpu(cpumask);
167 apicid = cpu_to_logical_apicid(cpu);
168 while (cpus_found < num_bits_set) {
169 if (cpu_isset(cpu, cpumask)) {
170 int new_apicid = cpu_to_logical_apicid(cpu);
171 if (apicid_cluster(apicid) !=
172 apicid_cluster(new_apicid)){
173 printk ("%s: Not a valid mask!\n", __func__);
174#if defined CONFIG_ES7000_CLUSTERED_APIC
175 return 0xFF;
176#else
177 return cpu_to_logical_apicid(0);
178#endif
179 }
180 apicid = new_apicid;
181 cpus_found++;
182 }
183 cpu++;
184 }
185 return apicid;
186}
187
188static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
189{
190 return cpuid_apic >> index_msb;
191}
192
193#endif /* __ASM_ES7000_APIC_H */
diff --git a/arch/x86/include/asm/es7000/apicdef.h b/arch/x86/include/asm/es7000/apicdef.h
new file mode 100644
index 000000000000..8b234a3cb851
--- /dev/null
+++ b/arch/x86/include/asm/es7000/apicdef.h
@@ -0,0 +1,13 @@
1#ifndef __ASM_ES7000_APICDEF_H
2#define __ASM_ES7000_APICDEF_H
3
4#define APIC_ID_MASK (0xFF<<24)
5
6static inline unsigned get_apic_id(unsigned long x)
7{
8 return (((x)>>24)&0xFF);
9}
10
11#define GET_APIC_ID(x) get_apic_id(x)
12
13#endif
diff --git a/arch/x86/include/asm/es7000/ipi.h b/arch/x86/include/asm/es7000/ipi.h
new file mode 100644
index 000000000000..632a955fcc0a
--- /dev/null
+++ b/arch/x86/include/asm/es7000/ipi.h
@@ -0,0 +1,24 @@
1#ifndef __ASM_ES7000_IPI_H
2#define __ASM_ES7000_IPI_H
3
4void send_IPI_mask_sequence(cpumask_t mask, int vector);
5
6static inline void send_IPI_mask(cpumask_t mask, int vector)
7{
8 send_IPI_mask_sequence(mask, vector);
9}
10
11static inline void send_IPI_allbutself(int vector)
12{
13 cpumask_t mask = cpu_online_map;
14 cpu_clear(smp_processor_id(), mask);
15 if (!cpus_empty(mask))
16 send_IPI_mask(mask, vector);
17}
18
19static inline void send_IPI_all(int vector)
20{
21 send_IPI_mask(cpu_online_map, vector);
22}
23
24#endif /* __ASM_ES7000_IPI_H */
diff --git a/arch/x86/include/asm/es7000/mpparse.h b/arch/x86/include/asm/es7000/mpparse.h
new file mode 100644
index 000000000000..ed5a3caae141
--- /dev/null
+++ b/arch/x86/include/asm/es7000/mpparse.h
@@ -0,0 +1,30 @@
1#ifndef __ASM_ES7000_MPPARSE_H
2#define __ASM_ES7000_MPPARSE_H
3
4#include <linux/acpi.h>
5
6extern int parse_unisys_oem (char *oemptr);
7extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
8extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
9extern void setup_unisys(void);
10
11#ifndef CONFIG_X86_GENERICARCH
12extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id);
13extern int mps_oem_check(struct mp_config_table *mpc, char *oem,
14 char *productid);
15#endif
16
17#ifdef CONFIG_ACPI
18
19static inline int es7000_check_dsdt(void)
20{
21 struct acpi_table_header header;
22
23 if (ACPI_SUCCESS(acpi_get_table_header(ACPI_SIG_DSDT, 0, &header)) &&
24 !strncmp(header.oem_id, "UNISYS", 6))
25 return 1;
26 return 0;
27}
28#endif
29
30#endif /* __ASM_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/es7000/wakecpu.h b/arch/x86/include/asm/es7000/wakecpu.h
new file mode 100644
index 000000000000..3ffc5a7bf667
--- /dev/null
+++ b/arch/x86/include/asm/es7000/wakecpu.h
@@ -0,0 +1,59 @@
1#ifndef __ASM_ES7000_WAKECPU_H
2#define __ASM_ES7000_WAKECPU_H
3
4/*
5 * This file copes with machines that wakeup secondary CPUs by the
6 * INIT, INIT, STARTUP sequence.
7 */
8
9#ifdef CONFIG_ES7000_CLUSTERED_APIC
10#define WAKE_SECONDARY_VIA_MIP
11#else
12#define WAKE_SECONDARY_VIA_INIT
13#endif
14
15#ifdef WAKE_SECONDARY_VIA_MIP
16extern int es7000_start_cpu(int cpu, unsigned long eip);
17static inline int
18wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
19{
20 int boot_error = 0;
21 boot_error = es7000_start_cpu(phys_apicid, start_eip);
22 return boot_error;
23}
24#endif
25
26#define TRAMPOLINE_LOW phys_to_virt(0x467)
27#define TRAMPOLINE_HIGH phys_to_virt(0x469)
28
29#define boot_cpu_apicid boot_cpu_physical_apicid
30
31static inline void wait_for_init_deassert(atomic_t *deassert)
32{
33#ifdef WAKE_SECONDARY_VIA_INIT
34 while (!atomic_read(deassert))
35 cpu_relax();
36#endif
37 return;
38}
39
40/* Nothing to do for most platforms, since cleared by the INIT cycle */
41static inline void smp_callin_clear_local_apic(void)
42{
43}
44
45static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
46{
47}
48
49static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
50{
51}
52
53#if APIC_DEBUG
54 #define inquire_remote_apic(apicid) __inquire_remote_apic(apicid)
55#else
56 #define inquire_remote_apic(apicid) {}
57#endif
58
59#endif /* __ASM_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h
new file mode 100644
index 000000000000..53018464aea6
--- /dev/null
+++ b/arch/x86/include/asm/fb.h
@@ -0,0 +1,21 @@
1#ifndef _ASM_X86_FB_H
2#define _ASM_X86_FB_H
3
4#include <linux/fb.h>
5#include <linux/fs.h>
6#include <asm/page.h>
7
8static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
9 unsigned long off)
10{
11 if (boot_cpu_data.x86 > 3)
12 pgprot_val(vma->vm_page_prot) |= _PAGE_PCD;
13}
14
15#ifdef CONFIG_X86_32
16extern int fb_is_primary_device(struct fb_info *info);
17#else
18static inline int fb_is_primary_device(struct fb_info *info) { return 0; }
19#endif
20
21#endif /* _ASM_X86_FB_H */
diff --git a/arch/x86/include/asm/fcntl.h b/arch/x86/include/asm/fcntl.h
new file mode 100644
index 000000000000..46ab12db5739
--- /dev/null
+++ b/arch/x86/include/asm/fcntl.h
@@ -0,0 +1 @@
#include <asm-generic/fcntl.h>
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
new file mode 100644
index 000000000000..8668a94f850e
--- /dev/null
+++ b/arch/x86/include/asm/fixmap.h
@@ -0,0 +1,68 @@
1#ifndef _ASM_X86_FIXMAP_H
2#define _ASM_X86_FIXMAP_H
3
4#ifdef CONFIG_X86_32
5# include "fixmap_32.h"
6#else
7# include "fixmap_64.h"
8#endif
9
10extern int fixmaps_set;
11
12void __native_set_fixmap(enum fixed_addresses idx, pte_t pte);
13void native_set_fixmap(enum fixed_addresses idx,
14 unsigned long phys, pgprot_t flags);
15
16#ifndef CONFIG_PARAVIRT
17static inline void __set_fixmap(enum fixed_addresses idx,
18 unsigned long phys, pgprot_t flags)
19{
20 native_set_fixmap(idx, phys, flags);
21}
22#endif
23
24#define set_fixmap(idx, phys) \
25 __set_fixmap(idx, phys, PAGE_KERNEL)
26
27/*
28 * Some hardware wants to get fixmapped without caching.
29 */
30#define set_fixmap_nocache(idx, phys) \
31 __set_fixmap(idx, phys, PAGE_KERNEL_NOCACHE)
32
33#define clear_fixmap(idx) \
34 __set_fixmap(idx, 0, __pgprot(0))
35
36#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT))
37#define __virt_to_fix(x) ((FIXADDR_TOP - ((x)&PAGE_MASK)) >> PAGE_SHIFT)
38
39extern void __this_fixmap_does_not_exist(void);
40
41/*
42 * 'index to address' translation. If anyone tries to use the idx
43 * directly without translation, we catch the bug with a NULL-deference
44 * kernel oops. Illegal ranges of incoming indices are caught too.
45 */
46static __always_inline unsigned long fix_to_virt(const unsigned int idx)
47{
48 /*
49 * this branch gets completely eliminated after inlining,
50 * except when someone tries to use fixaddr indices in an
51 * illegal way. (such as mixing up address types or using
52 * out-of-range indices).
53 *
54 * If it doesn't get removed, the linker will complain
55 * loudly with a reasonably clear error message..
56 */
57 if (idx >= __end_of_fixed_addresses)
58 __this_fixmap_does_not_exist();
59
60 return __fix_to_virt(idx);
61}
62
63static inline unsigned long virt_to_fix(const unsigned long vaddr)
64{
65 BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
66 return __virt_to_fix(vaddr);
67}
68#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/fixmap_32.h b/arch/x86/include/asm/fixmap_32.h
new file mode 100644
index 000000000000..09f29ab5c139
--- /dev/null
+++ b/arch/x86/include/asm/fixmap_32.h
@@ -0,0 +1,123 @@
1/*
2 * fixmap.h: compile-time virtual memory allocation
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1998 Ingo Molnar
9 *
10 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
11 */
12
13#ifndef _ASM_X86_FIXMAP_32_H
14#define _ASM_X86_FIXMAP_32_H
15
16
17/* used by vmalloc.c, vsyscall.lds.S.
18 *
19 * Leave one empty page between vmalloc'ed areas and
20 * the start of the fixmap.
21 */
22extern unsigned long __FIXADDR_TOP;
23#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO)
24#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1)
25
26#ifndef __ASSEMBLY__
27#include <linux/kernel.h>
28#include <asm/acpi.h>
29#include <asm/apicdef.h>
30#include <asm/page.h>
31#ifdef CONFIG_HIGHMEM
32#include <linux/threads.h>
33#include <asm/kmap_types.h>
34#endif
35
36/*
37 * Here we define all the compile-time 'special' virtual
38 * addresses. The point is to have a constant address at
39 * compile time, but to set the physical address only
40 * in the boot process. We allocate these special addresses
41 * from the end of virtual memory (0xfffff000) backwards.
42 * Also this lets us do fail-safe vmalloc(), we
43 * can guarantee that these special addresses and
44 * vmalloc()-ed addresses never overlap.
45 *
46 * these 'compile-time allocated' memory buffers are
47 * fixed-size 4k pages. (or larger if used with an increment
48 * highger than 1) use fixmap_set(idx,phys) to associate
49 * physical memory with fixmap indices.
50 *
51 * TLB entries of such buffers will not be flushed across
52 * task switches.
53 */
54enum fixed_addresses {
55 FIX_HOLE,
56 FIX_VDSO,
57 FIX_DBGP_BASE,
58 FIX_EARLYCON_MEM_BASE,
59#ifdef CONFIG_X86_LOCAL_APIC
60 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
61#endif
62#ifdef CONFIG_X86_IO_APIC
63 FIX_IO_APIC_BASE_0,
64 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS-1,
65#endif
66#ifdef CONFIG_X86_VISWS_APIC
67 FIX_CO_CPU, /* Cobalt timer */
68 FIX_CO_APIC, /* Cobalt APIC Redirection Table */
69 FIX_LI_PCIA, /* Lithium PCI Bridge A */
70 FIX_LI_PCIB, /* Lithium PCI Bridge B */
71#endif
72#ifdef CONFIG_X86_F00F_BUG
73 FIX_F00F_IDT, /* Virtual mapping for IDT */
74#endif
75#ifdef CONFIG_X86_CYCLONE_TIMER
76 FIX_CYCLONE_TIMER, /*cyclone timer register*/
77#endif
78#ifdef CONFIG_HIGHMEM
79 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */
80 FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1,
81#endif
82#ifdef CONFIG_PCI_MMCONFIG
83 FIX_PCIE_MCFG,
84#endif
85#ifdef CONFIG_PARAVIRT
86 FIX_PARAVIRT_BOOTMAP,
87#endif
88 __end_of_permanent_fixed_addresses,
89 /*
90 * 256 temporary boot-time mappings, used by early_ioremap(),
91 * before ioremap() is functional.
92 *
93 * We round it up to the next 256 pages boundary so that we
94 * can have a single pgd entry and a single pte table:
95 */
96#define NR_FIX_BTMAPS 64
97#define FIX_BTMAPS_SLOTS 4
98 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
99 (__end_of_permanent_fixed_addresses & 255),
100 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
101 FIX_WP_TEST,
102#ifdef CONFIG_ACPI
103 FIX_ACPI_BEGIN,
104 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
105#endif
106#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
107 FIX_OHCI1394_BASE,
108#endif
109 __end_of_fixed_addresses
110};
111
112extern void reserve_top_address(unsigned long reserve);
113
114
115#define FIXADDR_TOP ((unsigned long)__FIXADDR_TOP)
116
117#define __FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT)
118#define __FIXADDR_BOOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
119#define FIXADDR_START (FIXADDR_TOP - __FIXADDR_SIZE)
120#define FIXADDR_BOOT_START (FIXADDR_TOP - __FIXADDR_BOOT_SIZE)
121
122#endif /* !__ASSEMBLY__ */
123#endif /* _ASM_X86_FIXMAP_32_H */
diff --git a/arch/x86/include/asm/fixmap_64.h b/arch/x86/include/asm/fixmap_64.h
new file mode 100644
index 000000000000..00a30ab9b1a5
--- /dev/null
+++ b/arch/x86/include/asm/fixmap_64.h
@@ -0,0 +1,83 @@
1/*
2 * fixmap.h: compile-time virtual memory allocation
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1998 Ingo Molnar
9 */
10
11#ifndef _ASM_X86_FIXMAP_64_H
12#define _ASM_X86_FIXMAP_64_H
13
14#include <linux/kernel.h>
15#include <asm/acpi.h>
16#include <asm/apicdef.h>
17#include <asm/page.h>
18#include <asm/vsyscall.h>
19#include <asm/efi.h>
20
21/*
22 * Here we define all the compile-time 'special' virtual
23 * addresses. The point is to have a constant address at
24 * compile time, but to set the physical address only
25 * in the boot process.
26 *
27 * These 'compile-time allocated' memory buffers are
28 * fixed-size 4k pages (or larger if used with an increment
29 * higher than 1). Use set_fixmap(idx,phys) to associate
30 * physical memory with fixmap indices.
31 *
32 * TLB entries of such buffers will not be flushed across
33 * task switches.
34 */
35
36enum fixed_addresses {
37 VSYSCALL_LAST_PAGE,
38 VSYSCALL_FIRST_PAGE = VSYSCALL_LAST_PAGE
39 + ((VSYSCALL_END-VSYSCALL_START) >> PAGE_SHIFT) - 1,
40 VSYSCALL_HPET,
41 FIX_DBGP_BASE,
42 FIX_EARLYCON_MEM_BASE,
43 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
44 FIX_IO_APIC_BASE_0,
45 FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1,
46 FIX_EFI_IO_MAP_LAST_PAGE,
47 FIX_EFI_IO_MAP_FIRST_PAGE = FIX_EFI_IO_MAP_LAST_PAGE
48 + MAX_EFI_IO_PAGES - 1,
49#ifdef CONFIG_PARAVIRT
50 FIX_PARAVIRT_BOOTMAP,
51#endif
52 __end_of_permanent_fixed_addresses,
53#ifdef CONFIG_ACPI
54 FIX_ACPI_BEGIN,
55 FIX_ACPI_END = FIX_ACPI_BEGIN + FIX_ACPI_PAGES - 1,
56#endif
57#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
58 FIX_OHCI1394_BASE,
59#endif
60 /*
61 * 256 temporary boot-time mappings, used by early_ioremap(),
62 * before ioremap() is functional.
63 *
64 * We round it up to the next 256 pages boundary so that we
65 * can have a single pgd entry and a single pte table:
66 */
67#define NR_FIX_BTMAPS 64
68#define FIX_BTMAPS_SLOTS 4
69 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 -
70 (__end_of_permanent_fixed_addresses & 255),
71 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1,
72 __end_of_fixed_addresses
73};
74
75#define FIXADDR_TOP (VSYSCALL_END-PAGE_SIZE)
76#define FIXADDR_SIZE (__end_of_fixed_addresses << PAGE_SHIFT)
77#define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE)
78
79/* Only covers 32bit vsyscalls currently. Need another set for 64bit. */
80#define FIXADDR_USER_START ((unsigned long)VSYSCALL32_VSYSCALL)
81#define FIXADDR_USER_END (FIXADDR_USER_START + PAGE_SIZE)
82
83#endif /* _ASM_X86_FIXMAP_64_H */
diff --git a/arch/x86/include/asm/floppy.h b/arch/x86/include/asm/floppy.h
new file mode 100644
index 000000000000..dbe82a5c5eac
--- /dev/null
+++ b/arch/x86/include/asm/floppy.h
@@ -0,0 +1,281 @@
1/*
2 * Architecture specific parts of the Floppy driver
3 *
4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file "COPYING" in the main directory of this archive
6 * for more details.
7 *
8 * Copyright (C) 1995
9 */
10#ifndef _ASM_X86_FLOPPY_H
11#define _ASM_X86_FLOPPY_H
12
13#include <linux/vmalloc.h>
14
15/*
16 * The DMA channel used by the floppy controller cannot access data at
17 * addresses >= 16MB
18 *
19 * Went back to the 1MB limit, as some people had problems with the floppy
20 * driver otherwise. It doesn't matter much for performance anyway, as most
21 * floppy accesses go through the track buffer.
22 */
23#define _CROSS_64KB(a, s, vdma) \
24 (!(vdma) && \
25 ((unsigned long)(a)/K_64 != ((unsigned long)(a) + (s) - 1) / K_64))
26
27#define CROSS_64KB(a, s) _CROSS_64KB(a, s, use_virtual_dma & 1)
28
29
30#define SW fd_routine[use_virtual_dma & 1]
31#define CSW fd_routine[can_use_virtual_dma & 1]
32
33
34#define fd_inb(port) inb_p(port)
35#define fd_outb(value, port) outb_p(value, port)
36
37#define fd_request_dma() CSW._request_dma(FLOPPY_DMA, "floppy")
38#define fd_free_dma() CSW._free_dma(FLOPPY_DMA)
39#define fd_enable_irq() enable_irq(FLOPPY_IRQ)
40#define fd_disable_irq() disable_irq(FLOPPY_IRQ)
41#define fd_free_irq() free_irq(FLOPPY_IRQ, NULL)
42#define fd_get_dma_residue() SW._get_dma_residue(FLOPPY_DMA)
43#define fd_dma_mem_alloc(size) SW._dma_mem_alloc(size)
44#define fd_dma_setup(addr, size, mode, io) SW._dma_setup(addr, size, mode, io)
45
46#define FLOPPY_CAN_FALLBACK_ON_NODMA
47
48static int virtual_dma_count;
49static int virtual_dma_residue;
50static char *virtual_dma_addr;
51static int virtual_dma_mode;
52static int doing_pdma;
53
54static irqreturn_t floppy_hardint(int irq, void *dev_id)
55{
56 unsigned char st;
57
58#undef TRACE_FLPY_INT
59
60#ifdef TRACE_FLPY_INT
61 static int calls;
62 static int bytes;
63 static int dma_wait;
64#endif
65 if (!doing_pdma)
66 return floppy_interrupt(irq, dev_id);
67
68#ifdef TRACE_FLPY_INT
69 if (!calls)
70 bytes = virtual_dma_count;
71#endif
72
73 {
74 int lcount;
75 char *lptr;
76
77 st = 1;
78 for (lcount = virtual_dma_count, lptr = virtual_dma_addr;
79 lcount; lcount--, lptr++) {
80 st = inb(virtual_dma_port + 4) & 0xa0;
81 if (st != 0xa0)
82 break;
83 if (virtual_dma_mode)
84 outb_p(*lptr, virtual_dma_port + 5);
85 else
86 *lptr = inb_p(virtual_dma_port + 5);
87 }
88 virtual_dma_count = lcount;
89 virtual_dma_addr = lptr;
90 st = inb(virtual_dma_port + 4);
91 }
92
93#ifdef TRACE_FLPY_INT
94 calls++;
95#endif
96 if (st == 0x20)
97 return IRQ_HANDLED;
98 if (!(st & 0x20)) {
99 virtual_dma_residue += virtual_dma_count;
100 virtual_dma_count = 0;
101#ifdef TRACE_FLPY_INT
102 printk("count=%x, residue=%x calls=%d bytes=%d dma_wait=%d\n",
103 virtual_dma_count, virtual_dma_residue, calls, bytes,
104 dma_wait);
105 calls = 0;
106 dma_wait = 0;
107#endif
108 doing_pdma = 0;
109 floppy_interrupt(irq, dev_id);
110 return IRQ_HANDLED;
111 }
112#ifdef TRACE_FLPY_INT
113 if (!virtual_dma_count)
114 dma_wait++;
115#endif
116 return IRQ_HANDLED;
117}
118
119static void fd_disable_dma(void)
120{
121 if (!(can_use_virtual_dma & 1))
122 disable_dma(FLOPPY_DMA);
123 doing_pdma = 0;
124 virtual_dma_residue += virtual_dma_count;
125 virtual_dma_count = 0;
126}
127
128static int vdma_request_dma(unsigned int dmanr, const char *device_id)
129{
130 return 0;
131}
132
133static void vdma_nop(unsigned int dummy)
134{
135}
136
137
138static int vdma_get_dma_residue(unsigned int dummy)
139{
140 return virtual_dma_count + virtual_dma_residue;
141}
142
143
144static int fd_request_irq(void)
145{
146 if (can_use_virtual_dma)
147 return request_irq(FLOPPY_IRQ, floppy_hardint,
148 IRQF_DISABLED, "floppy", NULL);
149 else
150 return request_irq(FLOPPY_IRQ, floppy_interrupt,
151 IRQF_DISABLED, "floppy", NULL);
152}
153
154static unsigned long dma_mem_alloc(unsigned long size)
155{
156 return __get_dma_pages(GFP_KERNEL|__GFP_NORETRY, get_order(size));
157}
158
159
160static unsigned long vdma_mem_alloc(unsigned long size)
161{
162 return (unsigned long)vmalloc(size);
163
164}
165
166#define nodma_mem_alloc(size) vdma_mem_alloc(size)
167
168static void _fd_dma_mem_free(unsigned long addr, unsigned long size)
169{
170 if ((unsigned long)addr >= (unsigned long)high_memory)
171 vfree((void *)addr);
172 else
173 free_pages(addr, get_order(size));
174}
175
176#define fd_dma_mem_free(addr, size) _fd_dma_mem_free(addr, size)
177
178static void _fd_chose_dma_mode(char *addr, unsigned long size)
179{
180 if (can_use_virtual_dma == 2) {
181 if ((unsigned long)addr >= (unsigned long)high_memory ||
182 isa_virt_to_bus(addr) >= 0x1000000 ||
183 _CROSS_64KB(addr, size, 0))
184 use_virtual_dma = 1;
185 else
186 use_virtual_dma = 0;
187 } else {
188 use_virtual_dma = can_use_virtual_dma & 1;
189 }
190}
191
192#define fd_chose_dma_mode(addr, size) _fd_chose_dma_mode(addr, size)
193
194
195static int vdma_dma_setup(char *addr, unsigned long size, int mode, int io)
196{
197 doing_pdma = 1;
198 virtual_dma_port = io;
199 virtual_dma_mode = (mode == DMA_MODE_WRITE);
200 virtual_dma_addr = addr;
201 virtual_dma_count = size;
202 virtual_dma_residue = 0;
203 return 0;
204}
205
206static int hard_dma_setup(char *addr, unsigned long size, int mode, int io)
207{
208#ifdef FLOPPY_SANITY_CHECK
209 if (CROSS_64KB(addr, size)) {
210 printk("DMA crossing 64-K boundary %p-%p\n", addr, addr+size);
211 return -1;
212 }
213#endif
214 /* actual, physical DMA */
215 doing_pdma = 0;
216 clear_dma_ff(FLOPPY_DMA);
217 set_dma_mode(FLOPPY_DMA, mode);
218 set_dma_addr(FLOPPY_DMA, isa_virt_to_bus(addr));
219 set_dma_count(FLOPPY_DMA, size);
220 enable_dma(FLOPPY_DMA);
221 return 0;
222}
223
224static struct fd_routine_l {
225 int (*_request_dma)(unsigned int dmanr, const char *device_id);
226 void (*_free_dma)(unsigned int dmanr);
227 int (*_get_dma_residue)(unsigned int dummy);
228 unsigned long (*_dma_mem_alloc)(unsigned long size);
229 int (*_dma_setup)(char *addr, unsigned long size, int mode, int io);
230} fd_routine[] = {
231 {
232 request_dma,
233 free_dma,
234 get_dma_residue,
235 dma_mem_alloc,
236 hard_dma_setup
237 },
238 {
239 vdma_request_dma,
240 vdma_nop,
241 vdma_get_dma_residue,
242 vdma_mem_alloc,
243 vdma_dma_setup
244 }
245};
246
247
248static int FDC1 = 0x3f0;
249static int FDC2 = -1;
250
251/*
252 * Floppy types are stored in the rtc's CMOS RAM and so rtc_lock
253 * is needed to prevent corrupted CMOS RAM in case "insmod floppy"
254 * coincides with another rtc CMOS user. Paul G.
255 */
256#define FLOPPY0_TYPE \
257({ \
258 unsigned long flags; \
259 unsigned char val; \
260 spin_lock_irqsave(&rtc_lock, flags); \
261 val = (CMOS_READ(0x10) >> 4) & 15; \
262 spin_unlock_irqrestore(&rtc_lock, flags); \
263 val; \
264})
265
266#define FLOPPY1_TYPE \
267({ \
268 unsigned long flags; \
269 unsigned char val; \
270 spin_lock_irqsave(&rtc_lock, flags); \
271 val = CMOS_READ(0x10) & 15; \
272 spin_unlock_irqrestore(&rtc_lock, flags); \
273 val; \
274})
275
276#define N_FDC 2
277#define N_DRIVE 8
278
279#define EXTRA_FLOPPY_PARAMS
280
281#endif /* _ASM_X86_FLOPPY_H */
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
new file mode 100644
index 000000000000..06850a7194e1
--- /dev/null
+++ b/arch/x86/include/asm/frame.h
@@ -0,0 +1,27 @@
1#ifdef __ASSEMBLY__
2
3#include <asm/dwarf2.h>
4
5/* The annotation hides the frame from the unwinder and makes it look
6 like a ordinary ebp save/restore. This avoids some special cases for
7 frame pointer later */
8#ifdef CONFIG_FRAME_POINTER
9 .macro FRAME
10 pushl %ebp
11 CFI_ADJUST_CFA_OFFSET 4
12 CFI_REL_OFFSET ebp,0
13 movl %esp,%ebp
14 .endm
15 .macro ENDFRAME
16 popl %ebp
17 CFI_ADJUST_CFA_OFFSET -4
18 CFI_RESTORE ebp
19 .endm
20#else
21 .macro FRAME
22 .endm
23 .macro ENDFRAME
24 .endm
25#endif
26
27#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
new file mode 100644
index 000000000000..47f7e65e6c1d
--- /dev/null
+++ b/arch/x86/include/asm/ftrace.h
@@ -0,0 +1,24 @@
1#ifndef _ASM_X86_FTRACE_H
2#define _ASM_X86_FTRACE_H
3
4#ifdef CONFIG_FTRACE
5#define MCOUNT_ADDR ((long)(mcount))
6#define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */
7
8#ifndef __ASSEMBLY__
9extern void mcount(void);
10
11static inline unsigned long ftrace_call_adjust(unsigned long addr)
12{
13 /*
14 * call mcount is "e8 <4 byte offset>"
15 * The addr points to the 4 byte offset and the caller of this
16 * function wants the pointer to e8. Simply subtract one.
17 */
18 return addr - 1;
19}
20#endif
21
22#endif /* CONFIG_FTRACE */
23
24#endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
new file mode 100644
index 000000000000..1f11ce44e956
--- /dev/null
+++ b/arch/x86/include/asm/futex.h
@@ -0,0 +1,140 @@
1#ifndef _ASM_X86_FUTEX_H
2#define _ASM_X86_FUTEX_H
3
4#ifdef __KERNEL__
5
6#include <linux/futex.h>
7#include <linux/uaccess.h>
8
9#include <asm/asm.h>
10#include <asm/errno.h>
11#include <asm/processor.h>
12#include <asm/system.h>
13
14#define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \
15 asm volatile("1:\t" insn "\n" \
16 "2:\t.section .fixup,\"ax\"\n" \
17 "3:\tmov\t%3, %1\n" \
18 "\tjmp\t2b\n" \
19 "\t.previous\n" \
20 _ASM_EXTABLE(1b, 3b) \
21 : "=r" (oldval), "=r" (ret), "+m" (*uaddr) \
22 : "i" (-EFAULT), "0" (oparg), "1" (0))
23
24#define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg) \
25 asm volatile("1:\tmovl %2, %0\n" \
26 "\tmovl\t%0, %3\n" \
27 "\t" insn "\n" \
28 "2:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" \
29 "\tjnz\t1b\n" \
30 "3:\t.section .fixup,\"ax\"\n" \
31 "4:\tmov\t%5, %1\n" \
32 "\tjmp\t3b\n" \
33 "\t.previous\n" \
34 _ASM_EXTABLE(1b, 4b) \
35 _ASM_EXTABLE(2b, 4b) \
36 : "=&a" (oldval), "=&r" (ret), \
37 "+m" (*uaddr), "=&r" (tem) \
38 : "r" (oparg), "i" (-EFAULT), "1" (0))
39
40static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
41{
42 int op = (encoded_op >> 28) & 7;
43 int cmp = (encoded_op >> 24) & 15;
44 int oparg = (encoded_op << 8) >> 20;
45 int cmparg = (encoded_op << 20) >> 20;
46 int oldval = 0, ret, tem;
47
48 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
49 oparg = 1 << oparg;
50
51 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
52 return -EFAULT;
53
54#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
55 /* Real i386 machines can only support FUTEX_OP_SET */
56 if (op != FUTEX_OP_SET && boot_cpu_data.x86 == 3)
57 return -ENOSYS;
58#endif
59
60 pagefault_disable();
61
62 switch (op) {
63 case FUTEX_OP_SET:
64 __futex_atomic_op1("xchgl %0, %2", ret, oldval, uaddr, oparg);
65 break;
66 case FUTEX_OP_ADD:
67 __futex_atomic_op1(LOCK_PREFIX "xaddl %0, %2", ret, oldval,
68 uaddr, oparg);
69 break;
70 case FUTEX_OP_OR:
71 __futex_atomic_op2("orl %4, %3", ret, oldval, uaddr, oparg);
72 break;
73 case FUTEX_OP_ANDN:
74 __futex_atomic_op2("andl %4, %3", ret, oldval, uaddr, ~oparg);
75 break;
76 case FUTEX_OP_XOR:
77 __futex_atomic_op2("xorl %4, %3", ret, oldval, uaddr, oparg);
78 break;
79 default:
80 ret = -ENOSYS;
81 }
82
83 pagefault_enable();
84
85 if (!ret) {
86 switch (cmp) {
87 case FUTEX_OP_CMP_EQ:
88 ret = (oldval == cmparg);
89 break;
90 case FUTEX_OP_CMP_NE:
91 ret = (oldval != cmparg);
92 break;
93 case FUTEX_OP_CMP_LT:
94 ret = (oldval < cmparg);
95 break;
96 case FUTEX_OP_CMP_GE:
97 ret = (oldval >= cmparg);
98 break;
99 case FUTEX_OP_CMP_LE:
100 ret = (oldval <= cmparg);
101 break;
102 case FUTEX_OP_CMP_GT:
103 ret = (oldval > cmparg);
104 break;
105 default:
106 ret = -ENOSYS;
107 }
108 }
109 return ret;
110}
111
112static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
113 int newval)
114{
115
116#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
117 /* Real i386 machines have no cmpxchg instruction */
118 if (boot_cpu_data.x86 == 3)
119 return -ENOSYS;
120#endif
121
122 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int)))
123 return -EFAULT;
124
125 asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n"
126 "2:\t.section .fixup, \"ax\"\n"
127 "3:\tmov %2, %0\n"
128 "\tjmp 2b\n"
129 "\t.previous\n"
130 _ASM_EXTABLE(1b, 3b)
131 : "=a" (oldval), "+m" (*uaddr)
132 : "i" (-EFAULT), "r" (newval), "0" (oldval)
133 : "memory"
134 );
135
136 return oldval;
137}
138
139#endif
140#endif /* _ASM_X86_FUTEX_H */
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
new file mode 100644
index 000000000000..74252264433d
--- /dev/null
+++ b/arch/x86/include/asm/gart.h
@@ -0,0 +1,73 @@
1#ifndef _ASM_X86_GART_H
2#define _ASM_X86_GART_H
3
4#include <asm/e820.h>
5
6extern void set_up_gart_resume(u32, u32);
7
8extern int fallback_aper_order;
9extern int fallback_aper_force;
10extern int fix_aperture;
11
12/* PTE bits. */
13#define GPTE_VALID 1
14#define GPTE_COHERENT 2
15
16/* Aperture control register bits. */
17#define GARTEN (1<<0)
18#define DISGARTCPU (1<<4)
19#define DISGARTIO (1<<5)
20
21/* GART cache control register bits. */
22#define INVGART (1<<0)
23#define GARTPTEERR (1<<1)
24
25/* K8 On-cpu GART registers */
26#define AMD64_GARTAPERTURECTL 0x90
27#define AMD64_GARTAPERTUREBASE 0x94
28#define AMD64_GARTTABLEBASE 0x98
29#define AMD64_GARTCACHECTL 0x9c
30#define AMD64_GARTEN (1<<0)
31
32extern int agp_amd64_init(void);
33
34static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
35{
36 u32 tmp, ctl;
37
38 /* address of the mappings table */
39 addr >>= 12;
40 tmp = (u32) addr<<4;
41 tmp &= ~0xf;
42 pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
43
44 /* Enable GART translation for this hammer. */
45 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
46 ctl |= GARTEN;
47 ctl &= ~(DISGARTCPU | DISGARTIO);
48 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
49}
50
51static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
52{
53 if (!aper_base)
54 return 0;
55
56 if (aper_base + aper_size > 0x100000000ULL) {
57 printk(KERN_INFO "Aperture beyond 4GB. Ignoring.\n");
58 return 0;
59 }
60 if (e820_any_mapped(aper_base, aper_base + aper_size, E820_RAM)) {
61 printk(KERN_INFO "Aperture pointing to e820 RAM. Ignoring.\n");
62 return 0;
63 }
64 if (aper_size < min_size) {
65 printk(KERN_INFO "Aperture too small (%d MB) than (%d MB)\n",
66 aper_size>>20, min_size>>20);
67 return 0;
68 }
69
70 return 1;
71}
72
73#endif /* _ASM_X86_GART_H */
diff --git a/arch/x86/include/asm/genapic.h b/arch/x86/include/asm/genapic.h
new file mode 100644
index 000000000000..d48bee663a6f
--- /dev/null
+++ b/arch/x86/include/asm/genapic.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "genapic_32.h"
3#else
4# include "genapic_64.h"
5#endif
diff --git a/arch/x86/include/asm/genapic_32.h b/arch/x86/include/asm/genapic_32.h
new file mode 100644
index 000000000000..5cbd4fcc06fd
--- /dev/null
+++ b/arch/x86/include/asm/genapic_32.h
@@ -0,0 +1,126 @@
1#ifndef _ASM_X86_GENAPIC_32_H
2#define _ASM_X86_GENAPIC_32_H
3
4#include <asm/mpspec.h>
5
6/*
7 * Generic APIC driver interface.
8 *
9 * An straight forward mapping of the APIC related parts of the
10 * x86 subarchitecture interface to a dynamic object.
11 *
12 * This is used by the "generic" x86 subarchitecture.
13 *
14 * Copyright 2003 Andi Kleen, SuSE Labs.
15 */
16
17struct mpc_config_bus;
18struct mp_config_table;
19struct mpc_config_processor;
20
21struct genapic {
22 char *name;
23 int (*probe)(void);
24
25 int (*apic_id_registered)(void);
26 cpumask_t (*target_cpus)(void);
27 int int_delivery_mode;
28 int int_dest_mode;
29 int ESR_DISABLE;
30 int apic_destination_logical;
31 unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid);
32 unsigned long (*check_apicid_present)(int apicid);
33 int no_balance_irq;
34 int no_ioapic_check;
35 void (*init_apic_ldr)(void);
36 physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map);
37
38 void (*setup_apic_routing)(void);
39 int (*multi_timer_check)(int apic, int irq);
40 int (*apicid_to_node)(int logical_apicid);
41 int (*cpu_to_logical_apicid)(int cpu);
42 int (*cpu_present_to_apicid)(int mps_cpu);
43 physid_mask_t (*apicid_to_cpu_present)(int phys_apicid);
44 void (*setup_portio_remap)(void);
45 int (*check_phys_apicid_present)(int boot_cpu_physical_apicid);
46 void (*enable_apic_mode)(void);
47 u32 (*phys_pkg_id)(u32 cpuid_apic, int index_msb);
48
49 /* mpparse */
50 /* When one of the next two hooks returns 1 the genapic
51 is switched to this. Essentially they are additional probe
52 functions. */
53 int (*mps_oem_check)(struct mp_config_table *mpc, char *oem,
54 char *productid);
55 int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
56
57 unsigned (*get_apic_id)(unsigned long x);
58 unsigned long apic_id_mask;
59 unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
60 cpumask_t (*vector_allocation_domain)(int cpu);
61
62#ifdef CONFIG_SMP
63 /* ipi */
64 void (*send_IPI_mask)(cpumask_t mask, int vector);
65 void (*send_IPI_allbutself)(int vector);
66 void (*send_IPI_all)(int vector);
67#endif
68};
69
70#define APICFUNC(x) .x = x,
71
72/* More functions could be probably marked IPIFUNC and save some space
73 in UP GENERICARCH kernels, but I don't have the nerve right now
74 to untangle this mess. -AK */
75#ifdef CONFIG_SMP
76#define IPIFUNC(x) APICFUNC(x)
77#else
78#define IPIFUNC(x)
79#endif
80
81#define APIC_INIT(aname, aprobe) \
82{ \
83 .name = aname, \
84 .probe = aprobe, \
85 .int_delivery_mode = INT_DELIVERY_MODE, \
86 .int_dest_mode = INT_DEST_MODE, \
87 .no_balance_irq = NO_BALANCE_IRQ, \
88 .ESR_DISABLE = esr_disable, \
89 .apic_destination_logical = APIC_DEST_LOGICAL, \
90 APICFUNC(apic_id_registered) \
91 APICFUNC(target_cpus) \
92 APICFUNC(check_apicid_used) \
93 APICFUNC(check_apicid_present) \
94 APICFUNC(init_apic_ldr) \
95 APICFUNC(ioapic_phys_id_map) \
96 APICFUNC(setup_apic_routing) \
97 APICFUNC(multi_timer_check) \
98 APICFUNC(apicid_to_node) \
99 APICFUNC(cpu_to_logical_apicid) \
100 APICFUNC(cpu_present_to_apicid) \
101 APICFUNC(apicid_to_cpu_present) \
102 APICFUNC(setup_portio_remap) \
103 APICFUNC(check_phys_apicid_present) \
104 APICFUNC(mps_oem_check) \
105 APICFUNC(get_apic_id) \
106 .apic_id_mask = APIC_ID_MASK, \
107 APICFUNC(cpu_mask_to_apicid) \
108 APICFUNC(vector_allocation_domain) \
109 APICFUNC(acpi_madt_oem_check) \
110 IPIFUNC(send_IPI_mask) \
111 IPIFUNC(send_IPI_allbutself) \
112 IPIFUNC(send_IPI_all) \
113 APICFUNC(enable_apic_mode) \
114 APICFUNC(phys_pkg_id) \
115}
116
117extern struct genapic *genapic;
118
119enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
120#define get_uv_system_type() UV_NONE
121#define is_uv_system() 0
122#define uv_wakeup_secondary(a, b) 1
123#define uv_system_init() do {} while (0)
124
125
126#endif /* _ASM_X86_GENAPIC_32_H */
diff --git a/arch/x86/include/asm/genapic_64.h b/arch/x86/include/asm/genapic_64.h
new file mode 100644
index 000000000000..13c4e96199ea
--- /dev/null
+++ b/arch/x86/include/asm/genapic_64.h
@@ -0,0 +1,58 @@
1#ifndef _ASM_X86_GENAPIC_64_H
2#define _ASM_X86_GENAPIC_64_H
3
4/*
5 * Copyright 2004 James Cleverdon, IBM.
6 * Subject to the GNU Public License, v.2
7 *
8 * Generic APIC sub-arch data struct.
9 *
10 * Hacked for x86-64 by James Cleverdon from i386 architecture code by
11 * Martin Bligh, Andi Kleen, James Bottomley, John Stultz, and
12 * James Cleverdon.
13 */
14
15struct genapic {
16 char *name;
17 int (*acpi_madt_oem_check)(char *oem_id, char *oem_table_id);
18 u32 int_delivery_mode;
19 u32 int_dest_mode;
20 int (*apic_id_registered)(void);
21 cpumask_t (*target_cpus)(void);
22 cpumask_t (*vector_allocation_domain)(int cpu);
23 void (*init_apic_ldr)(void);
24 /* ipi */
25 void (*send_IPI_mask)(cpumask_t mask, int vector);
26 void (*send_IPI_allbutself)(int vector);
27 void (*send_IPI_all)(int vector);
28 void (*send_IPI_self)(int vector);
29 /* */
30 unsigned int (*cpu_mask_to_apicid)(cpumask_t cpumask);
31 unsigned int (*phys_pkg_id)(int index_msb);
32 unsigned int (*get_apic_id)(unsigned long x);
33 unsigned long (*set_apic_id)(unsigned int id);
34 unsigned long apic_id_mask;
35};
36
37extern struct genapic *genapic;
38
39extern struct genapic apic_flat;
40extern struct genapic apic_physflat;
41extern struct genapic apic_x2apic_cluster;
42extern struct genapic apic_x2apic_phys;
43extern int acpi_madt_oem_check(char *, char *);
44
45extern void apic_send_IPI_self(int vector);
46enum uv_system_type {UV_NONE, UV_LEGACY_APIC, UV_X2APIC, UV_NON_UNIQUE_APIC};
47extern enum uv_system_type get_uv_system_type(void);
48extern int is_uv_system(void);
49
50extern struct genapic apic_x2apic_uv_x;
51DECLARE_PER_CPU(int, x2apic_extra_bits);
52extern void uv_cpu_init(void);
53extern void uv_system_init(void);
54extern int uv_wakeup_secondary(int phys_apicid, unsigned int start_rip);
55
56extern void setup_apic_routing(void);
57
58#endif /* _ASM_X86_GENAPIC_64_H */
diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h
new file mode 100644
index 000000000000..ad3c2ed75481
--- /dev/null
+++ b/arch/x86/include/asm/geode.h
@@ -0,0 +1,253 @@
1/*
2 * AMD Geode definitions
3 * Copyright (C) 2006, Advanced Micro Devices, Inc.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of version 2 of the GNU General Public License
7 * as published by the Free Software Foundation.
8 */
9
10#ifndef _ASM_X86_GEODE_H
11#define _ASM_X86_GEODE_H
12
13#include <asm/processor.h>
14#include <linux/io.h>
15
16/* Generic southbridge functions */
17
18#define GEODE_DEV_PMS 0
19#define GEODE_DEV_ACPI 1
20#define GEODE_DEV_GPIO 2
21#define GEODE_DEV_MFGPT 3
22
23extern int geode_get_dev_base(unsigned int dev);
24
25/* Useful macros */
26#define geode_pms_base() geode_get_dev_base(GEODE_DEV_PMS)
27#define geode_acpi_base() geode_get_dev_base(GEODE_DEV_ACPI)
28#define geode_gpio_base() geode_get_dev_base(GEODE_DEV_GPIO)
29#define geode_mfgpt_base() geode_get_dev_base(GEODE_DEV_MFGPT)
30
31/* MSRS */
32
33#define MSR_GLIU_P2D_RO0 0x10000029
34
35#define MSR_LX_GLD_MSR_CONFIG 0x48002001
36#define MSR_LX_MSR_PADSEL 0x48002011 /* NOT 0x48000011; the data
37 * sheet has the wrong value */
38#define MSR_GLCP_SYS_RSTPLL 0x4C000014
39#define MSR_GLCP_DOTPLL 0x4C000015
40
41#define MSR_LBAR_SMB 0x5140000B
42#define MSR_LBAR_GPIO 0x5140000C
43#define MSR_LBAR_MFGPT 0x5140000D
44#define MSR_LBAR_ACPI 0x5140000E
45#define MSR_LBAR_PMS 0x5140000F
46
47#define MSR_DIVIL_SOFT_RESET 0x51400017
48
49#define MSR_PIC_YSEL_LOW 0x51400020
50#define MSR_PIC_YSEL_HIGH 0x51400021
51#define MSR_PIC_ZSEL_LOW 0x51400022
52#define MSR_PIC_ZSEL_HIGH 0x51400023
53#define MSR_PIC_IRQM_LPC 0x51400025
54
55#define MSR_MFGPT_IRQ 0x51400028
56#define MSR_MFGPT_NR 0x51400029
57#define MSR_MFGPT_SETUP 0x5140002B
58
59#define MSR_LX_SPARE_MSR 0x80000011 /* DC-specific */
60
61#define MSR_GX_GLD_MSR_CONFIG 0xC0002001
62#define MSR_GX_MSR_PADSEL 0xC0002011
63
64/* Resource Sizes */
65
66#define LBAR_GPIO_SIZE 0xFF
67#define LBAR_MFGPT_SIZE 0x40
68#define LBAR_ACPI_SIZE 0x40
69#define LBAR_PMS_SIZE 0x80
70
71/* ACPI registers (PMS block) */
72
73/*
74 * PM1_EN is only valid when VSA is enabled for 16 bit reads.
75 * When VSA is not enabled, *always* read both PM1_STS and PM1_EN
76 * with a 32 bit read at offset 0x0
77 */
78
79#define PM1_STS 0x00
80#define PM1_EN 0x02
81#define PM1_CNT 0x08
82#define PM2_CNT 0x0C
83#define PM_TMR 0x10
84#define PM_GPE0_STS 0x18
85#define PM_GPE0_EN 0x1C
86
87/* PMC registers (PMS block) */
88
89#define PM_SSD 0x00
90#define PM_SCXA 0x04
91#define PM_SCYA 0x08
92#define PM_OUT_SLPCTL 0x0C
93#define PM_SCLK 0x10
94#define PM_SED 0x1
95#define PM_SCXD 0x18
96#define PM_SCYD 0x1C
97#define PM_IN_SLPCTL 0x20
98#define PM_WKD 0x30
99#define PM_WKXD 0x34
100#define PM_RD 0x38
101#define PM_WKXA 0x3C
102#define PM_FSD 0x40
103#define PM_TSD 0x44
104#define PM_PSD 0x48
105#define PM_NWKD 0x4C
106#define PM_AWKD 0x50
107#define PM_SSC 0x54
108
109/* VSA2 magic values */
110
111#define VSA_VRC_INDEX 0xAC1C
112#define VSA_VRC_DATA 0xAC1E
113#define VSA_VR_UNLOCK 0xFC53 /* unlock virtual register */
114#define VSA_VR_SIGNATURE 0x0003
115#define VSA_VR_MEM_SIZE 0x0200
116#define AMD_VSA_SIG 0x4132 /* signature is ascii 'VSA2' */
117#define GSW_VSA_SIG 0x534d /* General Software signature */
118/* GPIO */
119
120#define GPIO_OUTPUT_VAL 0x00
121#define GPIO_OUTPUT_ENABLE 0x04
122#define GPIO_OUTPUT_OPEN_DRAIN 0x08
123#define GPIO_OUTPUT_INVERT 0x0C
124#define GPIO_OUTPUT_AUX1 0x10
125#define GPIO_OUTPUT_AUX2 0x14
126#define GPIO_PULL_UP 0x18
127#define GPIO_PULL_DOWN 0x1C
128#define GPIO_INPUT_ENABLE 0x20
129#define GPIO_INPUT_INVERT 0x24
130#define GPIO_INPUT_FILTER 0x28
131#define GPIO_INPUT_EVENT_COUNT 0x2C
132#define GPIO_READ_BACK 0x30
133#define GPIO_INPUT_AUX1 0x34
134#define GPIO_EVENTS_ENABLE 0x38
135#define GPIO_LOCK_ENABLE 0x3C
136#define GPIO_POSITIVE_EDGE_EN 0x40
137#define GPIO_NEGATIVE_EDGE_EN 0x44
138#define GPIO_POSITIVE_EDGE_STS 0x48
139#define GPIO_NEGATIVE_EDGE_STS 0x4C
140
141#define GPIO_MAP_X 0xE0
142#define GPIO_MAP_Y 0xE4
143#define GPIO_MAP_Z 0xE8
144#define GPIO_MAP_W 0xEC
145
146static inline u32 geode_gpio(unsigned int nr)
147{
148 BUG_ON(nr > 28);
149 return 1 << nr;
150}
151
152extern void geode_gpio_set(u32, unsigned int);
153extern void geode_gpio_clear(u32, unsigned int);
154extern int geode_gpio_isset(u32, unsigned int);
155extern void geode_gpio_setup_event(unsigned int, int, int);
156extern void geode_gpio_set_irq(unsigned int, unsigned int);
157
158static inline void geode_gpio_event_irq(unsigned int gpio, int pair)
159{
160 geode_gpio_setup_event(gpio, pair, 0);
161}
162
163static inline void geode_gpio_event_pme(unsigned int gpio, int pair)
164{
165 geode_gpio_setup_event(gpio, pair, 1);
166}
167
168/* Specific geode tests */
169
170static inline int is_geode_gx(void)
171{
172 return ((boot_cpu_data.x86_vendor == X86_VENDOR_NSC) &&
173 (boot_cpu_data.x86 == 5) &&
174 (boot_cpu_data.x86_model == 5));
175}
176
177static inline int is_geode_lx(void)
178{
179 return ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) &&
180 (boot_cpu_data.x86 == 5) &&
181 (boot_cpu_data.x86_model == 10));
182}
183
184static inline int is_geode(void)
185{
186 return (is_geode_gx() || is_geode_lx());
187}
188
189#ifdef CONFIG_MGEODE_LX
190extern int geode_has_vsa2(void);
191#else
192static inline int geode_has_vsa2(void)
193{
194 return 0;
195}
196#endif
197
198/* MFGPTs */
199
200#define MFGPT_MAX_TIMERS 8
201#define MFGPT_TIMER_ANY (-1)
202
203#define MFGPT_DOMAIN_WORKING 1
204#define MFGPT_DOMAIN_STANDBY 2
205#define MFGPT_DOMAIN_ANY (MFGPT_DOMAIN_WORKING | MFGPT_DOMAIN_STANDBY)
206
207#define MFGPT_CMP1 0
208#define MFGPT_CMP2 1
209
210#define MFGPT_EVENT_IRQ 0
211#define MFGPT_EVENT_NMI 1
212#define MFGPT_EVENT_RESET 3
213
214#define MFGPT_REG_CMP1 0
215#define MFGPT_REG_CMP2 2
216#define MFGPT_REG_COUNTER 4
217#define MFGPT_REG_SETUP 6
218
219#define MFGPT_SETUP_CNTEN (1 << 15)
220#define MFGPT_SETUP_CMP2 (1 << 14)
221#define MFGPT_SETUP_CMP1 (1 << 13)
222#define MFGPT_SETUP_SETUP (1 << 12)
223#define MFGPT_SETUP_STOPEN (1 << 11)
224#define MFGPT_SETUP_EXTEN (1 << 10)
225#define MFGPT_SETUP_REVEN (1 << 5)
226#define MFGPT_SETUP_CLKSEL (1 << 4)
227
228static inline void geode_mfgpt_write(int timer, u16 reg, u16 value)
229{
230 u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
231 outw(value, base + reg + (timer * 8));
232}
233
234static inline u16 geode_mfgpt_read(int timer, u16 reg)
235{
236 u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
237 return inw(base + reg + (timer * 8));
238}
239
240extern int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable);
241extern int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable);
242extern int geode_mfgpt_alloc_timer(int timer, int domain);
243
244#define geode_mfgpt_setup_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 1)
245#define geode_mfgpt_release_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 0)
246
247#ifdef CONFIG_GEODE_MFGPT_TIMER
248extern int __init mfgpt_timer_setup(void);
249#else
250static inline int mfgpt_timer_setup(void) { return 0; }
251#endif
252
253#endif /* _ASM_X86_GEODE_H */
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
new file mode 100644
index 000000000000..49dbfdfa50f9
--- /dev/null
+++ b/arch/x86/include/asm/gpio.h
@@ -0,0 +1,56 @@
1/*
2 * Generic GPIO API implementation for x86.
3 *
4 * Derived from the generic GPIO API for powerpc:
5 *
6 * Copyright (c) 2007-2008 MontaVista Software, Inc.
7 *
8 * Author: Anton Vorontsov <avorontsov@ru.mvista.com>
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 */
15
16#ifndef _ASM_X86_GPIO_H
17#define _ASM_X86_GPIO_H
18
19#include <asm-generic/gpio.h>
20
21#ifdef CONFIG_GPIOLIB
22
23/*
24 * Just call gpiolib.
25 */
26static inline int gpio_get_value(unsigned int gpio)
27{
28 return __gpio_get_value(gpio);
29}
30
31static inline void gpio_set_value(unsigned int gpio, int value)
32{
33 __gpio_set_value(gpio, value);
34}
35
36static inline int gpio_cansleep(unsigned int gpio)
37{
38 return __gpio_cansleep(gpio);
39}
40
41/*
42 * Not implemented, yet.
43 */
44static inline int gpio_to_irq(unsigned int gpio)
45{
46 return -ENOSYS;
47}
48
49static inline int irq_to_gpio(unsigned int irq)
50{
51 return -EINVAL;
52}
53
54#endif /* CONFIG_GPIOLIB */
55
56#endif /* _ASM_X86_GPIO_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
new file mode 100644
index 000000000000..000787df66e6
--- /dev/null
+++ b/arch/x86/include/asm/hardirq.h
@@ -0,0 +1,11 @@
1#ifdef CONFIG_X86_32
2# include "hardirq_32.h"
3#else
4# include "hardirq_64.h"
5#endif
6
7extern u64 arch_irq_stat_cpu(unsigned int cpu);
8#define arch_irq_stat_cpu arch_irq_stat_cpu
9
10extern u64 arch_irq_stat(void);
11#define arch_irq_stat arch_irq_stat
diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h
new file mode 100644
index 000000000000..5ca135e72f2b
--- /dev/null
+++ b/arch/x86/include/asm/hardirq_32.h
@@ -0,0 +1,28 @@
1#ifndef _ASM_X86_HARDIRQ_32_H
2#define _ASM_X86_HARDIRQ_32_H
3
4#include <linux/threads.h>
5#include <linux/irq.h>
6
7typedef struct {
8 unsigned int __softirq_pending;
9 unsigned long idle_timestamp;
10 unsigned int __nmi_count; /* arch dependent */
11 unsigned int apic_timer_irqs; /* arch dependent */
12 unsigned int irq0_irqs;
13 unsigned int irq_resched_count;
14 unsigned int irq_call_count;
15 unsigned int irq_tlb_count;
16 unsigned int irq_thermal_count;
17 unsigned int irq_spurious_count;
18} ____cacheline_aligned irq_cpustat_t;
19
20DECLARE_PER_CPU(irq_cpustat_t, irq_stat);
21
22#define __ARCH_IRQ_STAT
23#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat, cpu).member)
24
25void ack_bad_irq(unsigned int irq);
26#include <linux/irq_cpustat.h>
27
28#endif /* _ASM_X86_HARDIRQ_32_H */
diff --git a/arch/x86/include/asm/hardirq_64.h b/arch/x86/include/asm/hardirq_64.h
new file mode 100644
index 000000000000..1ba381fc51d3
--- /dev/null
+++ b/arch/x86/include/asm/hardirq_64.h
@@ -0,0 +1,23 @@
1#ifndef _ASM_X86_HARDIRQ_64_H
2#define _ASM_X86_HARDIRQ_64_H
3
4#include <linux/threads.h>
5#include <linux/irq.h>
6#include <asm/pda.h>
7#include <asm/apic.h>
8
9/* We can have at most NR_VECTORS irqs routed to a cpu at a time */
10#define MAX_HARDIRQS_PER_CPU NR_VECTORS
11
12#define __ARCH_IRQ_STAT 1
13
14#define local_softirq_pending() read_pda(__softirq_pending)
15
16#define __ARCH_SET_SOFTIRQ_PENDING 1
17
18#define set_softirq_pending(x) write_pda(__softirq_pending, (x))
19#define or_softirq_pending(x) or_pda(__softirq_pending, (x))
20
21extern void ack_bad_irq(unsigned int irq);
22
23#endif /* _ASM_X86_HARDIRQ_64_H */
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
new file mode 100644
index 000000000000..a3b3b7c3027b
--- /dev/null
+++ b/arch/x86/include/asm/highmem.h
@@ -0,0 +1,82 @@
1/*
2 * highmem.h: virtual kernel memory mappings for high memory
3 *
4 * Used in CONFIG_HIGHMEM systems for memory pages which
5 * are not addressable by direct kernel virtual addresses.
6 *
7 * Copyright (C) 1999 Gerhard Wichert, Siemens AG
8 * Gerhard.Wichert@pdb.siemens.de
9 *
10 *
11 * Redesigned the x86 32-bit VM architecture to deal with
12 * up to 16 Terabyte physical memory. With current x86 CPUs
13 * we now support up to 64 Gigabytes physical RAM.
14 *
15 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
16 */
17
18#ifndef _ASM_X86_HIGHMEM_H
19#define _ASM_X86_HIGHMEM_H
20
21#ifdef __KERNEL__
22
23#include <linux/interrupt.h>
24#include <linux/threads.h>
25#include <asm/kmap_types.h>
26#include <asm/tlbflush.h>
27#include <asm/paravirt.h>
28
29/* declarations for highmem.c */
30extern unsigned long highstart_pfn, highend_pfn;
31
32extern pte_t *kmap_pte;
33extern pgprot_t kmap_prot;
34extern pte_t *pkmap_page_table;
35
36/*
37 * Right now we initialize only a single pte table. It can be extended
38 * easily, subsequent pte tables have to be allocated in one physical
39 * chunk of RAM.
40 */
41/*
42 * Ordering is:
43 *
44 * FIXADDR_TOP
45 * fixed_addresses
46 * FIXADDR_START
47 * temp fixed addresses
48 * FIXADDR_BOOT_START
49 * Persistent kmap area
50 * PKMAP_BASE
51 * VMALLOC_END
52 * Vmalloc area
53 * VMALLOC_START
54 * high_memory
55 */
56#define LAST_PKMAP_MASK (LAST_PKMAP-1)
57#define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT)
58#define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT))
59
60extern void *kmap_high(struct page *page);
61extern void kunmap_high(struct page *page);
62
63void *kmap(struct page *page);
64void kunmap(struct page *page);
65void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot);
66void *kmap_atomic(struct page *page, enum km_type type);
67void kunmap_atomic(void *kvaddr, enum km_type type);
68void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
69struct page *kmap_atomic_to_page(void *ptr);
70
71#ifndef CONFIG_PARAVIRT
72#define kmap_atomic_pte(page, type) kmap_atomic(page, type)
73#endif
74
75#define flush_cache_kmaps() do { } while (0)
76
77extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
78 unsigned long end_pfn);
79
80#endif /* __KERNEL__ */
81
82#endif /* _ASM_X86_HIGHMEM_H */
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
new file mode 100644
index 000000000000..1c22cb05ad6a
--- /dev/null
+++ b/arch/x86/include/asm/hpet.h
@@ -0,0 +1,114 @@
1#ifndef _ASM_X86_HPET_H
2#define _ASM_X86_HPET_H
3
4#include <linux/msi.h>
5
6#ifdef CONFIG_HPET_TIMER
7
8#define HPET_MMAP_SIZE 1024
9
10#define HPET_ID 0x000
11#define HPET_PERIOD 0x004
12#define HPET_CFG 0x010
13#define HPET_STATUS 0x020
14#define HPET_COUNTER 0x0f0
15
16#define HPET_Tn_CFG(n) (0x100 + 0x20 * n)
17#define HPET_Tn_CMP(n) (0x108 + 0x20 * n)
18#define HPET_Tn_ROUTE(n) (0x110 + 0x20 * n)
19
20#define HPET_T0_CFG 0x100
21#define HPET_T0_CMP 0x108
22#define HPET_T0_ROUTE 0x110
23#define HPET_T1_CFG 0x120
24#define HPET_T1_CMP 0x128
25#define HPET_T1_ROUTE 0x130
26#define HPET_T2_CFG 0x140
27#define HPET_T2_CMP 0x148
28#define HPET_T2_ROUTE 0x150
29
30#define HPET_ID_REV 0x000000ff
31#define HPET_ID_NUMBER 0x00001f00
32#define HPET_ID_64BIT 0x00002000
33#define HPET_ID_LEGSUP 0x00008000
34#define HPET_ID_VENDOR 0xffff0000
35#define HPET_ID_NUMBER_SHIFT 8
36#define HPET_ID_VENDOR_SHIFT 16
37
38#define HPET_ID_VENDOR_8086 0x8086
39
40#define HPET_CFG_ENABLE 0x001
41#define HPET_CFG_LEGACY 0x002
42#define HPET_LEGACY_8254 2
43#define HPET_LEGACY_RTC 8
44
45#define HPET_TN_LEVEL 0x0002
46#define HPET_TN_ENABLE 0x0004
47#define HPET_TN_PERIODIC 0x0008
48#define HPET_TN_PERIODIC_CAP 0x0010
49#define HPET_TN_64BIT_CAP 0x0020
50#define HPET_TN_SETVAL 0x0040
51#define HPET_TN_32BIT 0x0100
52#define HPET_TN_ROUTE 0x3e00
53#define HPET_TN_FSB 0x4000
54#define HPET_TN_FSB_CAP 0x8000
55#define HPET_TN_ROUTE_SHIFT 9
56
57/* Max HPET Period is 10^8 femto sec as in HPET spec */
58#define HPET_MAX_PERIOD 100000000UL
59/*
60 * Min HPET period is 10^5 femto sec just for safety. If it is less than this,
61 * then 32 bit HPET counter wrapsaround in less than 0.5 sec.
62 */
63#define HPET_MIN_PERIOD 100000UL
64
65/* hpet memory map physical address */
66extern unsigned long hpet_address;
67extern unsigned long force_hpet_address;
68extern int hpet_force_user;
69extern int is_hpet_enabled(void);
70extern int hpet_enable(void);
71extern void hpet_disable(void);
72extern unsigned long hpet_readl(unsigned long a);
73extern void force_hpet_resume(void);
74
75extern void hpet_msi_unmask(unsigned int irq);
76extern void hpet_msi_mask(unsigned int irq);
77extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
78extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
79
80#ifdef CONFIG_PCI_MSI
81extern int arch_setup_hpet_msi(unsigned int irq);
82#else
83static inline int arch_setup_hpet_msi(unsigned int irq)
84{
85 return -EINVAL;
86}
87#endif
88
89#ifdef CONFIG_HPET_EMULATE_RTC
90
91#include <linux/interrupt.h>
92
93typedef irqreturn_t (*rtc_irq_handler)(int interrupt, void *cookie);
94extern int hpet_mask_rtc_irq_bit(unsigned long bit_mask);
95extern int hpet_set_rtc_irq_bit(unsigned long bit_mask);
96extern int hpet_set_alarm_time(unsigned char hrs, unsigned char min,
97 unsigned char sec);
98extern int hpet_set_periodic_freq(unsigned long freq);
99extern int hpet_rtc_dropped_irq(void);
100extern int hpet_rtc_timer_init(void);
101extern irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id);
102extern int hpet_register_irq_handler(rtc_irq_handler handler);
103extern void hpet_unregister_irq_handler(rtc_irq_handler handler);
104
105#endif /* CONFIG_HPET_EMULATE_RTC */
106
107#else /* CONFIG_HPET_TIMER */
108
109static inline int hpet_enable(void) { return 0; }
110static inline int is_hpet_enabled(void) { return 0; }
111#define hpet_readl(a) 0
112
113#endif
114#endif /* _ASM_X86_HPET_H */
diff --git a/arch/x86/include/asm/hugetlb.h b/arch/x86/include/asm/hugetlb.h
new file mode 100644
index 000000000000..439a9acc132d
--- /dev/null
+++ b/arch/x86/include/asm/hugetlb.h
@@ -0,0 +1,93 @@
1#ifndef _ASM_X86_HUGETLB_H
2#define _ASM_X86_HUGETLB_H
3
4#include <asm/page.h>
5
6
7static inline int is_hugepage_only_range(struct mm_struct *mm,
8 unsigned long addr,
9 unsigned long len) {
10 return 0;
11}
12
13/*
14 * If the arch doesn't supply something else, assume that hugepage
15 * size aligned regions are ok without further preparation.
16 */
17static inline int prepare_hugepage_range(struct file *file,
18 unsigned long addr, unsigned long len)
19{
20 struct hstate *h = hstate_file(file);
21 if (len & ~huge_page_mask(h))
22 return -EINVAL;
23 if (addr & ~huge_page_mask(h))
24 return -EINVAL;
25 return 0;
26}
27
28static inline void hugetlb_prefault_arch_hook(struct mm_struct *mm) {
29}
30
31static inline void hugetlb_free_pgd_range(struct mmu_gather *tlb,
32 unsigned long addr, unsigned long end,
33 unsigned long floor,
34 unsigned long ceiling)
35{
36 free_pgd_range(tlb, addr, end, floor, ceiling);
37}
38
39static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
40 pte_t *ptep, pte_t pte)
41{
42 set_pte_at(mm, addr, ptep, pte);
43}
44
45static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
46 unsigned long addr, pte_t *ptep)
47{
48 return ptep_get_and_clear(mm, addr, ptep);
49}
50
51static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
52 unsigned long addr, pte_t *ptep)
53{
54}
55
56static inline int huge_pte_none(pte_t pte)
57{
58 return pte_none(pte);
59}
60
61static inline pte_t huge_pte_wrprotect(pte_t pte)
62{
63 return pte_wrprotect(pte);
64}
65
66static inline void huge_ptep_set_wrprotect(struct mm_struct *mm,
67 unsigned long addr, pte_t *ptep)
68{
69 ptep_set_wrprotect(mm, addr, ptep);
70}
71
72static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
73 unsigned long addr, pte_t *ptep,
74 pte_t pte, int dirty)
75{
76 return ptep_set_access_flags(vma, addr, ptep, pte, dirty);
77}
78
79static inline pte_t huge_ptep_get(pte_t *ptep)
80{
81 return *ptep;
82}
83
84static inline int arch_prepare_hugepage(struct page *page)
85{
86 return 0;
87}
88
89static inline void arch_release_hugepage(struct page *page)
90{
91}
92
93#endif /* _ASM_X86_HUGETLB_H */
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
new file mode 100644
index 000000000000..b97aecb0b61d
--- /dev/null
+++ b/arch/x86/include/asm/hw_irq.h
@@ -0,0 +1,131 @@
1#ifndef _ASM_X86_HW_IRQ_H
2#define _ASM_X86_HW_IRQ_H
3
4/*
5 * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
6 *
7 * moved some of the old arch/i386/kernel/irq.h to here. VY
8 *
9 * IRQ/IPI changes taken from work by Thomas Radke
10 * <tomsoft@informatik.tu-chemnitz.de>
11 *
12 * hacked by Andi Kleen for x86-64.
13 * unified by tglx
14 */
15
16#include <asm/irq_vectors.h>
17
18#ifndef __ASSEMBLY__
19
20#include <linux/percpu.h>
21#include <linux/profile.h>
22#include <linux/smp.h>
23
24#include <asm/atomic.h>
25#include <asm/irq.h>
26#include <asm/sections.h>
27
28#define platform_legacy_irq(irq) ((irq) < 16)
29
30/* Interrupt handlers registered during init_IRQ */
31extern void apic_timer_interrupt(void);
32extern void error_interrupt(void);
33extern void spurious_interrupt(void);
34extern void thermal_interrupt(void);
35extern void reschedule_interrupt(void);
36
37extern void invalidate_interrupt(void);
38extern void invalidate_interrupt0(void);
39extern void invalidate_interrupt1(void);
40extern void invalidate_interrupt2(void);
41extern void invalidate_interrupt3(void);
42extern void invalidate_interrupt4(void);
43extern void invalidate_interrupt5(void);
44extern void invalidate_interrupt6(void);
45extern void invalidate_interrupt7(void);
46
47extern void irq_move_cleanup_interrupt(void);
48extern void threshold_interrupt(void);
49
50extern void call_function_interrupt(void);
51extern void call_function_single_interrupt(void);
52
53/* PIC specific functions */
54extern void disable_8259A_irq(unsigned int irq);
55extern void enable_8259A_irq(unsigned int irq);
56extern int i8259A_irq_pending(unsigned int irq);
57extern void make_8259A_irq(unsigned int irq);
58extern void init_8259A(int aeoi);
59
60/* IOAPIC */
61#define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs))
62extern unsigned long io_apic_irqs;
63
64extern void init_VISWS_APIC_irqs(void);
65extern void setup_IO_APIC(void);
66extern void disable_IO_APIC(void);
67extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
68extern void setup_ioapic_dest(void);
69
70#ifdef CONFIG_X86_64
71extern void enable_IO_APIC(void);
72#endif
73
74/* IPI functions */
75#ifdef CONFIG_X86_32
76extern void send_IPI_self(int vector);
77#endif
78extern void send_IPI(int dest, int vector);
79
80/* Statistics */
81extern atomic_t irq_err_count;
82extern atomic_t irq_mis_count;
83
84/* EISA */
85extern void eisa_set_level_irq(unsigned int irq);
86
87/* Voyager functions */
88extern asmlinkage void vic_cpi_interrupt(void);
89extern asmlinkage void vic_sys_interrupt(void);
90extern asmlinkage void vic_cmn_interrupt(void);
91extern asmlinkage void qic_timer_interrupt(void);
92extern asmlinkage void qic_invalidate_interrupt(void);
93extern asmlinkage void qic_reschedule_interrupt(void);
94extern asmlinkage void qic_enable_irq_interrupt(void);
95extern asmlinkage void qic_call_function_interrupt(void);
96
97/* SMP */
98extern void smp_apic_timer_interrupt(struct pt_regs *);
99extern void smp_spurious_interrupt(struct pt_regs *);
100extern void smp_error_interrupt(struct pt_regs *);
101#ifdef CONFIG_X86_SMP
102extern void smp_reschedule_interrupt(struct pt_regs *);
103extern void smp_call_function_interrupt(struct pt_regs *);
104extern void smp_call_function_single_interrupt(struct pt_regs *);
105#ifdef CONFIG_X86_32
106extern void smp_invalidate_interrupt(struct pt_regs *);
107#else
108extern asmlinkage void smp_invalidate_interrupt(struct pt_regs *);
109#endif
110#endif
111
112#ifdef CONFIG_X86_32
113extern void (*const interrupt[NR_VECTORS])(void);
114#endif
115
116typedef int vector_irq_t[NR_VECTORS];
117DECLARE_PER_CPU(vector_irq_t, vector_irq);
118
119#ifdef CONFIG_X86_IO_APIC
120extern void lock_vector_lock(void);
121extern void unlock_vector_lock(void);
122extern void __setup_vector_irq(int cpu);
123#else
124static inline void lock_vector_lock(void) {}
125static inline void unlock_vector_lock(void) {}
126static inline void __setup_vector_irq(int cpu) {}
127#endif
128
129#endif /* !ASSEMBLY_ */
130
131#endif /* _ASM_X86_HW_IRQ_H */
diff --git a/arch/x86/include/asm/hypertransport.h b/arch/x86/include/asm/hypertransport.h
new file mode 100644
index 000000000000..334b1a885d9c
--- /dev/null
+++ b/arch/x86/include/asm/hypertransport.h
@@ -0,0 +1,45 @@
1#ifndef _ASM_X86_HYPERTRANSPORT_H
2#define _ASM_X86_HYPERTRANSPORT_H
3
4/*
5 * Constants for x86 Hypertransport Interrupts.
6 */
7
8#define HT_IRQ_LOW_BASE 0xf8000000
9
10#define HT_IRQ_LOW_VECTOR_SHIFT 16
11#define HT_IRQ_LOW_VECTOR_MASK 0x00ff0000
12#define HT_IRQ_LOW_VECTOR(v) \
13 (((v) << HT_IRQ_LOW_VECTOR_SHIFT) & HT_IRQ_LOW_VECTOR_MASK)
14
15#define HT_IRQ_LOW_DEST_ID_SHIFT 8
16#define HT_IRQ_LOW_DEST_ID_MASK 0x0000ff00
17#define HT_IRQ_LOW_DEST_ID(v) \
18 (((v) << HT_IRQ_LOW_DEST_ID_SHIFT) & HT_IRQ_LOW_DEST_ID_MASK)
19
20#define HT_IRQ_LOW_DM_PHYSICAL 0x0000000
21#define HT_IRQ_LOW_DM_LOGICAL 0x0000040
22
23#define HT_IRQ_LOW_RQEOI_EDGE 0x0000000
24#define HT_IRQ_LOW_RQEOI_LEVEL 0x0000020
25
26
27#define HT_IRQ_LOW_MT_FIXED 0x0000000
28#define HT_IRQ_LOW_MT_ARBITRATED 0x0000004
29#define HT_IRQ_LOW_MT_SMI 0x0000008
30#define HT_IRQ_LOW_MT_NMI 0x000000c
31#define HT_IRQ_LOW_MT_INIT 0x0000010
32#define HT_IRQ_LOW_MT_STARTUP 0x0000014
33#define HT_IRQ_LOW_MT_EXTINT 0x0000018
34#define HT_IRQ_LOW_MT_LINT1 0x000008c
35#define HT_IRQ_LOW_MT_LINT0 0x0000098
36
37#define HT_IRQ_LOW_IRQ_MASKED 0x0000001
38
39
40#define HT_IRQ_HIGH_DEST_ID_SHIFT 0
41#define HT_IRQ_HIGH_DEST_ID_MASK 0x00ffffff
42#define HT_IRQ_HIGH_DEST_ID(v) \
43 ((((v) >> 8) << HT_IRQ_HIGH_DEST_ID_SHIFT) & HT_IRQ_HIGH_DEST_ID_MASK)
44
45#endif /* _ASM_X86_HYPERTRANSPORT_H */
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
new file mode 100644
index 000000000000..48f0004db8c9
--- /dev/null
+++ b/arch/x86/include/asm/i387.h
@@ -0,0 +1,400 @@
1/*
2 * Copyright (C) 1994 Linus Torvalds
3 *
4 * Pentium III FXSR, SSE support
5 * General FPU state handling cleanups
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 * x86-64 work by Andi Kleen 2002
8 */
9
10#ifndef _ASM_X86_I387_H
11#define _ASM_X86_I387_H
12
13#include <linux/sched.h>
14#include <linux/kernel_stat.h>
15#include <linux/regset.h>
16#include <linux/hardirq.h>
17#include <asm/asm.h>
18#include <asm/processor.h>
19#include <asm/sigcontext.h>
20#include <asm/user.h>
21#include <asm/uaccess.h>
22#include <asm/xsave.h>
23
24extern unsigned int sig_xstate_size;
25extern void fpu_init(void);
26extern void mxcsr_feature_mask_init(void);
27extern int init_fpu(struct task_struct *child);
28extern asmlinkage void math_state_restore(void);
29extern void init_thread_xstate(void);
30extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
31
32extern user_regset_active_fn fpregs_active, xfpregs_active;
33extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get;
34extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set;
35
36extern struct _fpx_sw_bytes fx_sw_reserved;
37#ifdef CONFIG_IA32_EMULATION
38extern unsigned int sig_xstate_ia32_size;
39extern struct _fpx_sw_bytes fx_sw_reserved_ia32;
40struct _fpstate_ia32;
41struct _xstate_ia32;
42extern int save_i387_xstate_ia32(void __user *buf);
43extern int restore_i387_xstate_ia32(void __user *buf);
44#endif
45
46#define X87_FSW_ES (1 << 7) /* Exception Summary */
47
48#ifdef CONFIG_X86_64
49
50/* Ignore delayed exceptions from user space */
51static inline void tolerant_fwait(void)
52{
53 asm volatile("1: fwait\n"
54 "2:\n"
55 _ASM_EXTABLE(1b, 2b));
56}
57
58static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
59{
60 int err;
61
62 asm volatile("1: rex64/fxrstor (%[fx])\n\t"
63 "2:\n"
64 ".section .fixup,\"ax\"\n"
65 "3: movl $-1,%[err]\n"
66 " jmp 2b\n"
67 ".previous\n"
68 _ASM_EXTABLE(1b, 3b)
69 : [err] "=r" (err)
70#if 0 /* See comment in __save_init_fpu() below. */
71 : [fx] "r" (fx), "m" (*fx), "0" (0));
72#else
73 : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
74#endif
75 return err;
76}
77
78static inline int restore_fpu_checking(struct task_struct *tsk)
79{
80 if (task_thread_info(tsk)->status & TS_XSAVE)
81 return xrstor_checking(&tsk->thread.xstate->xsave);
82 else
83 return fxrstor_checking(&tsk->thread.xstate->fxsave);
84}
85
86/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
87 is pending. Clear the x87 state here by setting it to fixed
88 values. The kernel data segment can be sometimes 0 and sometimes
89 new user value. Both should be ok.
90 Use the PDA as safe address because it should be already in L1. */
91static inline void clear_fpu_state(struct task_struct *tsk)
92{
93 struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
94 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
95
96 /*
97 * xsave header may indicate the init state of the FP.
98 */
99 if ((task_thread_info(tsk)->status & TS_XSAVE) &&
100 !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
101 return;
102
103 if (unlikely(fx->swd & X87_FSW_ES))
104 asm volatile("fnclex");
105 alternative_input(ASM_NOP8 ASM_NOP2,
106 " emms\n" /* clear stack tags */
107 " fildl %%gs:0", /* load to clear state */
108 X86_FEATURE_FXSAVE_LEAK);
109}
110
111static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
112{
113 int err;
114
115 asm volatile("1: rex64/fxsave (%[fx])\n\t"
116 "2:\n"
117 ".section .fixup,\"ax\"\n"
118 "3: movl $-1,%[err]\n"
119 " jmp 2b\n"
120 ".previous\n"
121 _ASM_EXTABLE(1b, 3b)
122 : [err] "=r" (err), "=m" (*fx)
123#if 0 /* See comment in __fxsave_clear() below. */
124 : [fx] "r" (fx), "0" (0));
125#else
126 : [fx] "cdaSDb" (fx), "0" (0));
127#endif
128 if (unlikely(err) &&
129 __clear_user(fx, sizeof(struct i387_fxsave_struct)))
130 err = -EFAULT;
131 /* No need to clear here because the caller clears USED_MATH */
132 return err;
133}
134
135static inline void fxsave(struct task_struct *tsk)
136{
137 /* Using "rex64; fxsave %0" is broken because, if the memory operand
138 uses any extended registers for addressing, a second REX prefix
139 will be generated (to the assembler, rex64 followed by semicolon
140 is a separate instruction), and hence the 64-bitness is lost. */
141#if 0
142 /* Using "fxsaveq %0" would be the ideal choice, but is only supported
143 starting with gas 2.16. */
144 __asm__ __volatile__("fxsaveq %0"
145 : "=m" (tsk->thread.xstate->fxsave));
146#elif 0
147 /* Using, as a workaround, the properly prefixed form below isn't
148 accepted by any binutils version so far released, complaining that
149 the same type of prefix is used twice if an extended register is
150 needed for addressing (fix submitted to mainline 2005-11-21). */
151 __asm__ __volatile__("rex64/fxsave %0"
152 : "=m" (tsk->thread.xstate->fxsave));
153#else
154 /* This, however, we can work around by forcing the compiler to select
155 an addressing mode that doesn't require extended registers. */
156 __asm__ __volatile__("rex64/fxsave (%1)"
157 : "=m" (tsk->thread.xstate->fxsave)
158 : "cdaSDb" (&tsk->thread.xstate->fxsave));
159#endif
160}
161
162static inline void __save_init_fpu(struct task_struct *tsk)
163{
164 if (task_thread_info(tsk)->status & TS_XSAVE)
165 xsave(tsk);
166 else
167 fxsave(tsk);
168
169 clear_fpu_state(tsk);
170 task_thread_info(tsk)->status &= ~TS_USEDFPU;
171}
172
173#else /* CONFIG_X86_32 */
174
175extern void finit(void);
176
177static inline void tolerant_fwait(void)
178{
179 asm volatile("fnclex ; fwait");
180}
181
182static inline void restore_fpu(struct task_struct *tsk)
183{
184 if (task_thread_info(tsk)->status & TS_XSAVE) {
185 xrstor_checking(&tsk->thread.xstate->xsave);
186 return;
187 }
188 /*
189 * The "nop" is needed to make the instructions the same
190 * length.
191 */
192 alternative_input(
193 "nop ; frstor %1",
194 "fxrstor %1",
195 X86_FEATURE_FXSR,
196 "m" (tsk->thread.xstate->fxsave));
197}
198
199/* We need a safe address that is cheap to find and that is already
200 in L1 during context switch. The best choices are unfortunately
201 different for UP and SMP */
202#ifdef CONFIG_SMP
203#define safe_address (__per_cpu_offset[0])
204#else
205#define safe_address (kstat_cpu(0).cpustat.user)
206#endif
207
208/*
209 * These must be called with preempt disabled
210 */
211static inline void __save_init_fpu(struct task_struct *tsk)
212{
213 if (task_thread_info(tsk)->status & TS_XSAVE) {
214 struct xsave_struct *xstate = &tsk->thread.xstate->xsave;
215 struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
216
217 xsave(tsk);
218
219 /*
220 * xsave header may indicate the init state of the FP.
221 */
222 if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
223 goto end;
224
225 if (unlikely(fx->swd & X87_FSW_ES))
226 asm volatile("fnclex");
227
228 /*
229 * we can do a simple return here or be paranoid :)
230 */
231 goto clear_state;
232 }
233
234 /* Use more nops than strictly needed in case the compiler
235 varies code */
236 alternative_input(
237 "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
238 "fxsave %[fx]\n"
239 "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
240 X86_FEATURE_FXSR,
241 [fx] "m" (tsk->thread.xstate->fxsave),
242 [fsw] "m" (tsk->thread.xstate->fxsave.swd) : "memory");
243clear_state:
244 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
245 is pending. Clear the x87 state here by setting it to fixed
246 values. safe_address is a random variable that should be in L1 */
247 alternative_input(
248 GENERIC_NOP8 GENERIC_NOP2,
249 "emms\n\t" /* clear stack tags */
250 "fildl %[addr]", /* set F?P to defined value */
251 X86_FEATURE_FXSAVE_LEAK,
252 [addr] "m" (safe_address));
253end:
254 task_thread_info(tsk)->status &= ~TS_USEDFPU;
255}
256
257#endif /* CONFIG_X86_64 */
258
259/*
260 * Signal frame handlers...
261 */
262extern int save_i387_xstate(void __user *buf);
263extern int restore_i387_xstate(void __user *buf);
264
265static inline void __unlazy_fpu(struct task_struct *tsk)
266{
267 if (task_thread_info(tsk)->status & TS_USEDFPU) {
268 __save_init_fpu(tsk);
269 stts();
270 } else
271 tsk->fpu_counter = 0;
272}
273
274static inline void __clear_fpu(struct task_struct *tsk)
275{
276 if (task_thread_info(tsk)->status & TS_USEDFPU) {
277 tolerant_fwait();
278 task_thread_info(tsk)->status &= ~TS_USEDFPU;
279 stts();
280 }
281}
282
283static inline void kernel_fpu_begin(void)
284{
285 struct thread_info *me = current_thread_info();
286 preempt_disable();
287 if (me->status & TS_USEDFPU)
288 __save_init_fpu(me->task);
289 else
290 clts();
291}
292
293static inline void kernel_fpu_end(void)
294{
295 stts();
296 preempt_enable();
297}
298
299/*
300 * Some instructions like VIA's padlock instructions generate a spurious
301 * DNA fault but don't modify SSE registers. And these instructions
302 * get used from interrupt context aswell. To prevent these kernel instructions
303 * in interrupt context interact wrongly with other user/kernel fpu usage, we
304 * should use them only in the context of irq_ts_save/restore()
305 */
306static inline int irq_ts_save(void)
307{
308 /*
309 * If we are in process context, we are ok to take a spurious DNA fault.
310 * Otherwise, doing clts() in process context require pre-emption to
311 * be disabled or some heavy lifting like kernel_fpu_begin()
312 */
313 if (!in_interrupt())
314 return 0;
315
316 if (read_cr0() & X86_CR0_TS) {
317 clts();
318 return 1;
319 }
320
321 return 0;
322}
323
324static inline void irq_ts_restore(int TS_state)
325{
326 if (TS_state)
327 stts();
328}
329
330#ifdef CONFIG_X86_64
331
332static inline void save_init_fpu(struct task_struct *tsk)
333{
334 __save_init_fpu(tsk);
335 stts();
336}
337
338#define unlazy_fpu __unlazy_fpu
339#define clear_fpu __clear_fpu
340
341#else /* CONFIG_X86_32 */
342
343/*
344 * These disable preemption on their own and are safe
345 */
346static inline void save_init_fpu(struct task_struct *tsk)
347{
348 preempt_disable();
349 __save_init_fpu(tsk);
350 stts();
351 preempt_enable();
352}
353
354static inline void unlazy_fpu(struct task_struct *tsk)
355{
356 preempt_disable();
357 __unlazy_fpu(tsk);
358 preempt_enable();
359}
360
361static inline void clear_fpu(struct task_struct *tsk)
362{
363 preempt_disable();
364 __clear_fpu(tsk);
365 preempt_enable();
366}
367
368#endif /* CONFIG_X86_64 */
369
370/*
371 * i387 state interaction
372 */
373static inline unsigned short get_fpu_cwd(struct task_struct *tsk)
374{
375 if (cpu_has_fxsr) {
376 return tsk->thread.xstate->fxsave.cwd;
377 } else {
378 return (unsigned short)tsk->thread.xstate->fsave.cwd;
379 }
380}
381
382static inline unsigned short get_fpu_swd(struct task_struct *tsk)
383{
384 if (cpu_has_fxsr) {
385 return tsk->thread.xstate->fxsave.swd;
386 } else {
387 return (unsigned short)tsk->thread.xstate->fsave.swd;
388 }
389}
390
391static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
392{
393 if (cpu_has_xmm) {
394 return tsk->thread.xstate->fxsave.mxcsr;
395 } else {
396 return MXCSR_DEFAULT;
397 }
398}
399
400#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
new file mode 100644
index 000000000000..1edbf89680fd
--- /dev/null
+++ b/arch/x86/include/asm/i8253.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_X86_I8253_H
2#define _ASM_X86_I8253_H
3
4/* i8253A PIT registers */
5#define PIT_MODE 0x43
6#define PIT_CH0 0x40
7#define PIT_CH2 0x42
8
9extern spinlock_t i8253_lock;
10
11extern struct clock_event_device *global_clock_event;
12
13extern void setup_pit_timer(void);
14
15#define inb_pit inb_p
16#define outb_pit outb_p
17
18#endif /* _ASM_X86_I8253_H */
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
new file mode 100644
index 000000000000..58d7091eeb1f
--- /dev/null
+++ b/arch/x86/include/asm/i8259.h
@@ -0,0 +1,63 @@
1#ifndef _ASM_X86_I8259_H
2#define _ASM_X86_I8259_H
3
4#include <linux/delay.h>
5
6extern unsigned int cached_irq_mask;
7
8#define __byte(x, y) (((unsigned char *)&(y))[x])
9#define cached_master_mask (__byte(0, cached_irq_mask))
10#define cached_slave_mask (__byte(1, cached_irq_mask))
11
12/* i8259A PIC registers */
13#define PIC_MASTER_CMD 0x20
14#define PIC_MASTER_IMR 0x21
15#define PIC_MASTER_ISR PIC_MASTER_CMD
16#define PIC_MASTER_POLL PIC_MASTER_ISR
17#define PIC_MASTER_OCW3 PIC_MASTER_ISR
18#define PIC_SLAVE_CMD 0xa0
19#define PIC_SLAVE_IMR 0xa1
20
21/* i8259A PIC related value */
22#define PIC_CASCADE_IR 2
23#define MASTER_ICW4_DEFAULT 0x01
24#define SLAVE_ICW4_DEFAULT 0x01
25#define PIC_ICW4_AEOI 2
26
27extern spinlock_t i8259A_lock;
28
29extern void init_8259A(int auto_eoi);
30extern void enable_8259A_irq(unsigned int irq);
31extern void disable_8259A_irq(unsigned int irq);
32extern unsigned int startup_8259A_irq(unsigned int irq);
33
34/* the PIC may need a careful delay on some platforms, hence specific calls */
35static inline unsigned char inb_pic(unsigned int port)
36{
37 unsigned char value = inb(port);
38
39 /*
40 * delay for some accesses to PIC on motherboard or in chipset
41 * must be at least one microsecond, so be safe here:
42 */
43 udelay(2);
44
45 return value;
46}
47
48static inline void outb_pic(unsigned char value, unsigned int port)
49{
50 outb(value, port);
51 /*
52 * delay for some accesses to PIC on motherboard or in chipset
53 * must be at least one microsecond, so be safe here:
54 */
55 udelay(2);
56}
57
58extern struct irq_chip i8259A_chip;
59
60extern void mask_8259A(void);
61extern void unmask_8259A(void);
62
63#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
new file mode 100644
index 000000000000..97989c0e534c
--- /dev/null
+++ b/arch/x86/include/asm/ia32.h
@@ -0,0 +1,170 @@
1#ifndef _ASM_X86_IA32_H
2#define _ASM_X86_IA32_H
3
4
5#ifdef CONFIG_IA32_EMULATION
6
7#include <linux/compat.h>
8
9/*
10 * 32 bit structures for IA32 support.
11 */
12
13#include <asm/sigcontext32.h>
14
15/* signal.h */
16struct sigaction32 {
17 unsigned int sa_handler; /* Really a pointer, but need to deal
18 with 32 bits */
19 unsigned int sa_flags;
20 unsigned int sa_restorer; /* Another 32 bit pointer */
21 compat_sigset_t sa_mask; /* A 32 bit mask */
22};
23
24struct old_sigaction32 {
25 unsigned int sa_handler; /* Really a pointer, but need to deal
26 with 32 bits */
27 compat_old_sigset_t sa_mask; /* A 32 bit mask */
28 unsigned int sa_flags;
29 unsigned int sa_restorer; /* Another 32 bit pointer */
30};
31
32typedef struct sigaltstack_ia32 {
33 unsigned int ss_sp;
34 int ss_flags;
35 unsigned int ss_size;
36} stack_ia32_t;
37
38struct ucontext_ia32 {
39 unsigned int uc_flags;
40 unsigned int uc_link;
41 stack_ia32_t uc_stack;
42 struct sigcontext_ia32 uc_mcontext;
43 compat_sigset_t uc_sigmask; /* mask last for extensibility */
44};
45
46/* This matches struct stat64 in glibc2.2, hence the absolutely
47 * insane amounts of padding around dev_t's.
48 */
49struct stat64 {
50 unsigned long long st_dev;
51 unsigned char __pad0[4];
52
53#define STAT64_HAS_BROKEN_ST_INO 1
54 unsigned int __st_ino;
55
56 unsigned int st_mode;
57 unsigned int st_nlink;
58
59 unsigned int st_uid;
60 unsigned int st_gid;
61
62 unsigned long long st_rdev;
63 unsigned char __pad3[4];
64
65 long long st_size;
66 unsigned int st_blksize;
67
68 long long st_blocks;/* Number 512-byte blocks allocated */
69
70 unsigned st_atime;
71 unsigned st_atime_nsec;
72 unsigned st_mtime;
73 unsigned st_mtime_nsec;
74 unsigned st_ctime;
75 unsigned st_ctime_nsec;
76
77 unsigned long long st_ino;
78} __attribute__((packed));
79
80typedef struct compat_siginfo {
81 int si_signo;
82 int si_errno;
83 int si_code;
84
85 union {
86 int _pad[((128 / sizeof(int)) - 3)];
87
88 /* kill() */
89 struct {
90 unsigned int _pid; /* sender's pid */
91 unsigned int _uid; /* sender's uid */
92 } _kill;
93
94 /* POSIX.1b timers */
95 struct {
96 compat_timer_t _tid; /* timer id */
97 int _overrun; /* overrun count */
98 compat_sigval_t _sigval; /* same as below */
99 int _sys_private; /* not to be passed to user */
100 int _overrun_incr; /* amount to add to overrun */
101 } _timer;
102
103 /* POSIX.1b signals */
104 struct {
105 unsigned int _pid; /* sender's pid */
106 unsigned int _uid; /* sender's uid */
107 compat_sigval_t _sigval;
108 } _rt;
109
110 /* SIGCHLD */
111 struct {
112 unsigned int _pid; /* which child */
113 unsigned int _uid; /* sender's uid */
114 int _status; /* exit code */
115 compat_clock_t _utime;
116 compat_clock_t _stime;
117 } _sigchld;
118
119 /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */
120 struct {
121 unsigned int _addr; /* faulting insn/memory ref. */
122 } _sigfault;
123
124 /* SIGPOLL */
125 struct {
126 int _band; /* POLL_IN, POLL_OUT, POLL_MSG */
127 int _fd;
128 } _sigpoll;
129 } _sifields;
130} compat_siginfo_t;
131
132struct sigframe32 {
133 u32 pretcode;
134 int sig;
135 struct sigcontext_ia32 sc;
136 struct _fpstate_ia32 fpstate;
137 unsigned int extramask[_COMPAT_NSIG_WORDS-1];
138};
139
140struct rt_sigframe32 {
141 u32 pretcode;
142 int sig;
143 u32 pinfo;
144 u32 puc;
145 compat_siginfo_t info;
146 struct ucontext_ia32 uc;
147 struct _fpstate_ia32 fpstate;
148};
149
150struct ustat32 {
151 __u32 f_tfree;
152 compat_ino_t f_tinode;
153 char f_fname[6];
154 char f_fpack[6];
155};
156
157#define IA32_STACK_TOP IA32_PAGE_OFFSET
158
159#ifdef __KERNEL__
160struct linux_binprm;
161extern int ia32_setup_arg_pages(struct linux_binprm *bprm,
162 unsigned long stack_top, int exec_stack);
163struct mm_struct;
164extern void ia32_pick_mmap_layout(struct mm_struct *mm);
165
166#endif
167
168#endif /* !CONFIG_IA32_SUPPORT */
169
170#endif /* _ASM_X86_IA32_H */
diff --git a/arch/x86/include/asm/ia32_unistd.h b/arch/x86/include/asm/ia32_unistd.h
new file mode 100644
index 000000000000..976f6ecd2ce6
--- /dev/null
+++ b/arch/x86/include/asm/ia32_unistd.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_X86_IA32_UNISTD_H
2#define _ASM_X86_IA32_UNISTD_H
3
4/*
5 * This file contains the system call numbers of the ia32 port,
6 * this is for the kernel only.
7 * Only add syscalls here where some part of the kernel needs to know
8 * the number. This should be otherwise in sync with asm-x86/unistd_32.h. -AK
9 */
10
11#define __NR_ia32_restart_syscall 0
12#define __NR_ia32_exit 1
13#define __NR_ia32_read 3
14#define __NR_ia32_write 4
15#define __NR_ia32_sigreturn 119
16#define __NR_ia32_rt_sigreturn 173
17
18#endif /* _ASM_X86_IA32_UNISTD_H */
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
new file mode 100644
index 000000000000..44c89c3a23e9
--- /dev/null
+++ b/arch/x86/include/asm/idle.h
@@ -0,0 +1,16 @@
1#ifndef _ASM_X86_IDLE_H
2#define _ASM_X86_IDLE_H
3
4#define IDLE_START 1
5#define IDLE_END 2
6
7struct notifier_block;
8void idle_notifier_register(struct notifier_block *n);
9void idle_notifier_unregister(struct notifier_block *n);
10
11void enter_idle(void);
12void exit_idle(void);
13
14void c1e_remove_cpu(int cpu);
15
16#endif /* _ASM_X86_IDLE_H */
diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h
new file mode 100644
index 000000000000..fa0fd068bc2e
--- /dev/null
+++ b/arch/x86/include/asm/intel_arch_perfmon.h
@@ -0,0 +1,31 @@
1#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H
2#define _ASM_X86_INTEL_ARCH_PERFMON_H
3
4#define MSR_ARCH_PERFMON_PERFCTR0 0xc1
5#define MSR_ARCH_PERFMON_PERFCTR1 0xc2
6
7#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
8#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
9
10#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22)
11#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
12#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
13#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
14
15#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c)
16#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
17#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0)
18#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
19 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
20
21union cpuid10_eax {
22 struct {
23 unsigned int version_id:8;
24 unsigned int num_counters:8;
25 unsigned int bit_width:8;
26 unsigned int mask_length:8;
27 } split;
28 unsigned int full;
29};
30
31#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
new file mode 100644
index 000000000000..5618a103f395
--- /dev/null
+++ b/arch/x86/include/asm/io.h
@@ -0,0 +1,91 @@
1#ifndef _ASM_X86_IO_H
2#define _ASM_X86_IO_H
3
4#define ARCH_HAS_IOREMAP_WC
5
6#include <linux/compiler.h>
7
8#define build_mmio_read(name, size, type, reg, barrier) \
9static inline type name(const volatile void __iomem *addr) \
10{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
11:"m" (*(volatile type __force *)addr) barrier); return ret; }
12
13#define build_mmio_write(name, size, type, reg, barrier) \
14static inline void name(type val, volatile void __iomem *addr) \
15{ asm volatile("mov" size " %0,%1": :reg (val), \
16"m" (*(volatile type __force *)addr) barrier); }
17
18build_mmio_read(readb, "b", unsigned char, "=q", :"memory")
19build_mmio_read(readw, "w", unsigned short, "=r", :"memory")
20build_mmio_read(readl, "l", unsigned int, "=r", :"memory")
21
22build_mmio_read(__readb, "b", unsigned char, "=q", )
23build_mmio_read(__readw, "w", unsigned short, "=r", )
24build_mmio_read(__readl, "l", unsigned int, "=r", )
25
26build_mmio_write(writeb, "b", unsigned char, "q", :"memory")
27build_mmio_write(writew, "w", unsigned short, "r", :"memory")
28build_mmio_write(writel, "l", unsigned int, "r", :"memory")
29
30build_mmio_write(__writeb, "b", unsigned char, "q", )
31build_mmio_write(__writew, "w", unsigned short, "r", )
32build_mmio_write(__writel, "l", unsigned int, "r", )
33
34#define readb_relaxed(a) __readb(a)
35#define readw_relaxed(a) __readw(a)
36#define readl_relaxed(a) __readl(a)
37#define __raw_readb __readb
38#define __raw_readw __readw
39#define __raw_readl __readl
40
41#define __raw_writeb __writeb
42#define __raw_writew __writew
43#define __raw_writel __writel
44
45#define mmiowb() barrier()
46
47#ifdef CONFIG_X86_64
48build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
49build_mmio_read(__readq, "q", unsigned long, "=r", )
50build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
51build_mmio_write(__writeq, "q", unsigned long, "r", )
52
53#define readq_relaxed(a) __readq(a)
54#define __raw_readq __readq
55#define __raw_writeq writeq
56
57/* Let people know we have them */
58#define readq readq
59#define writeq writeq
60#endif
61
62extern int iommu_bio_merge;
63
64#ifdef CONFIG_X86_32
65# include "io_32.h"
66#else
67# include "io_64.h"
68#endif
69
70extern void *xlate_dev_mem_ptr(unsigned long phys);
71extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
72
73extern int ioremap_change_attr(unsigned long vaddr, unsigned long size,
74 unsigned long prot_val);
75extern void __iomem *ioremap_wc(unsigned long offset, unsigned long size);
76
77/*
78 * early_ioremap() and early_iounmap() are for temporary early boot-time
79 * mappings, before the real ioremap() is functional.
80 * A boot-time mapping is currently limited to at most 16 pages.
81 */
82extern void early_ioremap_init(void);
83extern void early_ioremap_clear(void);
84extern void early_ioremap_reset(void);
85extern void *early_ioremap(unsigned long offset, unsigned long size);
86extern void *early_memremap(unsigned long offset, unsigned long size);
87extern void early_iounmap(void *addr, unsigned long size);
88extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
89
90
91#endif /* _ASM_X86_IO_H */
diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h
new file mode 100644
index 000000000000..d8e242e1b396
--- /dev/null
+++ b/arch/x86/include/asm/io_32.h
@@ -0,0 +1,284 @@
1#ifndef _ASM_X86_IO_32_H
2#define _ASM_X86_IO_32_H
3
4#include <linux/string.h>
5#include <linux/compiler.h>
6
7/*
8 * This file contains the definitions for the x86 IO instructions
9 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
10 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
11 * versions of the single-IO instructions (inb_p/inw_p/..).
12 *
13 * This file is not meant to be obfuscating: it's just complicated
14 * to (a) handle it all in a way that makes gcc able to optimize it
15 * as well as possible and (b) trying to avoid writing the same thing
16 * over and over again with slight variations and possibly making a
17 * mistake somewhere.
18 */
19
20/*
21 * Thanks to James van Artsdalen for a better timing-fix than
22 * the two short jumps: using outb's to a nonexistent port seems
23 * to guarantee better timings even on fast machines.
24 *
25 * On the other hand, I'd like to be sure of a non-existent port:
26 * I feel a bit unsafe about using 0x80 (should be safe, though)
27 *
28 * Linus
29 */
30
31 /*
32 * Bit simplified and optimized by Jan Hubicka
33 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
34 *
35 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
36 * isa_read[wl] and isa_write[wl] fixed
37 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
38 */
39
40#define IO_SPACE_LIMIT 0xffff
41
42#define XQUAD_PORTIO_BASE 0xfe400000
43#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
44
45#ifdef __KERNEL__
46
47#include <asm-generic/iomap.h>
48
49#include <linux/vmalloc.h>
50
51/*
52 * Convert a virtual cached pointer to an uncached pointer
53 */
54#define xlate_dev_kmem_ptr(p) p
55
56/**
57 * virt_to_phys - map virtual addresses to physical
58 * @address: address to remap
59 *
60 * The returned physical address is the physical (CPU) mapping for
61 * the memory address given. It is only valid to use this function on
62 * addresses directly mapped or allocated via kmalloc.
63 *
64 * This function does not give bus mappings for DMA transfers. In
65 * almost all conceivable cases a device driver should not be using
66 * this function
67 */
68
69static inline unsigned long virt_to_phys(volatile void *address)
70{
71 return __pa(address);
72}
73
74/**
75 * phys_to_virt - map physical address to virtual
76 * @address: address to remap
77 *
78 * The returned virtual address is a current CPU mapping for
79 * the memory address given. It is only valid to use this function on
80 * addresses that have a kernel mapping
81 *
82 * This function does not handle bus mappings for DMA transfers. In
83 * almost all conceivable cases a device driver should not be using
84 * this function
85 */
86
87static inline void *phys_to_virt(unsigned long address)
88{
89 return __va(address);
90}
91
92/*
93 * Change "struct page" to physical address.
94 */
95#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
96
97/**
98 * ioremap - map bus memory into CPU space
99 * @offset: bus address of the memory
100 * @size: size of the resource to map
101 *
102 * ioremap performs a platform specific sequence of operations to
103 * make bus memory CPU accessible via the readb/readw/readl/writeb/
104 * writew/writel functions and the other mmio helpers. The returned
105 * address is not guaranteed to be usable directly as a virtual
106 * address.
107 *
108 * If the area you are trying to map is a PCI BAR you should have a
109 * look at pci_iomap().
110 */
111extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
112extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
113extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
114 unsigned long prot_val);
115
116/*
117 * The default ioremap() behavior is non-cached:
118 */
119static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
120{
121 return ioremap_nocache(offset, size);
122}
123
124extern void iounmap(volatile void __iomem *addr);
125
126/*
127 * ISA I/O bus memory addresses are 1:1 with the physical address.
128 */
129#define isa_virt_to_bus virt_to_phys
130#define isa_page_to_bus page_to_phys
131#define isa_bus_to_virt phys_to_virt
132
133/*
134 * However PCI ones are not necessarily 1:1 and therefore these interfaces
135 * are forbidden in portable PCI drivers.
136 *
137 * Allow them on x86 for legacy drivers, though.
138 */
139#define virt_to_bus virt_to_phys
140#define bus_to_virt phys_to_virt
141
142static inline void
143memset_io(volatile void __iomem *addr, unsigned char val, int count)
144{
145 memset((void __force *)addr, val, count);
146}
147
148static inline void
149memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
150{
151 __memcpy(dst, (const void __force *)src, count);
152}
153
154static inline void
155memcpy_toio(volatile void __iomem *dst, const void *src, int count)
156{
157 __memcpy((void __force *)dst, src, count);
158}
159
160/*
161 * ISA space is 'always mapped' on a typical x86 system, no need to
162 * explicitly ioremap() it. The fact that the ISA IO space is mapped
163 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
164 * are physical addresses. The following constant pointer can be
165 * used as the IO-area pointer (it can be iounmapped as well, so the
166 * analogy with PCI is quite large):
167 */
168#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
169
170/*
171 * Cache management
172 *
173 * This needed for two cases
174 * 1. Out of order aware processors
175 * 2. Accidentally out of order processors (PPro errata #51)
176 */
177
178#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
179
180static inline void flush_write_buffers(void)
181{
182 asm volatile("lock; addl $0,0(%%esp)": : :"memory");
183}
184
185#else
186
187#define flush_write_buffers() do { } while (0)
188
189#endif
190
191#endif /* __KERNEL__ */
192
193extern void native_io_delay(void);
194
195extern int io_delay_type;
196extern void io_delay_init(void);
197
198#if defined(CONFIG_PARAVIRT)
199#include <asm/paravirt.h>
200#else
201
202static inline void slow_down_io(void)
203{
204 native_io_delay();
205#ifdef REALLY_SLOW_IO
206 native_io_delay();
207 native_io_delay();
208 native_io_delay();
209#endif
210}
211
212#endif
213
214#define __BUILDIO(bwl, bw, type) \
215static inline void out##bwl(unsigned type value, int port) \
216{ \
217 out##bwl##_local(value, port); \
218} \
219 \
220static inline unsigned type in##bwl(int port) \
221{ \
222 return in##bwl##_local(port); \
223}
224
225#define BUILDIO(bwl, bw, type) \
226static inline void out##bwl##_local(unsigned type value, int port) \
227{ \
228 asm volatile("out" #bwl " %" #bw "0, %w1" \
229 : : "a"(value), "Nd"(port)); \
230} \
231 \
232static inline unsigned type in##bwl##_local(int port) \
233{ \
234 unsigned type value; \
235 asm volatile("in" #bwl " %w1, %" #bw "0" \
236 : "=a"(value) : "Nd"(port)); \
237 return value; \
238} \
239 \
240static inline void out##bwl##_local_p(unsigned type value, int port) \
241{ \
242 out##bwl##_local(value, port); \
243 slow_down_io(); \
244} \
245 \
246static inline unsigned type in##bwl##_local_p(int port) \
247{ \
248 unsigned type value = in##bwl##_local(port); \
249 slow_down_io(); \
250 return value; \
251} \
252 \
253__BUILDIO(bwl, bw, type) \
254 \
255static inline void out##bwl##_p(unsigned type value, int port) \
256{ \
257 out##bwl(value, port); \
258 slow_down_io(); \
259} \
260 \
261static inline unsigned type in##bwl##_p(int port) \
262{ \
263 unsigned type value = in##bwl(port); \
264 slow_down_io(); \
265 return value; \
266} \
267 \
268static inline void outs##bwl(int port, const void *addr, unsigned long count) \
269{ \
270 asm volatile("rep; outs" #bwl \
271 : "+S"(addr), "+c"(count) : "d"(port)); \
272} \
273 \
274static inline void ins##bwl(int port, void *addr, unsigned long count) \
275{ \
276 asm volatile("rep; ins" #bwl \
277 : "+D"(addr), "+c"(count) : "d"(port)); \
278}
279
280BUILDIO(b, b, char)
281BUILDIO(w, w, short)
282BUILDIO(l, , int)
283
284#endif /* _ASM_X86_IO_32_H */
diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h
new file mode 100644
index 000000000000..fea325a1122f
--- /dev/null
+++ b/arch/x86/include/asm/io_64.h
@@ -0,0 +1,244 @@
1#ifndef _ASM_X86_IO_64_H
2#define _ASM_X86_IO_64_H
3
4
5/*
6 * This file contains the definitions for the x86 IO instructions
7 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
8 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
9 * versions of the single-IO instructions (inb_p/inw_p/..).
10 *
11 * This file is not meant to be obfuscating: it's just complicated
12 * to (a) handle it all in a way that makes gcc able to optimize it
13 * as well as possible and (b) trying to avoid writing the same thing
14 * over and over again with slight variations and possibly making a
15 * mistake somewhere.
16 */
17
18/*
19 * Thanks to James van Artsdalen for a better timing-fix than
20 * the two short jumps: using outb's to a nonexistent port seems
21 * to guarantee better timings even on fast machines.
22 *
23 * On the other hand, I'd like to be sure of a non-existent port:
24 * I feel a bit unsafe about using 0x80 (should be safe, though)
25 *
26 * Linus
27 */
28
29 /*
30 * Bit simplified and optimized by Jan Hubicka
31 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
32 *
33 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
34 * isa_read[wl] and isa_write[wl] fixed
35 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
36 */
37
38extern void native_io_delay(void);
39
40extern int io_delay_type;
41extern void io_delay_init(void);
42
43#if defined(CONFIG_PARAVIRT)
44#include <asm/paravirt.h>
45#else
46
47static inline void slow_down_io(void)
48{
49 native_io_delay();
50#ifdef REALLY_SLOW_IO
51 native_io_delay();
52 native_io_delay();
53 native_io_delay();
54#endif
55}
56#endif
57
58/*
59 * Talk about misusing macros..
60 */
61#define __OUT1(s, x) \
62static inline void out##s(unsigned x value, unsigned short port) {
63
64#define __OUT2(s, s1, s2) \
65asm volatile ("out" #s " %" s1 "0,%" s2 "1"
66
67#ifndef REALLY_SLOW_IO
68#define REALLY_SLOW_IO
69#define UNSET_REALLY_SLOW_IO
70#endif
71
72#define __OUT(s, s1, x) \
73 __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
74 } \
75 __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
76 slow_down_io(); \
77}
78
79#define __IN1(s) \
80static inline RETURN_TYPE in##s(unsigned short port) \
81{ \
82 RETURN_TYPE _v;
83
84#define __IN2(s, s1, s2) \
85 asm volatile ("in" #s " %" s2 "1,%" s1 "0"
86
87#define __IN(s, s1, i...) \
88 __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
89 return _v; \
90 } \
91 __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
92 slow_down_io(); \
93 return _v; }
94
95#ifdef UNSET_REALLY_SLOW_IO
96#undef REALLY_SLOW_IO
97#endif
98
99#define __INS(s) \
100static inline void ins##s(unsigned short port, void *addr, \
101 unsigned long count) \
102{ \
103 asm volatile ("rep ; ins" #s \
104 : "=D" (addr), "=c" (count) \
105 : "d" (port), "0" (addr), "1" (count)); \
106}
107
108#define __OUTS(s) \
109static inline void outs##s(unsigned short port, const void *addr, \
110 unsigned long count) \
111{ \
112 asm volatile ("rep ; outs" #s \
113 : "=S" (addr), "=c" (count) \
114 : "d" (port), "0" (addr), "1" (count)); \
115}
116
117#define RETURN_TYPE unsigned char
118__IN(b, "")
119#undef RETURN_TYPE
120#define RETURN_TYPE unsigned short
121__IN(w, "")
122#undef RETURN_TYPE
123#define RETURN_TYPE unsigned int
124__IN(l, "")
125#undef RETURN_TYPE
126
127__OUT(b, "b", char)
128__OUT(w, "w", short)
129__OUT(l, , int)
130
131__INS(b)
132__INS(w)
133__INS(l)
134
135__OUTS(b)
136__OUTS(w)
137__OUTS(l)
138
139#define IO_SPACE_LIMIT 0xffff
140
141#if defined(__KERNEL__) && defined(__x86_64__)
142
143#include <linux/vmalloc.h>
144
145#ifndef __i386__
146/*
147 * Change virtual addresses to physical addresses and vv.
148 * These are pretty trivial
149 */
150static inline unsigned long virt_to_phys(volatile void *address)
151{
152 return __pa(address);
153}
154
155static inline void *phys_to_virt(unsigned long address)
156{
157 return __va(address);
158}
159#endif
160
161/*
162 * Change "struct page" to physical address.
163 */
164#define page_to_phys(page) ((dma_addr_t)page_to_pfn(page) << PAGE_SHIFT)
165
166#include <asm-generic/iomap.h>
167
168/*
169 * This one maps high address device memory and turns off caching for that area.
170 * it's useful if some control registers are in such an area and write combining
171 * or read caching is not desirable:
172 */
173extern void __iomem *ioremap_nocache(resource_size_t offset, unsigned long size);
174extern void __iomem *ioremap_cache(resource_size_t offset, unsigned long size);
175extern void __iomem *ioremap_prot(resource_size_t offset, unsigned long size,
176 unsigned long prot_val);
177
178/*
179 * The default ioremap() behavior is non-cached:
180 */
181static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
182{
183 return ioremap_nocache(offset, size);
184}
185
186extern void iounmap(volatile void __iomem *addr);
187
188extern void __iomem *fix_ioremap(unsigned idx, unsigned long phys);
189
190/*
191 * ISA I/O bus memory addresses are 1:1 with the physical address.
192 */
193#define isa_virt_to_bus virt_to_phys
194#define isa_page_to_bus page_to_phys
195#define isa_bus_to_virt phys_to_virt
196
197/*
198 * However PCI ones are not necessarily 1:1 and therefore these interfaces
199 * are forbidden in portable PCI drivers.
200 *
201 * Allow them on x86 for legacy drivers, though.
202 */
203#define virt_to_bus virt_to_phys
204#define bus_to_virt phys_to_virt
205
206void __memcpy_fromio(void *, unsigned long, unsigned);
207void __memcpy_toio(unsigned long, const void *, unsigned);
208
209static inline void memcpy_fromio(void *to, const volatile void __iomem *from,
210 unsigned len)
211{
212 __memcpy_fromio(to, (unsigned long)from, len);
213}
214
215static inline void memcpy_toio(volatile void __iomem *to, const void *from,
216 unsigned len)
217{
218 __memcpy_toio((unsigned long)to, from, len);
219}
220
221void memset_io(volatile void __iomem *a, int b, size_t c);
222
223/*
224 * ISA space is 'always mapped' on a typical x86 system, no need to
225 * explicitly ioremap() it. The fact that the ISA IO space is mapped
226 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
227 * are physical addresses. The following constant pointer can be
228 * used as the IO-area pointer (it can be iounmapped as well, so the
229 * analogy with PCI is quite large):
230 */
231#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
232
233#define flush_write_buffers()
234
235#define BIO_VMERGE_BOUNDARY iommu_bio_merge
236
237/*
238 * Convert a virtual cached pointer to an uncached pointer
239 */
240#define xlate_dev_kmem_ptr(p) p
241
242#endif /* __KERNEL__ */
243
244#endif /* _ASM_X86_IO_64_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
new file mode 100644
index 000000000000..e475e009ae5d
--- /dev/null
+++ b/arch/x86/include/asm/io_apic.h
@@ -0,0 +1,214 @@
1#ifndef _ASM_X86_IO_APIC_H
2#define _ASM_X86_IO_APIC_H
3
4#include <linux/types.h>
5#include <asm/mpspec.h>
6#include <asm/apicdef.h>
7#include <asm/irq_vectors.h>
8
9/*
10 * Intel IO-APIC support for SMP and UP systems.
11 *
12 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar
13 */
14
15/* I/O Unit Redirection Table */
16#define IO_APIC_REDIR_VECTOR_MASK 0x000FF
17#define IO_APIC_REDIR_DEST_LOGICAL 0x00800
18#define IO_APIC_REDIR_DEST_PHYSICAL 0x00000
19#define IO_APIC_REDIR_SEND_PENDING (1 << 12)
20#define IO_APIC_REDIR_REMOTE_IRR (1 << 14)
21#define IO_APIC_REDIR_LEVEL_TRIGGER (1 << 15)
22#define IO_APIC_REDIR_MASKED (1 << 16)
23
24/*
25 * The structure of the IO-APIC:
26 */
27union IO_APIC_reg_00 {
28 u32 raw;
29 struct {
30 u32 __reserved_2 : 14,
31 LTS : 1,
32 delivery_type : 1,
33 __reserved_1 : 8,
34 ID : 8;
35 } __attribute__ ((packed)) bits;
36};
37
38union IO_APIC_reg_01 {
39 u32 raw;
40 struct {
41 u32 version : 8,
42 __reserved_2 : 7,
43 PRQ : 1,
44 entries : 8,
45 __reserved_1 : 8;
46 } __attribute__ ((packed)) bits;
47};
48
49union IO_APIC_reg_02 {
50 u32 raw;
51 struct {
52 u32 __reserved_2 : 24,
53 arbitration : 4,
54 __reserved_1 : 4;
55 } __attribute__ ((packed)) bits;
56};
57
58union IO_APIC_reg_03 {
59 u32 raw;
60 struct {
61 u32 boot_DT : 1,
62 __reserved_1 : 31;
63 } __attribute__ ((packed)) bits;
64};
65
66enum ioapic_irq_destination_types {
67 dest_Fixed = 0,
68 dest_LowestPrio = 1,
69 dest_SMI = 2,
70 dest__reserved_1 = 3,
71 dest_NMI = 4,
72 dest_INIT = 5,
73 dest__reserved_2 = 6,
74 dest_ExtINT = 7
75};
76
77struct IO_APIC_route_entry {
78 __u32 vector : 8,
79 delivery_mode : 3, /* 000: FIXED
80 * 001: lowest prio
81 * 111: ExtINT
82 */
83 dest_mode : 1, /* 0: physical, 1: logical */
84 delivery_status : 1,
85 polarity : 1,
86 irr : 1,
87 trigger : 1, /* 0: edge, 1: level */
88 mask : 1, /* 0: enabled, 1: disabled */
89 __reserved_2 : 15;
90
91 __u32 __reserved_3 : 24,
92 dest : 8;
93} __attribute__ ((packed));
94
95struct IR_IO_APIC_route_entry {
96 __u64 vector : 8,
97 zero : 3,
98 index2 : 1,
99 delivery_status : 1,
100 polarity : 1,
101 irr : 1,
102 trigger : 1,
103 mask : 1,
104 reserved : 31,
105 format : 1,
106 index : 15;
107} __attribute__ ((packed));
108
109#ifdef CONFIG_X86_IO_APIC
110
111/*
112 * # of IO-APICs and # of IRQ routing registers
113 */
114extern int nr_ioapics;
115extern int nr_ioapic_registers[MAX_IO_APICS];
116
117/*
118 * MP-BIOS irq configuration table structures:
119 */
120
121#define MP_MAX_IOAPIC_PIN 127
122
123struct mp_config_ioapic {
124 unsigned long mp_apicaddr;
125 unsigned int mp_apicid;
126 unsigned char mp_type;
127 unsigned char mp_apicver;
128 unsigned char mp_flags;
129};
130
131struct mp_config_intsrc {
132 unsigned int mp_dstapic;
133 unsigned char mp_type;
134 unsigned char mp_irqtype;
135 unsigned short mp_irqflag;
136 unsigned char mp_srcbus;
137 unsigned char mp_srcbusirq;
138 unsigned char mp_dstirq;
139};
140
141/* I/O APIC entries */
142extern struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
143
144/* # of MP IRQ source entries */
145extern int mp_irq_entries;
146
147/* MP IRQ source entries */
148extern struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
149
150/* non-0 if default (table-less) MP configuration */
151extern int mpc_default_type;
152
153/* Older SiS APIC requires we rewrite the index register */
154extern int sis_apic_bug;
155
156/* 1 if "noapic" boot option passed */
157extern int skip_ioapic_setup;
158
159/* 1 if "noapic" boot option passed */
160extern int noioapicquirk;
161
162/* -1 if "noapic" boot option passed */
163extern int noioapicreroute;
164
165/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
166extern int timer_through_8259;
167
168static inline void disable_ioapic_setup(void)
169{
170#ifdef CONFIG_PCI
171 noioapicquirk = 1;
172 noioapicreroute = -1;
173#endif
174 skip_ioapic_setup = 1;
175}
176
177/*
178 * If we use the IO-APIC for IRQ routing, disable automatic
179 * assignment of PCI IRQ's.
180 */
181#define io_apic_assign_pci_irqs \
182 (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
183
184#ifdef CONFIG_ACPI
185extern int io_apic_get_unique_id(int ioapic, int apic_id);
186extern int io_apic_get_version(int ioapic);
187extern int io_apic_get_redir_entries(int ioapic);
188extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
189 int edge_level, int active_high_low);
190#endif /* CONFIG_ACPI */
191
192extern int (*ioapic_renumber_irq)(int ioapic, int irq);
193extern void ioapic_init_mappings(void);
194
195#ifdef CONFIG_X86_64
196extern int save_mask_IO_APIC_setup(void);
197extern void restore_IO_APIC_setup(void);
198extern void reinit_intr_remapped_IO_APIC(int);
199#endif
200
201extern int probe_nr_irqs(void);
202
203#else /* !CONFIG_X86_IO_APIC */
204#define io_apic_assign_pci_irqs 0
205static const int timer_through_8259 = 0;
206static inline void ioapic_init_mappings(void) { }
207
208static inline int probe_nr_irqs(void)
209{
210 return NR_IRQS;
211}
212#endif
213
214#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/ioctl.h b/arch/x86/include/asm/ioctl.h
new file mode 100644
index 000000000000..b279fe06dfe5
--- /dev/null
+++ b/arch/x86/include/asm/ioctl.h
@@ -0,0 +1 @@
#include <asm-generic/ioctl.h>
diff --git a/arch/x86/include/asm/ioctls.h b/arch/x86/include/asm/ioctls.h
new file mode 100644
index 000000000000..0d5b23b7b06e
--- /dev/null
+++ b/arch/x86/include/asm/ioctls.h
@@ -0,0 +1,94 @@
1#ifndef _ASM_X86_IOCTLS_H
2#define _ASM_X86_IOCTLS_H
3
4#include <asm/ioctl.h>
5
6/* 0x54 is just a magic number to make these relatively unique ('T') */
7
8#define TCGETS 0x5401
9#define TCSETS 0x5402 /* Clashes with SNDCTL_TMR_START sound ioctl */
10#define TCSETSW 0x5403
11#define TCSETSF 0x5404
12#define TCGETA 0x5405
13#define TCSETA 0x5406
14#define TCSETAW 0x5407
15#define TCSETAF 0x5408
16#define TCSBRK 0x5409
17#define TCXONC 0x540A
18#define TCFLSH 0x540B
19#define TIOCEXCL 0x540C
20#define TIOCNXCL 0x540D
21#define TIOCSCTTY 0x540E
22#define TIOCGPGRP 0x540F
23#define TIOCSPGRP 0x5410
24#define TIOCOUTQ 0x5411
25#define TIOCSTI 0x5412
26#define TIOCGWINSZ 0x5413
27#define TIOCSWINSZ 0x5414
28#define TIOCMGET 0x5415
29#define TIOCMBIS 0x5416
30#define TIOCMBIC 0x5417
31#define TIOCMSET 0x5418
32#define TIOCGSOFTCAR 0x5419
33#define TIOCSSOFTCAR 0x541A
34#define FIONREAD 0x541B
35#define TIOCINQ FIONREAD
36#define TIOCLINUX 0x541C
37#define TIOCCONS 0x541D
38#define TIOCGSERIAL 0x541E
39#define TIOCSSERIAL 0x541F
40#define TIOCPKT 0x5420
41#define FIONBIO 0x5421
42#define TIOCNOTTY 0x5422
43#define TIOCSETD 0x5423
44#define TIOCGETD 0x5424
45#define TCSBRKP 0x5425 /* Needed for POSIX tcsendbreak() */
46/* #define TIOCTTYGSTRUCT 0x5426 - Former debugging-only ioctl */
47#define TIOCSBRK 0x5427 /* BSD compatibility */
48#define TIOCCBRK 0x5428 /* BSD compatibility */
49#define TIOCGSID 0x5429 /* Return the session ID of FD */
50#define TCGETS2 _IOR('T', 0x2A, struct termios2)
51#define TCSETS2 _IOW('T', 0x2B, struct termios2)
52#define TCSETSW2 _IOW('T', 0x2C, struct termios2)
53#define TCSETSF2 _IOW('T', 0x2D, struct termios2)
54#define TIOCGRS485 0x542E
55#define TIOCSRS485 0x542F
56#define TIOCGPTN _IOR('T', 0x30, unsigned int)
57 /* Get Pty Number (of pty-mux device) */
58#define TIOCSPTLCK _IOW('T', 0x31, int) /* Lock/unlock Pty */
59#define TCGETX 0x5432 /* SYS5 TCGETX compatibility */
60#define TCSETX 0x5433
61#define TCSETXF 0x5434
62#define TCSETXW 0x5435
63
64#define FIONCLEX 0x5450
65#define FIOCLEX 0x5451
66#define FIOASYNC 0x5452
67#define TIOCSERCONFIG 0x5453
68#define TIOCSERGWILD 0x5454
69#define TIOCSERSWILD 0x5455
70#define TIOCGLCKTRMIOS 0x5456
71#define TIOCSLCKTRMIOS 0x5457
72#define TIOCSERGSTRUCT 0x5458 /* For debugging only */
73#define TIOCSERGETLSR 0x5459 /* Get line status register */
74#define TIOCSERGETMULTI 0x545A /* Get multiport config */
75#define TIOCSERSETMULTI 0x545B /* Set multiport config */
76
77#define TIOCMIWAIT 0x545C /* wait for a change on serial input line(s) */
78#define TIOCGICOUNT 0x545D /* read serial port inline interrupt counts */
79#define TIOCGHAYESESP 0x545E /* Get Hayes ESP configuration */
80#define TIOCSHAYESESP 0x545F /* Set Hayes ESP configuration */
81#define FIOQSIZE 0x5460
82
83/* Used for packet mode */
84#define TIOCPKT_DATA 0
85#define TIOCPKT_FLUSHREAD 1
86#define TIOCPKT_FLUSHWRITE 2
87#define TIOCPKT_STOP 4
88#define TIOCPKT_START 8
89#define TIOCPKT_NOSTOP 16
90#define TIOCPKT_DOSTOP 32
91
92#define TIOCSER_TEMT 0x01 /* Transmitter physically empty */
93
94#endif /* _ASM_X86_IOCTLS_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
new file mode 100644
index 000000000000..98e28ea8cd16
--- /dev/null
+++ b/arch/x86/include/asm/iommu.h
@@ -0,0 +1,50 @@
1#ifndef _ASM_X86_IOMMU_H
2#define _ASM_X86_IOMMU_H
3
4extern void pci_iommu_shutdown(void);
5extern void no_iommu_init(void);
6extern struct dma_mapping_ops nommu_dma_ops;
7extern int force_iommu, no_iommu;
8extern int iommu_detected;
9extern int dmar_disabled;
10extern int forbid_dac;
11
12extern unsigned long iommu_nr_pages(unsigned long addr, unsigned long len);
13
14/* 10 seconds */
15#define DMAR_OPERATION_TIMEOUT ((cycles_t) tsc_khz*10*1000)
16
17#ifdef CONFIG_GART_IOMMU
18extern int gart_iommu_aperture;
19extern int gart_iommu_aperture_allowed;
20extern int gart_iommu_aperture_disabled;
21
22extern void early_gart_iommu_check(void);
23extern void gart_iommu_init(void);
24extern void gart_iommu_shutdown(void);
25extern void __init gart_parse_options(char *);
26extern void gart_iommu_hole_init(void);
27
28#else
29#define gart_iommu_aperture 0
30#define gart_iommu_aperture_allowed 0
31#define gart_iommu_aperture_disabled 1
32
33static inline void early_gart_iommu_check(void)
34{
35}
36static inline void gart_iommu_init(void)
37{
38}
39static inline void gart_iommu_shutdown(void)
40{
41}
42static inline void gart_parse_options(char *options)
43{
44}
45static inline void gart_iommu_hole_init(void)
46{
47}
48#endif
49
50#endif /* _ASM_X86_IOMMU_H */
diff --git a/arch/x86/include/asm/ipcbuf.h b/arch/x86/include/asm/ipcbuf.h
new file mode 100644
index 000000000000..ee678fd51594
--- /dev/null
+++ b/arch/x86/include/asm/ipcbuf.h
@@ -0,0 +1,28 @@
1#ifndef _ASM_X86_IPCBUF_H
2#define _ASM_X86_IPCBUF_H
3
4/*
5 * The ipc64_perm structure for x86 architecture.
6 * Note extra padding because this structure is passed back and forth
7 * between kernel and user space.
8 *
9 * Pad space is left for:
10 * - 32-bit mode_t and seq
11 * - 2 miscellaneous 32-bit values
12 */
13
14struct ipc64_perm {
15 __kernel_key_t key;
16 __kernel_uid32_t uid;
17 __kernel_gid32_t gid;
18 __kernel_uid32_t cuid;
19 __kernel_gid32_t cgid;
20 __kernel_mode_t mode;
21 unsigned short __pad1;
22 unsigned short seq;
23 unsigned short __pad2;
24 unsigned long __unused1;
25 unsigned long __unused2;
26};
27
28#endif /* _ASM_X86_IPCBUF_H */
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
new file mode 100644
index 000000000000..f89dffb28aa9
--- /dev/null
+++ b/arch/x86/include/asm/ipi.h
@@ -0,0 +1,138 @@
1#ifndef _ASM_X86_IPI_H
2#define _ASM_X86_IPI_H
3
4/*
5 * Copyright 2004 James Cleverdon, IBM.
6 * Subject to the GNU Public License, v.2
7 *
8 * Generic APIC InterProcessor Interrupt code.
9 *
10 * Moved to include file by James Cleverdon from
11 * arch/x86-64/kernel/smp.c
12 *
13 * Copyrights from kernel/smp.c:
14 *
15 * (c) 1995 Alan Cox, Building #3 <alan@redhat.com>
16 * (c) 1998-99, 2000 Ingo Molnar <mingo@redhat.com>
17 * (c) 2002,2003 Andi Kleen, SuSE Labs.
18 * Subject to the GNU Public License, v.2
19 */
20
21#include <asm/hw_irq.h>
22#include <asm/apic.h>
23#include <asm/smp.h>
24
25/*
26 * the following functions deal with sending IPIs between CPUs.
27 *
28 * We use 'broadcast', CPU->CPU IPIs and self-IPIs too.
29 */
30
31static inline unsigned int __prepare_ICR(unsigned int shortcut, int vector,
32 unsigned int dest)
33{
34 unsigned int icr = shortcut | dest;
35
36 switch (vector) {
37 default:
38 icr |= APIC_DM_FIXED | vector;
39 break;
40 case NMI_VECTOR:
41 icr |= APIC_DM_NMI;
42 break;
43 }
44 return icr;
45}
46
47static inline int __prepare_ICR2(unsigned int mask)
48{
49 return SET_APIC_DEST_FIELD(mask);
50}
51
52static inline void __xapic_wait_icr_idle(void)
53{
54 while (native_apic_mem_read(APIC_ICR) & APIC_ICR_BUSY)
55 cpu_relax();
56}
57
58static inline void __send_IPI_shortcut(unsigned int shortcut, int vector,
59 unsigned int dest)
60{
61 /*
62 * Subtle. In the case of the 'never do double writes' workaround
63 * we have to lock out interrupts to be safe. As we don't care
64 * of the value read we use an atomic rmw access to avoid costly
65 * cli/sti. Otherwise we use an even cheaper single atomic write
66 * to the APIC.
67 */
68 unsigned int cfg;
69
70 /*
71 * Wait for idle.
72 */
73 __xapic_wait_icr_idle();
74
75 /*
76 * No need to touch the target chip field
77 */
78 cfg = __prepare_ICR(shortcut, vector, dest);
79
80 /*
81 * Send the IPI. The write to APIC_ICR fires this off.
82 */
83 native_apic_mem_write(APIC_ICR, cfg);
84}
85
86/*
87 * This is used to send an IPI with no shorthand notation (the destination is
88 * specified in bits 56 to 63 of the ICR).
89 */
90static inline void __send_IPI_dest_field(unsigned int mask, int vector,
91 unsigned int dest)
92{
93 unsigned long cfg;
94
95 /*
96 * Wait for idle.
97 */
98 if (unlikely(vector == NMI_VECTOR))
99 safe_apic_wait_icr_idle();
100 else
101 __xapic_wait_icr_idle();
102
103 /*
104 * prepare target chip field
105 */
106 cfg = __prepare_ICR2(mask);
107 native_apic_mem_write(APIC_ICR2, cfg);
108
109 /*
110 * program the ICR
111 */
112 cfg = __prepare_ICR(0, vector, dest);
113
114 /*
115 * Send the IPI. The write to APIC_ICR fires this off.
116 */
117 native_apic_mem_write(APIC_ICR, cfg);
118}
119
120static inline void send_IPI_mask_sequence(cpumask_t mask, int vector)
121{
122 unsigned long flags;
123 unsigned long query_cpu;
124
125 /*
126 * Hack. The clustered APIC addressing mode doesn't allow us to send
127 * to an arbitrary mask, so I do a unicast to each CPU instead.
128 * - mbligh
129 */
130 local_irq_save(flags);
131 for_each_cpu_mask_nr(query_cpu, mask) {
132 __send_IPI_dest_field(per_cpu(x86_cpu_to_apicid, query_cpu),
133 vector, APIC_DEST_PHYSICAL);
134 }
135 local_irq_restore(flags);
136}
137
138#endif /* _ASM_X86_IPI_H */
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
new file mode 100644
index 000000000000..bae0eda95486
--- /dev/null
+++ b/arch/x86/include/asm/irq.h
@@ -0,0 +1,50 @@
1#ifndef _ASM_X86_IRQ_H
2#define _ASM_X86_IRQ_H
3/*
4 * (C) 1992, 1993 Linus Torvalds, (C) 1997 Ingo Molnar
5 *
6 * IRQ/IPI changes taken from work by Thomas Radke
7 * <tomsoft@informatik.tu-chemnitz.de>
8 */
9
10#include <asm/apicdef.h>
11#include <asm/irq_vectors.h>
12
13static inline int irq_canonicalize(int irq)
14{
15 return ((irq == 2) ? 9 : irq);
16}
17
18#ifdef CONFIG_X86_LOCAL_APIC
19# define ARCH_HAS_NMI_WATCHDOG
20#endif
21
22#ifdef CONFIG_4KSTACKS
23 extern void irq_ctx_init(int cpu);
24 extern void irq_ctx_exit(int cpu);
25# define __ARCH_HAS_DO_SOFTIRQ
26#else
27# define irq_ctx_init(cpu) do { } while (0)
28# define irq_ctx_exit(cpu) do { } while (0)
29# ifdef CONFIG_X86_64
30# define __ARCH_HAS_DO_SOFTIRQ
31# endif
32#endif
33
34#ifdef CONFIG_IRQBALANCE
35extern int irqbalance_disable(char *str);
36#endif
37
38#ifdef CONFIG_HOTPLUG_CPU
39#include <linux/cpumask.h>
40extern void fixup_irqs(cpumask_t map);
41#endif
42
43extern unsigned int do_IRQ(struct pt_regs *regs);
44extern void init_IRQ(void);
45extern void native_init_IRQ(void);
46
47/* Interrupt vector management */
48extern DECLARE_BITMAP(used_vectors, NR_VECTORS);
49
50#endif /* _ASM_X86_IRQ_H */
diff --git a/arch/x86/include/asm/irq_regs.h b/arch/x86/include/asm/irq_regs.h
new file mode 100644
index 000000000000..89c898ab298b
--- /dev/null
+++ b/arch/x86/include/asm/irq_regs.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "irq_regs_32.h"
3#else
4# include "irq_regs_64.h"
5#endif
diff --git a/arch/x86/include/asm/irq_regs_32.h b/arch/x86/include/asm/irq_regs_32.h
new file mode 100644
index 000000000000..af2f02d27fc7
--- /dev/null
+++ b/arch/x86/include/asm/irq_regs_32.h
@@ -0,0 +1,29 @@
1/*
2 * Per-cpu current frame pointer - the location of the last exception frame on
3 * the stack, stored in the per-cpu area.
4 *
5 * Jeremy Fitzhardinge <jeremy@goop.org>
6 */
7#ifndef _ASM_X86_IRQ_REGS_32_H
8#define _ASM_X86_IRQ_REGS_32_H
9
10#include <asm/percpu.h>
11
12DECLARE_PER_CPU(struct pt_regs *, irq_regs);
13
14static inline struct pt_regs *get_irq_regs(void)
15{
16 return x86_read_percpu(irq_regs);
17}
18
19static inline struct pt_regs *set_irq_regs(struct pt_regs *new_regs)
20{
21 struct pt_regs *old_regs;
22
23 old_regs = get_irq_regs();
24 x86_write_percpu(irq_regs, new_regs);
25
26 return old_regs;
27}
28
29#endif /* _ASM_X86_IRQ_REGS_32_H */
diff --git a/arch/x86/include/asm/irq_regs_64.h b/arch/x86/include/asm/irq_regs_64.h
new file mode 100644
index 000000000000..3dd9c0b70270
--- /dev/null
+++ b/arch/x86/include/asm/irq_regs_64.h
@@ -0,0 +1 @@
#include <asm-generic/irq_regs.h>
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
new file mode 100644
index 000000000000..20e1fd588dbf
--- /dev/null
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -0,0 +1,8 @@
1#ifndef _ASM_X86_IRQ_REMAPPING_H
2#define _ASM_X86_IRQ_REMAPPING_H
3
4extern int x2apic;
5
6#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
7
8#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
new file mode 100644
index 000000000000..d843ed0e9b2e
--- /dev/null
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -0,0 +1,164 @@
1#ifndef _ASM_X86_IRQ_VECTORS_H
2#define _ASM_X86_IRQ_VECTORS_H
3
4#include <linux/threads.h>
5
6#define NMI_VECTOR 0x02
7
8/*
9 * IDT vectors usable for external interrupt sources start
10 * at 0x20:
11 */
12#define FIRST_EXTERNAL_VECTOR 0x20
13
14#ifdef CONFIG_X86_32
15# define SYSCALL_VECTOR 0x80
16#else
17# define IA32_SYSCALL_VECTOR 0x80
18#endif
19
20/*
21 * Reserve the lowest usable priority level 0x20 - 0x2f for triggering
22 * cleanup after irq migration.
23 */
24#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
25
26/*
27 * Vectors 0x30-0x3f are used for ISA interrupts.
28 */
29#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10)
30#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
31#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
32#define IRQ3_VECTOR (IRQ0_VECTOR + 3)
33#define IRQ4_VECTOR (IRQ0_VECTOR + 4)
34#define IRQ5_VECTOR (IRQ0_VECTOR + 5)
35#define IRQ6_VECTOR (IRQ0_VECTOR + 6)
36#define IRQ7_VECTOR (IRQ0_VECTOR + 7)
37#define IRQ8_VECTOR (IRQ0_VECTOR + 8)
38#define IRQ9_VECTOR (IRQ0_VECTOR + 9)
39#define IRQ10_VECTOR (IRQ0_VECTOR + 10)
40#define IRQ11_VECTOR (IRQ0_VECTOR + 11)
41#define IRQ12_VECTOR (IRQ0_VECTOR + 12)
42#define IRQ13_VECTOR (IRQ0_VECTOR + 13)
43#define IRQ14_VECTOR (IRQ0_VECTOR + 14)
44#define IRQ15_VECTOR (IRQ0_VECTOR + 15)
45
46/*
47 * Special IRQ vectors used by the SMP architecture, 0xf0-0xff
48 *
49 * some of the following vectors are 'rare', they are merged
50 * into a single vector (CALL_FUNCTION_VECTOR) to save vector space.
51 * TLB, reschedule and local APIC vectors are performance-critical.
52 *
53 * Vectors 0xf0-0xfa are free (reserved for future Linux use).
54 */
55#ifdef CONFIG_X86_32
56
57# define SPURIOUS_APIC_VECTOR 0xff
58# define ERROR_APIC_VECTOR 0xfe
59# define INVALIDATE_TLB_VECTOR 0xfd
60# define RESCHEDULE_VECTOR 0xfc
61# define CALL_FUNCTION_VECTOR 0xfb
62# define CALL_FUNCTION_SINGLE_VECTOR 0xfa
63# define THERMAL_APIC_VECTOR 0xf0
64
65#else
66
67#define SPURIOUS_APIC_VECTOR 0xff
68#define ERROR_APIC_VECTOR 0xfe
69#define RESCHEDULE_VECTOR 0xfd
70#define CALL_FUNCTION_VECTOR 0xfc
71#define CALL_FUNCTION_SINGLE_VECTOR 0xfb
72#define THERMAL_APIC_VECTOR 0xfa
73#define THRESHOLD_APIC_VECTOR 0xf9
74#define UV_BAU_MESSAGE 0xf8
75#define INVALIDATE_TLB_VECTOR_END 0xf7
76#define INVALIDATE_TLB_VECTOR_START 0xf0 /* f0-f7 used for TLB flush */
77
78#define NUM_INVALIDATE_TLB_VECTORS 8
79
80#endif
81
82/*
83 * Local APIC timer IRQ vector is on a different priority level,
84 * to work around the 'lost local interrupt if more than 2 IRQ
85 * sources per level' errata.
86 */
87#define LOCAL_TIMER_VECTOR 0xef
88
89/*
90 * First APIC vector available to drivers: (vectors 0x30-0xee) we
91 * start at 0x31(0x41) to spread out vectors evenly between priority
92 * levels. (0x80 is the syscall vector)
93 */
94#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
95
96#define NR_VECTORS 256
97
98#define FPU_IRQ 13
99
100#define FIRST_VM86_IRQ 3
101#define LAST_VM86_IRQ 15
102#define invalid_vm86_irq(irq) ((irq) < 3 || (irq) > 15)
103
104#ifdef CONFIG_X86_64
105# if NR_CPUS < MAX_IO_APICS
106# define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
107# else
108# define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
109# endif
110
111#elif !defined(CONFIG_X86_VOYAGER)
112
113# if defined(CONFIG_X86_IO_APIC) || defined(CONFIG_PARAVIRT) || defined(CONFIG_X86_VISWS)
114
115# define NR_IRQS 224
116
117# else /* IO_APIC || PARAVIRT */
118
119# define NR_IRQS 16
120
121# endif
122
123#else /* !VISWS && !VOYAGER */
124
125# define NR_IRQS 224
126
127#endif /* VISWS */
128
129/* Voyager specific defines */
130/* These define the CPIs we use in linux */
131#define VIC_CPI_LEVEL0 0
132#define VIC_CPI_LEVEL1 1
133/* now the fake CPIs */
134#define VIC_TIMER_CPI 2
135#define VIC_INVALIDATE_CPI 3
136#define VIC_RESCHEDULE_CPI 4
137#define VIC_ENABLE_IRQ_CPI 5
138#define VIC_CALL_FUNCTION_CPI 6
139#define VIC_CALL_FUNCTION_SINGLE_CPI 7
140
141/* Now the QIC CPIs: Since we don't need the two initial levels,
142 * these are 2 less than the VIC CPIs */
143#define QIC_CPI_OFFSET 1
144#define QIC_TIMER_CPI (VIC_TIMER_CPI - QIC_CPI_OFFSET)
145#define QIC_INVALIDATE_CPI (VIC_INVALIDATE_CPI - QIC_CPI_OFFSET)
146#define QIC_RESCHEDULE_CPI (VIC_RESCHEDULE_CPI - QIC_CPI_OFFSET)
147#define QIC_ENABLE_IRQ_CPI (VIC_ENABLE_IRQ_CPI - QIC_CPI_OFFSET)
148#define QIC_CALL_FUNCTION_CPI (VIC_CALL_FUNCTION_CPI - QIC_CPI_OFFSET)
149#define QIC_CALL_FUNCTION_SINGLE_CPI (VIC_CALL_FUNCTION_SINGLE_CPI - QIC_CPI_OFFSET)
150
151#define VIC_START_FAKE_CPI VIC_TIMER_CPI
152#define VIC_END_FAKE_CPI VIC_CALL_FUNCTION_SINGLE_CPI
153
154/* this is the SYS_INT CPI. */
155#define VIC_SYS_INT 8
156#define VIC_CMN_INT 15
157
158/* This is the boot CPI for alternate processors. It gets overwritten
159 * by the above once the system has activated all available processors */
160#define VIC_CPU_BOOT_CPI VIC_CPI_LEVEL0
161#define VIC_CPU_BOOT_ERRATA_CPI (VIC_CPI_LEVEL0 + 8)
162
163
164#endif /* _ASM_X86_IRQ_VECTORS_H */
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
new file mode 100644
index 000000000000..2bdab21f0898
--- /dev/null
+++ b/arch/x86/include/asm/irqflags.h
@@ -0,0 +1,211 @@
1#ifndef _X86_IRQFLAGS_H_
2#define _X86_IRQFLAGS_H_
3
4#include <asm/processor-flags.h>
5
6#ifndef __ASSEMBLY__
7/*
8 * Interrupt control:
9 */
10
11static inline unsigned long native_save_fl(void)
12{
13 unsigned long flags;
14
15 asm volatile("# __raw_save_flags\n\t"
16 "pushf ; pop %0"
17 : "=g" (flags)
18 : /* no input */
19 : "memory");
20
21 return flags;
22}
23
24static inline void native_restore_fl(unsigned long flags)
25{
26 asm volatile("push %0 ; popf"
27 : /* no output */
28 :"g" (flags)
29 :"memory", "cc");
30}
31
32static inline void native_irq_disable(void)
33{
34 asm volatile("cli": : :"memory");
35}
36
37static inline void native_irq_enable(void)
38{
39 asm volatile("sti": : :"memory");
40}
41
42static inline void native_safe_halt(void)
43{
44 asm volatile("sti; hlt": : :"memory");
45}
46
47static inline void native_halt(void)
48{
49 asm volatile("hlt": : :"memory");
50}
51
52#endif
53
54#ifdef CONFIG_PARAVIRT
55#include <asm/paravirt.h>
56#else
57#ifndef __ASSEMBLY__
58
59static inline unsigned long __raw_local_save_flags(void)
60{
61 return native_save_fl();
62}
63
64static inline void raw_local_irq_restore(unsigned long flags)
65{
66 native_restore_fl(flags);
67}
68
69static inline void raw_local_irq_disable(void)
70{
71 native_irq_disable();
72}
73
74static inline void raw_local_irq_enable(void)
75{
76 native_irq_enable();
77}
78
79/*
80 * Used in the idle loop; sti takes one instruction cycle
81 * to complete:
82 */
83static inline void raw_safe_halt(void)
84{
85 native_safe_halt();
86}
87
88/*
89 * Used when interrupts are already enabled or to
90 * shutdown the processor:
91 */
92static inline void halt(void)
93{
94 native_halt();
95}
96
97/*
98 * For spinlocks, etc:
99 */
100static inline unsigned long __raw_local_irq_save(void)
101{
102 unsigned long flags = __raw_local_save_flags();
103
104 raw_local_irq_disable();
105
106 return flags;
107}
108#else
109
110#define ENABLE_INTERRUPTS(x) sti
111#define DISABLE_INTERRUPTS(x) cli
112
113#ifdef CONFIG_X86_64
114#define SWAPGS swapgs
115/*
116 * Currently paravirt can't handle swapgs nicely when we
117 * don't have a stack we can rely on (such as a user space
118 * stack). So we either find a way around these or just fault
119 * and emulate if a guest tries to call swapgs directly.
120 *
121 * Either way, this is a good way to document that we don't
122 * have a reliable stack. x86_64 only.
123 */
124#define SWAPGS_UNSAFE_STACK swapgs
125
126#define PARAVIRT_ADJUST_EXCEPTION_FRAME /* */
127
128#define INTERRUPT_RETURN iretq
129#define USERGS_SYSRET64 \
130 swapgs; \
131 sysretq;
132#define USERGS_SYSRET32 \
133 swapgs; \
134 sysretl
135#define ENABLE_INTERRUPTS_SYSEXIT32 \
136 swapgs; \
137 sti; \
138 sysexit
139
140#else
141#define INTERRUPT_RETURN iret
142#define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit
143#define GET_CR0_INTO_EAX movl %cr0, %eax
144#endif
145
146
147#endif /* __ASSEMBLY__ */
148#endif /* CONFIG_PARAVIRT */
149
150#ifndef __ASSEMBLY__
151#define raw_local_save_flags(flags) \
152 do { (flags) = __raw_local_save_flags(); } while (0)
153
154#define raw_local_irq_save(flags) \
155 do { (flags) = __raw_local_irq_save(); } while (0)
156
157static inline int raw_irqs_disabled_flags(unsigned long flags)
158{
159 return !(flags & X86_EFLAGS_IF);
160}
161
162static inline int raw_irqs_disabled(void)
163{
164 unsigned long flags = __raw_local_save_flags();
165
166 return raw_irqs_disabled_flags(flags);
167}
168
169#else
170
171#ifdef CONFIG_X86_64
172#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk
173#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
174 TRACE_IRQS_ON; \
175 sti; \
176 SAVE_REST; \
177 LOCKDEP_SYS_EXIT; \
178 RESTORE_REST; \
179 cli; \
180 TRACE_IRQS_OFF;
181
182#else
183#define ARCH_LOCKDEP_SYS_EXIT \
184 pushl %eax; \
185 pushl %ecx; \
186 pushl %edx; \
187 call lockdep_sys_exit; \
188 popl %edx; \
189 popl %ecx; \
190 popl %eax;
191
192#define ARCH_LOCKDEP_SYS_EXIT_IRQ
193#endif
194
195#ifdef CONFIG_TRACE_IRQFLAGS
196# define TRACE_IRQS_ON call trace_hardirqs_on_thunk;
197# define TRACE_IRQS_OFF call trace_hardirqs_off_thunk;
198#else
199# define TRACE_IRQS_ON
200# define TRACE_IRQS_OFF
201#endif
202#ifdef CONFIG_DEBUG_LOCK_ALLOC
203# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT
204# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ
205# else
206# define LOCKDEP_SYS_EXIT
207# define LOCKDEP_SYS_EXIT_IRQ
208# endif
209
210#endif /* __ASSEMBLY__ */
211#endif
diff --git a/arch/x86/include/asm/ist.h b/arch/x86/include/asm/ist.h
new file mode 100644
index 000000000000..7e5dff1de0e9
--- /dev/null
+++ b/arch/x86/include/asm/ist.h
@@ -0,0 +1,34 @@
1#ifndef _ASM_X86_IST_H
2#define _ASM_X86_IST_H
3
4/*
5 * Include file for the interface to IST BIOS
6 * Copyright 2002 Andy Grover <andrew.grover@intel.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License as published by the
10 * Free Software Foundation; either version 2, or (at your option) any
11 * later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 */
18
19
20#include <linux/types.h>
21
22struct ist_info {
23 __u32 signature;
24 __u32 command;
25 __u32 event;
26 __u32 perf_level;
27};
28
29#ifdef __KERNEL__
30
31extern struct ist_info ist_info;
32
33#endif /* __KERNEL__ */
34#endif /* _ASM_X86_IST_H */
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
new file mode 100644
index 000000000000..54c8cc53b24d
--- /dev/null
+++ b/arch/x86/include/asm/k8.h
@@ -0,0 +1,15 @@
1#ifndef _ASM_X86_K8_H
2#define _ASM_X86_K8_H
3
4#include <linux/pci.h>
5
6extern struct pci_device_id k8_nb_ids[];
7
8extern int early_is_k8_nb(u32 value);
9extern struct pci_dev **k8_northbridges;
10extern int num_k8_northbridges;
11extern int cache_k8_northbridges(void);
12extern void k8_flush_garts(void);
13extern int k8_scan_nodes(unsigned long start, unsigned long end);
14
15#endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
new file mode 100644
index 000000000000..fa7c0b974761
--- /dev/null
+++ b/arch/x86/include/asm/kdebug.h
@@ -0,0 +1,37 @@
1#ifndef _ASM_X86_KDEBUG_H
2#define _ASM_X86_KDEBUG_H
3
4#include <linux/notifier.h>
5
6struct pt_regs;
7
8/* Grossly misnamed. */
9enum die_val {
10 DIE_OOPS = 1,
11 DIE_INT3,
12 DIE_DEBUG,
13 DIE_PANIC,
14 DIE_NMI,
15 DIE_DIE,
16 DIE_NMIWATCHDOG,
17 DIE_KERNELDEBUG,
18 DIE_TRAP,
19 DIE_GPF,
20 DIE_CALL,
21 DIE_NMI_IPI,
22 DIE_PAGE_FAULT,
23 DIE_NMIUNKNOWN,
24};
25
26extern void printk_address(unsigned long address, int reliable);
27extern void die(const char *, struct pt_regs *,long);
28extern int __must_check __die(const char *, struct pt_regs *, long);
29extern void show_registers(struct pt_regs *regs);
30extern void show_trace(struct task_struct *t, struct pt_regs *regs,
31 unsigned long *sp, unsigned long bp);
32extern void __show_regs(struct pt_regs *regs, int all);
33extern void show_regs(struct pt_regs *regs);
34extern unsigned long oops_begin(void);
35extern void oops_end(unsigned long, struct pt_regs *, int signr);
36
37#endif /* _ASM_X86_KDEBUG_H */
diff --git a/arch/x86/include/asm/kexec.h b/arch/x86/include/asm/kexec.h
new file mode 100644
index 000000000000..a1f22771a15a
--- /dev/null
+++ b/arch/x86/include/asm/kexec.h
@@ -0,0 +1,175 @@
1#ifndef _ASM_X86_KEXEC_H
2#define _ASM_X86_KEXEC_H
3
4#ifdef CONFIG_X86_32
5# define PA_CONTROL_PAGE 0
6# define VA_CONTROL_PAGE 1
7# define PA_PGD 2
8# define VA_PGD 3
9# define PA_PTE_0 4
10# define VA_PTE_0 5
11# define PA_PTE_1 6
12# define VA_PTE_1 7
13# define PA_SWAP_PAGE 8
14# ifdef CONFIG_X86_PAE
15# define PA_PMD_0 9
16# define VA_PMD_0 10
17# define PA_PMD_1 11
18# define VA_PMD_1 12
19# define PAGES_NR 13
20# else
21# define PAGES_NR 9
22# endif
23#else
24# define PA_CONTROL_PAGE 0
25# define VA_CONTROL_PAGE 1
26# define PA_PGD 2
27# define VA_PGD 3
28# define PA_PUD_0 4
29# define VA_PUD_0 5
30# define PA_PMD_0 6
31# define VA_PMD_0 7
32# define PA_PTE_0 8
33# define VA_PTE_0 9
34# define PA_PUD_1 10
35# define VA_PUD_1 11
36# define PA_PMD_1 12
37# define VA_PMD_1 13
38# define PA_PTE_1 14
39# define VA_PTE_1 15
40# define PA_TABLE_PAGE 16
41# define PAGES_NR 17
42#endif
43
44#ifdef CONFIG_X86_32
45# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
46#endif
47
48#ifndef __ASSEMBLY__
49
50#include <linux/string.h>
51
52#include <asm/page.h>
53#include <asm/ptrace.h>
54
55/*
56 * KEXEC_SOURCE_MEMORY_LIMIT maximum page get_free_page can return.
57 * I.e. Maximum page that is mapped directly into kernel memory,
58 * and kmap is not required.
59 *
60 * So far x86_64 is limited to 40 physical address bits.
61 */
62#ifdef CONFIG_X86_32
63/* Maximum physical address we can use pages from */
64# define KEXEC_SOURCE_MEMORY_LIMIT (-1UL)
65/* Maximum address we can reach in physical address mode */
66# define KEXEC_DESTINATION_MEMORY_LIMIT (-1UL)
67/* Maximum address we can use for the control code buffer */
68# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
69
70# define KEXEC_CONTROL_PAGE_SIZE 4096
71
72/* The native architecture */
73# define KEXEC_ARCH KEXEC_ARCH_386
74
75/* We can also handle crash dumps from 64 bit kernel. */
76# define vmcore_elf_check_arch_cross(x) ((x)->e_machine == EM_X86_64)
77#else
78/* Maximum physical address we can use pages from */
79# define KEXEC_SOURCE_MEMORY_LIMIT (0xFFFFFFFFFFUL)
80/* Maximum address we can reach in physical address mode */
81# define KEXEC_DESTINATION_MEMORY_LIMIT (0xFFFFFFFFFFUL)
82/* Maximum address we can use for the control pages */
83# define KEXEC_CONTROL_MEMORY_LIMIT (0xFFFFFFFFFFUL)
84
85/* Allocate one page for the pdp and the second for the code */
86# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
87
88/* The native architecture */
89# define KEXEC_ARCH KEXEC_ARCH_X86_64
90#endif
91
92/*
93 * CPU does not save ss and sp on stack if execution is already
94 * running in kernel mode at the time of NMI occurrence. This code
95 * fixes it.
96 */
97static inline void crash_fixup_ss_esp(struct pt_regs *newregs,
98 struct pt_regs *oldregs)
99{
100#ifdef CONFIG_X86_32
101 newregs->sp = (unsigned long)&(oldregs->sp);
102 asm volatile("xorl %%eax, %%eax\n\t"
103 "movw %%ss, %%ax\n\t"
104 :"=a"(newregs->ss));
105#endif
106}
107
108/*
109 * This function is responsible for capturing register states if coming
110 * via panic otherwise just fix up the ss and sp if coming via kernel
111 * mode exception.
112 */
113static inline void crash_setup_regs(struct pt_regs *newregs,
114 struct pt_regs *oldregs)
115{
116 if (oldregs) {
117 memcpy(newregs, oldregs, sizeof(*newregs));
118 crash_fixup_ss_esp(newregs, oldregs);
119 } else {
120#ifdef CONFIG_X86_32
121 asm volatile("movl %%ebx,%0" : "=m"(newregs->bx));
122 asm volatile("movl %%ecx,%0" : "=m"(newregs->cx));
123 asm volatile("movl %%edx,%0" : "=m"(newregs->dx));
124 asm volatile("movl %%esi,%0" : "=m"(newregs->si));
125 asm volatile("movl %%edi,%0" : "=m"(newregs->di));
126 asm volatile("movl %%ebp,%0" : "=m"(newregs->bp));
127 asm volatile("movl %%eax,%0" : "=m"(newregs->ax));
128 asm volatile("movl %%esp,%0" : "=m"(newregs->sp));
129 asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss));
130 asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs));
131 asm volatile("movl %%ds, %%eax;" :"=a"(newregs->ds));
132 asm volatile("movl %%es, %%eax;" :"=a"(newregs->es));
133 asm volatile("pushfl; popl %0" :"=m"(newregs->flags));
134#else
135 asm volatile("movq %%rbx,%0" : "=m"(newregs->bx));
136 asm volatile("movq %%rcx,%0" : "=m"(newregs->cx));
137 asm volatile("movq %%rdx,%0" : "=m"(newregs->dx));
138 asm volatile("movq %%rsi,%0" : "=m"(newregs->si));
139 asm volatile("movq %%rdi,%0" : "=m"(newregs->di));
140 asm volatile("movq %%rbp,%0" : "=m"(newregs->bp));
141 asm volatile("movq %%rax,%0" : "=m"(newregs->ax));
142 asm volatile("movq %%rsp,%0" : "=m"(newregs->sp));
143 asm volatile("movq %%r8,%0" : "=m"(newregs->r8));
144 asm volatile("movq %%r9,%0" : "=m"(newregs->r9));
145 asm volatile("movq %%r10,%0" : "=m"(newregs->r10));
146 asm volatile("movq %%r11,%0" : "=m"(newregs->r11));
147 asm volatile("movq %%r12,%0" : "=m"(newregs->r12));
148 asm volatile("movq %%r13,%0" : "=m"(newregs->r13));
149 asm volatile("movq %%r14,%0" : "=m"(newregs->r14));
150 asm volatile("movq %%r15,%0" : "=m"(newregs->r15));
151 asm volatile("movl %%ss, %%eax;" :"=a"(newregs->ss));
152 asm volatile("movl %%cs, %%eax;" :"=a"(newregs->cs));
153 asm volatile("pushfq; popq %0" :"=m"(newregs->flags));
154#endif
155 newregs->ip = (unsigned long)current_text_addr();
156 }
157}
158
159#ifdef CONFIG_X86_32
160asmlinkage unsigned long
161relocate_kernel(unsigned long indirection_page,
162 unsigned long control_page,
163 unsigned long start_address,
164 unsigned int has_pae,
165 unsigned int preserve_context);
166#else
167NORET_TYPE void
168relocate_kernel(unsigned long indirection_page,
169 unsigned long page_list,
170 unsigned long start_address) ATTRIB_NORET;
171#endif
172
173#endif /* __ASSEMBLY__ */
174
175#endif /* _ASM_X86_KEXEC_H */
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
new file mode 100644
index 000000000000..e6c6c808489f
--- /dev/null
+++ b/arch/x86/include/asm/kgdb.h
@@ -0,0 +1,79 @@
1#ifndef _ASM_X86_KGDB_H
2#define _ASM_X86_KGDB_H
3
4/*
5 * Copyright (C) 2001-2004 Amit S. Kale
6 * Copyright (C) 2008 Wind River Systems, Inc.
7 */
8
9/*
10 * BUFMAX defines the maximum number of characters in inbound/outbound
11 * buffers at least NUMREGBYTES*2 are needed for register packets
12 * Longer buffer is needed to list all threads
13 */
14#define BUFMAX 1024
15
16/*
17 * Note that this register image is in a different order than
18 * the register image that Linux produces at interrupt time.
19 *
20 * Linux's register image is defined by struct pt_regs in ptrace.h.
21 * Just why GDB uses a different order is a historical mystery.
22 */
23#ifdef CONFIG_X86_32
24enum regnames {
25 GDB_AX, /* 0 */
26 GDB_CX, /* 1 */
27 GDB_DX, /* 2 */
28 GDB_BX, /* 3 */
29 GDB_SP, /* 4 */
30 GDB_BP, /* 5 */
31 GDB_SI, /* 6 */
32 GDB_DI, /* 7 */
33 GDB_PC, /* 8 also known as eip */
34 GDB_PS, /* 9 also known as eflags */
35 GDB_CS, /* 10 */
36 GDB_SS, /* 11 */
37 GDB_DS, /* 12 */
38 GDB_ES, /* 13 */
39 GDB_FS, /* 14 */
40 GDB_GS, /* 15 */
41};
42#define NUMREGBYTES ((GDB_GS+1)*4)
43#else /* ! CONFIG_X86_32 */
44enum regnames64 {
45 GDB_AX, /* 0 */
46 GDB_BX, /* 1 */
47 GDB_CX, /* 2 */
48 GDB_DX, /* 3 */
49 GDB_SI, /* 4 */
50 GDB_DI, /* 5 */
51 GDB_BP, /* 6 */
52 GDB_SP, /* 7 */
53 GDB_R8, /* 8 */
54 GDB_R9, /* 9 */
55 GDB_R10, /* 10 */
56 GDB_R11, /* 11 */
57 GDB_R12, /* 12 */
58 GDB_R13, /* 13 */
59 GDB_R14, /* 14 */
60 GDB_R15, /* 15 */
61 GDB_PC, /* 16 */
62};
63
64enum regnames32 {
65 GDB_PS = 34,
66 GDB_CS,
67 GDB_SS,
68};
69#define NUMREGBYTES ((GDB_SS+1)*4)
70#endif /* CONFIG_X86_32 */
71
72static inline void arch_kgdb_breakpoint(void)
73{
74 asm(" int $3");
75}
76#define BREAK_INSTR_SIZE 1
77#define CACHE_FLUSH_IS_SAFE 1
78
79#endif /* _ASM_X86_KGDB_H */
diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h
new file mode 100644
index 000000000000..5759c165a5cf
--- /dev/null
+++ b/arch/x86/include/asm/kmap_types.h
@@ -0,0 +1,29 @@
1#ifndef _ASM_X86_KMAP_TYPES_H
2#define _ASM_X86_KMAP_TYPES_H
3
4#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM)
5# define D(n) __KM_FENCE_##n ,
6#else
7# define D(n)
8#endif
9
10enum km_type {
11D(0) KM_BOUNCE_READ,
12D(1) KM_SKB_SUNRPC_DATA,
13D(2) KM_SKB_DATA_SOFTIRQ,
14D(3) KM_USER0,
15D(4) KM_USER1,
16D(5) KM_BIO_SRC_IRQ,
17D(6) KM_BIO_DST_IRQ,
18D(7) KM_PTE0,
19D(8) KM_PTE1,
20D(9) KM_IRQ0,
21D(10) KM_IRQ1,
22D(11) KM_SOFTIRQ0,
23D(12) KM_SOFTIRQ1,
24D(13) KM_TYPE_NR
25};
26
27#undef D
28
29#endif /* _ASM_X86_KMAP_TYPES_H */
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
new file mode 100644
index 000000000000..4fe681de1e76
--- /dev/null
+++ b/arch/x86/include/asm/kprobes.h
@@ -0,0 +1,88 @@
1#ifndef _ASM_X86_KPROBES_H
2#define _ASM_X86_KPROBES_H
3/*
4 * Kernel Probes (KProbes)
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright (C) IBM Corporation, 2002, 2004
21 *
22 * See arch/x86/kernel/kprobes.c for x86 kprobes history.
23 */
24#include <linux/types.h>
25#include <linux/ptrace.h>
26#include <linux/percpu.h>
27
28#define __ARCH_WANT_KPROBES_INSN_SLOT
29
30struct pt_regs;
31struct kprobe;
32
33typedef u8 kprobe_opcode_t;
34#define BREAKPOINT_INSTRUCTION 0xcc
35#define RELATIVEJUMP_INSTRUCTION 0xe9
36#define MAX_INSN_SIZE 16
37#define MAX_STACK_SIZE 64
38#define MIN_STACK_SIZE(ADDR) \
39 (((MAX_STACK_SIZE) < (((unsigned long)current_thread_info()) + \
40 THREAD_SIZE - (unsigned long)(ADDR))) \
41 ? (MAX_STACK_SIZE) \
42 : (((unsigned long)current_thread_info()) + \
43 THREAD_SIZE - (unsigned long)(ADDR)))
44
45#define flush_insn_slot(p) do { } while (0)
46
47extern const int kretprobe_blacklist_size;
48
49void arch_remove_kprobe(struct kprobe *p);
50void kretprobe_trampoline(void);
51
52/* Architecture specific copy of original instruction*/
53struct arch_specific_insn {
54 /* copy of the original instruction */
55 kprobe_opcode_t *insn;
56 /*
57 * boostable = -1: This instruction type is not boostable.
58 * boostable = 0: This instruction type is boostable.
59 * boostable = 1: This instruction has been boosted: we have
60 * added a relative jump after the instruction copy in insn,
61 * so no single-step and fixup are needed (unless there's
62 * a post_handler or break_handler).
63 */
64 int boostable;
65};
66
67struct prev_kprobe {
68 struct kprobe *kp;
69 unsigned long status;
70 unsigned long old_flags;
71 unsigned long saved_flags;
72};
73
74/* per-cpu kprobe control block */
75struct kprobe_ctlblk {
76 unsigned long kprobe_status;
77 unsigned long kprobe_old_flags;
78 unsigned long kprobe_saved_flags;
79 unsigned long *jprobe_saved_sp;
80 struct pt_regs jprobe_saved_regs;
81 kprobe_opcode_t jprobes_stack[MAX_STACK_SIZE];
82 struct prev_kprobe prev_kprobe;
83};
84
85extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr);
86extern int kprobe_exceptions_notify(struct notifier_block *self,
87 unsigned long val, void *data);
88#endif /* _ASM_X86_KPROBES_H */
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
new file mode 100644
index 000000000000..b95162af0bf6
--- /dev/null
+++ b/arch/x86/include/asm/kvm.h
@@ -0,0 +1,211 @@
1#ifndef _ASM_X86_KVM_H
2#define _ASM_X86_KVM_H
3
4/*
5 * KVM x86 specific structures and definitions
6 *
7 */
8
9#include <asm/types.h>
10#include <linux/ioctl.h>
11
12/* Architectural interrupt line count. */
13#define KVM_NR_INTERRUPTS 256
14
15struct kvm_memory_alias {
16 __u32 slot; /* this has a different namespace than memory slots */
17 __u32 flags;
18 __u64 guest_phys_addr;
19 __u64 memory_size;
20 __u64 target_phys_addr;
21};
22
23/* for KVM_GET_IRQCHIP and KVM_SET_IRQCHIP */
24struct kvm_pic_state {
25 __u8 last_irr; /* edge detection */
26 __u8 irr; /* interrupt request register */
27 __u8 imr; /* interrupt mask register */
28 __u8 isr; /* interrupt service register */
29 __u8 priority_add; /* highest irq priority */
30 __u8 irq_base;
31 __u8 read_reg_select;
32 __u8 poll;
33 __u8 special_mask;
34 __u8 init_state;
35 __u8 auto_eoi;
36 __u8 rotate_on_auto_eoi;
37 __u8 special_fully_nested_mode;
38 __u8 init4; /* true if 4 byte init */
39 __u8 elcr; /* PIIX edge/trigger selection */
40 __u8 elcr_mask;
41};
42
43#define KVM_IOAPIC_NUM_PINS 24
44struct kvm_ioapic_state {
45 __u64 base_address;
46 __u32 ioregsel;
47 __u32 id;
48 __u32 irr;
49 __u32 pad;
50 union {
51 __u64 bits;
52 struct {
53 __u8 vector;
54 __u8 delivery_mode:3;
55 __u8 dest_mode:1;
56 __u8 delivery_status:1;
57 __u8 polarity:1;
58 __u8 remote_irr:1;
59 __u8 trig_mode:1;
60 __u8 mask:1;
61 __u8 reserve:7;
62 __u8 reserved[4];
63 __u8 dest_id;
64 } fields;
65 } redirtbl[KVM_IOAPIC_NUM_PINS];
66};
67
68#define KVM_IRQCHIP_PIC_MASTER 0
69#define KVM_IRQCHIP_PIC_SLAVE 1
70#define KVM_IRQCHIP_IOAPIC 2
71
72/* for KVM_GET_REGS and KVM_SET_REGS */
73struct kvm_regs {
74 /* out (KVM_GET_REGS) / in (KVM_SET_REGS) */
75 __u64 rax, rbx, rcx, rdx;
76 __u64 rsi, rdi, rsp, rbp;
77 __u64 r8, r9, r10, r11;
78 __u64 r12, r13, r14, r15;
79 __u64 rip, rflags;
80};
81
82/* for KVM_GET_LAPIC and KVM_SET_LAPIC */
83#define KVM_APIC_REG_SIZE 0x400
84struct kvm_lapic_state {
85 char regs[KVM_APIC_REG_SIZE];
86};
87
88struct kvm_segment {
89 __u64 base;
90 __u32 limit;
91 __u16 selector;
92 __u8 type;
93 __u8 present, dpl, db, s, l, g, avl;
94 __u8 unusable;
95 __u8 padding;
96};
97
98struct kvm_dtable {
99 __u64 base;
100 __u16 limit;
101 __u16 padding[3];
102};
103
104
105/* for KVM_GET_SREGS and KVM_SET_SREGS */
106struct kvm_sregs {
107 /* out (KVM_GET_SREGS) / in (KVM_SET_SREGS) */
108 struct kvm_segment cs, ds, es, fs, gs, ss;
109 struct kvm_segment tr, ldt;
110 struct kvm_dtable gdt, idt;
111 __u64 cr0, cr2, cr3, cr4, cr8;
112 __u64 efer;
113 __u64 apic_base;
114 __u64 interrupt_bitmap[(KVM_NR_INTERRUPTS + 63) / 64];
115};
116
117/* for KVM_GET_FPU and KVM_SET_FPU */
118struct kvm_fpu {
119 __u8 fpr[8][16];
120 __u16 fcw;
121 __u16 fsw;
122 __u8 ftwx; /* in fxsave format */
123 __u8 pad1;
124 __u16 last_opcode;
125 __u64 last_ip;
126 __u64 last_dp;
127 __u8 xmm[16][16];
128 __u32 mxcsr;
129 __u32 pad2;
130};
131
132struct kvm_msr_entry {
133 __u32 index;
134 __u32 reserved;
135 __u64 data;
136};
137
138/* for KVM_GET_MSRS and KVM_SET_MSRS */
139struct kvm_msrs {
140 __u32 nmsrs; /* number of msrs in entries */
141 __u32 pad;
142
143 struct kvm_msr_entry entries[0];
144};
145
146/* for KVM_GET_MSR_INDEX_LIST */
147struct kvm_msr_list {
148 __u32 nmsrs; /* number of msrs in entries */
149 __u32 indices[0];
150};
151
152
153struct kvm_cpuid_entry {
154 __u32 function;
155 __u32 eax;
156 __u32 ebx;
157 __u32 ecx;
158 __u32 edx;
159 __u32 padding;
160};
161
162/* for KVM_SET_CPUID */
163struct kvm_cpuid {
164 __u32 nent;
165 __u32 padding;
166 struct kvm_cpuid_entry entries[0];
167};
168
169struct kvm_cpuid_entry2 {
170 __u32 function;
171 __u32 index;
172 __u32 flags;
173 __u32 eax;
174 __u32 ebx;
175 __u32 ecx;
176 __u32 edx;
177 __u32 padding[3];
178};
179
180#define KVM_CPUID_FLAG_SIGNIFCANT_INDEX 1
181#define KVM_CPUID_FLAG_STATEFUL_FUNC 2
182#define KVM_CPUID_FLAG_STATE_READ_NEXT 4
183
184/* for KVM_SET_CPUID2 */
185struct kvm_cpuid2 {
186 __u32 nent;
187 __u32 padding;
188 struct kvm_cpuid_entry2 entries[0];
189};
190
191/* for KVM_GET_PIT and KVM_SET_PIT */
192struct kvm_pit_channel_state {
193 __u32 count; /* can be 65536 */
194 __u16 latched_count;
195 __u8 count_latched;
196 __u8 status_latched;
197 __u8 status;
198 __u8 read_state;
199 __u8 write_state;
200 __u8 write_latch;
201 __u8 rw_mode;
202 __u8 mode;
203 __u8 bcd;
204 __u8 gate;
205 __s64 count_load_time;
206};
207
208struct kvm_pit_state {
209 struct kvm_pit_channel_state channels[3];
210};
211#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
new file mode 100644
index 000000000000..65679d006337
--- /dev/null
+++ b/arch/x86/include/asm/kvm_host.h
@@ -0,0 +1,752 @@
1/*
2 * Kernel-based Virtual Machine driver for Linux
3 *
4 * This header defines architecture specific interfaces, x86 version
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
8 *
9 */
10
11#ifndef _ASM_X86_KVM_HOST_H
12#define _ASM_X86_KVM_HOST_H
13
14#include <linux/types.h>
15#include <linux/mm.h>
16#include <linux/mmu_notifier.h>
17
18#include <linux/kvm.h>
19#include <linux/kvm_para.h>
20#include <linux/kvm_types.h>
21
22#include <asm/pvclock-abi.h>
23#include <asm/desc.h>
24
25#define KVM_MAX_VCPUS 16
26#define KVM_MEMORY_SLOTS 32
27/* memory slots that does not exposed to userspace */
28#define KVM_PRIVATE_MEM_SLOTS 4
29
30#define KVM_PIO_PAGE_OFFSET 1
31#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
32
33#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
34#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
35#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
36 0xFFFFFF0000000000ULL)
37
38#define KVM_GUEST_CR0_MASK \
39 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE \
40 | X86_CR0_NW | X86_CR0_CD)
41#define KVM_VM_CR0_ALWAYS_ON \
42 (X86_CR0_PG | X86_CR0_PE | X86_CR0_WP | X86_CR0_NE | X86_CR0_TS \
43 | X86_CR0_MP)
44#define KVM_GUEST_CR4_MASK \
45 (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
46#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
47#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
48
49#define INVALID_PAGE (~(hpa_t)0)
50#define UNMAPPED_GVA (~(gpa_t)0)
51
52/* shadow tables are PAE even on non-PAE hosts */
53#define KVM_HPAGE_SHIFT 21
54#define KVM_HPAGE_SIZE (1UL << KVM_HPAGE_SHIFT)
55#define KVM_HPAGE_MASK (~(KVM_HPAGE_SIZE - 1))
56
57#define KVM_PAGES_PER_HPAGE (KVM_HPAGE_SIZE / PAGE_SIZE)
58
59#define DE_VECTOR 0
60#define DB_VECTOR 1
61#define BP_VECTOR 3
62#define OF_VECTOR 4
63#define BR_VECTOR 5
64#define UD_VECTOR 6
65#define NM_VECTOR 7
66#define DF_VECTOR 8
67#define TS_VECTOR 10
68#define NP_VECTOR 11
69#define SS_VECTOR 12
70#define GP_VECTOR 13
71#define PF_VECTOR 14
72#define MF_VECTOR 16
73#define MC_VECTOR 18
74
75#define SELECTOR_TI_MASK (1 << 2)
76#define SELECTOR_RPL_MASK 0x03
77
78#define IOPL_SHIFT 12
79
80#define KVM_ALIAS_SLOTS 4
81
82#define KVM_PERMILLE_MMU_PAGES 20
83#define KVM_MIN_ALLOC_MMU_PAGES 64
84#define KVM_MMU_HASH_SHIFT 10
85#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
86#define KVM_MIN_FREE_MMU_PAGES 5
87#define KVM_REFILL_PAGES 25
88#define KVM_MAX_CPUID_ENTRIES 40
89#define KVM_NR_VAR_MTRR 8
90
91extern spinlock_t kvm_lock;
92extern struct list_head vm_list;
93
94struct kvm_vcpu;
95struct kvm;
96
97enum kvm_reg {
98 VCPU_REGS_RAX = 0,
99 VCPU_REGS_RCX = 1,
100 VCPU_REGS_RDX = 2,
101 VCPU_REGS_RBX = 3,
102 VCPU_REGS_RSP = 4,
103 VCPU_REGS_RBP = 5,
104 VCPU_REGS_RSI = 6,
105 VCPU_REGS_RDI = 7,
106#ifdef CONFIG_X86_64
107 VCPU_REGS_R8 = 8,
108 VCPU_REGS_R9 = 9,
109 VCPU_REGS_R10 = 10,
110 VCPU_REGS_R11 = 11,
111 VCPU_REGS_R12 = 12,
112 VCPU_REGS_R13 = 13,
113 VCPU_REGS_R14 = 14,
114 VCPU_REGS_R15 = 15,
115#endif
116 VCPU_REGS_RIP,
117 NR_VCPU_REGS
118};
119
120enum {
121 VCPU_SREG_ES,
122 VCPU_SREG_CS,
123 VCPU_SREG_SS,
124 VCPU_SREG_DS,
125 VCPU_SREG_FS,
126 VCPU_SREG_GS,
127 VCPU_SREG_TR,
128 VCPU_SREG_LDTR,
129};
130
131#include <asm/kvm_x86_emulate.h>
132
133#define KVM_NR_MEM_OBJS 40
134
135struct kvm_guest_debug {
136 int enabled;
137 unsigned long bp[4];
138 int singlestep;
139};
140
141/*
142 * We don't want allocation failures within the mmu code, so we preallocate
143 * enough memory for a single page fault in a cache.
144 */
145struct kvm_mmu_memory_cache {
146 int nobjs;
147 void *objects[KVM_NR_MEM_OBJS];
148};
149
150#define NR_PTE_CHAIN_ENTRIES 5
151
152struct kvm_pte_chain {
153 u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES];
154 struct hlist_node link;
155};
156
157/*
158 * kvm_mmu_page_role, below, is defined as:
159 *
160 * bits 0:3 - total guest paging levels (2-4, or zero for real mode)
161 * bits 4:7 - page table level for this shadow (1-4)
162 * bits 8:9 - page table quadrant for 2-level guests
163 * bit 16 - "metaphysical" - gfn is not a real page (huge page/real mode)
164 * bits 17:19 - common access permissions for all ptes in this shadow page
165 */
166union kvm_mmu_page_role {
167 unsigned word;
168 struct {
169 unsigned glevels:4;
170 unsigned level:4;
171 unsigned quadrant:2;
172 unsigned pad_for_nice_hex_output:6;
173 unsigned metaphysical:1;
174 unsigned access:3;
175 unsigned invalid:1;
176 };
177};
178
179struct kvm_mmu_page {
180 struct list_head link;
181 struct hlist_node hash_link;
182
183 /*
184 * The following two entries are used to key the shadow page in the
185 * hash table.
186 */
187 gfn_t gfn;
188 union kvm_mmu_page_role role;
189
190 u64 *spt;
191 /* hold the gfn of each spte inside spt */
192 gfn_t *gfns;
193 unsigned long slot_bitmap; /* One bit set per slot which has memory
194 * in this shadow page.
195 */
196 int multimapped; /* More than one parent_pte? */
197 int root_count; /* Currently serving as active root */
198 bool unsync;
199 bool unsync_children;
200 union {
201 u64 *parent_pte; /* !multimapped */
202 struct hlist_head parent_ptes; /* multimapped, kvm_pte_chain */
203 };
204 DECLARE_BITMAP(unsync_child_bitmap, 512);
205};
206
207struct kvm_pv_mmu_op_buffer {
208 void *ptr;
209 unsigned len;
210 unsigned processed;
211 char buf[512] __aligned(sizeof(long));
212};
213
214/*
215 * x86 supports 3 paging modes (4-level 64-bit, 3-level 64-bit, and 2-level
216 * 32-bit). The kvm_mmu structure abstracts the details of the current mmu
217 * mode.
218 */
219struct kvm_mmu {
220 void (*new_cr3)(struct kvm_vcpu *vcpu);
221 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
222 void (*free)(struct kvm_vcpu *vcpu);
223 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva);
224 void (*prefetch_page)(struct kvm_vcpu *vcpu,
225 struct kvm_mmu_page *page);
226 int (*sync_page)(struct kvm_vcpu *vcpu,
227 struct kvm_mmu_page *sp);
228 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
229 hpa_t root_hpa;
230 int root_level;
231 int shadow_root_level;
232
233 u64 *pae_root;
234};
235
236struct kvm_vcpu_arch {
237 u64 host_tsc;
238 int interrupt_window_open;
239 unsigned long irq_summary; /* bit vector: 1 per word in irq_pending */
240 DECLARE_BITMAP(irq_pending, KVM_NR_INTERRUPTS);
241 /*
242 * rip and regs accesses must go through
243 * kvm_{register,rip}_{read,write} functions.
244 */
245 unsigned long regs[NR_VCPU_REGS];
246 u32 regs_avail;
247 u32 regs_dirty;
248
249 unsigned long cr0;
250 unsigned long cr2;
251 unsigned long cr3;
252 unsigned long cr4;
253 unsigned long cr8;
254 u64 pdptrs[4]; /* pae */
255 u64 shadow_efer;
256 u64 apic_base;
257 struct kvm_lapic *apic; /* kernel irqchip context */
258 int mp_state;
259 int sipi_vector;
260 u64 ia32_misc_enable_msr;
261 bool tpr_access_reporting;
262
263 struct kvm_mmu mmu;
264 /* only needed in kvm_pv_mmu_op() path, but it's hot so
265 * put it here to avoid allocation */
266 struct kvm_pv_mmu_op_buffer mmu_op_buffer;
267
268 struct kvm_mmu_memory_cache mmu_pte_chain_cache;
269 struct kvm_mmu_memory_cache mmu_rmap_desc_cache;
270 struct kvm_mmu_memory_cache mmu_page_cache;
271 struct kvm_mmu_memory_cache mmu_page_header_cache;
272
273 gfn_t last_pt_write_gfn;
274 int last_pt_write_count;
275 u64 *last_pte_updated;
276 gfn_t last_pte_gfn;
277
278 struct {
279 gfn_t gfn; /* presumed gfn during guest pte update */
280 pfn_t pfn; /* pfn corresponding to that gfn */
281 int largepage;
282 unsigned long mmu_seq;
283 } update_pte;
284
285 struct i387_fxsave_struct host_fx_image;
286 struct i387_fxsave_struct guest_fx_image;
287
288 gva_t mmio_fault_cr2;
289 struct kvm_pio_request pio;
290 void *pio_data;
291
292 struct kvm_queued_exception {
293 bool pending;
294 bool has_error_code;
295 u8 nr;
296 u32 error_code;
297 } exception;
298
299 struct kvm_queued_interrupt {
300 bool pending;
301 u8 nr;
302 } interrupt;
303
304 struct {
305 int active;
306 u8 save_iopl;
307 struct kvm_save_segment {
308 u16 selector;
309 unsigned long base;
310 u32 limit;
311 u32 ar;
312 } tr, es, ds, fs, gs;
313 } rmode;
314 int halt_request; /* real mode on Intel only */
315
316 int cpuid_nent;
317 struct kvm_cpuid_entry2 cpuid_entries[KVM_MAX_CPUID_ENTRIES];
318 /* emulate context */
319
320 struct x86_emulate_ctxt emulate_ctxt;
321
322 gpa_t time;
323 struct pvclock_vcpu_time_info hv_clock;
324 unsigned int hv_clock_tsc_khz;
325 unsigned int time_offset;
326 struct page *time_page;
327
328 bool nmi_pending;
329 bool nmi_injected;
330
331 u64 mtrr[0x100];
332};
333
334struct kvm_mem_alias {
335 gfn_t base_gfn;
336 unsigned long npages;
337 gfn_t target_gfn;
338};
339
340struct kvm_arch{
341 int naliases;
342 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
343
344 unsigned int n_free_mmu_pages;
345 unsigned int n_requested_mmu_pages;
346 unsigned int n_alloc_mmu_pages;
347 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
348 /*
349 * Hash table of struct kvm_mmu_page.
350 */
351 struct list_head active_mmu_pages;
352 struct list_head assigned_dev_head;
353 struct dmar_domain *intel_iommu_domain;
354 struct kvm_pic *vpic;
355 struct kvm_ioapic *vioapic;
356 struct kvm_pit *vpit;
357 struct hlist_head irq_ack_notifier_list;
358
359 int round_robin_prev_vcpu;
360 unsigned int tss_addr;
361 struct page *apic_access_page;
362
363 gpa_t wall_clock;
364
365 struct page *ept_identity_pagetable;
366 bool ept_identity_pagetable_done;
367};
368
369struct kvm_vm_stat {
370 u32 mmu_shadow_zapped;
371 u32 mmu_pte_write;
372 u32 mmu_pte_updated;
373 u32 mmu_pde_zapped;
374 u32 mmu_flooded;
375 u32 mmu_recycled;
376 u32 mmu_cache_miss;
377 u32 mmu_unsync;
378 u32 remote_tlb_flush;
379 u32 lpages;
380};
381
382struct kvm_vcpu_stat {
383 u32 pf_fixed;
384 u32 pf_guest;
385 u32 tlb_flush;
386 u32 invlpg;
387
388 u32 exits;
389 u32 io_exits;
390 u32 mmio_exits;
391 u32 signal_exits;
392 u32 irq_window_exits;
393 u32 nmi_window_exits;
394 u32 halt_exits;
395 u32 halt_wakeup;
396 u32 request_irq_exits;
397 u32 irq_exits;
398 u32 host_state_reload;
399 u32 efer_reload;
400 u32 fpu_reload;
401 u32 insn_emulation;
402 u32 insn_emulation_fail;
403 u32 hypercalls;
404 u32 irq_injections;
405};
406
407struct descriptor_table {
408 u16 limit;
409 unsigned long base;
410} __attribute__((packed));
411
412struct kvm_x86_ops {
413 int (*cpu_has_kvm_support)(void); /* __init */
414 int (*disabled_by_bios)(void); /* __init */
415 void (*hardware_enable)(void *dummy); /* __init */
416 void (*hardware_disable)(void *dummy);
417 void (*check_processor_compatibility)(void *rtn);
418 int (*hardware_setup)(void); /* __init */
419 void (*hardware_unsetup)(void); /* __exit */
420 bool (*cpu_has_accelerated_tpr)(void);
421
422 /* Create, but do not attach this VCPU */
423 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
424 void (*vcpu_free)(struct kvm_vcpu *vcpu);
425 int (*vcpu_reset)(struct kvm_vcpu *vcpu);
426
427 void (*prepare_guest_switch)(struct kvm_vcpu *vcpu);
428 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
429 void (*vcpu_put)(struct kvm_vcpu *vcpu);
430
431 int (*set_guest_debug)(struct kvm_vcpu *vcpu,
432 struct kvm_debug_guest *dbg);
433 void (*guest_debug_pre)(struct kvm_vcpu *vcpu);
434 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
435 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
436 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
437 void (*get_segment)(struct kvm_vcpu *vcpu,
438 struct kvm_segment *var, int seg);
439 int (*get_cpl)(struct kvm_vcpu *vcpu);
440 void (*set_segment)(struct kvm_vcpu *vcpu,
441 struct kvm_segment *var, int seg);
442 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
443 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
444 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
445 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
446 void (*set_cr4)(struct kvm_vcpu *vcpu, unsigned long cr4);
447 void (*set_efer)(struct kvm_vcpu *vcpu, u64 efer);
448 void (*get_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
449 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
450 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
451 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
452 unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr);
453 void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value,
454 int *exception);
455 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
456 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
457 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
458
459 void (*tlb_flush)(struct kvm_vcpu *vcpu);
460
461 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run);
462 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu);
463 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
464 void (*patch_hypercall)(struct kvm_vcpu *vcpu,
465 unsigned char *hypercall_addr);
466 int (*get_irq)(struct kvm_vcpu *vcpu);
467 void (*set_irq)(struct kvm_vcpu *vcpu, int vec);
468 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
469 bool has_error_code, u32 error_code);
470 bool (*exception_injected)(struct kvm_vcpu *vcpu);
471 void (*inject_pending_irq)(struct kvm_vcpu *vcpu);
472 void (*inject_pending_vectors)(struct kvm_vcpu *vcpu,
473 struct kvm_run *run);
474
475 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
476 int (*get_tdp_level)(void);
477};
478
479extern struct kvm_x86_ops *kvm_x86_ops;
480
481int kvm_mmu_module_init(void);
482void kvm_mmu_module_exit(void);
483
484void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
485int kvm_mmu_create(struct kvm_vcpu *vcpu);
486int kvm_mmu_setup(struct kvm_vcpu *vcpu);
487void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
488void kvm_mmu_set_base_ptes(u64 base_pte);
489void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
490 u64 dirty_mask, u64 nx_mask, u64 x_mask);
491
492int kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
493void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot);
494void kvm_mmu_zap_all(struct kvm *kvm);
495unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
496void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
497
498int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
499
500int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
501 const void *val, int bytes);
502int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
503 gpa_t addr, unsigned long *ret);
504
505extern bool tdp_enabled;
506
507enum emulation_result {
508 EMULATE_DONE, /* no further processing */
509 EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
510 EMULATE_FAIL, /* can't emulate this instruction */
511};
512
513#define EMULTYPE_NO_DECODE (1 << 0)
514#define EMULTYPE_TRAP_UD (1 << 1)
515int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run,
516 unsigned long cr2, u16 error_code, int emulation_type);
517void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
518void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
519void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
520void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
521 unsigned long *rflags);
522
523unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr);
524void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long value,
525 unsigned long *rflags);
526void kvm_enable_efer_bits(u64);
527int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
528int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
529
530struct x86_emulate_ctxt;
531
532int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
533 int size, unsigned port);
534int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
535 int size, unsigned long count, int down,
536 gva_t address, int rep, unsigned port);
537void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
538int kvm_emulate_halt(struct kvm_vcpu *vcpu);
539int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
540int emulate_clts(struct kvm_vcpu *vcpu);
541int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
542 unsigned long *dest);
543int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
544 unsigned long value);
545
546void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
547int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
548 int type_bits, int seg);
549
550int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
551
552void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
553void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
554void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
555void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
556unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
557void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw);
558void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
559
560int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
561int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
562
563void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
564void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
565void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
566 u32 error_code);
567
568void kvm_pic_set_irq(void *opaque, int irq, int level);
569
570void kvm_inject_nmi(struct kvm_vcpu *vcpu);
571
572void fx_init(struct kvm_vcpu *vcpu);
573
574int emulator_read_std(unsigned long addr,
575 void *val,
576 unsigned int bytes,
577 struct kvm_vcpu *vcpu);
578int emulator_write_emulated(unsigned long addr,
579 const void *val,
580 unsigned int bytes,
581 struct kvm_vcpu *vcpu);
582
583unsigned long segment_base(u16 selector);
584
585void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu);
586void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
587 const u8 *new, int bytes);
588int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva);
589void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
590int kvm_mmu_load(struct kvm_vcpu *vcpu);
591void kvm_mmu_unload(struct kvm_vcpu *vcpu);
592void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
593
594int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
595
596int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
597
598int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
599void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
600
601void kvm_enable_tdp(void);
602void kvm_disable_tdp(void);
603
604int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
605int complete_pio(struct kvm_vcpu *vcpu);
606
607static inline struct kvm_mmu_page *page_header(hpa_t shadow_page)
608{
609 struct page *page = pfn_to_page(shadow_page >> PAGE_SHIFT);
610
611 return (struct kvm_mmu_page *)page_private(page);
612}
613
614static inline u16 kvm_read_fs(void)
615{
616 u16 seg;
617 asm("mov %%fs, %0" : "=g"(seg));
618 return seg;
619}
620
621static inline u16 kvm_read_gs(void)
622{
623 u16 seg;
624 asm("mov %%gs, %0" : "=g"(seg));
625 return seg;
626}
627
628static inline u16 kvm_read_ldt(void)
629{
630 u16 ldt;
631 asm("sldt %0" : "=g"(ldt));
632 return ldt;
633}
634
635static inline void kvm_load_fs(u16 sel)
636{
637 asm("mov %0, %%fs" : : "rm"(sel));
638}
639
640static inline void kvm_load_gs(u16 sel)
641{
642 asm("mov %0, %%gs" : : "rm"(sel));
643}
644
645static inline void kvm_load_ldt(u16 sel)
646{
647 asm("lldt %0" : : "rm"(sel));
648}
649
650static inline void kvm_get_idt(struct descriptor_table *table)
651{
652 asm("sidt %0" : "=m"(*table));
653}
654
655static inline void kvm_get_gdt(struct descriptor_table *table)
656{
657 asm("sgdt %0" : "=m"(*table));
658}
659
660static inline unsigned long kvm_read_tr_base(void)
661{
662 u16 tr;
663 asm("str %0" : "=g"(tr));
664 return segment_base(tr);
665}
666
667#ifdef CONFIG_X86_64
668static inline unsigned long read_msr(unsigned long msr)
669{
670 u64 value;
671
672 rdmsrl(msr, value);
673 return value;
674}
675#endif
676
677static inline void kvm_fx_save(struct i387_fxsave_struct *image)
678{
679 asm("fxsave (%0)":: "r" (image));
680}
681
682static inline void kvm_fx_restore(struct i387_fxsave_struct *image)
683{
684 asm("fxrstor (%0)":: "r" (image));
685}
686
687static inline void kvm_fx_finit(void)
688{
689 asm("finit");
690}
691
692static inline u32 get_rdx_init_val(void)
693{
694 return 0x600; /* P6 family */
695}
696
697static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code)
698{
699 kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
700}
701
702#define ASM_VMX_VMCLEAR_RAX ".byte 0x66, 0x0f, 0xc7, 0x30"
703#define ASM_VMX_VMLAUNCH ".byte 0x0f, 0x01, 0xc2"
704#define ASM_VMX_VMRESUME ".byte 0x0f, 0x01, 0xc3"
705#define ASM_VMX_VMPTRLD_RAX ".byte 0x0f, 0xc7, 0x30"
706#define ASM_VMX_VMREAD_RDX_RAX ".byte 0x0f, 0x78, 0xd0"
707#define ASM_VMX_VMWRITE_RAX_RDX ".byte 0x0f, 0x79, 0xd0"
708#define ASM_VMX_VMWRITE_RSP_RDX ".byte 0x0f, 0x79, 0xd4"
709#define ASM_VMX_VMXOFF ".byte 0x0f, 0x01, 0xc4"
710#define ASM_VMX_VMXON_RAX ".byte 0xf3, 0x0f, 0xc7, 0x30"
711#define ASM_VMX_INVEPT ".byte 0x66, 0x0f, 0x38, 0x80, 0x08"
712#define ASM_VMX_INVVPID ".byte 0x66, 0x0f, 0x38, 0x81, 0x08"
713
714#define MSR_IA32_TIME_STAMP_COUNTER 0x010
715
716#define TSS_IOPB_BASE_OFFSET 0x66
717#define TSS_BASE_SIZE 0x68
718#define TSS_IOPB_SIZE (65536 / 8)
719#define TSS_REDIRECTION_SIZE (256 / 8)
720#define RMODE_TSS_SIZE \
721 (TSS_BASE_SIZE + TSS_REDIRECTION_SIZE + TSS_IOPB_SIZE + 1)
722
723enum {
724 TASK_SWITCH_CALL = 0,
725 TASK_SWITCH_IRET = 1,
726 TASK_SWITCH_JMP = 2,
727 TASK_SWITCH_GATE = 3,
728};
729
730/*
731 * Hardware virtualization extension instructions may fault if a
732 * reboot turns off virtualization while processes are running.
733 * Trap the fault and ignore the instruction if that happens.
734 */
735asmlinkage void kvm_handle_fault_on_reboot(void);
736
737#define __kvm_handle_fault_on_reboot(insn) \
738 "666: " insn "\n\t" \
739 ".pushsection .fixup, \"ax\" \n" \
740 "667: \n\t" \
741 __ASM_SIZE(push) " $666b \n\t" \
742 "jmp kvm_handle_fault_on_reboot \n\t" \
743 ".popsection \n\t" \
744 ".pushsection __ex_table, \"a\" \n\t" \
745 _ASM_PTR " 666b, 667b \n\t" \
746 ".popsection"
747
748#define KVM_ARCH_WANT_MMU_NOTIFIER
749int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
750int kvm_age_hva(struct kvm *kvm, unsigned long hva);
751
752#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
new file mode 100644
index 000000000000..b8a3305ae093
--- /dev/null
+++ b/arch/x86/include/asm/kvm_para.h
@@ -0,0 +1,147 @@
1#ifndef _ASM_X86_KVM_PARA_H
2#define _ASM_X86_KVM_PARA_H
3
4/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
5 * should be used to determine that a VM is running under KVM.
6 */
7#define KVM_CPUID_SIGNATURE 0x40000000
8
9/* This CPUID returns a feature bitmap in eax. Before enabling a particular
10 * paravirtualization, the appropriate feature bit should be checked.
11 */
12#define KVM_CPUID_FEATURES 0x40000001
13#define KVM_FEATURE_CLOCKSOURCE 0
14#define KVM_FEATURE_NOP_IO_DELAY 1
15#define KVM_FEATURE_MMU_OP 2
16
17#define MSR_KVM_WALL_CLOCK 0x11
18#define MSR_KVM_SYSTEM_TIME 0x12
19
20#define KVM_MAX_MMU_OP_BATCH 32
21
22/* Operations for KVM_HC_MMU_OP */
23#define KVM_MMU_OP_WRITE_PTE 1
24#define KVM_MMU_OP_FLUSH_TLB 2
25#define KVM_MMU_OP_RELEASE_PT 3
26
27/* Payload for KVM_HC_MMU_OP */
28struct kvm_mmu_op_header {
29 __u32 op;
30 __u32 pad;
31};
32
33struct kvm_mmu_op_write_pte {
34 struct kvm_mmu_op_header header;
35 __u64 pte_phys;
36 __u64 pte_val;
37};
38
39struct kvm_mmu_op_flush_tlb {
40 struct kvm_mmu_op_header header;
41};
42
43struct kvm_mmu_op_release_pt {
44 struct kvm_mmu_op_header header;
45 __u64 pt_phys;
46};
47
48#ifdef __KERNEL__
49#include <asm/processor.h>
50
51extern void kvmclock_init(void);
52
53
54/* This instruction is vmcall. On non-VT architectures, it will generate a
55 * trap that we will then rewrite to the appropriate instruction.
56 */
57#define KVM_HYPERCALL ".byte 0x0f,0x01,0xc1"
58
59/* For KVM hypercalls, a three-byte sequence of either the vmrun or the vmmrun
60 * instruction. The hypervisor may replace it with something else but only the
61 * instructions are guaranteed to be supported.
62 *
63 * Up to four arguments may be passed in rbx, rcx, rdx, and rsi respectively.
64 * The hypercall number should be placed in rax and the return value will be
65 * placed in rax. No other registers will be clobbered unless explicited
66 * noted by the particular hypercall.
67 */
68
69static inline long kvm_hypercall0(unsigned int nr)
70{
71 long ret;
72 asm volatile(KVM_HYPERCALL
73 : "=a"(ret)
74 : "a"(nr)
75 : "memory");
76 return ret;
77}
78
79static inline long kvm_hypercall1(unsigned int nr, unsigned long p1)
80{
81 long ret;
82 asm volatile(KVM_HYPERCALL
83 : "=a"(ret)
84 : "a"(nr), "b"(p1)
85 : "memory");
86 return ret;
87}
88
89static inline long kvm_hypercall2(unsigned int nr, unsigned long p1,
90 unsigned long p2)
91{
92 long ret;
93 asm volatile(KVM_HYPERCALL
94 : "=a"(ret)
95 : "a"(nr), "b"(p1), "c"(p2)
96 : "memory");
97 return ret;
98}
99
100static inline long kvm_hypercall3(unsigned int nr, unsigned long p1,
101 unsigned long p2, unsigned long p3)
102{
103 long ret;
104 asm volatile(KVM_HYPERCALL
105 : "=a"(ret)
106 : "a"(nr), "b"(p1), "c"(p2), "d"(p3)
107 : "memory");
108 return ret;
109}
110
111static inline long kvm_hypercall4(unsigned int nr, unsigned long p1,
112 unsigned long p2, unsigned long p3,
113 unsigned long p4)
114{
115 long ret;
116 asm volatile(KVM_HYPERCALL
117 : "=a"(ret)
118 : "a"(nr), "b"(p1), "c"(p2), "d"(p3), "S"(p4)
119 : "memory");
120 return ret;
121}
122
123static inline int kvm_para_available(void)
124{
125 unsigned int eax, ebx, ecx, edx;
126 char signature[13];
127
128 cpuid(KVM_CPUID_SIGNATURE, &eax, &ebx, &ecx, &edx);
129 memcpy(signature + 0, &ebx, 4);
130 memcpy(signature + 4, &ecx, 4);
131 memcpy(signature + 8, &edx, 4);
132 signature[12] = 0;
133
134 if (strcmp(signature, "KVMKVMKVM") == 0)
135 return 1;
136
137 return 0;
138}
139
140static inline unsigned int kvm_arch_para_features(void)
141{
142 return cpuid_eax(KVM_CPUID_FEATURES);
143}
144
145#endif
146
147#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/kvm_x86_emulate.h b/arch/x86/include/asm/kvm_x86_emulate.h
new file mode 100644
index 000000000000..25179a29f208
--- /dev/null
+++ b/arch/x86/include/asm/kvm_x86_emulate.h
@@ -0,0 +1,184 @@
1/******************************************************************************
2 * x86_emulate.h
3 *
4 * Generic x86 (32-bit and 64-bit) instruction decoder and emulator.
5 *
6 * Copyright (c) 2005 Keir Fraser
7 *
8 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
9 */
10
11#ifndef _ASM_X86_KVM_X86_EMULATE_H
12#define _ASM_X86_KVM_X86_EMULATE_H
13
14struct x86_emulate_ctxt;
15
16/*
17 * x86_emulate_ops:
18 *
19 * These operations represent the instruction emulator's interface to memory.
20 * There are two categories of operation: those that act on ordinary memory
21 * regions (*_std), and those that act on memory regions known to require
22 * special treatment or emulation (*_emulated).
23 *
24 * The emulator assumes that an instruction accesses only one 'emulated memory'
25 * location, that this location is the given linear faulting address (cr2), and
26 * that this is one of the instruction's data operands. Instruction fetches and
27 * stack operations are assumed never to access emulated memory. The emulator
28 * automatically deduces which operand of a string-move operation is accessing
29 * emulated memory, and assumes that the other operand accesses normal memory.
30 *
31 * NOTES:
32 * 1. The emulator isn't very smart about emulated vs. standard memory.
33 * 'Emulated memory' access addresses should be checked for sanity.
34 * 'Normal memory' accesses may fault, and the caller must arrange to
35 * detect and handle reentrancy into the emulator via recursive faults.
36 * Accesses may be unaligned and may cross page boundaries.
37 * 2. If the access fails (cannot emulate, or a standard access faults) then
38 * it is up to the memop to propagate the fault to the guest VM via
39 * some out-of-band mechanism, unknown to the emulator. The memop signals
40 * failure by returning X86EMUL_PROPAGATE_FAULT to the emulator, which will
41 * then immediately bail.
42 * 3. Valid access sizes are 1, 2, 4 and 8 bytes. On x86/32 systems only
43 * cmpxchg8b_emulated need support 8-byte accesses.
44 * 4. The emulator cannot handle 64-bit mode emulation on an x86/32 system.
45 */
46/* Access completed successfully: continue emulation as normal. */
47#define X86EMUL_CONTINUE 0
48/* Access is unhandleable: bail from emulation and return error to caller. */
49#define X86EMUL_UNHANDLEABLE 1
50/* Terminate emulation but return success to the caller. */
51#define X86EMUL_PROPAGATE_FAULT 2 /* propagate a generated fault to guest */
52#define X86EMUL_RETRY_INSTR 2 /* retry the instruction for some reason */
53#define X86EMUL_CMPXCHG_FAILED 2 /* cmpxchg did not see expected value */
54struct x86_emulate_ops {
55 /*
56 * read_std: Read bytes of standard (non-emulated/special) memory.
57 * Used for instruction fetch, stack operations, and others.
58 * @addr: [IN ] Linear address from which to read.
59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
60 * @bytes: [IN ] Number of bytes to read from memory.
61 */
62 int (*read_std)(unsigned long addr, void *val,
63 unsigned int bytes, struct kvm_vcpu *vcpu);
64
65 /*
66 * read_emulated: Read bytes from emulated/special memory area.
67 * @addr: [IN ] Linear address from which to read.
68 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
69 * @bytes: [IN ] Number of bytes to read from memory.
70 */
71 int (*read_emulated)(unsigned long addr,
72 void *val,
73 unsigned int bytes,
74 struct kvm_vcpu *vcpu);
75
76 /*
77 * write_emulated: Read bytes from emulated/special memory area.
78 * @addr: [IN ] Linear address to which to write.
79 * @val: [IN ] Value to write to memory (low-order bytes used as
80 * required).
81 * @bytes: [IN ] Number of bytes to write to memory.
82 */
83 int (*write_emulated)(unsigned long addr,
84 const void *val,
85 unsigned int bytes,
86 struct kvm_vcpu *vcpu);
87
88 /*
89 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
90 * emulated/special memory area.
91 * @addr: [IN ] Linear address to access.
92 * @old: [IN ] Value expected to be current at @addr.
93 * @new: [IN ] Value to write to @addr.
94 * @bytes: [IN ] Number of bytes to access using CMPXCHG.
95 */
96 int (*cmpxchg_emulated)(unsigned long addr,
97 const void *old,
98 const void *new,
99 unsigned int bytes,
100 struct kvm_vcpu *vcpu);
101
102};
103
104/* Type, address-of, and value of an instruction's operand. */
105struct operand {
106 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type;
107 unsigned int bytes;
108 unsigned long val, orig_val, *ptr;
109};
110
111struct fetch_cache {
112 u8 data[15];
113 unsigned long start;
114 unsigned long end;
115};
116
117struct decode_cache {
118 u8 twobyte;
119 u8 b;
120 u8 lock_prefix;
121 u8 rep_prefix;
122 u8 op_bytes;
123 u8 ad_bytes;
124 u8 rex_prefix;
125 struct operand src;
126 struct operand dst;
127 bool has_seg_override;
128 u8 seg_override;
129 unsigned int d;
130 unsigned long regs[NR_VCPU_REGS];
131 unsigned long eip;
132 /* modrm */
133 u8 modrm;
134 u8 modrm_mod;
135 u8 modrm_reg;
136 u8 modrm_rm;
137 u8 use_modrm_ea;
138 bool rip_relative;
139 unsigned long modrm_ea;
140 void *modrm_ptr;
141 unsigned long modrm_val;
142 struct fetch_cache fetch;
143};
144
145struct x86_emulate_ctxt {
146 /* Register state before/after emulation. */
147 struct kvm_vcpu *vcpu;
148
149 /* Linear faulting address (if emulating a page-faulting instruction) */
150 unsigned long eflags;
151
152 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
153 int mode;
154
155 u32 cs_base;
156
157 /* decode cache */
158
159 struct decode_cache decode;
160};
161
162/* Repeat String Operation Prefix */
163#define REPE_PREFIX 1
164#define REPNE_PREFIX 2
165
166/* Execution mode, passed to the emulator. */
167#define X86EMUL_MODE_REAL 0 /* Real mode. */
168#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
169#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
170#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
171
172/* Host execution mode. */
173#if defined(__i386__)
174#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
175#elif defined(CONFIG_X86_64)
176#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
177#endif
178
179int x86_decode_insn(struct x86_emulate_ctxt *ctxt,
180 struct x86_emulate_ops *ops);
181int x86_emulate_insn(struct x86_emulate_ctxt *ctxt,
182 struct x86_emulate_ops *ops);
183
184#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/ldt.h b/arch/x86/include/asm/ldt.h
new file mode 100644
index 000000000000..46727eb37bfe
--- /dev/null
+++ b/arch/x86/include/asm/ldt.h
@@ -0,0 +1,40 @@
1/*
2 * ldt.h
3 *
4 * Definitions of structures used with the modify_ldt system call.
5 */
6#ifndef _ASM_X86_LDT_H
7#define _ASM_X86_LDT_H
8
9/* Maximum number of LDT entries supported. */
10#define LDT_ENTRIES 8192
11/* The size of each LDT entry. */
12#define LDT_ENTRY_SIZE 8
13
14#ifndef __ASSEMBLY__
15/*
16 * Note on 64bit base and limit is ignored and you cannot set DS/ES/CS
17 * not to the default values if you still want to do syscalls. This
18 * call is more for 32bit mode therefore.
19 */
20struct user_desc {
21 unsigned int entry_number;
22 unsigned int base_addr;
23 unsigned int limit;
24 unsigned int seg_32bit:1;
25 unsigned int contents:2;
26 unsigned int read_exec_only:1;
27 unsigned int limit_in_pages:1;
28 unsigned int seg_not_present:1;
29 unsigned int useable:1;
30#ifdef __x86_64__
31 unsigned int lm:1;
32#endif
33};
34
35#define MODIFY_LDT_CONTENTS_DATA 0
36#define MODIFY_LDT_CONTENTS_STACK 1
37#define MODIFY_LDT_CONTENTS_CODE 2
38
39#endif /* !__ASSEMBLY__ */
40#endif /* _ASM_X86_LDT_H */
diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
new file mode 100644
index 000000000000..d28a507cef39
--- /dev/null
+++ b/arch/x86/include/asm/lguest.h
@@ -0,0 +1,94 @@
1#ifndef _ASM_X86_LGUEST_H
2#define _ASM_X86_LGUEST_H
3
4#define GDT_ENTRY_LGUEST_CS 10
5#define GDT_ENTRY_LGUEST_DS 11
6#define LGUEST_CS (GDT_ENTRY_LGUEST_CS * 8)
7#define LGUEST_DS (GDT_ENTRY_LGUEST_DS * 8)
8
9#ifndef __ASSEMBLY__
10#include <asm/desc.h>
11
12#define GUEST_PL 1
13
14/* Every guest maps the core switcher code. */
15#define SHARED_SWITCHER_PAGES \
16 DIV_ROUND_UP(end_switcher_text - start_switcher_text, PAGE_SIZE)
17/* Pages for switcher itself, then two pages per cpu */
18#define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * NR_CPUS)
19
20/* We map at -4M for ease of mapping into the guest (one PTE page). */
21#define SWITCHER_ADDR 0xFFC00000
22
23/* Found in switcher.S */
24extern unsigned long default_idt_entries[];
25
26/* Declarations for definitions in lguest_guest.S */
27extern char lguest_noirq_start[], lguest_noirq_end[];
28extern const char lgstart_cli[], lgend_cli[];
29extern const char lgstart_sti[], lgend_sti[];
30extern const char lgstart_popf[], lgend_popf[];
31extern const char lgstart_pushf[], lgend_pushf[];
32extern const char lgstart_iret[], lgend_iret[];
33
34extern void lguest_iret(void);
35extern void lguest_init(void);
36
37struct lguest_regs {
38 /* Manually saved part. */
39 unsigned long eax, ebx, ecx, edx;
40 unsigned long esi, edi, ebp;
41 unsigned long gs;
42 unsigned long fs, ds, es;
43 unsigned long trapnum, errcode;
44 /* Trap pushed part */
45 unsigned long eip;
46 unsigned long cs;
47 unsigned long eflags;
48 unsigned long esp;
49 unsigned long ss;
50};
51
52/* This is a guest-specific page (mapped ro) into the guest. */
53struct lguest_ro_state {
54 /* Host information we need to restore when we switch back. */
55 u32 host_cr3;
56 struct desc_ptr host_idt_desc;
57 struct desc_ptr host_gdt_desc;
58 u32 host_sp;
59
60 /* Fields which are used when guest is running. */
61 struct desc_ptr guest_idt_desc;
62 struct desc_ptr guest_gdt_desc;
63 struct x86_hw_tss guest_tss;
64 struct desc_struct guest_idt[IDT_ENTRIES];
65 struct desc_struct guest_gdt[GDT_ENTRIES];
66};
67
68struct lg_cpu_arch {
69 /* The GDT entries copied into lguest_ro_state when running. */
70 struct desc_struct gdt[GDT_ENTRIES];
71
72 /* The IDT entries: some copied into lguest_ro_state when running. */
73 struct desc_struct idt[IDT_ENTRIES];
74
75 /* The address of the last guest-visible pagefault (ie. cr2). */
76 unsigned long last_pagefault;
77};
78
79static inline void lguest_set_ts(void)
80{
81 u32 cr0;
82
83 cr0 = read_cr0();
84 if (!(cr0 & 8))
85 write_cr0(cr0 | 8);
86}
87
88/* Full 4G segment descriptors, suitable for CS and DS. */
89#define FULL_EXEC_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9b00} } })
90#define FULL_SEGMENT ((struct desc_struct){ { {0x0000ffff, 0x00cf9300} } })
91
92#endif /* __ASSEMBLY__ */
93
94#endif /* _ASM_X86_LGUEST_H */
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
new file mode 100644
index 000000000000..43894428c3c2
--- /dev/null
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -0,0 +1,71 @@
1/* Architecture specific portion of the lguest hypercalls */
2#ifndef _ASM_X86_LGUEST_HCALL_H
3#define _ASM_X86_LGUEST_HCALL_H
4
5#define LHCALL_FLUSH_ASYNC 0
6#define LHCALL_LGUEST_INIT 1
7#define LHCALL_SHUTDOWN 2
8#define LHCALL_LOAD_GDT 3
9#define LHCALL_NEW_PGTABLE 4
10#define LHCALL_FLUSH_TLB 5
11#define LHCALL_LOAD_IDT_ENTRY 6
12#define LHCALL_SET_STACK 7
13#define LHCALL_TS 8
14#define LHCALL_SET_CLOCKEVENT 9
15#define LHCALL_HALT 10
16#define LHCALL_SET_PTE 14
17#define LHCALL_SET_PMD 15
18#define LHCALL_LOAD_TLS 16
19#define LHCALL_NOTIFY 17
20
21#define LGUEST_TRAP_ENTRY 0x1F
22
23/* Argument number 3 to LHCALL_LGUEST_SHUTDOWN */
24#define LGUEST_SHUTDOWN_POWEROFF 1
25#define LGUEST_SHUTDOWN_RESTART 2
26
27#ifndef __ASSEMBLY__
28#include <asm/hw_irq.h>
29
30/*G:031 But first, how does our Guest contact the Host to ask for privileged
31 * operations? There are two ways: the direct way is to make a "hypercall",
32 * to make requests of the Host Itself.
33 *
34 * Our hypercall mechanism uses the highest unused trap code (traps 32 and
35 * above are used by real hardware interrupts). Fifteen hypercalls are
36 * available: the hypercall number is put in the %eax register, and the
37 * arguments (when required) are placed in %edx, %ebx and %ecx. If a return
38 * value makes sense, it's returned in %eax.
39 *
40 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
41 * Host, rather than returning failure. This reflects Winston Churchill's
42 * definition of a gentleman: "someone who is only rude intentionally". */
43static inline unsigned long
44hcall(unsigned long call,
45 unsigned long arg1, unsigned long arg2, unsigned long arg3)
46{
47 /* "int" is the Intel instruction to trigger a trap. */
48 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
49 /* The call in %eax (aka "a") might be overwritten */
50 : "=a"(call)
51 /* The arguments are in %eax, %edx, %ebx & %ecx */
52 : "a"(call), "d"(arg1), "b"(arg2), "c"(arg3)
53 /* "memory" means this might write somewhere in memory.
54 * This isn't true for all calls, but it's safe to tell
55 * gcc that it might happen so it doesn't get clever. */
56 : "memory");
57 return call;
58}
59/*:*/
60
61/* Can't use our min() macro here: needs to be a constant */
62#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
63
64#define LHCALL_RING_SIZE 64
65struct hcall_args {
66 /* These map directly onto eax, ebx, ecx, edx in struct lguest_regs */
67 unsigned long arg0, arg2, arg3, arg1;
68};
69
70#endif /* !__ASSEMBLY__ */
71#endif /* _ASM_X86_LGUEST_HCALL_H */
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
new file mode 100644
index 000000000000..f61ee8f937e4
--- /dev/null
+++ b/arch/x86/include/asm/linkage.h
@@ -0,0 +1,61 @@
1#ifndef _ASM_X86_LINKAGE_H
2#define _ASM_X86_LINKAGE_H
3
4#undef notrace
5#define notrace __attribute__((no_instrument_function))
6
7#ifdef CONFIG_X86_64
8#define __ALIGN .p2align 4,,15
9#define __ALIGN_STR ".p2align 4,,15"
10#endif
11
12#ifdef CONFIG_X86_32
13#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
14/*
15 * For 32-bit UML - mark functions implemented in assembly that use
16 * regparm input parameters:
17 */
18#define asmregparm __attribute__((regparm(3)))
19
20/*
21 * Make sure the compiler doesn't do anything stupid with the
22 * arguments on the stack - they are owned by the *caller*, not
23 * the callee. This just fools gcc into not spilling into them,
24 * and keeps it from doing tailcall recursion and/or using the
25 * stack slots for temporaries, since they are live and "used"
26 * all the way to the end of the function.
27 *
28 * NOTE! On x86-64, all the arguments are in registers, so this
29 * only matters on a 32-bit kernel.
30 */
31#define asmlinkage_protect(n, ret, args...) \
32 __asmlinkage_protect##n(ret, ##args)
33#define __asmlinkage_protect_n(ret, args...) \
34 __asm__ __volatile__ ("" : "=r" (ret) : "0" (ret), ##args)
35#define __asmlinkage_protect0(ret) \
36 __asmlinkage_protect_n(ret)
37#define __asmlinkage_protect1(ret, arg1) \
38 __asmlinkage_protect_n(ret, "g" (arg1))
39#define __asmlinkage_protect2(ret, arg1, arg2) \
40 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2))
41#define __asmlinkage_protect3(ret, arg1, arg2, arg3) \
42 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3))
43#define __asmlinkage_protect4(ret, arg1, arg2, arg3, arg4) \
44 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
45 "g" (arg4))
46#define __asmlinkage_protect5(ret, arg1, arg2, arg3, arg4, arg5) \
47 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
48 "g" (arg4), "g" (arg5))
49#define __asmlinkage_protect6(ret, arg1, arg2, arg3, arg4, arg5, arg6) \
50 __asmlinkage_protect_n(ret, "g" (arg1), "g" (arg2), "g" (arg3), \
51 "g" (arg4), "g" (arg5), "g" (arg6))
52
53#endif
54
55#ifdef CONFIG_X86_ALIGNMENT_16
56#define __ALIGN .align 16,0x90
57#define __ALIGN_STR ".align 16,0x90"
58#endif
59
60#endif /* _ASM_X86_LINKAGE_H */
61
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
new file mode 100644
index 000000000000..47b9b6f19057
--- /dev/null
+++ b/arch/x86/include/asm/local.h
@@ -0,0 +1,235 @@
1#ifndef _ASM_X86_LOCAL_H
2#define _ASM_X86_LOCAL_H
3
4#include <linux/percpu.h>
5
6#include <asm/system.h>
7#include <asm/atomic.h>
8#include <asm/asm.h>
9
10typedef struct {
11 atomic_long_t a;
12} local_t;
13
14#define LOCAL_INIT(i) { ATOMIC_LONG_INIT(i) }
15
16#define local_read(l) atomic_long_read(&(l)->a)
17#define local_set(l, i) atomic_long_set(&(l)->a, (i))
18
19static inline void local_inc(local_t *l)
20{
21 asm volatile(_ASM_INC "%0"
22 : "+m" (l->a.counter));
23}
24
25static inline void local_dec(local_t *l)
26{
27 asm volatile(_ASM_DEC "%0"
28 : "+m" (l->a.counter));
29}
30
31static inline void local_add(long i, local_t *l)
32{
33 asm volatile(_ASM_ADD "%1,%0"
34 : "+m" (l->a.counter)
35 : "ir" (i));
36}
37
38static inline void local_sub(long i, local_t *l)
39{
40 asm volatile(_ASM_SUB "%1,%0"
41 : "+m" (l->a.counter)
42 : "ir" (i));
43}
44
45/**
46 * local_sub_and_test - subtract value from variable and test result
47 * @i: integer value to subtract
48 * @l: pointer to type local_t
49 *
50 * Atomically subtracts @i from @l and returns
51 * true if the result is zero, or false for all
52 * other cases.
53 */
54static inline int local_sub_and_test(long i, local_t *l)
55{
56 unsigned char c;
57
58 asm volatile(_ASM_SUB "%2,%0; sete %1"
59 : "+m" (l->a.counter), "=qm" (c)
60 : "ir" (i) : "memory");
61 return c;
62}
63
64/**
65 * local_dec_and_test - decrement and test
66 * @l: pointer to type local_t
67 *
68 * Atomically decrements @l by 1 and
69 * returns true if the result is 0, or false for all other
70 * cases.
71 */
72static inline int local_dec_and_test(local_t *l)
73{
74 unsigned char c;
75
76 asm volatile(_ASM_DEC "%0; sete %1"
77 : "+m" (l->a.counter), "=qm" (c)
78 : : "memory");
79 return c != 0;
80}
81
82/**
83 * local_inc_and_test - increment and test
84 * @l: pointer to type local_t
85 *
86 * Atomically increments @l by 1
87 * and returns true if the result is zero, or false for all
88 * other cases.
89 */
90static inline int local_inc_and_test(local_t *l)
91{
92 unsigned char c;
93
94 asm volatile(_ASM_INC "%0; sete %1"
95 : "+m" (l->a.counter), "=qm" (c)
96 : : "memory");
97 return c != 0;
98}
99
100/**
101 * local_add_negative - add and test if negative
102 * @i: integer value to add
103 * @l: pointer to type local_t
104 *
105 * Atomically adds @i to @l and returns true
106 * if the result is negative, or false when
107 * result is greater than or equal to zero.
108 */
109static inline int local_add_negative(long i, local_t *l)
110{
111 unsigned char c;
112
113 asm volatile(_ASM_ADD "%2,%0; sets %1"
114 : "+m" (l->a.counter), "=qm" (c)
115 : "ir" (i) : "memory");
116 return c;
117}
118
119/**
120 * local_add_return - add and return
121 * @i: integer value to add
122 * @l: pointer to type local_t
123 *
124 * Atomically adds @i to @l and returns @i + @l
125 */
126static inline long local_add_return(long i, local_t *l)
127{
128 long __i;
129#ifdef CONFIG_M386
130 unsigned long flags;
131 if (unlikely(boot_cpu_data.x86 <= 3))
132 goto no_xadd;
133#endif
134 /* Modern 486+ processor */
135 __i = i;
136 asm volatile(_ASM_XADD "%0, %1;"
137 : "+r" (i), "+m" (l->a.counter)
138 : : "memory");
139 return i + __i;
140
141#ifdef CONFIG_M386
142no_xadd: /* Legacy 386 processor */
143 local_irq_save(flags);
144 __i = local_read(l);
145 local_set(l, i + __i);
146 local_irq_restore(flags);
147 return i + __i;
148#endif
149}
150
151static inline long local_sub_return(long i, local_t *l)
152{
153 return local_add_return(-i, l);
154}
155
156#define local_inc_return(l) (local_add_return(1, l))
157#define local_dec_return(l) (local_sub_return(1, l))
158
159#define local_cmpxchg(l, o, n) \
160 (cmpxchg_local(&((l)->a.counter), (o), (n)))
161/* Always has a lock prefix */
162#define local_xchg(l, n) (xchg(&((l)->a.counter), (n)))
163
164/**
165 * local_add_unless - add unless the number is a given value
166 * @l: pointer of type local_t
167 * @a: the amount to add to l...
168 * @u: ...unless l is equal to u.
169 *
170 * Atomically adds @a to @l, so long as it was not @u.
171 * Returns non-zero if @l was not @u, and zero otherwise.
172 */
173#define local_add_unless(l, a, u) \
174({ \
175 long c, old; \
176 c = local_read((l)); \
177 for (;;) { \
178 if (unlikely(c == (u))) \
179 break; \
180 old = local_cmpxchg((l), c, c + (a)); \
181 if (likely(old == c)) \
182 break; \
183 c = old; \
184 } \
185 c != (u); \
186})
187#define local_inc_not_zero(l) local_add_unless((l), 1, 0)
188
189/* On x86_32, these are no better than the atomic variants.
190 * On x86-64 these are better than the atomic variants on SMP kernels
191 * because they dont use a lock prefix.
192 */
193#define __local_inc(l) local_inc(l)
194#define __local_dec(l) local_dec(l)
195#define __local_add(i, l) local_add((i), (l))
196#define __local_sub(i, l) local_sub((i), (l))
197
198/* Use these for per-cpu local_t variables: on some archs they are
199 * much more efficient than these naive implementations. Note they take
200 * a variable, not an address.
201 *
202 * X86_64: This could be done better if we moved the per cpu data directly
203 * after GS.
204 */
205
206/* Need to disable preemption for the cpu local counters otherwise we could
207 still access a variable of a previous CPU in a non atomic way. */
208#define cpu_local_wrap_v(l) \
209({ \
210 local_t res__; \
211 preempt_disable(); \
212 res__ = (l); \
213 preempt_enable(); \
214 res__; \
215})
216#define cpu_local_wrap(l) \
217({ \
218 preempt_disable(); \
219 (l); \
220 preempt_enable(); \
221}) \
222
223#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var((l))))
224#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var((l)), (i)))
225#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var((l))))
226#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var((l))))
227#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var((l))))
228#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var((l))))
229
230#define __cpu_local_inc(l) cpu_local_inc((l))
231#define __cpu_local_dec(l) cpu_local_dec((l))
232#define __cpu_local_add(i, l) cpu_local_add((i), (l))
233#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
234
235#endif /* _ASM_X86_LOCAL_H */
diff --git a/arch/x86/include/asm/mach-default/apm.h b/arch/x86/include/asm/mach-default/apm.h
new file mode 100644
index 000000000000..20370c6db74b
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/apm.h
@@ -0,0 +1,73 @@
1/*
2 * Machine specific APM BIOS functions for generic.
3 * Split out from apm.c by Osamu Tomita <tomita@cinet.co.jp>
4 */
5
6#ifndef _ASM_X86_MACH_DEFAULT_APM_H
7#define _ASM_X86_MACH_DEFAULT_APM_H
8
9#ifdef APM_ZERO_SEGS
10# define APM_DO_ZERO_SEGS \
11 "pushl %%ds\n\t" \
12 "pushl %%es\n\t" \
13 "xorl %%edx, %%edx\n\t" \
14 "mov %%dx, %%ds\n\t" \
15 "mov %%dx, %%es\n\t" \
16 "mov %%dx, %%fs\n\t" \
17 "mov %%dx, %%gs\n\t"
18# define APM_DO_POP_SEGS \
19 "popl %%es\n\t" \
20 "popl %%ds\n\t"
21#else
22# define APM_DO_ZERO_SEGS
23# define APM_DO_POP_SEGS
24#endif
25
26static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in,
27 u32 *eax, u32 *ebx, u32 *ecx,
28 u32 *edx, u32 *esi)
29{
30 /*
31 * N.B. We do NOT need a cld after the BIOS call
32 * because we always save and restore the flags.
33 */
34 __asm__ __volatile__(APM_DO_ZERO_SEGS
35 "pushl %%edi\n\t"
36 "pushl %%ebp\n\t"
37 "lcall *%%cs:apm_bios_entry\n\t"
38 "setc %%al\n\t"
39 "popl %%ebp\n\t"
40 "popl %%edi\n\t"
41 APM_DO_POP_SEGS
42 : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx),
43 "=S" (*esi)
44 : "a" (func), "b" (ebx_in), "c" (ecx_in)
45 : "memory", "cc");
46}
47
48static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in,
49 u32 ecx_in, u32 *eax)
50{
51 int cx, dx, si;
52 u8 error;
53
54 /*
55 * N.B. We do NOT need a cld after the BIOS call
56 * because we always save and restore the flags.
57 */
58 __asm__ __volatile__(APM_DO_ZERO_SEGS
59 "pushl %%edi\n\t"
60 "pushl %%ebp\n\t"
61 "lcall *%%cs:apm_bios_entry\n\t"
62 "setc %%bl\n\t"
63 "popl %%ebp\n\t"
64 "popl %%edi\n\t"
65 APM_DO_POP_SEGS
66 : "=a" (*eax), "=b" (error), "=c" (cx), "=d" (dx),
67 "=S" (si)
68 : "a" (func), "b" (ebx_in), "c" (ecx_in)
69 : "memory", "cc");
70 return error;
71}
72
73#endif /* _ASM_X86_MACH_DEFAULT_APM_H */
diff --git a/arch/x86/include/asm/mach-default/do_timer.h b/arch/x86/include/asm/mach-default/do_timer.h
new file mode 100644
index 000000000000..23ecda0b28a0
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/do_timer.h
@@ -0,0 +1,16 @@
1/* defines for inline arch setup functions */
2#include <linux/clockchips.h>
3
4#include <asm/i8259.h>
5#include <asm/i8253.h>
6
7/**
8 * do_timer_interrupt_hook - hook into timer tick
9 *
10 * Call the pit clock event handler. see asm/i8253.h
11 **/
12
13static inline void do_timer_interrupt_hook(void)
14{
15 global_clock_event->event_handler(global_clock_event);
16}
diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h
new file mode 100644
index 000000000000..6b1add8e31dd
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/entry_arch.h
@@ -0,0 +1,36 @@
1/*
2 * This file is designed to contain the BUILD_INTERRUPT specifications for
3 * all of the extra named interrupt vectors used by the architecture.
4 * Usually this is the Inter Process Interrupts (IPIs)
5 */
6
7/*
8 * The following vectors are part of the Linux architecture, there
9 * is no hardware IRQ pin equivalent for them, they are triggered
10 * through the ICC by us (IPIs)
11 */
12#ifdef CONFIG_X86_SMP
13BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
14BUILD_INTERRUPT(invalidate_interrupt,INVALIDATE_TLB_VECTOR)
15BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
16BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
17BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
18#endif
19
20/*
21 * every pentium local APIC has two 'local interrupts', with a
22 * soft-definable vector attached to both interrupts, one of
23 * which is a timer interrupt, the other one is error counter
24 * overflow. Linux uses the local APIC timer interrupt to get
25 * a much simpler SMP time architecture:
26 */
27#ifdef CONFIG_X86_LOCAL_APIC
28BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
29BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
30BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
31
32#ifdef CONFIG_X86_MCE_P4THERMAL
33BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR)
34#endif
35
36#endif
diff --git a/arch/x86/include/asm/mach-default/mach_apic.h b/arch/x86/include/asm/mach-default/mach_apic.h
new file mode 100644
index 000000000000..ff3a6c236c00
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_apic.h
@@ -0,0 +1,156 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_APIC_H
2#define _ASM_X86_MACH_DEFAULT_MACH_APIC_H
3
4#ifdef CONFIG_X86_LOCAL_APIC
5
6#include <mach_apicdef.h>
7#include <asm/smp.h>
8
9#define APIC_DFR_VALUE (APIC_DFR_FLAT)
10
11static inline cpumask_t target_cpus(void)
12{
13#ifdef CONFIG_SMP
14 return cpu_online_map;
15#else
16 return cpumask_of_cpu(0);
17#endif
18}
19
20#define NO_BALANCE_IRQ (0)
21#define esr_disable (0)
22
23#ifdef CONFIG_X86_64
24#include <asm/genapic.h>
25#define INT_DELIVERY_MODE (genapic->int_delivery_mode)
26#define INT_DEST_MODE (genapic->int_dest_mode)
27#define TARGET_CPUS (genapic->target_cpus())
28#define apic_id_registered (genapic->apic_id_registered)
29#define init_apic_ldr (genapic->init_apic_ldr)
30#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
31#define phys_pkg_id (genapic->phys_pkg_id)
32#define vector_allocation_domain (genapic->vector_allocation_domain)
33#define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID)))
34#define send_IPI_self (genapic->send_IPI_self)
35extern void setup_apic_routing(void);
36#else
37#define INT_DELIVERY_MODE dest_LowestPrio
38#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */
39#define TARGET_CPUS (target_cpus())
40/*
41 * Set up the logical destination ID.
42 *
43 * Intel recommends to set DFR, LDR and TPR before enabling
44 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
45 * document number 292116). So here it goes...
46 */
47static inline void init_apic_ldr(void)
48{
49 unsigned long val;
50
51 apic_write(APIC_DFR, APIC_DFR_VALUE);
52 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
53 val |= SET_APIC_LOGICAL_ID(1UL << smp_processor_id());
54 apic_write(APIC_LDR, val);
55}
56
57static inline int apic_id_registered(void)
58{
59 return physid_isset(read_apic_id(), phys_cpu_present_map);
60}
61
62static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
63{
64 return cpus_addr(cpumask)[0];
65}
66
67static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
68{
69 return cpuid_apic >> index_msb;
70}
71
72static inline void setup_apic_routing(void)
73{
74#ifdef CONFIG_X86_IO_APIC
75 printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
76 "Flat", nr_ioapics);
77#endif
78}
79
80static inline int apicid_to_node(int logical_apicid)
81{
82#ifdef CONFIG_SMP
83 return apicid_2_node[hard_smp_processor_id()];
84#else
85 return 0;
86#endif
87}
88
89static inline cpumask_t vector_allocation_domain(int cpu)
90{
91 /* Careful. Some cpus do not strictly honor the set of cpus
92 * specified in the interrupt destination when using lowest
93 * priority interrupt delivery mode.
94 *
95 * In particular there was a hyperthreading cpu observed to
96 * deliver interrupts to the wrong hyperthread when only one
97 * hyperthread was specified in the interrupt desitination.
98 */
99 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
100 return domain;
101}
102#endif
103
104static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
105{
106 return physid_isset(apicid, bitmap);
107}
108
109static inline unsigned long check_apicid_present(int bit)
110{
111 return physid_isset(bit, phys_cpu_present_map);
112}
113
114static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
115{
116 return phys_map;
117}
118
119static inline int multi_timer_check(int apic, int irq)
120{
121 return 0;
122}
123
124/* Mapping from cpu number to logical apicid */
125static inline int cpu_to_logical_apicid(int cpu)
126{
127 return 1 << cpu;
128}
129
130static inline int cpu_present_to_apicid(int mps_cpu)
131{
132 if (mps_cpu < NR_CPUS && cpu_present(mps_cpu))
133 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
134 else
135 return BAD_APICID;
136}
137
138static inline physid_mask_t apicid_to_cpu_present(int phys_apicid)
139{
140 return physid_mask_of_physid(phys_apicid);
141}
142
143static inline void setup_portio_remap(void)
144{
145}
146
147static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
148{
149 return physid_isset(boot_cpu_physical_apicid, phys_cpu_present_map);
150}
151
152static inline void enable_apic_mode(void)
153{
154}
155#endif /* CONFIG_X86_LOCAL_APIC */
156#endif /* _ASM_X86_MACH_DEFAULT_MACH_APIC_H */
diff --git a/arch/x86/include/asm/mach-default/mach_apicdef.h b/arch/x86/include/asm/mach-default/mach_apicdef.h
new file mode 100644
index 000000000000..53179936d6c6
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_apicdef.h
@@ -0,0 +1,24 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H
2#define _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H
3
4#include <asm/apic.h>
5
6#ifdef CONFIG_X86_64
7#define APIC_ID_MASK (genapic->apic_id_mask)
8#define GET_APIC_ID(x) (genapic->get_apic_id(x))
9#define SET_APIC_ID(x) (genapic->set_apic_id(x))
10#else
11#define APIC_ID_MASK (0xF<<24)
12static inline unsigned get_apic_id(unsigned long x)
13{
14 unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
15 if (APIC_XAPIC(ver))
16 return (((x)>>24)&0xFF);
17 else
18 return (((x)>>24)&0xF);
19}
20
21#define GET_APIC_ID(x) get_apic_id(x)
22#endif
23
24#endif /* _ASM_X86_MACH_DEFAULT_MACH_APICDEF_H */
diff --git a/arch/x86/include/asm/mach-default/mach_ipi.h b/arch/x86/include/asm/mach-default/mach_ipi.h
new file mode 100644
index 000000000000..fabca01ebacf
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_ipi.h
@@ -0,0 +1,64 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_IPI_H
2#define _ASM_X86_MACH_DEFAULT_MACH_IPI_H
3
4/* Avoid include hell */
5#define NMI_VECTOR 0x02
6
7void send_IPI_mask_bitmask(cpumask_t mask, int vector);
8void __send_IPI_shortcut(unsigned int shortcut, int vector);
9
10extern int no_broadcast;
11
12#ifdef CONFIG_X86_64
13#include <asm/genapic.h>
14#define send_IPI_mask (genapic->send_IPI_mask)
15#else
16static inline void send_IPI_mask(cpumask_t mask, int vector)
17{
18 send_IPI_mask_bitmask(mask, vector);
19}
20#endif
21
22static inline void __local_send_IPI_allbutself(int vector)
23{
24 if (no_broadcast || vector == NMI_VECTOR) {
25 cpumask_t mask = cpu_online_map;
26
27 cpu_clear(smp_processor_id(), mask);
28 send_IPI_mask(mask, vector);
29 } else
30 __send_IPI_shortcut(APIC_DEST_ALLBUT, vector);
31}
32
33static inline void __local_send_IPI_all(int vector)
34{
35 if (no_broadcast || vector == NMI_VECTOR)
36 send_IPI_mask(cpu_online_map, vector);
37 else
38 __send_IPI_shortcut(APIC_DEST_ALLINC, vector);
39}
40
41#ifdef CONFIG_X86_64
42#define send_IPI_allbutself (genapic->send_IPI_allbutself)
43#define send_IPI_all (genapic->send_IPI_all)
44#else
45static inline void send_IPI_allbutself(int vector)
46{
47 /*
48 * if there are no other CPUs in the system then we get an APIC send
49 * error if we try to broadcast, thus avoid sending IPIs in this case.
50 */
51 if (!(num_online_cpus() > 1))
52 return;
53
54 __local_send_IPI_allbutself(vector);
55 return;
56}
57
58static inline void send_IPI_all(int vector)
59{
60 __local_send_IPI_all(vector);
61}
62#endif
63
64#endif /* _ASM_X86_MACH_DEFAULT_MACH_IPI_H */
diff --git a/arch/x86/include/asm/mach-default/mach_mpparse.h b/arch/x86/include/asm/mach-default/mach_mpparse.h
new file mode 100644
index 000000000000..8c1ea21238a7
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_mpparse.h
@@ -0,0 +1,17 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H
2#define _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H
3
4static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
5 char *productid)
6{
7 return 0;
8}
9
10/* Hook from generic ACPI tables.c */
11static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
12{
13 return 0;
14}
15
16
17#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/mach-default/mach_mpspec.h b/arch/x86/include/asm/mach-default/mach_mpspec.h
new file mode 100644
index 000000000000..e85ede686be8
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_mpspec.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H
2#define _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H
3
4#define MAX_IRQ_SOURCES 256
5
6#if CONFIG_BASE_SMALL == 0
7#define MAX_MP_BUSSES 256
8#else
9#define MAX_MP_BUSSES 32
10#endif
11
12#endif /* _ASM_X86_MACH_DEFAULT_MACH_MPSPEC_H */
diff --git a/arch/x86/include/asm/mach-default/mach_timer.h b/arch/x86/include/asm/mach-default/mach_timer.h
new file mode 100644
index 000000000000..853728519ae9
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_timer.h
@@ -0,0 +1,48 @@
1/*
2 * Machine specific calibrate_tsc() for generic.
3 * Split out from timer_tsc.c by Osamu Tomita <tomita@cinet.co.jp>
4 */
5/* ------ Calibrate the TSC -------
6 * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
7 * Too much 64-bit arithmetic here to do this cleanly in C, and for
8 * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
9 * output busy loop as low as possible. We avoid reading the CTC registers
10 * directly because of the awkward 8-bit access mechanism of the 82C54
11 * device.
12 */
13#ifndef _ASM_X86_MACH_DEFAULT_MACH_TIMER_H
14#define _ASM_X86_MACH_DEFAULT_MACH_TIMER_H
15
16#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
17#define CALIBRATE_LATCH \
18 ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
19
20static inline void mach_prepare_counter(void)
21{
22 /* Set the Gate high, disable speaker */
23 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
24
25 /*
26 * Now let's take care of CTC channel 2
27 *
28 * Set the Gate high, program CTC channel 2 for mode 0,
29 * (interrupt on terminal count mode), binary count,
30 * load 5 * LATCH count, (LSB and MSB) to begin countdown.
31 *
32 * Some devices need a delay here.
33 */
34 outb(0xb0, 0x43); /* binary, mode 0, LSB/MSB, Ch 2 */
35 outb_p(CALIBRATE_LATCH & 0xff, 0x42); /* LSB of count */
36 outb_p(CALIBRATE_LATCH >> 8, 0x42); /* MSB of count */
37}
38
39static inline void mach_countup(unsigned long *count_p)
40{
41 unsigned long count = 0;
42 do {
43 count++;
44 } while ((inb_p(0x61) & 0x20) == 0);
45 *count_p = count;
46}
47
48#endif /* _ASM_X86_MACH_DEFAULT_MACH_TIMER_H */
diff --git a/arch/x86/include/asm/mach-default/mach_traps.h b/arch/x86/include/asm/mach-default/mach_traps.h
new file mode 100644
index 000000000000..f7920601e472
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_traps.h
@@ -0,0 +1,33 @@
1/*
2 * Machine specific NMI handling for generic.
3 * Split out from traps.c by Osamu Tomita <tomita@cinet.co.jp>
4 */
5#ifndef _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
6#define _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H
7
8#include <asm/mc146818rtc.h>
9
10static inline unsigned char get_nmi_reason(void)
11{
12 return inb(0x61);
13}
14
15static inline void reassert_nmi(void)
16{
17 int old_reg = -1;
18
19 if (do_i_have_lock_cmos())
20 old_reg = current_lock_cmos_reg();
21 else
22 lock_cmos(0); /* register doesn't matter here */
23 outb(0x8f, 0x70);
24 inb(0x71); /* dummy */
25 outb(0x0f, 0x70);
26 inb(0x71); /* dummy */
27 if (old_reg >= 0)
28 outb(old_reg, 0x70);
29 else
30 unlock_cmos();
31}
32
33#endif /* _ASM_X86_MACH_DEFAULT_MACH_TRAPS_H */
diff --git a/arch/x86/include/asm/mach-default/mach_wakecpu.h b/arch/x86/include/asm/mach-default/mach_wakecpu.h
new file mode 100644
index 000000000000..d5c0b826a4ff
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/mach_wakecpu.h
@@ -0,0 +1,42 @@
1#ifndef _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
2#define _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H
3
4/*
5 * This file copes with machines that wakeup secondary CPUs by the
6 * INIT, INIT, STARTUP sequence.
7 */
8
9#define WAKE_SECONDARY_VIA_INIT
10
11#define TRAMPOLINE_LOW phys_to_virt(0x467)
12#define TRAMPOLINE_HIGH phys_to_virt(0x469)
13
14#define boot_cpu_apicid boot_cpu_physical_apicid
15
16static inline void wait_for_init_deassert(atomic_t *deassert)
17{
18 while (!atomic_read(deassert))
19 cpu_relax();
20 return;
21}
22
23/* Nothing to do for most platforms, since cleared by the INIT cycle */
24static inline void smp_callin_clear_local_apic(void)
25{
26}
27
28static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
29{
30}
31
32static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
33{
34}
35
36#if APIC_DEBUG
37 #define inquire_remote_apic(apicid) __inquire_remote_apic(apicid)
38#else
39 #define inquire_remote_apic(apicid) {}
40#endif
41
42#endif /* _ASM_X86_MACH_DEFAULT_MACH_WAKECPU_H */
diff --git a/arch/x86/include/asm/mach-default/pci-functions.h b/arch/x86/include/asm/mach-default/pci-functions.h
new file mode 100644
index 000000000000..ed0bab427354
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/pci-functions.h
@@ -0,0 +1,19 @@
1/*
2 * PCI BIOS function numbering for conventional PCI BIOS
3 * systems
4 */
5
6#define PCIBIOS_PCI_FUNCTION_ID 0xb1XX
7#define PCIBIOS_PCI_BIOS_PRESENT 0xb101
8#define PCIBIOS_FIND_PCI_DEVICE 0xb102
9#define PCIBIOS_FIND_PCI_CLASS_CODE 0xb103
10#define PCIBIOS_GENERATE_SPECIAL_CYCLE 0xb106
11#define PCIBIOS_READ_CONFIG_BYTE 0xb108
12#define PCIBIOS_READ_CONFIG_WORD 0xb109
13#define PCIBIOS_READ_CONFIG_DWORD 0xb10a
14#define PCIBIOS_WRITE_CONFIG_BYTE 0xb10b
15#define PCIBIOS_WRITE_CONFIG_WORD 0xb10c
16#define PCIBIOS_WRITE_CONFIG_DWORD 0xb10d
17#define PCIBIOS_GET_ROUTING_OPTIONS 0xb10e
18#define PCIBIOS_SET_PCI_HW_INT 0xb10f
19
diff --git a/arch/x86/include/asm/mach-default/setup_arch.h b/arch/x86/include/asm/mach-default/setup_arch.h
new file mode 100644
index 000000000000..38846208b548
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/setup_arch.h
@@ -0,0 +1,3 @@
1/* Hook to call BIOS initialisation function */
2
3/* no action for generic */
diff --git a/arch/x86/include/asm/mach-default/smpboot_hooks.h b/arch/x86/include/asm/mach-default/smpboot_hooks.h
new file mode 100644
index 000000000000..dbab36d64d48
--- /dev/null
+++ b/arch/x86/include/asm/mach-default/smpboot_hooks.h
@@ -0,0 +1,59 @@
1/* two abstractions specific to kernel/smpboot.c, mainly to cater to visws
2 * which needs to alter them. */
3
4static inline void smpboot_clear_io_apic_irqs(void)
5{
6#ifdef CONFIG_X86_IO_APIC
7 io_apic_irqs = 0;
8#endif
9}
10
11static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
12{
13 CMOS_WRITE(0xa, 0xf);
14 local_flush_tlb();
15 pr_debug("1.\n");
16 *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4;
17 pr_debug("2.\n");
18 *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf;
19 pr_debug("3.\n");
20}
21
22static inline void smpboot_restore_warm_reset_vector(void)
23{
24 /*
25 * Install writable page 0 entry to set BIOS data area.
26 */
27 local_flush_tlb();
28
29 /*
30 * Paranoid: Set warm reset code and vector here back
31 * to default values.
32 */
33 CMOS_WRITE(0, 0xf);
34
35 *((volatile long *) phys_to_virt(0x467)) = 0;
36}
37
38static inline void __init smpboot_setup_io_apic(void)
39{
40#ifdef CONFIG_X86_IO_APIC
41 /*
42 * Here we can be sure that there is an IO-APIC in the system. Let's
43 * go and set it up:
44 */
45 if (!skip_ioapic_setup && nr_ioapics)
46 setup_IO_APIC();
47 else {
48 nr_ioapics = 0;
49 localise_nmi_watchdog();
50 }
51#endif
52}
53
54static inline void smpboot_clear_io_apic(void)
55{
56#ifdef CONFIG_X86_IO_APIC
57 nr_ioapics = 0;
58#endif
59}
diff --git a/arch/x86/include/asm/mach-generic/gpio.h b/arch/x86/include/asm/mach-generic/gpio.h
new file mode 100644
index 000000000000..995c45efdb33
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/gpio.h
@@ -0,0 +1,15 @@
1#ifndef _ASM_X86_MACH_GENERIC_GPIO_H
2#define _ASM_X86_MACH_GENERIC_GPIO_H
3
4int gpio_request(unsigned gpio, const char *label);
5void gpio_free(unsigned gpio);
6int gpio_direction_input(unsigned gpio);
7int gpio_direction_output(unsigned gpio, int value);
8int gpio_get_value(unsigned gpio);
9void gpio_set_value(unsigned gpio, int value);
10int gpio_to_irq(unsigned gpio);
11int irq_to_gpio(unsigned irq);
12
13#include <asm-generic/gpio.h> /* cansleep wrappers */
14
15#endif /* _ASM_X86_MACH_GENERIC_GPIO_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_apic.h b/arch/x86/include/asm/mach-generic/mach_apic.h
new file mode 100644
index 000000000000..5180bd7478fb
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_apic.h
@@ -0,0 +1,33 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_APIC_H
2#define _ASM_X86_MACH_GENERIC_MACH_APIC_H
3
4#include <asm/genapic.h>
5
6#define esr_disable (genapic->ESR_DISABLE)
7#define NO_BALANCE_IRQ (genapic->no_balance_irq)
8#define INT_DELIVERY_MODE (genapic->int_delivery_mode)
9#define INT_DEST_MODE (genapic->int_dest_mode)
10#undef APIC_DEST_LOGICAL
11#define APIC_DEST_LOGICAL (genapic->apic_destination_logical)
12#define TARGET_CPUS (genapic->target_cpus())
13#define apic_id_registered (genapic->apic_id_registered)
14#define init_apic_ldr (genapic->init_apic_ldr)
15#define ioapic_phys_id_map (genapic->ioapic_phys_id_map)
16#define setup_apic_routing (genapic->setup_apic_routing)
17#define multi_timer_check (genapic->multi_timer_check)
18#define apicid_to_node (genapic->apicid_to_node)
19#define cpu_to_logical_apicid (genapic->cpu_to_logical_apicid)
20#define cpu_present_to_apicid (genapic->cpu_present_to_apicid)
21#define apicid_to_cpu_present (genapic->apicid_to_cpu_present)
22#define setup_portio_remap (genapic->setup_portio_remap)
23#define check_apicid_present (genapic->check_apicid_present)
24#define check_phys_apicid_present (genapic->check_phys_apicid_present)
25#define check_apicid_used (genapic->check_apicid_used)
26#define cpu_mask_to_apicid (genapic->cpu_mask_to_apicid)
27#define vector_allocation_domain (genapic->vector_allocation_domain)
28#define enable_apic_mode (genapic->enable_apic_mode)
29#define phys_pkg_id (genapic->phys_pkg_id)
30
31extern void generic_bigsmp_probe(void);
32
33#endif /* _ASM_X86_MACH_GENERIC_MACH_APIC_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_apicdef.h b/arch/x86/include/asm/mach-generic/mach_apicdef.h
new file mode 100644
index 000000000000..68041f3802f4
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_apicdef.h
@@ -0,0 +1,11 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_APICDEF_H
2#define _ASM_X86_MACH_GENERIC_MACH_APICDEF_H
3
4#ifndef APIC_DEFINITION
5#include <asm/genapic.h>
6
7#define GET_APIC_ID (genapic->get_apic_id)
8#define APIC_ID_MASK (genapic->apic_id_mask)
9#endif
10
11#endif /* _ASM_X86_MACH_GENERIC_MACH_APICDEF_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_ipi.h b/arch/x86/include/asm/mach-generic/mach_ipi.h
new file mode 100644
index 000000000000..ffd637e3c3d9
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_ipi.h
@@ -0,0 +1,10 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_IPI_H
2#define _ASM_X86_MACH_GENERIC_MACH_IPI_H
3
4#include <asm/genapic.h>
5
6#define send_IPI_mask (genapic->send_IPI_mask)
7#define send_IPI_allbutself (genapic->send_IPI_allbutself)
8#define send_IPI_all (genapic->send_IPI_all)
9
10#endif /* _ASM_X86_MACH_GENERIC_MACH_IPI_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_mpparse.h b/arch/x86/include/asm/mach-generic/mach_mpparse.h
new file mode 100644
index 000000000000..048f1d468535
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_mpparse.h
@@ -0,0 +1,10 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H
2#define _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H
3
4
5extern int mps_oem_check(struct mp_config_table *mpc, char *oem,
6 char *productid);
7
8extern int acpi_madt_oem_check(char *oem_id, char *oem_table_id);
9
10#endif /* _ASM_X86_MACH_GENERIC_MACH_MPPARSE_H */
diff --git a/arch/x86/include/asm/mach-generic/mach_mpspec.h b/arch/x86/include/asm/mach-generic/mach_mpspec.h
new file mode 100644
index 000000000000..bbab5ccfd4fe
--- /dev/null
+++ b/arch/x86/include/asm/mach-generic/mach_mpspec.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H
2#define _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H
3
4#define MAX_IRQ_SOURCES 256
5
6/* Summit or generic (i.e. installer) kernels need lots of bus entries. */
7/* Maximum 256 PCI busses, plus 1 ISA bus in each of 4 cabinets. */
8#define MAX_MP_BUSSES 260
9
10extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
11 char *productid);
12#endif /* _ASM_X86_MACH_GENERIC_MACH_MPSPEC_H */
diff --git a/arch/x86/include/asm/mach-rdc321x/gpio.h b/arch/x86/include/asm/mach-rdc321x/gpio.h
new file mode 100644
index 000000000000..c210ab5788b0
--- /dev/null
+++ b/arch/x86/include/asm/mach-rdc321x/gpio.h
@@ -0,0 +1,60 @@
1#ifndef _ASM_X86_MACH_RDC321X_GPIO_H
2#define _ASM_X86_MACH_RDC321X_GPIO_H
3
4#include <linux/kernel.h>
5
6extern int rdc_gpio_get_value(unsigned gpio);
7extern void rdc_gpio_set_value(unsigned gpio, int value);
8extern int rdc_gpio_direction_input(unsigned gpio);
9extern int rdc_gpio_direction_output(unsigned gpio, int value);
10extern int rdc_gpio_request(unsigned gpio, const char *label);
11extern void rdc_gpio_free(unsigned gpio);
12extern void __init rdc321x_gpio_setup(void);
13
14/* Wrappers for the arch-neutral GPIO API */
15
16static inline int gpio_request(unsigned gpio, const char *label)
17{
18 return rdc_gpio_request(gpio, label);
19}
20
21static inline void gpio_free(unsigned gpio)
22{
23 might_sleep();
24 rdc_gpio_free(gpio);
25}
26
27static inline int gpio_direction_input(unsigned gpio)
28{
29 return rdc_gpio_direction_input(gpio);
30}
31
32static inline int gpio_direction_output(unsigned gpio, int value)
33{
34 return rdc_gpio_direction_output(gpio, value);
35}
36
37static inline int gpio_get_value(unsigned gpio)
38{
39 return rdc_gpio_get_value(gpio);
40}
41
42static inline void gpio_set_value(unsigned gpio, int value)
43{
44 rdc_gpio_set_value(gpio, value);
45}
46
47static inline int gpio_to_irq(unsigned gpio)
48{
49 return gpio;
50}
51
52static inline int irq_to_gpio(unsigned irq)
53{
54 return irq;
55}
56
57/* For cansleep */
58#include <asm-generic/gpio.h>
59
60#endif /* _ASM_X86_MACH_RDC321X_GPIO_H */
diff --git a/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h b/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h
new file mode 100644
index 000000000000..c8e9c8bed3d0
--- /dev/null
+++ b/arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h
@@ -0,0 +1,12 @@
1#define PFX "rdc321x: "
2
3/* General purpose configuration and data registers */
4#define RDC3210_CFGREG_ADDR 0x0CF8
5#define RDC3210_CFGREG_DATA 0x0CFC
6
7#define RDC321X_GPIO_CTRL_REG1 0x48
8#define RDC321X_GPIO_CTRL_REG2 0x84
9#define RDC321X_GPIO_DATA_REG1 0x4c
10#define RDC321X_GPIO_DATA_REG2 0x88
11
12#define RDC321X_MAX_GPIO 58
diff --git a/arch/x86/include/asm/mach-voyager/do_timer.h b/arch/x86/include/asm/mach-voyager/do_timer.h
new file mode 100644
index 000000000000..9e5a459fd15b
--- /dev/null
+++ b/arch/x86/include/asm/mach-voyager/do_timer.h
@@ -0,0 +1,17 @@
1/* defines for inline arch setup functions */
2#include <linux/clockchips.h>
3
4#include <asm/voyager.h>
5#include <asm/i8253.h>
6
7/**
8 * do_timer_interrupt_hook - hook into timer tick
9 *
10 * Call the pit clock event handler. see asm/i8253.h
11 **/
12static inline void do_timer_interrupt_hook(void)
13{
14 global_clock_event->event_handler(global_clock_event);
15 voyager_timer_interrupt();
16}
17
diff --git a/arch/x86/include/asm/mach-voyager/entry_arch.h b/arch/x86/include/asm/mach-voyager/entry_arch.h
new file mode 100644
index 000000000000..ae52624b5937
--- /dev/null
+++ b/arch/x86/include/asm/mach-voyager/entry_arch.h
@@ -0,0 +1,26 @@
1/* -*- mode: c; c-basic-offset: 8 -*- */
2
3/* Copyright (C) 2002
4 *
5 * Author: James.Bottomley@HansenPartnership.com
6 *
7 * linux/arch/i386/voyager/entry_arch.h
8 *
9 * This file builds the VIC and QIC CPI gates
10 */
11
12/* initialise the voyager interrupt gates
13 *
14 * This uses the macros in irq.h to set up assembly jump gates. The
15 * calls are then redirected to the same routine with smp_ prefixed */
16BUILD_INTERRUPT(vic_sys_interrupt, VIC_SYS_INT)
17BUILD_INTERRUPT(vic_cmn_interrupt, VIC_CMN_INT)
18BUILD_INTERRUPT(vic_cpi_interrupt, VIC_CPI_LEVEL0);
19
20/* do all the QIC interrupts */
21BUILD_INTERRUPT(qic_timer_interrupt, QIC_TIMER_CPI);
22BUILD_INTERRUPT(qic_invalidate_interrupt, QIC_INVALIDATE_CPI);
23BUILD_INTERRUPT(qic_reschedule_interrupt, QIC_RESCHEDULE_CPI);
24BUILD_INTERRUPT(qic_enable_irq_interrupt, QIC_ENABLE_IRQ_CPI);
25BUILD_INTERRUPT(qic_call_function_interrupt, QIC_CALL_FUNCTION_CPI);
26BUILD_INTERRUPT(qic_call_function_single_interrupt, QIC_CALL_FUNCTION_SINGLE_CPI);
diff --git a/arch/x86/include/asm/mach-voyager/setup_arch.h b/arch/x86/include/asm/mach-voyager/setup_arch.h
new file mode 100644
index 000000000000..71729ca05cd7
--- /dev/null
+++ b/arch/x86/include/asm/mach-voyager/setup_arch.h
@@ -0,0 +1,12 @@
1#include <asm/voyager.h>
2#include <asm/setup.h>
3#define VOYAGER_BIOS_INFO ((struct voyager_bios_info *) \
4 (&boot_params.apm_bios_info))
5
6/* Hook to call BIOS initialisation function */
7
8/* for voyager, pass the voyager BIOS/SUS info area to the detection
9 * routines */
10
11#define ARCH_SETUP voyager_detect(VOYAGER_BIOS_INFO);
12
diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h
new file mode 100644
index 000000000000..5a65b107ad58
--- /dev/null
+++ b/arch/x86/include/asm/math_emu.h
@@ -0,0 +1,31 @@
1#ifndef _ASM_X86_MATH_EMU_H
2#define _ASM_X86_MATH_EMU_H
3
4/* This structure matches the layout of the data saved to the stack
5 following a device-not-present interrupt, part of it saved
6 automatically by the 80386/80486.
7 */
8struct info {
9 long ___orig_eip;
10 long ___ebx;
11 long ___ecx;
12 long ___edx;
13 long ___esi;
14 long ___edi;
15 long ___ebp;
16 long ___eax;
17 long ___ds;
18 long ___es;
19 long ___fs;
20 long ___orig_eax;
21 long ___eip;
22 long ___cs;
23 long ___eflags;
24 long ___esp;
25 long ___ss;
26 long ___vm86_es; /* This and the following only in vm86 mode */
27 long ___vm86_ds;
28 long ___vm86_fs;
29 long ___vm86_gs;
30};
31#endif /* _ASM_X86_MATH_EMU_H */
diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h
new file mode 100644
index 000000000000..01fdf5674e24
--- /dev/null
+++ b/arch/x86/include/asm/mc146818rtc.h
@@ -0,0 +1,104 @@
1/*
2 * Machine dependent access functions for RTC registers.
3 */
4#ifndef _ASM_X86_MC146818RTC_H
5#define _ASM_X86_MC146818RTC_H
6
7#include <asm/io.h>
8#include <asm/system.h>
9#include <asm/processor.h>
10#include <linux/mc146818rtc.h>
11
12#ifndef RTC_PORT
13#define RTC_PORT(x) (0x70 + (x))
14#define RTC_ALWAYS_BCD 1 /* RTC operates in binary mode */
15#endif
16
17#if defined(CONFIG_X86_32) && defined(__HAVE_ARCH_CMPXCHG)
18/*
19 * This lock provides nmi access to the CMOS/RTC registers. It has some
20 * special properties. It is owned by a CPU and stores the index register
21 * currently being accessed (if owned). The idea here is that it works
22 * like a normal lock (normally). However, in an NMI, the NMI code will
23 * first check to see if its CPU owns the lock, meaning that the NMI
24 * interrupted during the read/write of the device. If it does, it goes ahead
25 * and performs the access and then restores the index register. If it does
26 * not, it locks normally.
27 *
28 * Note that since we are working with NMIs, we need this lock even in
29 * a non-SMP machine just to mark that the lock is owned.
30 *
31 * This only works with compare-and-swap. There is no other way to
32 * atomically claim the lock and set the owner.
33 */
34#include <linux/smp.h>
35extern volatile unsigned long cmos_lock;
36
37/*
38 * All of these below must be called with interrupts off, preempt
39 * disabled, etc.
40 */
41
42static inline void lock_cmos(unsigned char reg)
43{
44 unsigned long new;
45 new = ((smp_processor_id() + 1) << 8) | reg;
46 for (;;) {
47 if (cmos_lock) {
48 cpu_relax();
49 continue;
50 }
51 if (__cmpxchg(&cmos_lock, 0, new, sizeof(cmos_lock)) == 0)
52 return;
53 }
54}
55
56static inline void unlock_cmos(void)
57{
58 cmos_lock = 0;
59}
60
61static inline int do_i_have_lock_cmos(void)
62{
63 return (cmos_lock >> 8) == (smp_processor_id() + 1);
64}
65
66static inline unsigned char current_lock_cmos_reg(void)
67{
68 return cmos_lock & 0xff;
69}
70
71#define lock_cmos_prefix(reg) \
72 do { \
73 unsigned long cmos_flags; \
74 local_irq_save(cmos_flags); \
75 lock_cmos(reg)
76
77#define lock_cmos_suffix(reg) \
78 unlock_cmos(); \
79 local_irq_restore(cmos_flags); \
80 } while (0)
81#else
82#define lock_cmos_prefix(reg) do {} while (0)
83#define lock_cmos_suffix(reg) do {} while (0)
84#define lock_cmos(reg)
85#define unlock_cmos()
86#define do_i_have_lock_cmos() 0
87#define current_lock_cmos_reg() 0
88#endif
89
90/*
91 * The yet supported machines all access the RTC index register via
92 * an ISA port access but the way to access the date register differs ...
93 */
94#define CMOS_READ(addr) rtc_cmos_read(addr)
95#define CMOS_WRITE(val, addr) rtc_cmos_write(val, addr)
96unsigned char rtc_cmos_read(unsigned char addr);
97void rtc_cmos_write(unsigned char val, unsigned char addr);
98
99extern int mach_set_rtc_mmss(unsigned long nowtime);
100extern unsigned long mach_get_cmos_time(void);
101
102#define RTC_IRQ 8
103
104#endif /* _ASM_X86_MC146818RTC_H */
diff --git a/arch/x86/include/asm/mca.h b/arch/x86/include/asm/mca.h
new file mode 100644
index 000000000000..eedbb6cc1efb
--- /dev/null
+++ b/arch/x86/include/asm/mca.h
@@ -0,0 +1,43 @@
1/* -*- mode: c; c-basic-offset: 8 -*- */
2
3/* Platform specific MCA defines */
4#ifndef _ASM_X86_MCA_H
5#define _ASM_X86_MCA_H
6
7/* Maximal number of MCA slots - actually, some machines have less, but
8 * they all have sufficient number of POS registers to cover 8.
9 */
10#define MCA_MAX_SLOT_NR 8
11
12/* Most machines have only one MCA bus. The only multiple bus machines
13 * I know have at most two */
14#define MAX_MCA_BUSSES 2
15
16#define MCA_PRIMARY_BUS 0
17#define MCA_SECONDARY_BUS 1
18
19/* Dummy slot numbers on primary MCA for integrated functions */
20#define MCA_INTEGSCSI (MCA_MAX_SLOT_NR)
21#define MCA_INTEGVIDEO (MCA_MAX_SLOT_NR+1)
22#define MCA_MOTHERBOARD (MCA_MAX_SLOT_NR+2)
23
24/* Dummy POS values for integrated functions */
25#define MCA_DUMMY_POS_START 0x10000
26#define MCA_INTEGSCSI_POS (MCA_DUMMY_POS_START+1)
27#define MCA_INTEGVIDEO_POS (MCA_DUMMY_POS_START+2)
28#define MCA_MOTHERBOARD_POS (MCA_DUMMY_POS_START+3)
29
30/* MCA registers */
31
32#define MCA_MOTHERBOARD_SETUP_REG 0x94
33#define MCA_ADAPTER_SETUP_REG 0x96
34#define MCA_POS_REG(n) (0x100+(n))
35
36#define MCA_ENABLED 0x01 /* POS 2, set if adapter enabled */
37
38/* Max number of adapters, including both slots and various integrated
39 * things.
40 */
41#define MCA_NUMADAPTERS (MCA_MAX_SLOT_NR+3)
42
43#endif /* _ASM_X86_MCA_H */
diff --git a/arch/x86/include/asm/mca_dma.h b/arch/x86/include/asm/mca_dma.h
new file mode 100644
index 000000000000..45271aef82dd
--- /dev/null
+++ b/arch/x86/include/asm/mca_dma.h
@@ -0,0 +1,201 @@
1#ifndef _ASM_X86_MCA_DMA_H
2#define _ASM_X86_MCA_DMA_H
3
4#include <asm/io.h>
5#include <linux/ioport.h>
6
7/*
8 * Microchannel specific DMA stuff. DMA on an MCA machine is fairly similar to
9 * standard PC dma, but it certainly has its quirks. DMA register addresses
10 * are in a different place and there are some added functions. Most of this
11 * should be pretty obvious on inspection. Note that the user must divide
12 * count by 2 when using 16-bit dma; that is not handled by these functions.
13 *
14 * Ramen Noodles are yummy.
15 *
16 * 1998 Tymm Twillman <tymm@computer.org>
17 */
18
19/*
20 * Registers that are used by the DMA controller; FN is the function register
21 * (tell the controller what to do) and EXE is the execution register (how
22 * to do it)
23 */
24
25#define MCA_DMA_REG_FN 0x18
26#define MCA_DMA_REG_EXE 0x1A
27
28/*
29 * Functions that the DMA controller can do
30 */
31
32#define MCA_DMA_FN_SET_IO 0x00
33#define MCA_DMA_FN_SET_ADDR 0x20
34#define MCA_DMA_FN_GET_ADDR 0x30
35#define MCA_DMA_FN_SET_COUNT 0x40
36#define MCA_DMA_FN_GET_COUNT 0x50
37#define MCA_DMA_FN_GET_STATUS 0x60
38#define MCA_DMA_FN_SET_MODE 0x70
39#define MCA_DMA_FN_SET_ARBUS 0x80
40#define MCA_DMA_FN_MASK 0x90
41#define MCA_DMA_FN_RESET_MASK 0xA0
42#define MCA_DMA_FN_MASTER_CLEAR 0xD0
43
44/*
45 * Modes (used by setting MCA_DMA_FN_MODE in the function register)
46 *
47 * Note that the MODE_READ is read from memory (write to device), and
48 * MODE_WRITE is vice-versa.
49 */
50
51#define MCA_DMA_MODE_XFER 0x04 /* read by default */
52#define MCA_DMA_MODE_READ 0x04 /* same as XFER */
53#define MCA_DMA_MODE_WRITE 0x08 /* OR with MODE_XFER to use */
54#define MCA_DMA_MODE_IO 0x01 /* DMA from IO register */
55#define MCA_DMA_MODE_16 0x40 /* 16 bit xfers */
56
57
58/**
59 * mca_enable_dma - channel to enable DMA on
60 * @dmanr: DMA channel
61 *
62 * Enable the MCA bus DMA on a channel. This can be called from
63 * IRQ context.
64 */
65
66static inline void mca_enable_dma(unsigned int dmanr)
67{
68 outb(MCA_DMA_FN_RESET_MASK | dmanr, MCA_DMA_REG_FN);
69}
70
71/**
72 * mca_disble_dma - channel to disable DMA on
73 * @dmanr: DMA channel
74 *
75 * Enable the MCA bus DMA on a channel. This can be called from
76 * IRQ context.
77 */
78
79static inline void mca_disable_dma(unsigned int dmanr)
80{
81 outb(MCA_DMA_FN_MASK | dmanr, MCA_DMA_REG_FN);
82}
83
84/**
85 * mca_set_dma_addr - load a 24bit DMA address
86 * @dmanr: DMA channel
87 * @a: 24bit bus address
88 *
89 * Load the address register in the DMA controller. This has a 24bit
90 * limitation (16Mb).
91 */
92
93static inline void mca_set_dma_addr(unsigned int dmanr, unsigned int a)
94{
95 outb(MCA_DMA_FN_SET_ADDR | dmanr, MCA_DMA_REG_FN);
96 outb(a & 0xff, MCA_DMA_REG_EXE);
97 outb((a >> 8) & 0xff, MCA_DMA_REG_EXE);
98 outb((a >> 16) & 0xff, MCA_DMA_REG_EXE);
99}
100
101/**
102 * mca_get_dma_addr - load a 24bit DMA address
103 * @dmanr: DMA channel
104 *
105 * Read the address register in the DMA controller. This has a 24bit
106 * limitation (16Mb). The return is a bus address.
107 */
108
109static inline unsigned int mca_get_dma_addr(unsigned int dmanr)
110{
111 unsigned int addr;
112
113 outb(MCA_DMA_FN_GET_ADDR | dmanr, MCA_DMA_REG_FN);
114 addr = inb(MCA_DMA_REG_EXE);
115 addr |= inb(MCA_DMA_REG_EXE) << 8;
116 addr |= inb(MCA_DMA_REG_EXE) << 16;
117
118 return addr;
119}
120
121/**
122 * mca_set_dma_count - load a 16bit transfer count
123 * @dmanr: DMA channel
124 * @count: count
125 *
126 * Set the DMA count for this channel. This can be up to 64Kbytes.
127 * Setting a count of zero will not do what you expect.
128 */
129
130static inline void mca_set_dma_count(unsigned int dmanr, unsigned int count)
131{
132 count--; /* transfers one more than count -- correct for this */
133
134 outb(MCA_DMA_FN_SET_COUNT | dmanr, MCA_DMA_REG_FN);
135 outb(count & 0xff, MCA_DMA_REG_EXE);
136 outb((count >> 8) & 0xff, MCA_DMA_REG_EXE);
137}
138
139/**
140 * mca_get_dma_residue - get the remaining bytes to transfer
141 * @dmanr: DMA channel
142 *
143 * This function returns the number of bytes left to transfer
144 * on this DMA channel.
145 */
146
147static inline unsigned int mca_get_dma_residue(unsigned int dmanr)
148{
149 unsigned short count;
150
151 outb(MCA_DMA_FN_GET_COUNT | dmanr, MCA_DMA_REG_FN);
152 count = 1 + inb(MCA_DMA_REG_EXE);
153 count += inb(MCA_DMA_REG_EXE) << 8;
154
155 return count;
156}
157
158/**
159 * mca_set_dma_io - set the port for an I/O transfer
160 * @dmanr: DMA channel
161 * @io_addr: an I/O port number
162 *
163 * Unlike the ISA bus DMA controllers the DMA on MCA bus can transfer
164 * with an I/O port target.
165 */
166
167static inline void mca_set_dma_io(unsigned int dmanr, unsigned int io_addr)
168{
169 /*
170 * DMA from a port address -- set the io address
171 */
172
173 outb(MCA_DMA_FN_SET_IO | dmanr, MCA_DMA_REG_FN);
174 outb(io_addr & 0xff, MCA_DMA_REG_EXE);
175 outb((io_addr >> 8) & 0xff, MCA_DMA_REG_EXE);
176}
177
178/**
179 * mca_set_dma_mode - set the DMA mode
180 * @dmanr: DMA channel
181 * @mode: mode to set
182 *
183 * The DMA controller supports several modes. The mode values you can
184 * set are-
185 *
186 * %MCA_DMA_MODE_READ when reading from the DMA device.
187 *
188 * %MCA_DMA_MODE_WRITE to writing to the DMA device.
189 *
190 * %MCA_DMA_MODE_IO to do DMA to or from an I/O port.
191 *
192 * %MCA_DMA_MODE_16 to do 16bit transfers.
193 */
194
195static inline void mca_set_dma_mode(unsigned int dmanr, unsigned int mode)
196{
197 outb(MCA_DMA_FN_SET_MODE | dmanr, MCA_DMA_REG_FN);
198 outb(mode, MCA_DMA_REG_EXE);
199}
200
201#endif /* _ASM_X86_MCA_DMA_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
new file mode 100644
index 000000000000..1d6e17c2f23a
--- /dev/null
+++ b/arch/x86/include/asm/mce.h
@@ -0,0 +1,130 @@
1#ifndef _ASM_X86_MCE_H
2#define _ASM_X86_MCE_H
3
4#ifdef __x86_64__
5
6#include <asm/ioctls.h>
7#include <asm/types.h>
8
9/*
10 * Machine Check support for x86
11 */
12
13#define MCG_CTL_P (1UL<<8) /* MCG_CAP register available */
14
15#define MCG_STATUS_RIPV (1UL<<0) /* restart ip valid */
16#define MCG_STATUS_EIPV (1UL<<1) /* ip points to correct instruction */
17#define MCG_STATUS_MCIP (1UL<<2) /* machine check in progress */
18
19#define MCI_STATUS_VAL (1UL<<63) /* valid error */
20#define MCI_STATUS_OVER (1UL<<62) /* previous errors lost */
21#define MCI_STATUS_UC (1UL<<61) /* uncorrected error */
22#define MCI_STATUS_EN (1UL<<60) /* error enabled */
23#define MCI_STATUS_MISCV (1UL<<59) /* misc error reg. valid */
24#define MCI_STATUS_ADDRV (1UL<<58) /* addr reg. valid */
25#define MCI_STATUS_PCC (1UL<<57) /* processor context corrupt */
26
27/* Fields are zero when not available */
28struct mce {
29 __u64 status;
30 __u64 misc;
31 __u64 addr;
32 __u64 mcgstatus;
33 __u64 ip;
34 __u64 tsc; /* cpu time stamp counter */
35 __u64 res1; /* for future extension */
36 __u64 res2; /* dito. */
37 __u8 cs; /* code segment */
38 __u8 bank; /* machine check bank */
39 __u8 cpu; /* cpu that raised the error */
40 __u8 finished; /* entry is valid */
41 __u32 pad;
42};
43
44/*
45 * This structure contains all data related to the MCE log. Also
46 * carries a signature to make it easier to find from external
47 * debugging tools. Each entry is only valid when its finished flag
48 * is set.
49 */
50
51#define MCE_LOG_LEN 32
52
53struct mce_log {
54 char signature[12]; /* "MACHINECHECK" */
55 unsigned len; /* = MCE_LOG_LEN */
56 unsigned next;
57 unsigned flags;
58 unsigned pad0;
59 struct mce entry[MCE_LOG_LEN];
60};
61
62#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
63
64#define MCE_LOG_SIGNATURE "MACHINECHECK"
65
66#define MCE_GET_RECORD_LEN _IOR('M', 1, int)
67#define MCE_GET_LOG_LEN _IOR('M', 2, int)
68#define MCE_GETCLEAR_FLAGS _IOR('M', 3, int)
69
70/* Software defined banks */
71#define MCE_EXTENDED_BANK 128
72#define MCE_THERMAL_BANK MCE_EXTENDED_BANK + 0
73
74#define K8_MCE_THRESHOLD_BASE (MCE_EXTENDED_BANK + 1) /* MCE_AMD */
75#define K8_MCE_THRESHOLD_BANK_0 (MCE_THRESHOLD_BASE + 0 * 9)
76#define K8_MCE_THRESHOLD_BANK_1 (MCE_THRESHOLD_BASE + 1 * 9)
77#define K8_MCE_THRESHOLD_BANK_2 (MCE_THRESHOLD_BASE + 2 * 9)
78#define K8_MCE_THRESHOLD_BANK_3 (MCE_THRESHOLD_BASE + 3 * 9)
79#define K8_MCE_THRESHOLD_BANK_4 (MCE_THRESHOLD_BASE + 4 * 9)
80#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
81#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
82
83#endif /* __x86_64__ */
84
85#ifdef __KERNEL__
86
87#ifdef CONFIG_X86_32
88extern int mce_disabled;
89#else /* CONFIG_X86_32 */
90
91#include <asm/atomic.h>
92
93void mce_log(struct mce *m);
94DECLARE_PER_CPU(struct sys_device, device_mce);
95extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu);
96
97#ifdef CONFIG_X86_MCE_INTEL
98void mce_intel_feature_init(struct cpuinfo_x86 *c);
99#else
100static inline void mce_intel_feature_init(struct cpuinfo_x86 *c) { }
101#endif
102
103#ifdef CONFIG_X86_MCE_AMD
104void mce_amd_feature_init(struct cpuinfo_x86 *c);
105#else
106static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { }
107#endif
108
109void mce_log_therm_throt_event(unsigned int cpu, __u64 status);
110
111extern atomic_t mce_entry;
112
113extern void do_machine_check(struct pt_regs *, long);
114extern int mce_notify_user(void);
115
116#endif /* !CONFIG_X86_32 */
117
118
119
120#ifdef CONFIG_X86_MCE
121extern void mcheck_init(struct cpuinfo_x86 *c);
122#else
123#define mcheck_init(c) do { } while (0)
124#endif
125extern void stop_mce(void);
126extern void restart_mce(void);
127
128#endif /* __KERNEL__ */
129
130#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
new file mode 100644
index 000000000000..c882664716c1
--- /dev/null
+++ b/arch/x86/include/asm/microcode.h
@@ -0,0 +1,47 @@
1#ifndef _ASM_X86_MICROCODE_H
2#define _ASM_X86_MICROCODE_H
3
4struct cpu_signature {
5 unsigned int sig;
6 unsigned int pf;
7 unsigned int rev;
8};
9
10struct device;
11
12struct microcode_ops {
13 int (*request_microcode_user) (int cpu, const void __user *buf, size_t size);
14 int (*request_microcode_fw) (int cpu, struct device *device);
15
16 void (*apply_microcode) (int cpu);
17
18 int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
19 void (*microcode_fini_cpu) (int cpu);
20};
21
22struct ucode_cpu_info {
23 struct cpu_signature cpu_sig;
24 int valid;
25 void *mc;
26};
27extern struct ucode_cpu_info ucode_cpu_info[];
28
29#ifdef CONFIG_MICROCODE_INTEL
30extern struct microcode_ops * __init init_intel_microcode(void);
31#else
32static inline struct microcode_ops * __init init_intel_microcode(void)
33{
34 return NULL;
35}
36#endif /* CONFIG_MICROCODE_INTEL */
37
38#ifdef CONFIG_MICROCODE_AMD
39extern struct microcode_ops * __init init_amd_microcode(void);
40#else
41static inline struct microcode_ops * __init init_amd_microcode(void)
42{
43 return NULL;
44}
45#endif
46
47#endif /* _ASM_X86_MICROCODE_H */
diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
new file mode 100644
index 000000000000..90bc4108a4fd
--- /dev/null
+++ b/arch/x86/include/asm/mman.h
@@ -0,0 +1,20 @@
1#ifndef _ASM_X86_MMAN_H
2#define _ASM_X86_MMAN_H
3
4#include <asm-generic/mman.h>
5
6#define MAP_32BIT 0x40 /* only give out 32bit addresses */
7
8#define MAP_GROWSDOWN 0x0100 /* stack-like segment */
9#define MAP_DENYWRITE 0x0800 /* ETXTBSY */
10#define MAP_EXECUTABLE 0x1000 /* mark it as an executable */
11#define MAP_LOCKED 0x2000 /* pages are locked */
12#define MAP_NORESERVE 0x4000 /* don't check for reservations */
13#define MAP_POPULATE 0x8000 /* populate (prefault) pagetables */
14#define MAP_NONBLOCK 0x10000 /* do not block on IO */
15#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
16
17#define MCL_CURRENT 1 /* lock all current mappings */
18#define MCL_FUTURE 2 /* lock all future mappings */
19
20#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/asm/mmconfig.h b/arch/x86/include/asm/mmconfig.h
new file mode 100644
index 000000000000..9b119da1d105
--- /dev/null
+++ b/arch/x86/include/asm/mmconfig.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_MMCONFIG_H
2#define _ASM_X86_MMCONFIG_H
3
4#ifdef CONFIG_PCI_MMCONFIG
5extern void __cpuinit fam10h_check_enable_mmcfg(void);
6extern void __cpuinit check_enable_amd_mmconf_dmi(void);
7#else
8static inline void fam10h_check_enable_mmcfg(void) { }
9static inline void check_enable_amd_mmconf_dmi(void) { }
10#endif
11
12#endif /* _ASM_X86_MMCONFIG_H */
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
new file mode 100644
index 000000000000..80a1dee5bea5
--- /dev/null
+++ b/arch/x86/include/asm/mmu.h
@@ -0,0 +1,26 @@
1#ifndef _ASM_X86_MMU_H
2#define _ASM_X86_MMU_H
3
4#include <linux/spinlock.h>
5#include <linux/mutex.h>
6
7/*
8 * The x86 doesn't have a mmu context, but
9 * we put the segment information here.
10 */
11typedef struct {
12 void *ldt;
13 int size;
14 struct mutex lock;
15 void *vdso;
16} mm_context_t;
17
18#ifdef CONFIG_SMP
19void leave_mm(int cpu);
20#else
21static inline void leave_mm(int cpu)
22{
23}
24#endif
25
26#endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
new file mode 100644
index 000000000000..8aeeb3fd73db
--- /dev/null
+++ b/arch/x86/include/asm/mmu_context.h
@@ -0,0 +1,37 @@
1#ifndef _ASM_X86_MMU_CONTEXT_H
2#define _ASM_X86_MMU_CONTEXT_H
3
4#include <asm/desc.h>
5#include <asm/atomic.h>
6#include <asm/pgalloc.h>
7#include <asm/tlbflush.h>
8#include <asm/paravirt.h>
9#ifndef CONFIG_PARAVIRT
10#include <asm-generic/mm_hooks.h>
11
12static inline void paravirt_activate_mm(struct mm_struct *prev,
13 struct mm_struct *next)
14{
15}
16#endif /* !CONFIG_PARAVIRT */
17
18/*
19 * Used for LDT copy/destruction.
20 */
21int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
22void destroy_context(struct mm_struct *mm);
23
24#ifdef CONFIG_X86_32
25# include "mmu_context_32.h"
26#else
27# include "mmu_context_64.h"
28#endif
29
30#define activate_mm(prev, next) \
31do { \
32 paravirt_activate_mm((prev), (next)); \
33 switch_mm((prev), (next), NULL); \
34} while (0);
35
36
37#endif /* _ASM_X86_MMU_CONTEXT_H */
diff --git a/arch/x86/include/asm/mmu_context_32.h b/arch/x86/include/asm/mmu_context_32.h
new file mode 100644
index 000000000000..8e10015781fb
--- /dev/null
+++ b/arch/x86/include/asm/mmu_context_32.h
@@ -0,0 +1,56 @@
1#ifndef _ASM_X86_MMU_CONTEXT_32_H
2#define _ASM_X86_MMU_CONTEXT_32_H
3
4static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
5{
6#ifdef CONFIG_SMP
7 unsigned cpu = smp_processor_id();
8 if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK)
9 per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_LAZY;
10#endif
11}
12
13static inline void switch_mm(struct mm_struct *prev,
14 struct mm_struct *next,
15 struct task_struct *tsk)
16{
17 int cpu = smp_processor_id();
18
19 if (likely(prev != next)) {
20 /* stop flush ipis for the previous mm */
21 cpu_clear(cpu, prev->cpu_vm_mask);
22#ifdef CONFIG_SMP
23 per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
24 per_cpu(cpu_tlbstate, cpu).active_mm = next;
25#endif
26 cpu_set(cpu, next->cpu_vm_mask);
27
28 /* Re-load page tables */
29 load_cr3(next->pgd);
30
31 /*
32 * load the LDT, if the LDT is different:
33 */
34 if (unlikely(prev->context.ldt != next->context.ldt))
35 load_LDT_nolock(&next->context);
36 }
37#ifdef CONFIG_SMP
38 else {
39 per_cpu(cpu_tlbstate, cpu).state = TLBSTATE_OK;
40 BUG_ON(per_cpu(cpu_tlbstate, cpu).active_mm != next);
41
42 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
43 /* We were in lazy tlb mode and leave_mm disabled
44 * tlb flush IPI delivery. We must reload %cr3.
45 */
46 load_cr3(next->pgd);
47 load_LDT_nolock(&next->context);
48 }
49 }
50#endif
51}
52
53#define deactivate_mm(tsk, mm) \
54 asm("movl %0,%%gs": :"r" (0));
55
56#endif /* _ASM_X86_MMU_CONTEXT_32_H */
diff --git a/arch/x86/include/asm/mmu_context_64.h b/arch/x86/include/asm/mmu_context_64.h
new file mode 100644
index 000000000000..677d36e9540a
--- /dev/null
+++ b/arch/x86/include/asm/mmu_context_64.h
@@ -0,0 +1,54 @@
1#ifndef _ASM_X86_MMU_CONTEXT_64_H
2#define _ASM_X86_MMU_CONTEXT_64_H
3
4#include <asm/pda.h>
5
6static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
7{
8#ifdef CONFIG_SMP
9 if (read_pda(mmu_state) == TLBSTATE_OK)
10 write_pda(mmu_state, TLBSTATE_LAZY);
11#endif
12}
13
14static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
15 struct task_struct *tsk)
16{
17 unsigned cpu = smp_processor_id();
18 if (likely(prev != next)) {
19 /* stop flush ipis for the previous mm */
20 cpu_clear(cpu, prev->cpu_vm_mask);
21#ifdef CONFIG_SMP
22 write_pda(mmu_state, TLBSTATE_OK);
23 write_pda(active_mm, next);
24#endif
25 cpu_set(cpu, next->cpu_vm_mask);
26 load_cr3(next->pgd);
27
28 if (unlikely(next->context.ldt != prev->context.ldt))
29 load_LDT_nolock(&next->context);
30 }
31#ifdef CONFIG_SMP
32 else {
33 write_pda(mmu_state, TLBSTATE_OK);
34 if (read_pda(active_mm) != next)
35 BUG();
36 if (!cpu_test_and_set(cpu, next->cpu_vm_mask)) {
37 /* We were in lazy tlb mode and leave_mm disabled
38 * tlb flush IPI delivery. We must reload CR3
39 * to make sure to use no freed page tables.
40 */
41 load_cr3(next->pgd);
42 load_LDT_nolock(&next->context);
43 }
44 }
45#endif
46}
47
48#define deactivate_mm(tsk, mm) \
49do { \
50 load_gs_index(0); \
51 asm volatile("movl %0,%%fs"::"r"(0)); \
52} while (0)
53
54#endif /* _ASM_X86_MMU_CONTEXT_64_H */
diff --git a/arch/x86/include/asm/mmx.h b/arch/x86/include/asm/mmx.h
new file mode 100644
index 000000000000..5cbf3135b971
--- /dev/null
+++ b/arch/x86/include/asm/mmx.h
@@ -0,0 +1,14 @@
1#ifndef _ASM_X86_MMX_H
2#define _ASM_X86_MMX_H
3
4/*
5 * MMX 3Dnow! helper operations
6 */
7
8#include <linux/types.h>
9
10extern void *_mmx_memcpy(void *to, const void *from, size_t size);
11extern void mmx_clear_page(void *page);
12extern void mmx_copy_page(void *to, void *from);
13
14#endif /* _ASM_X86_MMX_H */
diff --git a/arch/x86/include/asm/mmzone.h b/arch/x86/include/asm/mmzone.h
new file mode 100644
index 000000000000..64217ea16a36
--- /dev/null
+++ b/arch/x86/include/asm/mmzone.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "mmzone_32.h"
3#else
4# include "mmzone_64.h"
5#endif
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
new file mode 100644
index 000000000000..485bdf059ffb
--- /dev/null
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -0,0 +1,134 @@
1/*
2 * Written by Pat Gaughen (gone@us.ibm.com) Mar 2002
3 *
4 */
5
6#ifndef _ASM_X86_MMZONE_32_H
7#define _ASM_X86_MMZONE_32_H
8
9#include <asm/smp.h>
10
11#ifdef CONFIG_NUMA
12extern struct pglist_data *node_data[];
13#define NODE_DATA(nid) (node_data[nid])
14
15#include <asm/numaq.h>
16/* summit or generic arch */
17#include <asm/srat.h>
18
19extern int get_memcfg_numa_flat(void);
20/*
21 * This allows any one NUMA architecture to be compiled
22 * for, and still fall back to the flat function if it
23 * fails.
24 */
25static inline void get_memcfg_numa(void)
26{
27
28 if (get_memcfg_numaq())
29 return;
30 if (get_memcfg_from_srat())
31 return;
32 get_memcfg_numa_flat();
33}
34
35extern int early_pfn_to_nid(unsigned long pfn);
36
37#else /* !CONFIG_NUMA */
38
39#define get_memcfg_numa get_memcfg_numa_flat
40
41#endif /* CONFIG_NUMA */
42
43#ifdef CONFIG_DISCONTIGMEM
44
45/*
46 * generic node memory support, the following assumptions apply:
47 *
48 * 1) memory comes in 64Mb contigious chunks which are either present or not
49 * 2) we will not have more than 64Gb in total
50 *
51 * for now assume that 64Gb is max amount of RAM for whole system
52 * 64Gb / 4096bytes/page = 16777216 pages
53 */
54#define MAX_NR_PAGES 16777216
55#define MAX_ELEMENTS 1024
56#define PAGES_PER_ELEMENT (MAX_NR_PAGES/MAX_ELEMENTS)
57
58extern s8 physnode_map[];
59
60static inline int pfn_to_nid(unsigned long pfn)
61{
62#ifdef CONFIG_NUMA
63 return((int) physnode_map[(pfn) / PAGES_PER_ELEMENT]);
64#else
65 return 0;
66#endif
67}
68
69/*
70 * Following are macros that each numa implmentation must define.
71 */
72
73#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
74#define node_end_pfn(nid) \
75({ \
76 pg_data_t *__pgdat = NODE_DATA(nid); \
77 __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \
78})
79
80static inline int pfn_valid(int pfn)
81{
82 int nid = pfn_to_nid(pfn);
83
84 if (nid >= 0)
85 return (pfn < node_end_pfn(nid));
86 return 0;
87}
88
89#endif /* CONFIG_DISCONTIGMEM */
90
91#ifdef CONFIG_NEED_MULTIPLE_NODES
92
93/*
94 * Following are macros that are specific to this numa platform.
95 */
96#define reserve_bootmem(addr, size, flags) \
97 reserve_bootmem_node(NODE_DATA(0), (addr), (size), (flags))
98#define alloc_bootmem(x) \
99 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS))
100#define alloc_bootmem_nopanic(x) \
101 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
102 __pa(MAX_DMA_ADDRESS))
103#define alloc_bootmem_low(x) \
104 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, 0)
105#define alloc_bootmem_pages(x) \
106 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, __pa(MAX_DMA_ADDRESS))
107#define alloc_bootmem_pages_nopanic(x) \
108 __alloc_bootmem_node_nopanic(NODE_DATA(0), (x), PAGE_SIZE, \
109 __pa(MAX_DMA_ADDRESS))
110#define alloc_bootmem_low_pages(x) \
111 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0)
112#define alloc_bootmem_node(pgdat, x) \
113({ \
114 struct pglist_data __maybe_unused \
115 *__alloc_bootmem_node__pgdat = (pgdat); \
116 __alloc_bootmem_node(NODE_DATA(0), (x), SMP_CACHE_BYTES, \
117 __pa(MAX_DMA_ADDRESS)); \
118})
119#define alloc_bootmem_pages_node(pgdat, x) \
120({ \
121 struct pglist_data __maybe_unused \
122 *__alloc_bootmem_node__pgdat = (pgdat); \
123 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, \
124 __pa(MAX_DMA_ADDRESS)); \
125})
126#define alloc_bootmem_low_pages_node(pgdat, x) \
127({ \
128 struct pglist_data __maybe_unused \
129 *__alloc_bootmem_node__pgdat = (pgdat); \
130 __alloc_bootmem_node(NODE_DATA(0), (x), PAGE_SIZE, 0); \
131})
132#endif /* CONFIG_NEED_MULTIPLE_NODES */
133
134#endif /* _ASM_X86_MMZONE_32_H */
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
new file mode 100644
index 000000000000..a5b3817d4b9e
--- /dev/null
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -0,0 +1,51 @@
1/* K8 NUMA support */
2/* Copyright 2002,2003 by Andi Kleen, SuSE Labs */
3/* 2.5 Version loosely based on the NUMAQ Code by Pat Gaughen. */
4#ifndef _ASM_X86_MMZONE_64_H
5#define _ASM_X86_MMZONE_64_H
6
7
8#ifdef CONFIG_NUMA
9
10#include <linux/mmdebug.h>
11
12#include <asm/smp.h>
13
14/* Simple perfect hash to map physical addresses to node numbers */
15struct memnode {
16 int shift;
17 unsigned int mapsize;
18 s16 *map;
19 s16 embedded_map[64 - 8];
20} ____cacheline_aligned; /* total size = 128 bytes */
21extern struct memnode memnode;
22#define memnode_shift memnode.shift
23#define memnodemap memnode.map
24#define memnodemapsize memnode.mapsize
25
26extern struct pglist_data *node_data[];
27
28static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
29{
30 unsigned nid;
31 VIRTUAL_BUG_ON(!memnodemap);
32 nid = memnodemap[addr >> memnode_shift];
33 VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
34 return nid;
35}
36
37#define NODE_DATA(nid) (node_data[nid])
38
39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
40#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
41 NODE_DATA(nid)->node_spanned_pages)
42
43extern int early_pfn_to_nid(unsigned long pfn);
44
45#ifdef CONFIG_NUMA_EMU
46#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024)
47#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
48#endif
49
50#endif
51#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
new file mode 100644
index 000000000000..47d62743c4d5
--- /dev/null
+++ b/arch/x86/include/asm/module.h
@@ -0,0 +1,80 @@
1#ifndef _ASM_X86_MODULE_H
2#define _ASM_X86_MODULE_H
3
4/* x86_32/64 are simple */
5struct mod_arch_specific {};
6
7#ifdef CONFIG_X86_32
8# define Elf_Shdr Elf32_Shdr
9# define Elf_Sym Elf32_Sym
10# define Elf_Ehdr Elf32_Ehdr
11#else
12# define Elf_Shdr Elf64_Shdr
13# define Elf_Sym Elf64_Sym
14# define Elf_Ehdr Elf64_Ehdr
15#endif
16
17#ifdef CONFIG_X86_64
18/* X86_64 does not define MODULE_PROC_FAMILY */
19#elif defined CONFIG_M386
20#define MODULE_PROC_FAMILY "386 "
21#elif defined CONFIG_M486
22#define MODULE_PROC_FAMILY "486 "
23#elif defined CONFIG_M586
24#define MODULE_PROC_FAMILY "586 "
25#elif defined CONFIG_M586TSC
26#define MODULE_PROC_FAMILY "586TSC "
27#elif defined CONFIG_M586MMX
28#define MODULE_PROC_FAMILY "586MMX "
29#elif defined CONFIG_MCORE2
30#define MODULE_PROC_FAMILY "CORE2 "
31#elif defined CONFIG_M686
32#define MODULE_PROC_FAMILY "686 "
33#elif defined CONFIG_MPENTIUMII
34#define MODULE_PROC_FAMILY "PENTIUMII "
35#elif defined CONFIG_MPENTIUMIII
36#define MODULE_PROC_FAMILY "PENTIUMIII "
37#elif defined CONFIG_MPENTIUMM
38#define MODULE_PROC_FAMILY "PENTIUMM "
39#elif defined CONFIG_MPENTIUM4
40#define MODULE_PROC_FAMILY "PENTIUM4 "
41#elif defined CONFIG_MK6
42#define MODULE_PROC_FAMILY "K6 "
43#elif defined CONFIG_MK7
44#define MODULE_PROC_FAMILY "K7 "
45#elif defined CONFIG_MK8
46#define MODULE_PROC_FAMILY "K8 "
47#elif defined CONFIG_X86_ELAN
48#define MODULE_PROC_FAMILY "ELAN "
49#elif defined CONFIG_MCRUSOE
50#define MODULE_PROC_FAMILY "CRUSOE "
51#elif defined CONFIG_MEFFICEON
52#define MODULE_PROC_FAMILY "EFFICEON "
53#elif defined CONFIG_MWINCHIPC6
54#define MODULE_PROC_FAMILY "WINCHIPC6 "
55#elif defined CONFIG_MWINCHIP3D
56#define MODULE_PROC_FAMILY "WINCHIP3D "
57#elif defined CONFIG_MCYRIXIII
58#define MODULE_PROC_FAMILY "CYRIXIII "
59#elif defined CONFIG_MVIAC3_2
60#define MODULE_PROC_FAMILY "VIAC3-2 "
61#elif defined CONFIG_MVIAC7
62#define MODULE_PROC_FAMILY "VIAC7 "
63#elif defined CONFIG_MGEODEGX1
64#define MODULE_PROC_FAMILY "GEODEGX1 "
65#elif defined CONFIG_MGEODE_LX
66#define MODULE_PROC_FAMILY "GEODE "
67#else
68#error unknown processor family
69#endif
70
71#ifdef CONFIG_X86_32
72# ifdef CONFIG_4KSTACKS
73# define MODULE_STACKSIZE "4KSTACKS "
74# else
75# define MODULE_STACKSIZE ""
76# endif
77# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
78#endif
79
80#endif /* _ASM_X86_MODULE_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
new file mode 100644
index 000000000000..91885c28f66b
--- /dev/null
+++ b/arch/x86/include/asm/mpspec.h
@@ -0,0 +1,145 @@
1#ifndef _ASM_X86_MPSPEC_H
2#define _ASM_X86_MPSPEC_H
3
4#include <linux/init.h>
5
6#include <asm/mpspec_def.h>
7
8extern int apic_version[MAX_APICS];
9
10#ifdef CONFIG_X86_32
11#include <mach_mpspec.h>
12
13extern unsigned int def_to_bigsmp;
14extern u8 apicid_2_node[];
15extern int pic_mode;
16
17#ifdef CONFIG_X86_NUMAQ
18extern int mp_bus_id_to_node[MAX_MP_BUSSES];
19extern int mp_bus_id_to_local[MAX_MP_BUSSES];
20extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
21#endif
22
23#define MAX_APICID 256
24
25#else
26
27#define MAX_MP_BUSSES 256
28/* Each PCI slot may be a combo card with its own bus. 4 IRQ pins per slot. */
29#define MAX_IRQ_SOURCES (MAX_MP_BUSSES * 4)
30
31#endif
32
33extern void early_find_smp_config(void);
34extern void early_get_smp_config(void);
35
36#if defined(CONFIG_MCA) || defined(CONFIG_EISA)
37extern int mp_bus_id_to_type[MAX_MP_BUSSES];
38#endif
39
40extern DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
41
42extern unsigned int boot_cpu_physical_apicid;
43extern unsigned int max_physical_apicid;
44extern int smp_found_config;
45extern int mpc_default_type;
46extern unsigned long mp_lapic_addr;
47
48extern void find_smp_config(void);
49extern void get_smp_config(void);
50#ifdef CONFIG_X86_MPPARSE
51extern void early_reserve_e820_mpc_new(void);
52#else
53static inline void early_reserve_e820_mpc_new(void) { }
54#endif
55
56void __cpuinit generic_processor_info(int apicid, int version);
57#ifdef CONFIG_ACPI
58extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
59extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
60 u32 gsi);
61extern void mp_config_acpi_legacy_irqs(void);
62extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
63#ifdef CONFIG_X86_IO_APIC
64extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
65 u32 gsi, int triggering, int polarity);
66#else
67static inline int
68mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
69 u32 gsi, int triggering, int polarity)
70{
71 return 0;
72}
73#endif
74#endif /* CONFIG_ACPI */
75
76#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS)
77
78struct physid_mask {
79 unsigned long mask[PHYSID_ARRAY_SIZE];
80};
81
82typedef struct physid_mask physid_mask_t;
83
84#define physid_set(physid, map) set_bit(physid, (map).mask)
85#define physid_clear(physid, map) clear_bit(physid, (map).mask)
86#define physid_isset(physid, map) test_bit(physid, (map).mask)
87#define physid_test_and_set(physid, map) \
88 test_and_set_bit(physid, (map).mask)
89
90#define physids_and(dst, src1, src2) \
91 bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
92
93#define physids_or(dst, src1, src2) \
94 bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS)
95
96#define physids_clear(map) \
97 bitmap_zero((map).mask, MAX_APICS)
98
99#define physids_complement(dst, src) \
100 bitmap_complement((dst).mask, (src).mask, MAX_APICS)
101
102#define physids_empty(map) \
103 bitmap_empty((map).mask, MAX_APICS)
104
105#define physids_equal(map1, map2) \
106 bitmap_equal((map1).mask, (map2).mask, MAX_APICS)
107
108#define physids_weight(map) \
109 bitmap_weight((map).mask, MAX_APICS)
110
111#define physids_shift_right(d, s, n) \
112 bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS)
113
114#define physids_shift_left(d, s, n) \
115 bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
116
117#define physids_coerce(map) ((map).mask[0])
118
119#define physids_promote(physids) \
120 ({ \
121 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
122 __physid_mask.mask[0] = physids; \
123 __physid_mask; \
124 })
125
126/* Note: will create very large stack frames if physid_mask_t is big */
127#define physid_mask_of_physid(physid) \
128 ({ \
129 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
130 physid_set(physid, __physid_mask); \
131 __physid_mask; \
132 })
133
134static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
135{
136 physids_clear(*map);
137 physid_set(physid, *map);
138}
139
140#define PHYSID_MASK_ALL { {[0 ... PHYSID_ARRAY_SIZE-1] = ~0UL} }
141#define PHYSID_MASK_NONE { {[0 ... PHYSID_ARRAY_SIZE-1] = 0UL} }
142
143extern physid_mask_t phys_cpu_present_map;
144
145#endif /* _ASM_X86_MPSPEC_H */
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
new file mode 100644
index 000000000000..e3ace7d1d35d
--- /dev/null
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -0,0 +1,180 @@
1#ifndef _ASM_X86_MPSPEC_DEF_H
2#define _ASM_X86_MPSPEC_DEF_H
3
4/*
5 * Structure definitions for SMP machines following the
6 * Intel Multiprocessing Specification 1.1 and 1.4.
7 */
8
9/*
10 * This tag identifies where the SMP configuration
11 * information is.
12 */
13
14#define SMP_MAGIC_IDENT (('_'<<24) | ('P'<<16) | ('M'<<8) | '_')
15
16#ifdef CONFIG_X86_32
17# define MAX_MPC_ENTRY 1024
18# define MAX_APICS 256
19#else
20# if NR_CPUS <= 255
21# define MAX_APICS 255
22# else
23# define MAX_APICS 32768
24# endif
25#endif
26
27struct intel_mp_floating {
28 char mpf_signature[4]; /* "_MP_" */
29 unsigned int mpf_physptr; /* Configuration table address */
30 unsigned char mpf_length; /* Our length (paragraphs) */
31 unsigned char mpf_specification;/* Specification version */
32 unsigned char mpf_checksum; /* Checksum (makes sum 0) */
33 unsigned char mpf_feature1; /* Standard or configuration ? */
34 unsigned char mpf_feature2; /* Bit7 set for IMCR|PIC */
35 unsigned char mpf_feature3; /* Unused (0) */
36 unsigned char mpf_feature4; /* Unused (0) */
37 unsigned char mpf_feature5; /* Unused (0) */
38};
39
40#define MPC_SIGNATURE "PCMP"
41
42struct mp_config_table {
43 char mpc_signature[4];
44 unsigned short mpc_length; /* Size of table */
45 char mpc_spec; /* 0x01 */
46 char mpc_checksum;
47 char mpc_oem[8];
48 char mpc_productid[12];
49 unsigned int mpc_oemptr; /* 0 if not present */
50 unsigned short mpc_oemsize; /* 0 if not present */
51 unsigned short mpc_oemcount;
52 unsigned int mpc_lapic; /* APIC address */
53 unsigned int reserved;
54};
55
56/* Followed by entries */
57
58#define MP_PROCESSOR 0
59#define MP_BUS 1
60#define MP_IOAPIC 2
61#define MP_INTSRC 3
62#define MP_LINTSRC 4
63/* Used by IBM NUMA-Q to describe node locality */
64#define MP_TRANSLATION 192
65
66#define CPU_ENABLED 1 /* Processor is available */
67#define CPU_BOOTPROCESSOR 2 /* Processor is the BP */
68
69#define CPU_STEPPING_MASK 0x000F
70#define CPU_MODEL_MASK 0x00F0
71#define CPU_FAMILY_MASK 0x0F00
72
73struct mpc_config_processor {
74 unsigned char mpc_type;
75 unsigned char mpc_apicid; /* Local APIC number */
76 unsigned char mpc_apicver; /* Its versions */
77 unsigned char mpc_cpuflag;
78 unsigned int mpc_cpufeature;
79 unsigned int mpc_featureflag; /* CPUID feature value */
80 unsigned int mpc_reserved[2];
81};
82
83struct mpc_config_bus {
84 unsigned char mpc_type;
85 unsigned char mpc_busid;
86 unsigned char mpc_bustype[6];
87};
88
89/* List of Bus Type string values, Intel MP Spec. */
90#define BUSTYPE_EISA "EISA"
91#define BUSTYPE_ISA "ISA"
92#define BUSTYPE_INTERN "INTERN" /* Internal BUS */
93#define BUSTYPE_MCA "MCA"
94#define BUSTYPE_VL "VL" /* Local bus */
95#define BUSTYPE_PCI "PCI"
96#define BUSTYPE_PCMCIA "PCMCIA"
97#define BUSTYPE_CBUS "CBUS"
98#define BUSTYPE_CBUSII "CBUSII"
99#define BUSTYPE_FUTURE "FUTURE"
100#define BUSTYPE_MBI "MBI"
101#define BUSTYPE_MBII "MBII"
102#define BUSTYPE_MPI "MPI"
103#define BUSTYPE_MPSA "MPSA"
104#define BUSTYPE_NUBUS "NUBUS"
105#define BUSTYPE_TC "TC"
106#define BUSTYPE_VME "VME"
107#define BUSTYPE_XPRESS "XPRESS"
108
109#define MPC_APIC_USABLE 0x01
110
111struct mpc_config_ioapic {
112 unsigned char mpc_type;
113 unsigned char mpc_apicid;
114 unsigned char mpc_apicver;
115 unsigned char mpc_flags;
116 unsigned int mpc_apicaddr;
117};
118
119struct mpc_config_intsrc {
120 unsigned char mpc_type;
121 unsigned char mpc_irqtype;
122 unsigned short mpc_irqflag;
123 unsigned char mpc_srcbus;
124 unsigned char mpc_srcbusirq;
125 unsigned char mpc_dstapic;
126 unsigned char mpc_dstirq;
127};
128
129enum mp_irq_source_types {
130 mp_INT = 0,
131 mp_NMI = 1,
132 mp_SMI = 2,
133 mp_ExtINT = 3
134};
135
136#define MP_IRQDIR_DEFAULT 0
137#define MP_IRQDIR_HIGH 1
138#define MP_IRQDIR_LOW 3
139
140#define MP_APIC_ALL 0xFF
141
142struct mpc_config_lintsrc {
143 unsigned char mpc_type;
144 unsigned char mpc_irqtype;
145 unsigned short mpc_irqflag;
146 unsigned char mpc_srcbusid;
147 unsigned char mpc_srcbusirq;
148 unsigned char mpc_destapic;
149 unsigned char mpc_destapiclint;
150};
151
152#define MPC_OEM_SIGNATURE "_OEM"
153
154struct mp_config_oemtable {
155 char oem_signature[4];
156 unsigned short oem_length; /* Size of table */
157 char oem_rev; /* 0x01 */
158 char oem_checksum;
159 char mpc_oem[8];
160};
161
162/*
163 * Default configurations
164 *
165 * 1 2 CPU ISA 82489DX
166 * 2 2 CPU EISA 82489DX neither IRQ 0 timer nor IRQ 13 DMA chaining
167 * 3 2 CPU EISA 82489DX
168 * 4 2 CPU MCA 82489DX
169 * 5 2 CPU ISA+PCI
170 * 6 2 CPU EISA+PCI
171 * 7 2 CPU MCA+PCI
172 */
173
174enum mp_bustype {
175 MP_BUS_ISA = 1,
176 MP_BUS_EISA,
177 MP_BUS_PCI,
178 MP_BUS_MCA,
179};
180#endif /* _ASM_X86_MPSPEC_DEF_H */
diff --git a/arch/x86/include/asm/msgbuf.h b/arch/x86/include/asm/msgbuf.h
new file mode 100644
index 000000000000..7e4e9481f51c
--- /dev/null
+++ b/arch/x86/include/asm/msgbuf.h
@@ -0,0 +1,39 @@
1#ifndef _ASM_X86_MSGBUF_H
2#define _ASM_X86_MSGBUF_H
3
4/*
5 * The msqid64_ds structure for i386 architecture.
6 * Note extra padding because this structure is passed back and forth
7 * between kernel and user space.
8 *
9 * Pad space on i386 is left for:
10 * - 64-bit time_t to solve y2038 problem
11 * - 2 miscellaneous 32-bit values
12 *
13 * Pad space on x8664 is left for:
14 * - 2 miscellaneous 64-bit values
15 */
16struct msqid64_ds {
17 struct ipc64_perm msg_perm;
18 __kernel_time_t msg_stime; /* last msgsnd time */
19#ifdef __i386__
20 unsigned long __unused1;
21#endif
22 __kernel_time_t msg_rtime; /* last msgrcv time */
23#ifdef __i386__
24 unsigned long __unused2;
25#endif
26 __kernel_time_t msg_ctime; /* last change time */
27#ifdef __i386__
28 unsigned long __unused3;
29#endif
30 unsigned long msg_cbytes; /* current number of bytes on queue */
31 unsigned long msg_qnum; /* number of messages in queue */
32 unsigned long msg_qbytes; /* max number of bytes on queue */
33 __kernel_pid_t msg_lspid; /* pid of last msgsnd */
34 __kernel_pid_t msg_lrpid; /* last receive pid */
35 unsigned long __unused4;
36 unsigned long __unused5;
37};
38
39#endif /* _ASM_X86_MSGBUF_H */
diff --git a/arch/x86/include/asm/msidef.h b/arch/x86/include/asm/msidef.h
new file mode 100644
index 000000000000..6706b3006f13
--- /dev/null
+++ b/arch/x86/include/asm/msidef.h
@@ -0,0 +1,55 @@
1#ifndef _ASM_X86_MSIDEF_H
2#define _ASM_X86_MSIDEF_H
3
4/*
5 * Constants for Intel APIC based MSI messages.
6 */
7
8/*
9 * Shifts for MSI data
10 */
11
12#define MSI_DATA_VECTOR_SHIFT 0
13#define MSI_DATA_VECTOR_MASK 0x000000ff
14#define MSI_DATA_VECTOR(v) (((v) << MSI_DATA_VECTOR_SHIFT) & \
15 MSI_DATA_VECTOR_MASK)
16
17#define MSI_DATA_DELIVERY_MODE_SHIFT 8
18#define MSI_DATA_DELIVERY_FIXED (0 << MSI_DATA_DELIVERY_MODE_SHIFT)
19#define MSI_DATA_DELIVERY_LOWPRI (1 << MSI_DATA_DELIVERY_MODE_SHIFT)
20
21#define MSI_DATA_LEVEL_SHIFT 14
22#define MSI_DATA_LEVEL_DEASSERT (0 << MSI_DATA_LEVEL_SHIFT)
23#define MSI_DATA_LEVEL_ASSERT (1 << MSI_DATA_LEVEL_SHIFT)
24
25#define MSI_DATA_TRIGGER_SHIFT 15
26#define MSI_DATA_TRIGGER_EDGE (0 << MSI_DATA_TRIGGER_SHIFT)
27#define MSI_DATA_TRIGGER_LEVEL (1 << MSI_DATA_TRIGGER_SHIFT)
28
29/*
30 * Shift/mask fields for msi address
31 */
32
33#define MSI_ADDR_BASE_HI 0
34#define MSI_ADDR_BASE_LO 0xfee00000
35
36#define MSI_ADDR_DEST_MODE_SHIFT 2
37#define MSI_ADDR_DEST_MODE_PHYSICAL (0 << MSI_ADDR_DEST_MODE_SHIFT)
38#define MSI_ADDR_DEST_MODE_LOGICAL (1 << MSI_ADDR_DEST_MODE_SHIFT)
39
40#define MSI_ADDR_REDIRECTION_SHIFT 3
41#define MSI_ADDR_REDIRECTION_CPU (0 << MSI_ADDR_REDIRECTION_SHIFT)
42 /* dedicated cpu */
43#define MSI_ADDR_REDIRECTION_LOWPRI (1 << MSI_ADDR_REDIRECTION_SHIFT)
44 /* lowest priority */
45
46#define MSI_ADDR_DEST_ID_SHIFT 12
47#define MSI_ADDR_DEST_ID_MASK 0x00ffff0
48#define MSI_ADDR_DEST_ID(dest) (((dest) << MSI_ADDR_DEST_ID_SHIFT) & \
49 MSI_ADDR_DEST_ID_MASK)
50
51#define MSI_ADDR_IR_EXT_INT (1 << 4)
52#define MSI_ADDR_IR_SHV (1 << 3)
53#define MSI_ADDR_IR_INDEX1(index) ((index & 0x8000) >> 13)
54#define MSI_ADDR_IR_INDEX2(index) ((index & 0x7fff) << 5)
55#endif /* _ASM_X86_MSIDEF_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
new file mode 100644
index 000000000000..e38859d577a1
--- /dev/null
+++ b/arch/x86/include/asm/msr-index.h
@@ -0,0 +1,332 @@
1#ifndef _ASM_X86_MSR_INDEX_H
2#define _ASM_X86_MSR_INDEX_H
3
4/* CPU model specific register (MSR) numbers */
5
6/* x86-64 specific MSRs */
7#define MSR_EFER 0xc0000080 /* extended feature register */
8#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */
9#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */
10#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target */
11#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */
12#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */
13#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */
14#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */
15
16/* EFER bits: */
17#define _EFER_SCE 0 /* SYSCALL/SYSRET */
18#define _EFER_LME 8 /* Long mode enable */
19#define _EFER_LMA 10 /* Long mode active (read-only) */
20#define _EFER_NX 11 /* No execute enable */
21
22#define EFER_SCE (1<<_EFER_SCE)
23#define EFER_LME (1<<_EFER_LME)
24#define EFER_LMA (1<<_EFER_LMA)
25#define EFER_NX (1<<_EFER_NX)
26
27/* Intel MSRs. Some also available on other CPUs */
28#define MSR_IA32_PERFCTR0 0x000000c1
29#define MSR_IA32_PERFCTR1 0x000000c2
30#define MSR_FSB_FREQ 0x000000cd
31
32#define MSR_MTRRcap 0x000000fe
33#define MSR_IA32_BBL_CR_CTL 0x00000119
34
35#define MSR_IA32_SYSENTER_CS 0x00000174
36#define MSR_IA32_SYSENTER_ESP 0x00000175
37#define MSR_IA32_SYSENTER_EIP 0x00000176
38
39#define MSR_IA32_MCG_CAP 0x00000179
40#define MSR_IA32_MCG_STATUS 0x0000017a
41#define MSR_IA32_MCG_CTL 0x0000017b
42
43#define MSR_IA32_PEBS_ENABLE 0x000003f1
44#define MSR_IA32_DS_AREA 0x00000600
45#define MSR_IA32_PERF_CAPABILITIES 0x00000345
46
47#define MSR_MTRRfix64K_00000 0x00000250
48#define MSR_MTRRfix16K_80000 0x00000258
49#define MSR_MTRRfix16K_A0000 0x00000259
50#define MSR_MTRRfix4K_C0000 0x00000268
51#define MSR_MTRRfix4K_C8000 0x00000269
52#define MSR_MTRRfix4K_D0000 0x0000026a
53#define MSR_MTRRfix4K_D8000 0x0000026b
54#define MSR_MTRRfix4K_E0000 0x0000026c
55#define MSR_MTRRfix4K_E8000 0x0000026d
56#define MSR_MTRRfix4K_F0000 0x0000026e
57#define MSR_MTRRfix4K_F8000 0x0000026f
58#define MSR_MTRRdefType 0x000002ff
59
60#define MSR_IA32_CR_PAT 0x00000277
61
62#define MSR_IA32_DEBUGCTLMSR 0x000001d9
63#define MSR_IA32_LASTBRANCHFROMIP 0x000001db
64#define MSR_IA32_LASTBRANCHTOIP 0x000001dc
65#define MSR_IA32_LASTINTFROMIP 0x000001dd
66#define MSR_IA32_LASTINTTOIP 0x000001de
67
68/* DEBUGCTLMSR bits (others vary by model): */
69#define _DEBUGCTLMSR_LBR 0 /* last branch recording */
70#define _DEBUGCTLMSR_BTF 1 /* single-step on branches */
71
72#define DEBUGCTLMSR_LBR (1UL << _DEBUGCTLMSR_LBR)
73#define DEBUGCTLMSR_BTF (1UL << _DEBUGCTLMSR_BTF)
74
75#define MSR_IA32_MC0_CTL 0x00000400
76#define MSR_IA32_MC0_STATUS 0x00000401
77#define MSR_IA32_MC0_ADDR 0x00000402
78#define MSR_IA32_MC0_MISC 0x00000403
79
80#define MSR_P6_PERFCTR0 0x000000c1
81#define MSR_P6_PERFCTR1 0x000000c2
82#define MSR_P6_EVNTSEL0 0x00000186
83#define MSR_P6_EVNTSEL1 0x00000187
84
85/* AMD64 MSRs. Not complete. See the architecture manual for a more
86 complete list. */
87
88#define MSR_AMD64_NB_CFG 0xc001001f
89#define MSR_AMD64_IBSFETCHCTL 0xc0011030
90#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
91#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
92#define MSR_AMD64_IBSOPCTL 0xc0011033
93#define MSR_AMD64_IBSOPRIP 0xc0011034
94#define MSR_AMD64_IBSOPDATA 0xc0011035
95#define MSR_AMD64_IBSOPDATA2 0xc0011036
96#define MSR_AMD64_IBSOPDATA3 0xc0011037
97#define MSR_AMD64_IBSDCLINAD 0xc0011038
98#define MSR_AMD64_IBSDCPHYSAD 0xc0011039
99#define MSR_AMD64_IBSCTL 0xc001103a
100
101/* Fam 10h MSRs */
102#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
103#define FAM10H_MMIO_CONF_ENABLE (1<<0)
104#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
105#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
106#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff
107#define FAM10H_MMIO_CONF_BASE_SHIFT 20
108
109/* K8 MSRs */
110#define MSR_K8_TOP_MEM1 0xc001001a
111#define MSR_K8_TOP_MEM2 0xc001001d
112#define MSR_K8_SYSCFG 0xc0010010
113#define MSR_K8_HWCR 0xc0010015
114#define MSR_K8_INT_PENDING_MSG 0xc0010055
115/* C1E active bits in int pending message */
116#define K8_INTP_C1E_ACTIVE_MASK 0x18000000
117#define MSR_K8_TSEG_ADDR 0xc0010112
118#define K8_MTRRFIXRANGE_DRAM_ENABLE 0x00040000 /* MtrrFixDramEn bit */
119#define K8_MTRRFIXRANGE_DRAM_MODIFY 0x00080000 /* MtrrFixDramModEn bit */
120#define K8_MTRR_RDMEM_WRMEM_MASK 0x18181818 /* Mask: RdMem|WrMem */
121
122/* K7 MSRs */
123#define MSR_K7_EVNTSEL0 0xc0010000
124#define MSR_K7_PERFCTR0 0xc0010004
125#define MSR_K7_EVNTSEL1 0xc0010001
126#define MSR_K7_PERFCTR1 0xc0010005
127#define MSR_K7_EVNTSEL2 0xc0010002
128#define MSR_K7_PERFCTR2 0xc0010006
129#define MSR_K7_EVNTSEL3 0xc0010003
130#define MSR_K7_PERFCTR3 0xc0010007
131#define MSR_K7_CLK_CTL 0xc001001b
132#define MSR_K7_HWCR 0xc0010015
133#define MSR_K7_FID_VID_CTL 0xc0010041
134#define MSR_K7_FID_VID_STATUS 0xc0010042
135
136/* K6 MSRs */
137#define MSR_K6_EFER 0xc0000080
138#define MSR_K6_STAR 0xc0000081
139#define MSR_K6_WHCR 0xc0000082
140#define MSR_K6_UWCCR 0xc0000085
141#define MSR_K6_EPMR 0xc0000086
142#define MSR_K6_PSOR 0xc0000087
143#define MSR_K6_PFIR 0xc0000088
144
145/* Centaur-Hauls/IDT defined MSRs. */
146#define MSR_IDT_FCR1 0x00000107
147#define MSR_IDT_FCR2 0x00000108
148#define MSR_IDT_FCR3 0x00000109
149#define MSR_IDT_FCR4 0x0000010a
150
151#define MSR_IDT_MCR0 0x00000110
152#define MSR_IDT_MCR1 0x00000111
153#define MSR_IDT_MCR2 0x00000112
154#define MSR_IDT_MCR3 0x00000113
155#define MSR_IDT_MCR4 0x00000114
156#define MSR_IDT_MCR5 0x00000115
157#define MSR_IDT_MCR6 0x00000116
158#define MSR_IDT_MCR7 0x00000117
159#define MSR_IDT_MCR_CTRL 0x00000120
160
161/* VIA Cyrix defined MSRs*/
162#define MSR_VIA_FCR 0x00001107
163#define MSR_VIA_LONGHAUL 0x0000110a
164#define MSR_VIA_RNG 0x0000110b
165#define MSR_VIA_BCR2 0x00001147
166
167/* Transmeta defined MSRs */
168#define MSR_TMTA_LONGRUN_CTRL 0x80868010
169#define MSR_TMTA_LONGRUN_FLAGS 0x80868011
170#define MSR_TMTA_LRTI_READOUT 0x80868018
171#define MSR_TMTA_LRTI_VOLT_MHZ 0x8086801a
172
173/* Intel defined MSRs. */
174#define MSR_IA32_P5_MC_ADDR 0x00000000
175#define MSR_IA32_P5_MC_TYPE 0x00000001
176#define MSR_IA32_TSC 0x00000010
177#define MSR_IA32_PLATFORM_ID 0x00000017
178#define MSR_IA32_EBL_CR_POWERON 0x0000002a
179#define MSR_IA32_FEATURE_CONTROL 0x0000003a
180
181#define FEATURE_CONTROL_LOCKED (1<<0)
182#define FEATURE_CONTROL_VMXON_ENABLED (1<<2)
183
184#define MSR_IA32_APICBASE 0x0000001b
185#define MSR_IA32_APICBASE_BSP (1<<8)
186#define MSR_IA32_APICBASE_ENABLE (1<<11)
187#define MSR_IA32_APICBASE_BASE (0xfffff<<12)
188
189#define MSR_IA32_UCODE_WRITE 0x00000079
190#define MSR_IA32_UCODE_REV 0x0000008b
191
192#define MSR_IA32_PERF_STATUS 0x00000198
193#define MSR_IA32_PERF_CTL 0x00000199
194
195#define MSR_IA32_MPERF 0x000000e7
196#define MSR_IA32_APERF 0x000000e8
197
198#define MSR_IA32_THERM_CONTROL 0x0000019a
199#define MSR_IA32_THERM_INTERRUPT 0x0000019b
200#define MSR_IA32_THERM_STATUS 0x0000019c
201#define MSR_IA32_MISC_ENABLE 0x000001a0
202
203/* Intel Model 6 */
204#define MSR_P6_EVNTSEL0 0x00000186
205#define MSR_P6_EVNTSEL1 0x00000187
206
207/* P4/Xeon+ specific */
208#define MSR_IA32_MCG_EAX 0x00000180
209#define MSR_IA32_MCG_EBX 0x00000181
210#define MSR_IA32_MCG_ECX 0x00000182
211#define MSR_IA32_MCG_EDX 0x00000183
212#define MSR_IA32_MCG_ESI 0x00000184
213#define MSR_IA32_MCG_EDI 0x00000185
214#define MSR_IA32_MCG_EBP 0x00000186
215#define MSR_IA32_MCG_ESP 0x00000187
216#define MSR_IA32_MCG_EFLAGS 0x00000188
217#define MSR_IA32_MCG_EIP 0x00000189
218#define MSR_IA32_MCG_RESERVED 0x0000018a
219
220/* Pentium IV performance counter MSRs */
221#define MSR_P4_BPU_PERFCTR0 0x00000300
222#define MSR_P4_BPU_PERFCTR1 0x00000301
223#define MSR_P4_BPU_PERFCTR2 0x00000302
224#define MSR_P4_BPU_PERFCTR3 0x00000303
225#define MSR_P4_MS_PERFCTR0 0x00000304
226#define MSR_P4_MS_PERFCTR1 0x00000305
227#define MSR_P4_MS_PERFCTR2 0x00000306
228#define MSR_P4_MS_PERFCTR3 0x00000307
229#define MSR_P4_FLAME_PERFCTR0 0x00000308
230#define MSR_P4_FLAME_PERFCTR1 0x00000309
231#define MSR_P4_FLAME_PERFCTR2 0x0000030a
232#define MSR_P4_FLAME_PERFCTR3 0x0000030b
233#define MSR_P4_IQ_PERFCTR0 0x0000030c
234#define MSR_P4_IQ_PERFCTR1 0x0000030d
235#define MSR_P4_IQ_PERFCTR2 0x0000030e
236#define MSR_P4_IQ_PERFCTR3 0x0000030f
237#define MSR_P4_IQ_PERFCTR4 0x00000310
238#define MSR_P4_IQ_PERFCTR5 0x00000311
239#define MSR_P4_BPU_CCCR0 0x00000360
240#define MSR_P4_BPU_CCCR1 0x00000361
241#define MSR_P4_BPU_CCCR2 0x00000362
242#define MSR_P4_BPU_CCCR3 0x00000363
243#define MSR_P4_MS_CCCR0 0x00000364
244#define MSR_P4_MS_CCCR1 0x00000365
245#define MSR_P4_MS_CCCR2 0x00000366
246#define MSR_P4_MS_CCCR3 0x00000367
247#define MSR_P4_FLAME_CCCR0 0x00000368
248#define MSR_P4_FLAME_CCCR1 0x00000369
249#define MSR_P4_FLAME_CCCR2 0x0000036a
250#define MSR_P4_FLAME_CCCR3 0x0000036b
251#define MSR_P4_IQ_CCCR0 0x0000036c
252#define MSR_P4_IQ_CCCR1 0x0000036d
253#define MSR_P4_IQ_CCCR2 0x0000036e
254#define MSR_P4_IQ_CCCR3 0x0000036f
255#define MSR_P4_IQ_CCCR4 0x00000370
256#define MSR_P4_IQ_CCCR5 0x00000371
257#define MSR_P4_ALF_ESCR0 0x000003ca
258#define MSR_P4_ALF_ESCR1 0x000003cb
259#define MSR_P4_BPU_ESCR0 0x000003b2
260#define MSR_P4_BPU_ESCR1 0x000003b3
261#define MSR_P4_BSU_ESCR0 0x000003a0
262#define MSR_P4_BSU_ESCR1 0x000003a1
263#define MSR_P4_CRU_ESCR0 0x000003b8
264#define MSR_P4_CRU_ESCR1 0x000003b9
265#define MSR_P4_CRU_ESCR2 0x000003cc
266#define MSR_P4_CRU_ESCR3 0x000003cd
267#define MSR_P4_CRU_ESCR4 0x000003e0
268#define MSR_P4_CRU_ESCR5 0x000003e1
269#define MSR_P4_DAC_ESCR0 0x000003a8
270#define MSR_P4_DAC_ESCR1 0x000003a9
271#define MSR_P4_FIRM_ESCR0 0x000003a4
272#define MSR_P4_FIRM_ESCR1 0x000003a5
273#define MSR_P4_FLAME_ESCR0 0x000003a6
274#define MSR_P4_FLAME_ESCR1 0x000003a7
275#define MSR_P4_FSB_ESCR0 0x000003a2
276#define MSR_P4_FSB_ESCR1 0x000003a3
277#define MSR_P4_IQ_ESCR0 0x000003ba
278#define MSR_P4_IQ_ESCR1 0x000003bb
279#define MSR_P4_IS_ESCR0 0x000003b4
280#define MSR_P4_IS_ESCR1 0x000003b5
281#define MSR_P4_ITLB_ESCR0 0x000003b6
282#define MSR_P4_ITLB_ESCR1 0x000003b7
283#define MSR_P4_IX_ESCR0 0x000003c8
284#define MSR_P4_IX_ESCR1 0x000003c9
285#define MSR_P4_MOB_ESCR0 0x000003aa
286#define MSR_P4_MOB_ESCR1 0x000003ab
287#define MSR_P4_MS_ESCR0 0x000003c0
288#define MSR_P4_MS_ESCR1 0x000003c1
289#define MSR_P4_PMH_ESCR0 0x000003ac
290#define MSR_P4_PMH_ESCR1 0x000003ad
291#define MSR_P4_RAT_ESCR0 0x000003bc
292#define MSR_P4_RAT_ESCR1 0x000003bd
293#define MSR_P4_SAAT_ESCR0 0x000003ae
294#define MSR_P4_SAAT_ESCR1 0x000003af
295#define MSR_P4_SSU_ESCR0 0x000003be
296#define MSR_P4_SSU_ESCR1 0x000003bf /* guess: not in manual */
297
298#define MSR_P4_TBPU_ESCR0 0x000003c2
299#define MSR_P4_TBPU_ESCR1 0x000003c3
300#define MSR_P4_TC_ESCR0 0x000003c4
301#define MSR_P4_TC_ESCR1 0x000003c5
302#define MSR_P4_U2L_ESCR0 0x000003b0
303#define MSR_P4_U2L_ESCR1 0x000003b1
304
305/* Intel Core-based CPU performance counters */
306#define MSR_CORE_PERF_FIXED_CTR0 0x00000309
307#define MSR_CORE_PERF_FIXED_CTR1 0x0000030a
308#define MSR_CORE_PERF_FIXED_CTR2 0x0000030b
309#define MSR_CORE_PERF_FIXED_CTR_CTRL 0x0000038d
310#define MSR_CORE_PERF_GLOBAL_STATUS 0x0000038e
311#define MSR_CORE_PERF_GLOBAL_CTRL 0x0000038f
312#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x00000390
313
314/* Geode defined MSRs */
315#define MSR_GEODE_BUSCONT_CONF0 0x00001900
316
317/* Intel VT MSRs */
318#define MSR_IA32_VMX_BASIC 0x00000480
319#define MSR_IA32_VMX_PINBASED_CTLS 0x00000481
320#define MSR_IA32_VMX_PROCBASED_CTLS 0x00000482
321#define MSR_IA32_VMX_EXIT_CTLS 0x00000483
322#define MSR_IA32_VMX_ENTRY_CTLS 0x00000484
323#define MSR_IA32_VMX_MISC 0x00000485
324#define MSR_IA32_VMX_CR0_FIXED0 0x00000486
325#define MSR_IA32_VMX_CR0_FIXED1 0x00000487
326#define MSR_IA32_VMX_CR4_FIXED0 0x00000488
327#define MSR_IA32_VMX_CR4_FIXED1 0x00000489
328#define MSR_IA32_VMX_VMCS_ENUM 0x0000048a
329#define MSR_IA32_VMX_PROCBASED_CTLS2 0x0000048b
330#define MSR_IA32_VMX_EPT_VPID_CAP 0x0000048c
331
332#endif /* _ASM_X86_MSR_INDEX_H */
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
new file mode 100644
index 000000000000..46be2fa7ac26
--- /dev/null
+++ b/arch/x86/include/asm/msr.h
@@ -0,0 +1,247 @@
1#ifndef _ASM_X86_MSR_H
2#define _ASM_X86_MSR_H
3
4#include <asm/msr-index.h>
5
6#ifndef __ASSEMBLY__
7# include <linux/types.h>
8#endif
9
10#ifdef __KERNEL__
11#ifndef __ASSEMBLY__
12
13#include <asm/asm.h>
14#include <asm/errno.h>
15
16static inline unsigned long long native_read_tscp(unsigned int *aux)
17{
18 unsigned long low, high;
19 asm volatile(".byte 0x0f,0x01,0xf9"
20 : "=a" (low), "=d" (high), "=c" (*aux));
21 return low | ((u64)high << 32);
22}
23
24/*
25 * i386 calling convention returns 64-bit value in edx:eax, while
26 * x86_64 returns at rax. Also, the "A" constraint does not really
27 * mean rdx:rax in x86_64, so we need specialized behaviour for each
28 * architecture
29 */
30#ifdef CONFIG_X86_64
31#define DECLARE_ARGS(val, low, high) unsigned low, high
32#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32))
33#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high)
34#define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high)
35#else
36#define DECLARE_ARGS(val, low, high) unsigned long long val
37#define EAX_EDX_VAL(val, low, high) (val)
38#define EAX_EDX_ARGS(val, low, high) "A" (val)
39#define EAX_EDX_RET(val, low, high) "=A" (val)
40#endif
41
42static inline unsigned long long native_read_msr(unsigned int msr)
43{
44 DECLARE_ARGS(val, low, high);
45
46 asm volatile("rdmsr" : EAX_EDX_RET(val, low, high) : "c" (msr));
47 return EAX_EDX_VAL(val, low, high);
48}
49
50static inline unsigned long long native_read_msr_safe(unsigned int msr,
51 int *err)
52{
53 DECLARE_ARGS(val, low, high);
54
55 asm volatile("2: rdmsr ; xor %[err],%[err]\n"
56 "1:\n\t"
57 ".section .fixup,\"ax\"\n\t"
58 "3: mov %[fault],%[err] ; jmp 1b\n\t"
59 ".previous\n\t"
60 _ASM_EXTABLE(2b, 3b)
61 : [err] "=r" (*err), EAX_EDX_RET(val, low, high)
62 : "c" (msr), [fault] "i" (-EFAULT));
63 return EAX_EDX_VAL(val, low, high);
64}
65
66static inline unsigned long long native_read_msr_amd_safe(unsigned int msr,
67 int *err)
68{
69 DECLARE_ARGS(val, low, high);
70
71 asm volatile("2: rdmsr ; xor %0,%0\n"
72 "1:\n\t"
73 ".section .fixup,\"ax\"\n\t"
74 "3: mov %3,%0 ; jmp 1b\n\t"
75 ".previous\n\t"
76 _ASM_EXTABLE(2b, 3b)
77 : "=r" (*err), EAX_EDX_RET(val, low, high)
78 : "c" (msr), "D" (0x9c5a203a), "i" (-EFAULT));
79 return EAX_EDX_VAL(val, low, high);
80}
81
82static inline void native_write_msr(unsigned int msr,
83 unsigned low, unsigned high)
84{
85 asm volatile("wrmsr" : : "c" (msr), "a"(low), "d" (high) : "memory");
86}
87
88static inline int native_write_msr_safe(unsigned int msr,
89 unsigned low, unsigned high)
90{
91 int err;
92 asm volatile("2: wrmsr ; xor %[err],%[err]\n"
93 "1:\n\t"
94 ".section .fixup,\"ax\"\n\t"
95 "3: mov %[fault],%[err] ; jmp 1b\n\t"
96 ".previous\n\t"
97 _ASM_EXTABLE(2b, 3b)
98 : [err] "=a" (err)
99 : "c" (msr), "0" (low), "d" (high),
100 [fault] "i" (-EFAULT)
101 : "memory");
102 return err;
103}
104
105extern unsigned long long native_read_tsc(void);
106
107static __always_inline unsigned long long __native_read_tsc(void)
108{
109 DECLARE_ARGS(val, low, high);
110
111 rdtsc_barrier();
112 asm volatile("rdtsc" : EAX_EDX_RET(val, low, high));
113 rdtsc_barrier();
114
115 return EAX_EDX_VAL(val, low, high);
116}
117
118static inline unsigned long long native_read_pmc(int counter)
119{
120 DECLARE_ARGS(val, low, high);
121
122 asm volatile("rdpmc" : EAX_EDX_RET(val, low, high) : "c" (counter));
123 return EAX_EDX_VAL(val, low, high);
124}
125
126#ifdef CONFIG_PARAVIRT
127#include <asm/paravirt.h>
128#else
129#include <linux/errno.h>
130/*
131 * Access to machine-specific registers (available on 586 and better only)
132 * Note: the rd* operations modify the parameters directly (without using
133 * pointer indirection), this allows gcc to optimize better
134 */
135
136#define rdmsr(msr, val1, val2) \
137do { \
138 u64 __val = native_read_msr((msr)); \
139 (val1) = (u32)__val; \
140 (val2) = (u32)(__val >> 32); \
141} while (0)
142
143static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
144{
145 native_write_msr(msr, low, high);
146}
147
148#define rdmsrl(msr, val) \
149 ((val) = native_read_msr((msr)))
150
151#define wrmsrl(msr, val) \
152 native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32))
153
154/* wrmsr with exception handling */
155static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
156{
157 return native_write_msr_safe(msr, low, high);
158}
159
160/* rdmsr with exception handling */
161#define rdmsr_safe(msr, p1, p2) \
162({ \
163 int __err; \
164 u64 __val = native_read_msr_safe((msr), &__err); \
165 (*p1) = (u32)__val; \
166 (*p2) = (u32)(__val >> 32); \
167 __err; \
168})
169
170static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
171{
172 int err;
173
174 *p = native_read_msr_safe(msr, &err);
175 return err;
176}
177static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
178{
179 int err;
180
181 *p = native_read_msr_amd_safe(msr, &err);
182 return err;
183}
184
185#define rdtscl(low) \
186 ((low) = (u32)native_read_tsc())
187
188#define rdtscll(val) \
189 ((val) = native_read_tsc())
190
191#define rdpmc(counter, low, high) \
192do { \
193 u64 _l = native_read_pmc((counter)); \
194 (low) = (u32)_l; \
195 (high) = (u32)(_l >> 32); \
196} while (0)
197
198#define rdtscp(low, high, aux) \
199do { \
200 unsigned long long _val = native_read_tscp(&(aux)); \
201 (low) = (u32)_val; \
202 (high) = (u32)(_val >> 32); \
203} while (0)
204
205#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
206
207#endif /* !CONFIG_PARAVIRT */
208
209
210#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \
211 (u32)((val) >> 32))
212
213#define write_tsc(val1, val2) wrmsr(0x10, (val1), (val2))
214
215#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0)
216
217#ifdef CONFIG_SMP
218int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
219int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
220int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
221int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
222#else /* CONFIG_SMP */
223static inline int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
224{
225 rdmsr(msr_no, *l, *h);
226 return 0;
227}
228static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
229{
230 wrmsr(msr_no, l, h);
231 return 0;
232}
233static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no,
234 u32 *l, u32 *h)
235{
236 return rdmsr_safe(msr_no, l, h);
237}
238static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
239{
240 return wrmsr_safe(msr_no, l, h);
241}
242#endif /* CONFIG_SMP */
243#endif /* __ASSEMBLY__ */
244#endif /* __KERNEL__ */
245
246
247#endif /* _ASM_X86_MSR_H */
diff --git a/arch/x86/include/asm/mtrr.h b/arch/x86/include/asm/mtrr.h
new file mode 100644
index 000000000000..7c1e4258b31e
--- /dev/null
+++ b/arch/x86/include/asm/mtrr.h
@@ -0,0 +1,173 @@
1/* Generic MTRR (Memory Type Range Register) ioctls.
2
3 Copyright (C) 1997-1999 Richard Gooch
4
5 This library is free software; you can redistribute it and/or
6 modify it under the terms of the GNU Library General Public
7 License as published by the Free Software Foundation; either
8 version 2 of the License, or (at your option) any later version.
9
10 This library is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 Library General Public License for more details.
14
15 You should have received a copy of the GNU Library General Public
16 License along with this library; if not, write to the Free
17 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18
19 Richard Gooch may be reached by email at rgooch@atnf.csiro.au
20 The postal address is:
21 Richard Gooch, c/o ATNF, P. O. Box 76, Epping, N.S.W., 2121, Australia.
22*/
23#ifndef _ASM_X86_MTRR_H
24#define _ASM_X86_MTRR_H
25
26#include <linux/ioctl.h>
27#include <linux/errno.h>
28
29#define MTRR_IOCTL_BASE 'M'
30
31struct mtrr_sentry {
32 unsigned long base; /* Base address */
33 unsigned int size; /* Size of region */
34 unsigned int type; /* Type of region */
35};
36
37/* Warning: this structure has a different order from i386
38 on x86-64. The 32bit emulation code takes care of that.
39 But you need to use this for 64bit, otherwise your X server
40 will break. */
41
42#ifdef __i386__
43struct mtrr_gentry {
44 unsigned int regnum; /* Register number */
45 unsigned long base; /* Base address */
46 unsigned int size; /* Size of region */
47 unsigned int type; /* Type of region */
48};
49
50#else /* __i386__ */
51
52struct mtrr_gentry {
53 unsigned long base; /* Base address */
54 unsigned int size; /* Size of region */
55 unsigned int regnum; /* Register number */
56 unsigned int type; /* Type of region */
57};
58#endif /* !__i386__ */
59
60/* These are the various ioctls */
61#define MTRRIOC_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry)
62#define MTRRIOC_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry)
63#define MTRRIOC_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry)
64#define MTRRIOC_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry)
65#define MTRRIOC_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry)
66#define MTRRIOC_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry)
67#define MTRRIOC_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry)
68#define MTRRIOC_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry)
69#define MTRRIOC_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry)
70#define MTRRIOC_KILL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry)
71
72/* These are the region types */
73#define MTRR_TYPE_UNCACHABLE 0
74#define MTRR_TYPE_WRCOMB 1
75/*#define MTRR_TYPE_ 2*/
76/*#define MTRR_TYPE_ 3*/
77#define MTRR_TYPE_WRTHROUGH 4
78#define MTRR_TYPE_WRPROT 5
79#define MTRR_TYPE_WRBACK 6
80#define MTRR_NUM_TYPES 7
81
82#ifdef __KERNEL__
83
84/* The following functions are for use by other drivers */
85# ifdef CONFIG_MTRR
86extern u8 mtrr_type_lookup(u64 addr, u64 end);
87extern void mtrr_save_fixed_ranges(void *);
88extern void mtrr_save_state(void);
89extern int mtrr_add(unsigned long base, unsigned long size,
90 unsigned int type, bool increment);
91extern int mtrr_add_page(unsigned long base, unsigned long size,
92 unsigned int type, bool increment);
93extern int mtrr_del(int reg, unsigned long base, unsigned long size);
94extern int mtrr_del_page(int reg, unsigned long base, unsigned long size);
95extern void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi);
96extern void mtrr_ap_init(void);
97extern void mtrr_bp_init(void);
98extern int mtrr_trim_uncached_memory(unsigned long end_pfn);
99extern int amd_special_default_mtrr(void);
100# else
101static inline u8 mtrr_type_lookup(u64 addr, u64 end)
102{
103 /*
104 * Return no-MTRRs:
105 */
106 return 0xff;
107}
108#define mtrr_save_fixed_ranges(arg) do {} while (0)
109#define mtrr_save_state() do {} while (0)
110static inline int mtrr_add(unsigned long base, unsigned long size,
111 unsigned int type, bool increment)
112{
113 return -ENODEV;
114}
115static inline int mtrr_add_page(unsigned long base, unsigned long size,
116 unsigned int type, bool increment)
117{
118 return -ENODEV;
119}
120static inline int mtrr_del(int reg, unsigned long base, unsigned long size)
121{
122 return -ENODEV;
123}
124static inline int mtrr_del_page(int reg, unsigned long base, unsigned long size)
125{
126 return -ENODEV;
127}
128static inline int mtrr_trim_uncached_memory(unsigned long end_pfn)
129{
130 return 0;
131}
132static inline void mtrr_centaur_report_mcr(int mcr, u32 lo, u32 hi)
133{
134}
135
136#define mtrr_ap_init() do {} while (0)
137#define mtrr_bp_init() do {} while (0)
138# endif
139
140#ifdef CONFIG_COMPAT
141#include <linux/compat.h>
142
143struct mtrr_sentry32 {
144 compat_ulong_t base; /* Base address */
145 compat_uint_t size; /* Size of region */
146 compat_uint_t type; /* Type of region */
147};
148
149struct mtrr_gentry32 {
150 compat_ulong_t regnum; /* Register number */
151 compat_uint_t base; /* Base address */
152 compat_uint_t size; /* Size of region */
153 compat_uint_t type; /* Type of region */
154};
155
156#define MTRR_IOCTL_BASE 'M'
157
158#define MTRRIOC32_ADD_ENTRY _IOW(MTRR_IOCTL_BASE, 0, struct mtrr_sentry32)
159#define MTRRIOC32_SET_ENTRY _IOW(MTRR_IOCTL_BASE, 1, struct mtrr_sentry32)
160#define MTRRIOC32_DEL_ENTRY _IOW(MTRR_IOCTL_BASE, 2, struct mtrr_sentry32)
161#define MTRRIOC32_GET_ENTRY _IOWR(MTRR_IOCTL_BASE, 3, struct mtrr_gentry32)
162#define MTRRIOC32_KILL_ENTRY _IOW(MTRR_IOCTL_BASE, 4, struct mtrr_sentry32)
163#define MTRRIOC32_ADD_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 5, struct mtrr_sentry32)
164#define MTRRIOC32_SET_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 6, struct mtrr_sentry32)
165#define MTRRIOC32_DEL_PAGE_ENTRY _IOW(MTRR_IOCTL_BASE, 7, struct mtrr_sentry32)
166#define MTRRIOC32_GET_PAGE_ENTRY _IOWR(MTRR_IOCTL_BASE, 8, struct mtrr_gentry32)
167#define MTRRIOC32_KILL_PAGE_ENTRY \
168 _IOW(MTRR_IOCTL_BASE, 9, struct mtrr_sentry32)
169#endif /* CONFIG_COMPAT */
170
171#endif /* __KERNEL__ */
172
173#endif /* _ASM_X86_MTRR_H */
diff --git a/arch/x86/include/asm/mutex.h b/arch/x86/include/asm/mutex.h
new file mode 100644
index 000000000000..a731b9c573a6
--- /dev/null
+++ b/arch/x86/include/asm/mutex.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "mutex_32.h"
3#else
4# include "mutex_64.h"
5#endif
diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h
new file mode 100644
index 000000000000..03f90c8a5a7c
--- /dev/null
+++ b/arch/x86/include/asm/mutex_32.h
@@ -0,0 +1,125 @@
1/*
2 * Assembly implementation of the mutex fastpath, based on atomic
3 * decrement/increment.
4 *
5 * started by Ingo Molnar:
6 *
7 * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
8 */
9#ifndef _ASM_X86_MUTEX_32_H
10#define _ASM_X86_MUTEX_32_H
11
12#include <asm/alternative.h>
13
14/**
15 * __mutex_fastpath_lock - try to take the lock by moving the count
16 * from 1 to a 0 value
17 * @count: pointer of type atomic_t
18 * @fn: function to call if the original value was not 1
19 *
20 * Change the count from 1 to a value lower than 1, and call <fn> if it
21 * wasn't 1 originally. This function MUST leave the value lower than 1
22 * even when the "1" assertion wasn't true.
23 */
24#define __mutex_fastpath_lock(count, fail_fn) \
25do { \
26 unsigned int dummy; \
27 \
28 typecheck(atomic_t *, count); \
29 typecheck_fn(void (*)(atomic_t *), fail_fn); \
30 \
31 asm volatile(LOCK_PREFIX " decl (%%eax)\n" \
32 " jns 1f \n" \
33 " call " #fail_fn "\n" \
34 "1:\n" \
35 : "=a" (dummy) \
36 : "a" (count) \
37 : "memory", "ecx", "edx"); \
38} while (0)
39
40
41/**
42 * __mutex_fastpath_lock_retval - try to take the lock by moving the count
43 * from 1 to a 0 value
44 * @count: pointer of type atomic_t
45 * @fail_fn: function to call if the original value was not 1
46 *
47 * Change the count from 1 to a value lower than 1, and call <fail_fn> if it
48 * wasn't 1 originally. This function returns 0 if the fastpath succeeds,
49 * or anything the slow path function returns
50 */
51static inline int __mutex_fastpath_lock_retval(atomic_t *count,
52 int (*fail_fn)(atomic_t *))
53{
54 if (unlikely(atomic_dec_return(count) < 0))
55 return fail_fn(count);
56 else
57 return 0;
58}
59
60/**
61 * __mutex_fastpath_unlock - try to promote the mutex from 0 to 1
62 * @count: pointer of type atomic_t
63 * @fail_fn: function to call if the original value was not 0
64 *
65 * try to promote the mutex from 0 to 1. if it wasn't 0, call <fail_fn>.
66 * In the failure case, this function is allowed to either set the value
67 * to 1, or to set it to a value lower than 1.
68 *
69 * If the implementation sets it to a value of lower than 1, the
70 * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs
71 * to return 0 otherwise.
72 */
73#define __mutex_fastpath_unlock(count, fail_fn) \
74do { \
75 unsigned int dummy; \
76 \
77 typecheck(atomic_t *, count); \
78 typecheck_fn(void (*)(atomic_t *), fail_fn); \
79 \
80 asm volatile(LOCK_PREFIX " incl (%%eax)\n" \
81 " jg 1f\n" \
82 " call " #fail_fn "\n" \
83 "1:\n" \
84 : "=a" (dummy) \
85 : "a" (count) \
86 : "memory", "ecx", "edx"); \
87} while (0)
88
89#define __mutex_slowpath_needs_to_unlock() 1
90
91/**
92 * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
93 *
94 * @count: pointer of type atomic_t
95 * @fail_fn: fallback function
96 *
97 * Change the count from 1 to a value lower than 1, and return 0 (failure)
98 * if it wasn't 1 originally, or return 1 (success) otherwise. This function
99 * MUST leave the value lower than 1 even when the "1" assertion wasn't true.
100 * Additionally, if the value was < 0 originally, this function must not leave
101 * it to 0 on failure.
102 */
103static inline int __mutex_fastpath_trylock(atomic_t *count,
104 int (*fail_fn)(atomic_t *))
105{
106 /*
107 * We have two variants here. The cmpxchg based one is the best one
108 * because it never induce a false contention state. It is included
109 * here because architectures using the inc/dec algorithms over the
110 * xchg ones are much more likely to support cmpxchg natively.
111 *
112 * If not we fall back to the spinlock based variant - that is
113 * just as efficient (and simpler) as a 'destructive' probing of
114 * the mutex state would be.
115 */
116#ifdef __HAVE_ARCH_CMPXCHG
117 if (likely(atomic_cmpxchg(count, 1, 0) == 1))
118 return 1;
119 return 0;
120#else
121 return fail_fn(count);
122#endif
123}
124
125#endif /* _ASM_X86_MUTEX_32_H */
diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h
new file mode 100644
index 000000000000..68a87b0f8e29
--- /dev/null
+++ b/arch/x86/include/asm/mutex_64.h
@@ -0,0 +1,100 @@
1/*
2 * Assembly implementation of the mutex fastpath, based on atomic
3 * decrement/increment.
4 *
5 * started by Ingo Molnar:
6 *
7 * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
8 */
9#ifndef _ASM_X86_MUTEX_64_H
10#define _ASM_X86_MUTEX_64_H
11
12/**
13 * __mutex_fastpath_lock - decrement and call function if negative
14 * @v: pointer of type atomic_t
15 * @fail_fn: function to call if the result is negative
16 *
17 * Atomically decrements @v and calls <fail_fn> if the result is negative.
18 */
19#define __mutex_fastpath_lock(v, fail_fn) \
20do { \
21 unsigned long dummy; \
22 \
23 typecheck(atomic_t *, v); \
24 typecheck_fn(void (*)(atomic_t *), fail_fn); \
25 \
26 asm volatile(LOCK_PREFIX " decl (%%rdi)\n" \
27 " jns 1f \n" \
28 " call " #fail_fn "\n" \
29 "1:" \
30 : "=D" (dummy) \
31 : "D" (v) \
32 : "rax", "rsi", "rdx", "rcx", \
33 "r8", "r9", "r10", "r11", "memory"); \
34} while (0)
35
36/**
37 * __mutex_fastpath_lock_retval - try to take the lock by moving the count
38 * from 1 to a 0 value
39 * @count: pointer of type atomic_t
40 * @fail_fn: function to call if the original value was not 1
41 *
42 * Change the count from 1 to a value lower than 1, and call <fail_fn> if
43 * it wasn't 1 originally. This function returns 0 if the fastpath succeeds,
44 * or anything the slow path function returns
45 */
46static inline int __mutex_fastpath_lock_retval(atomic_t *count,
47 int (*fail_fn)(atomic_t *))
48{
49 if (unlikely(atomic_dec_return(count) < 0))
50 return fail_fn(count);
51 else
52 return 0;
53}
54
55/**
56 * __mutex_fastpath_unlock - increment and call function if nonpositive
57 * @v: pointer of type atomic_t
58 * @fail_fn: function to call if the result is nonpositive
59 *
60 * Atomically increments @v and calls <fail_fn> if the result is nonpositive.
61 */
62#define __mutex_fastpath_unlock(v, fail_fn) \
63do { \
64 unsigned long dummy; \
65 \
66 typecheck(atomic_t *, v); \
67 typecheck_fn(void (*)(atomic_t *), fail_fn); \
68 \
69 asm volatile(LOCK_PREFIX " incl (%%rdi)\n" \
70 " jg 1f\n" \
71 " call " #fail_fn "\n" \
72 "1:" \
73 : "=D" (dummy) \
74 : "D" (v) \
75 : "rax", "rsi", "rdx", "rcx", \
76 "r8", "r9", "r10", "r11", "memory"); \
77} while (0)
78
79#define __mutex_slowpath_needs_to_unlock() 1
80
81/**
82 * __mutex_fastpath_trylock - try to acquire the mutex, without waiting
83 *
84 * @count: pointer of type atomic_t
85 * @fail_fn: fallback function
86 *
87 * Change the count from 1 to 0 and return 1 (success), or return 0 (failure)
88 * if it wasn't 1 originally. [the fallback function is never used on
89 * x86_64, because all x86_64 CPUs have a CMPXCHG instruction.]
90 */
91static inline int __mutex_fastpath_trylock(atomic_t *count,
92 int (*fail_fn)(atomic_t *))
93{
94 if (likely(atomic_cmpxchg(count, 1, 0) == 1))
95 return 1;
96 else
97 return 0;
98}
99
100#endif /* _ASM_X86_MUTEX_64_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
new file mode 100644
index 000000000000..c45a0a568dff
--- /dev/null
+++ b/arch/x86/include/asm/nmi.h
@@ -0,0 +1,81 @@
1#ifndef _ASM_X86_NMI_H
2#define _ASM_X86_NMI_H
3
4#include <linux/pm.h>
5#include <asm/irq.h>
6#include <asm/io.h>
7
8#ifdef ARCH_HAS_NMI_WATCHDOG
9
10/**
11 * do_nmi_callback
12 *
13 * Check to see if a callback exists and execute it. Return 1
14 * if the handler exists and was handled successfully.
15 */
16int do_nmi_callback(struct pt_regs *regs, int cpu);
17
18extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
19extern int check_nmi_watchdog(void);
20extern int nmi_watchdog_enabled;
21extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
22extern int avail_to_resrv_perfctr_nmi(unsigned int);
23extern int reserve_perfctr_nmi(unsigned int);
24extern void release_perfctr_nmi(unsigned int);
25extern int reserve_evntsel_nmi(unsigned int);
26extern void release_evntsel_nmi(unsigned int);
27
28extern void setup_apic_nmi_watchdog(void *);
29extern void stop_apic_nmi_watchdog(void *);
30extern void disable_timer_nmi_watchdog(void);
31extern void enable_timer_nmi_watchdog(void);
32extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
33extern void cpu_nmi_set_wd_enabled(void);
34
35extern atomic_t nmi_active;
36extern unsigned int nmi_watchdog;
37#define NMI_NONE 0
38#define NMI_IO_APIC 1
39#define NMI_LOCAL_APIC 2
40#define NMI_INVALID 3
41
42struct ctl_table;
43struct file;
44extern int proc_nmi_enabled(struct ctl_table *, int , struct file *,
45 void __user *, size_t *, loff_t *);
46extern int unknown_nmi_panic;
47
48void __trigger_all_cpu_backtrace(void);
49#define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace()
50
51static inline void localise_nmi_watchdog(void)
52{
53 if (nmi_watchdog == NMI_IO_APIC)
54 nmi_watchdog = NMI_LOCAL_APIC;
55}
56
57/* check if nmi_watchdog is active (ie was specified at boot) */
58static inline int nmi_watchdog_active(void)
59{
60 /*
61 * actually it should be:
62 * return (nmi_watchdog == NMI_LOCAL_APIC ||
63 * nmi_watchdog == NMI_IO_APIC)
64 * but since they are power of two we could use a
65 * cheaper way --cvg
66 */
67 return nmi_watchdog & 0x3;
68}
69#endif
70
71void lapic_watchdog_stop(void);
72int lapic_watchdog_init(unsigned nmi_hz);
73int lapic_wd_event(unsigned nmi_hz);
74unsigned lapic_adjust_nmi_hz(unsigned hz);
75int lapic_watchdog_ok(void);
76void disable_lapic_nmi_watchdog(void);
77void enable_lapic_nmi_watchdog(void);
78void stop_nmi(void);
79void restart_nmi(void);
80
81#endif /* _ASM_X86_NMI_H */
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
new file mode 100644
index 000000000000..ad2668ee1aa7
--- /dev/null
+++ b/arch/x86/include/asm/nops.h
@@ -0,0 +1,118 @@
1#ifndef _ASM_X86_NOPS_H
2#define _ASM_X86_NOPS_H
3
4/* Define nops for use with alternative() */
5
6/* generic versions from gas
7 1: nop
8 the following instructions are NOT nops in 64-bit mode,
9 for 64-bit mode use K8 or P6 nops instead
10 2: movl %esi,%esi
11 3: leal 0x00(%esi),%esi
12 4: leal 0x00(,%esi,1),%esi
13 6: leal 0x00000000(%esi),%esi
14 7: leal 0x00000000(,%esi,1),%esi
15*/
16#define GENERIC_NOP1 ".byte 0x90\n"
17#define GENERIC_NOP2 ".byte 0x89,0xf6\n"
18#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n"
19#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n"
20#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4
21#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n"
22#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n"
23#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7
24
25/* Opteron 64bit nops
26 1: nop
27 2: osp nop
28 3: osp osp nop
29 4: osp osp osp nop
30*/
31#define K8_NOP1 GENERIC_NOP1
32#define K8_NOP2 ".byte 0x66,0x90\n"
33#define K8_NOP3 ".byte 0x66,0x66,0x90\n"
34#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n"
35#define K8_NOP5 K8_NOP3 K8_NOP2
36#define K8_NOP6 K8_NOP3 K8_NOP3
37#define K8_NOP7 K8_NOP4 K8_NOP3
38#define K8_NOP8 K8_NOP4 K8_NOP4
39
40/* K7 nops
41 uses eax dependencies (arbitary choice)
42 1: nop
43 2: movl %eax,%eax
44 3: leal (,%eax,1),%eax
45 4: leal 0x00(,%eax,1),%eax
46 6: leal 0x00000000(%eax),%eax
47 7: leal 0x00000000(,%eax,1),%eax
48*/
49#define K7_NOP1 GENERIC_NOP1
50#define K7_NOP2 ".byte 0x8b,0xc0\n"
51#define K7_NOP3 ".byte 0x8d,0x04,0x20\n"
52#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n"
53#define K7_NOP5 K7_NOP4 ASM_NOP1
54#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n"
55#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n"
56#define K7_NOP8 K7_NOP7 ASM_NOP1
57
58/* P6 nops
59 uses eax dependencies (Intel-recommended choice)
60 1: nop
61 2: osp nop
62 3: nopl (%eax)
63 4: nopl 0x00(%eax)
64 5: nopl 0x00(%eax,%eax,1)
65 6: osp nopl 0x00(%eax,%eax,1)
66 7: nopl 0x00000000(%eax)
67 8: nopl 0x00000000(%eax,%eax,1)
68*/
69#define P6_NOP1 GENERIC_NOP1
70#define P6_NOP2 ".byte 0x66,0x90\n"
71#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n"
72#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n"
73#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n"
74#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n"
75#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n"
76#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n"
77
78#if defined(CONFIG_MK7)
79#define ASM_NOP1 K7_NOP1
80#define ASM_NOP2 K7_NOP2
81#define ASM_NOP3 K7_NOP3
82#define ASM_NOP4 K7_NOP4
83#define ASM_NOP5 K7_NOP5
84#define ASM_NOP6 K7_NOP6
85#define ASM_NOP7 K7_NOP7
86#define ASM_NOP8 K7_NOP8
87#elif defined(CONFIG_X86_P6_NOP)
88#define ASM_NOP1 P6_NOP1
89#define ASM_NOP2 P6_NOP2
90#define ASM_NOP3 P6_NOP3
91#define ASM_NOP4 P6_NOP4
92#define ASM_NOP5 P6_NOP5
93#define ASM_NOP6 P6_NOP6
94#define ASM_NOP7 P6_NOP7
95#define ASM_NOP8 P6_NOP8
96#elif defined(CONFIG_X86_64)
97#define ASM_NOP1 K8_NOP1
98#define ASM_NOP2 K8_NOP2
99#define ASM_NOP3 K8_NOP3
100#define ASM_NOP4 K8_NOP4
101#define ASM_NOP5 K8_NOP5
102#define ASM_NOP6 K8_NOP6
103#define ASM_NOP7 K8_NOP7
104#define ASM_NOP8 K8_NOP8
105#else
106#define ASM_NOP1 GENERIC_NOP1
107#define ASM_NOP2 GENERIC_NOP2
108#define ASM_NOP3 GENERIC_NOP3
109#define ASM_NOP4 GENERIC_NOP4
110#define ASM_NOP5 GENERIC_NOP5
111#define ASM_NOP6 GENERIC_NOP6
112#define ASM_NOP7 GENERIC_NOP7
113#define ASM_NOP8 GENERIC_NOP8
114#endif
115
116#define ASM_NOP_MAX 8
117
118#endif /* _ASM_X86_NOPS_H */
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
new file mode 100644
index 000000000000..27da400d3138
--- /dev/null
+++ b/arch/x86/include/asm/numa.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "numa_32.h"
3#else
4# include "numa_64.h"
5#endif
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
new file mode 100644
index 000000000000..e9f5db796244
--- /dev/null
+++ b/arch/x86/include/asm/numa_32.h
@@ -0,0 +1,11 @@
1#ifndef _ASM_X86_NUMA_32_H
2#define _ASM_X86_NUMA_32_H
3
4extern int pxm_to_nid(int pxm);
5extern void numa_remove_cpu(int cpu);
6
7#ifdef CONFIG_NUMA
8extern void set_highmem_pages_init(void);
9#endif
10
11#endif /* _ASM_X86_NUMA_32_H */
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
new file mode 100644
index 000000000000..064ed6df4cbe
--- /dev/null
+++ b/arch/x86/include/asm/numa_64.h
@@ -0,0 +1,43 @@
1#ifndef _ASM_X86_NUMA_64_H
2#define _ASM_X86_NUMA_64_H
3
4#include <linux/nodemask.h>
5#include <asm/apicdef.h>
6
7struct bootnode {
8 u64 start;
9 u64 end;
10};
11
12extern int compute_hash_shift(struct bootnode *nodes, int numblks,
13 int *nodeids);
14
15#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
16
17extern void numa_init_array(void);
18extern int numa_off;
19
20extern void srat_reserve_add_area(int nodeid);
21extern int hotadd_percent;
22
23extern s16 apicid_to_node[MAX_LOCAL_APIC];
24
25extern unsigned long numa_free_all_bootmem(void);
26extern void setup_node_bootmem(int nodeid, unsigned long start,
27 unsigned long end);
28
29#ifdef CONFIG_NUMA
30extern void __init init_cpu_to_node(void);
31extern void __cpuinit numa_set_node(int cpu, int node);
32extern void __cpuinit numa_clear_node(int cpu);
33extern void __cpuinit numa_add_cpu(int cpu);
34extern void __cpuinit numa_remove_cpu(int cpu);
35#else
36static inline void init_cpu_to_node(void) { }
37static inline void numa_set_node(int cpu, int node) { }
38static inline void numa_clear_node(int cpu) { }
39static inline void numa_add_cpu(int cpu, int node) { }
40static inline void numa_remove_cpu(int cpu) { }
41#endif
42
43#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
new file mode 100644
index 000000000000..1e8bd30b4c16
--- /dev/null
+++ b/arch/x86/include/asm/numaq.h
@@ -0,0 +1,169 @@
1/*
2 * Written by: Patricia Gaughen, IBM Corporation
3 *
4 * Copyright (C) 2002, IBM Corp.
5 *
6 * All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16 * NON INFRINGEMENT. See the GNU General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * Send feedback to <gone@us.ibm.com>
24 */
25
26#ifndef _ASM_X86_NUMAQ_H
27#define _ASM_X86_NUMAQ_H
28
29#ifdef CONFIG_X86_NUMAQ
30
31extern int found_numaq;
32extern int get_memcfg_numaq(void);
33
34/*
35 * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the
36 */
37#define SYS_CFG_DATA_PRIV_ADDR 0x0009d000 /* place for scd in private
38 quad space */
39
40/*
41 * Communication area for each processor on lynxer-processor tests.
42 *
43 * NOTE: If you change the size of this eachproc structure you need
44 * to change the definition for EACH_QUAD_SIZE.
45 */
46struct eachquadmem {
47 unsigned int priv_mem_start; /* Starting address of this */
48 /* quad's private memory. */
49 /* This is always 0. */
50 /* In MB. */
51 unsigned int priv_mem_size; /* Size of this quad's */
52 /* private memory. */
53 /* In MB. */
54 unsigned int low_shrd_mem_strp_start;/* Starting address of this */
55 /* quad's low shared block */
56 /* (untranslated). */
57 /* In MB. */
58 unsigned int low_shrd_mem_start; /* Starting address of this */
59 /* quad's low shared memory */
60 /* (untranslated). */
61 /* In MB. */
62 unsigned int low_shrd_mem_size; /* Size of this quad's low */
63 /* shared memory. */
64 /* In MB. */
65 unsigned int lmmio_copb_start; /* Starting address of this */
66 /* quad's local memory */
67 /* mapped I/O in the */
68 /* compatibility OPB. */
69 /* In MB. */
70 unsigned int lmmio_copb_size; /* Size of this quad's local */
71 /* memory mapped I/O in the */
72 /* compatibility OPB. */
73 /* In MB. */
74 unsigned int lmmio_nopb_start; /* Starting address of this */
75 /* quad's local memory */
76 /* mapped I/O in the */
77 /* non-compatibility OPB. */
78 /* In MB. */
79 unsigned int lmmio_nopb_size; /* Size of this quad's local */
80 /* memory mapped I/O in the */
81 /* non-compatibility OPB. */
82 /* In MB. */
83 unsigned int io_apic_0_start; /* Starting address of I/O */
84 /* APIC 0. */
85 unsigned int io_apic_0_sz; /* Size I/O APIC 0. */
86 unsigned int io_apic_1_start; /* Starting address of I/O */
87 /* APIC 1. */
88 unsigned int io_apic_1_sz; /* Size I/O APIC 1. */
89 unsigned int hi_shrd_mem_start; /* Starting address of this */
90 /* quad's high shared memory.*/
91 /* In MB. */
92 unsigned int hi_shrd_mem_size; /* Size of this quad's high */
93 /* shared memory. */
94 /* In MB. */
95 unsigned int mps_table_addr; /* Address of this quad's */
96 /* MPS tables from BIOS, */
97 /* in system space.*/
98 unsigned int lcl_MDC_pio_addr; /* Port-I/O address for */
99 /* local access of MDC. */
100 unsigned int rmt_MDC_mmpio_addr; /* MM-Port-I/O address for */
101 /* remote access of MDC. */
102 unsigned int mm_port_io_start; /* Starting address of this */
103 /* quad's memory mapped Port */
104 /* I/O space. */
105 unsigned int mm_port_io_size; /* Size of this quad's memory*/
106 /* mapped Port I/O space. */
107 unsigned int mm_rmt_io_apic_start; /* Starting address of this */
108 /* quad's memory mapped */
109 /* remote I/O APIC space. */
110 unsigned int mm_rmt_io_apic_size; /* Size of this quad's memory*/
111 /* mapped remote I/O APIC */
112 /* space. */
113 unsigned int mm_isa_start; /* Starting address of this */
114 /* quad's memory mapped ISA */
115 /* space (contains MDC */
116 /* memory space). */
117 unsigned int mm_isa_size; /* Size of this quad's memory*/
118 /* mapped ISA space (contains*/
119 /* MDC memory space). */
120 unsigned int rmt_qmi_addr; /* Remote addr to access QMI.*/
121 unsigned int lcl_qmi_addr; /* Local addr to access QMI. */
122};
123
124/*
125 * Note: This structure must be NOT be changed unless the multiproc and
126 * OS are changed to reflect the new structure.
127 */
128struct sys_cfg_data {
129 unsigned int quad_id;
130 unsigned int bsp_proc_id; /* Boot Strap Processor in this quad. */
131 unsigned int scd_version; /* Version number of this table. */
132 unsigned int first_quad_id;
133 unsigned int quads_present31_0; /* 1 bit for each quad */
134 unsigned int quads_present63_32; /* 1 bit for each quad */
135 unsigned int config_flags;
136 unsigned int boot_flags;
137 unsigned int csr_start_addr; /* Absolute value (not in MB) */
138 unsigned int csr_size; /* Absolute value (not in MB) */
139 unsigned int lcl_apic_start_addr; /* Absolute value (not in MB) */
140 unsigned int lcl_apic_size; /* Absolute value (not in MB) */
141 unsigned int low_shrd_mem_base; /* 0 or 512MB or 1GB */
142 unsigned int low_shrd_mem_quad_offset; /* 0,128M,256M,512M,1G */
143 /* may not be totally populated */
144 unsigned int split_mem_enbl; /* 0 for no low shared memory */
145 unsigned int mmio_sz; /* Size of total system memory mapped I/O */
146 /* (in MB). */
147 unsigned int quad_spin_lock; /* Spare location used for quad */
148 /* bringup. */
149 unsigned int nonzero55; /* For checksumming. */
150 unsigned int nonzeroaa; /* For checksumming. */
151 unsigned int scd_magic_number;
152 unsigned int system_type;
153 unsigned int checksum;
154 /*
155 * memory configuration area for each quad
156 */
157 struct eachquadmem eq[MAX_NUMNODES]; /* indexed by quad id */
158};
159
160void numaq_tsc_disable(void);
161
162#else
163static inline int get_memcfg_numaq(void)
164{
165 return 0;
166}
167#endif /* CONFIG_X86_NUMAQ */
168#endif /* _ASM_X86_NUMAQ_H */
169
diff --git a/arch/x86/include/asm/numaq/apic.h b/arch/x86/include/asm/numaq/apic.h
new file mode 100644
index 000000000000..0bf2a06b7a4e
--- /dev/null
+++ b/arch/x86/include/asm/numaq/apic.h
@@ -0,0 +1,136 @@
1#ifndef __ASM_NUMAQ_APIC_H
2#define __ASM_NUMAQ_APIC_H
3
4#include <asm/io.h>
5#include <linux/mmzone.h>
6#include <linux/nodemask.h>
7
8#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
9
10static inline cpumask_t target_cpus(void)
11{
12 return CPU_MASK_ALL;
13}
14
15#define NO_BALANCE_IRQ (1)
16#define esr_disable (1)
17
18#define INT_DELIVERY_MODE dest_LowestPrio
19#define INT_DEST_MODE 0 /* physical delivery on LOCAL quad */
20
21static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
22{
23 return physid_isset(apicid, bitmap);
24}
25static inline unsigned long check_apicid_present(int bit)
26{
27 return physid_isset(bit, phys_cpu_present_map);
28}
29#define apicid_cluster(apicid) (apicid & 0xF0)
30
31static inline int apic_id_registered(void)
32{
33 return 1;
34}
35
36static inline void init_apic_ldr(void)
37{
38 /* Already done in NUMA-Q firmware */
39}
40
41static inline void setup_apic_routing(void)
42{
43 printk("Enabling APIC mode: %s. Using %d I/O APICs\n",
44 "NUMA-Q", nr_ioapics);
45}
46
47/*
48 * Skip adding the timer int on secondary nodes, which causes
49 * a small but painful rift in the time-space continuum.
50 */
51static inline int multi_timer_check(int apic, int irq)
52{
53 return apic != 0 && irq == 0;
54}
55
56static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map)
57{
58 /* We don't have a good way to do this yet - hack */
59 return physids_promote(0xFUL);
60}
61
62/* Mapping from cpu number to logical apicid */
63extern u8 cpu_2_logical_apicid[];
64static inline int cpu_to_logical_apicid(int cpu)
65{
66 if (cpu >= NR_CPUS)
67 return BAD_APICID;
68 return (int)cpu_2_logical_apicid[cpu];
69}
70
71/*
72 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
73 * cpu to APIC ID relation to properly interact with the intelligent
74 * mode of the cluster controller.
75 */
76static inline int cpu_present_to_apicid(int mps_cpu)
77{
78 if (mps_cpu < 60)
79 return ((mps_cpu >> 2) << 4) | (1 << (mps_cpu & 0x3));
80 else
81 return BAD_APICID;
82}
83
84static inline int apicid_to_node(int logical_apicid)
85{
86 return logical_apicid >> 4;
87}
88
89static inline physid_mask_t apicid_to_cpu_present(int logical_apicid)
90{
91 int node = apicid_to_node(logical_apicid);
92 int cpu = __ffs(logical_apicid & 0xf);
93
94 return physid_mask_of_physid(cpu + 4*node);
95}
96
97extern void *xquad_portio;
98
99static inline void setup_portio_remap(void)
100{
101 int num_quads = num_online_nodes();
102
103 if (num_quads <= 1)
104 return;
105
106 printk("Remapping cross-quad port I/O for %d quads\n", num_quads);
107 xquad_portio = ioremap(XQUAD_PORTIO_BASE, num_quads*XQUAD_PORTIO_QUAD);
108 printk("xquad_portio vaddr 0x%08lx, len %08lx\n",
109 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
110}
111
112static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
113{
114 return (1);
115}
116
117static inline void enable_apic_mode(void)
118{
119}
120
121/*
122 * We use physical apicids here, not logical, so just return the default
123 * physical broadcast to stop people from breaking us
124 */
125static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
126{
127 return (int) 0xF;
128}
129
130/* No NUMA-Q box has a HT CPU, but it can't hurt to use the default code. */
131static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
132{
133 return cpuid_apic >> index_msb;
134}
135
136#endif /* __ASM_NUMAQ_APIC_H */
diff --git a/arch/x86/include/asm/numaq/apicdef.h b/arch/x86/include/asm/numaq/apicdef.h
new file mode 100644
index 000000000000..e012a46cc22a
--- /dev/null
+++ b/arch/x86/include/asm/numaq/apicdef.h
@@ -0,0 +1,14 @@
1#ifndef __ASM_NUMAQ_APICDEF_H
2#define __ASM_NUMAQ_APICDEF_H
3
4
5#define APIC_ID_MASK (0xF<<24)
6
7static inline unsigned get_apic_id(unsigned long x)
8{
9 return (((x)>>24)&0x0F);
10}
11
12#define GET_APIC_ID(x) get_apic_id(x)
13
14#endif
diff --git a/arch/x86/include/asm/numaq/ipi.h b/arch/x86/include/asm/numaq/ipi.h
new file mode 100644
index 000000000000..935588d286cf
--- /dev/null
+++ b/arch/x86/include/asm/numaq/ipi.h
@@ -0,0 +1,25 @@
1#ifndef __ASM_NUMAQ_IPI_H
2#define __ASM_NUMAQ_IPI_H
3
4void send_IPI_mask_sequence(cpumask_t, int vector);
5
6static inline void send_IPI_mask(cpumask_t mask, int vector)
7{
8 send_IPI_mask_sequence(mask, vector);
9}
10
11static inline void send_IPI_allbutself(int vector)
12{
13 cpumask_t mask = cpu_online_map;
14 cpu_clear(smp_processor_id(), mask);
15
16 if (!cpus_empty(mask))
17 send_IPI_mask(mask, vector);
18}
19
20static inline void send_IPI_all(int vector)
21{
22 send_IPI_mask(cpu_online_map, vector);
23}
24
25#endif /* __ASM_NUMAQ_IPI_H */
diff --git a/arch/x86/include/asm/numaq/mpparse.h b/arch/x86/include/asm/numaq/mpparse.h
new file mode 100644
index 000000000000..252292e077b6
--- /dev/null
+++ b/arch/x86/include/asm/numaq/mpparse.h
@@ -0,0 +1,7 @@
1#ifndef __ASM_NUMAQ_MPPARSE_H
2#define __ASM_NUMAQ_MPPARSE_H
3
4extern void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
5 char *productid);
6
7#endif /* __ASM_NUMAQ_MPPARSE_H */
diff --git a/arch/x86/include/asm/numaq/wakecpu.h b/arch/x86/include/asm/numaq/wakecpu.h
new file mode 100644
index 000000000000..c577bda5b1c5
--- /dev/null
+++ b/arch/x86/include/asm/numaq/wakecpu.h
@@ -0,0 +1,43 @@
1#ifndef __ASM_NUMAQ_WAKECPU_H
2#define __ASM_NUMAQ_WAKECPU_H
3
4/* This file copes with machines that wakeup secondary CPUs by NMIs */
5
6#define WAKE_SECONDARY_VIA_NMI
7
8#define TRAMPOLINE_LOW phys_to_virt(0x8)
9#define TRAMPOLINE_HIGH phys_to_virt(0xa)
10
11#define boot_cpu_apicid boot_cpu_logical_apicid
12
13/* We don't do anything here because we use NMI's to boot instead */
14static inline void wait_for_init_deassert(atomic_t *deassert)
15{
16}
17
18/*
19 * Because we use NMIs rather than the INIT-STARTUP sequence to
20 * bootstrap the CPUs, the APIC may be in a weird state. Kick it.
21 */
22static inline void smp_callin_clear_local_apic(void)
23{
24 clear_local_APIC();
25}
26
27static inline void store_NMI_vector(unsigned short *high, unsigned short *low)
28{
29 printk("Storing NMI vector\n");
30 *high = *((volatile unsigned short *) TRAMPOLINE_HIGH);
31 *low = *((volatile unsigned short *) TRAMPOLINE_LOW);
32}
33
34static inline void restore_NMI_vector(unsigned short *high, unsigned short *low)
35{
36 printk("Restoring NMI vector\n");
37 *((volatile unsigned short *) TRAMPOLINE_HIGH) = *high;
38 *((volatile unsigned short *) TRAMPOLINE_LOW) = *low;
39}
40
41#define inquire_remote_apic(apicid) {}
42
43#endif /* __ASM_NUMAQ_WAKECPU_H */
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
new file mode 100644
index 000000000000..834a30295fab
--- /dev/null
+++ b/arch/x86/include/asm/olpc.h
@@ -0,0 +1,132 @@
1/* OLPC machine specific definitions */
2
3#ifndef _ASM_X86_OLPC_H
4#define _ASM_X86_OLPC_H
5
6#include <asm/geode.h>
7
8struct olpc_platform_t {
9 int flags;
10 uint32_t boardrev;
11 int ecver;
12};
13
14#define OLPC_F_PRESENT 0x01
15#define OLPC_F_DCON 0x02
16#define OLPC_F_VSA 0x04
17
18#ifdef CONFIG_OLPC
19
20extern struct olpc_platform_t olpc_platform_info;
21
22/*
23 * OLPC board IDs contain the major build number within the mask 0x0ff0,
24 * and the minor build number withing 0x000f. Pre-builds have a minor
25 * number less than 8, and normal builds start at 8. For example, 0x0B10
26 * is a PreB1, and 0x0C18 is a C1.
27 */
28
29static inline uint32_t olpc_board(uint8_t id)
30{
31 return (id << 4) | 0x8;
32}
33
34static inline uint32_t olpc_board_pre(uint8_t id)
35{
36 return id << 4;
37}
38
39static inline int machine_is_olpc(void)
40{
41 return (olpc_platform_info.flags & OLPC_F_PRESENT) ? 1 : 0;
42}
43
44/*
45 * The DCON is OLPC's Display Controller. It has a number of unique
46 * features that we might want to take advantage of..
47 */
48static inline int olpc_has_dcon(void)
49{
50 return (olpc_platform_info.flags & OLPC_F_DCON) ? 1 : 0;
51}
52
53/*
54 * The VSA is software from AMD that typical Geode bioses will include.
55 * It is used to emulate the PCI bus, VGA, etc. OLPC's Open Firmware does
56 * not include the VSA; instead, PCI is emulated by the kernel.
57 *
58 * The VSA is described further in arch/x86/pci/olpc.c.
59 */
60static inline int olpc_has_vsa(void)
61{
62 return (olpc_platform_info.flags & OLPC_F_VSA) ? 1 : 0;
63}
64
65/*
66 * The "Mass Production" version of OLPC's XO is identified as being model
67 * C2. During the prototype phase, the following models (in chronological
68 * order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models
69 * were based on Geode GX CPUs, and models after that were based upon
70 * Geode LX CPUs. There were also some hand-assembled models floating
71 * around, referred to as PreB1, PreB2, etc.
72 */
73static inline int olpc_board_at_least(uint32_t rev)
74{
75 return olpc_platform_info.boardrev >= rev;
76}
77
78#else
79
80static inline int machine_is_olpc(void)
81{
82 return 0;
83}
84
85static inline int olpc_has_dcon(void)
86{
87 return 0;
88}
89
90static inline int olpc_has_vsa(void)
91{
92 return 0;
93}
94
95#endif
96
97/* EC related functions */
98
99extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
100 unsigned char *outbuf, size_t outlen);
101
102extern int olpc_ec_mask_set(uint8_t bits);
103extern int olpc_ec_mask_unset(uint8_t bits);
104
105/* EC commands */
106
107#define EC_FIRMWARE_REV 0x08
108
109/* SCI source values */
110
111#define EC_SCI_SRC_EMPTY 0x00
112#define EC_SCI_SRC_GAME 0x01
113#define EC_SCI_SRC_BATTERY 0x02
114#define EC_SCI_SRC_BATSOC 0x04
115#define EC_SCI_SRC_BATERR 0x08
116#define EC_SCI_SRC_EBOOK 0x10
117#define EC_SCI_SRC_WLAN 0x20
118#define EC_SCI_SRC_ACPWR 0x40
119#define EC_SCI_SRC_ALL 0x7F
120
121/* GPIO assignments */
122
123#define OLPC_GPIO_MIC_AC geode_gpio(1)
124#define OLPC_GPIO_DCON_IRQ geode_gpio(7)
125#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
126#define OLPC_GPIO_SMB_CLK geode_gpio(14)
127#define OLPC_GPIO_SMB_DATA geode_gpio(15)
128#define OLPC_GPIO_WORKAUX geode_gpio(24)
129#define OLPC_GPIO_LID geode_gpio(26)
130#define OLPC_GPIO_ECSCI geode_gpio(27)
131
132#endif /* _ASM_X86_OLPC_H */
diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h
new file mode 100644
index 000000000000..e9873a2e8695
--- /dev/null
+++ b/arch/x86/include/asm/page.h
@@ -0,0 +1,209 @@
1#ifndef _ASM_X86_PAGE_H
2#define _ASM_X86_PAGE_H
3
4#include <linux/const.h>
5
6/* PAGE_SHIFT determines the page size */
7#define PAGE_SHIFT 12
8#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
9#define PAGE_MASK (~(PAGE_SIZE-1))
10
11#ifdef __KERNEL__
12
13#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1)
14#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
15
16/* Cast PAGE_MASK to a signed type so that it is sign-extended if
17 virtual addresses are 32-bits but physical addresses are larger
18 (ie, 32-bit PAE). */
19#define PHYSICAL_PAGE_MASK (((signed long)PAGE_MASK) & __PHYSICAL_MASK)
20
21/* PTE_PFN_MASK extracts the PFN from a (pte|pmd|pud|pgd)val_t */
22#define PTE_PFN_MASK ((pteval_t)PHYSICAL_PAGE_MASK)
23
24/* PTE_FLAGS_MASK extracts the flags from a (pte|pmd|pud|pgd)val_t */
25#define PTE_FLAGS_MASK (~PTE_PFN_MASK)
26
27#define PMD_PAGE_SIZE (_AC(1, UL) << PMD_SHIFT)
28#define PMD_PAGE_MASK (~(PMD_PAGE_SIZE-1))
29
30#define HPAGE_SHIFT PMD_SHIFT
31#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT)
32#define HPAGE_MASK (~(HPAGE_SIZE - 1))
33#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
34
35#define HUGE_MAX_HSTATE 2
36
37#ifndef __ASSEMBLY__
38#include <linux/types.h>
39#endif
40
41#ifdef CONFIG_X86_64
42#include <asm/page_64.h>
43#else
44#include <asm/page_32.h>
45#endif /* CONFIG_X86_64 */
46
47#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET)
48
49#define VM_DATA_DEFAULT_FLAGS \
50 (((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \
51 VM_READ | VM_WRITE | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
52
53
54#ifndef __ASSEMBLY__
55
56typedef struct { pgdval_t pgd; } pgd_t;
57typedef struct { pgprotval_t pgprot; } pgprot_t;
58
59extern int page_is_ram(unsigned long pagenr);
60extern int pagerange_is_ram(unsigned long start, unsigned long end);
61extern int devmem_is_allowed(unsigned long pagenr);
62extern void map_devmem(unsigned long pfn, unsigned long size,
63 pgprot_t vma_prot);
64extern void unmap_devmem(unsigned long pfn, unsigned long size,
65 pgprot_t vma_prot);
66
67extern unsigned long max_low_pfn_mapped;
68extern unsigned long max_pfn_mapped;
69
70struct page;
71
72static inline void clear_user_page(void *page, unsigned long vaddr,
73 struct page *pg)
74{
75 clear_page(page);
76}
77
78static inline void copy_user_page(void *to, void *from, unsigned long vaddr,
79 struct page *topage)
80{
81 copy_page(to, from);
82}
83
84#define __alloc_zeroed_user_highpage(movableflags, vma, vaddr) \
85 alloc_page_vma(GFP_HIGHUSER | __GFP_ZERO | movableflags, vma, vaddr)
86#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE
87
88static inline pgd_t native_make_pgd(pgdval_t val)
89{
90 return (pgd_t) { val };
91}
92
93static inline pgdval_t native_pgd_val(pgd_t pgd)
94{
95 return pgd.pgd;
96}
97
98#if PAGETABLE_LEVELS >= 3
99#if PAGETABLE_LEVELS == 4
100typedef struct { pudval_t pud; } pud_t;
101
102static inline pud_t native_make_pud(pmdval_t val)
103{
104 return (pud_t) { val };
105}
106
107static inline pudval_t native_pud_val(pud_t pud)
108{
109 return pud.pud;
110}
111#else /* PAGETABLE_LEVELS == 3 */
112#include <asm-generic/pgtable-nopud.h>
113
114static inline pudval_t native_pud_val(pud_t pud)
115{
116 return native_pgd_val(pud.pgd);
117}
118#endif /* PAGETABLE_LEVELS == 4 */
119
120typedef struct { pmdval_t pmd; } pmd_t;
121
122static inline pmd_t native_make_pmd(pmdval_t val)
123{
124 return (pmd_t) { val };
125}
126
127static inline pmdval_t native_pmd_val(pmd_t pmd)
128{
129 return pmd.pmd;
130}
131#else /* PAGETABLE_LEVELS == 2 */
132#include <asm-generic/pgtable-nopmd.h>
133
134static inline pmdval_t native_pmd_val(pmd_t pmd)
135{
136 return native_pgd_val(pmd.pud.pgd);
137}
138#endif /* PAGETABLE_LEVELS >= 3 */
139
140static inline pte_t native_make_pte(pteval_t val)
141{
142 return (pte_t) { .pte = val };
143}
144
145static inline pteval_t native_pte_val(pte_t pte)
146{
147 return pte.pte;
148}
149
150static inline pteval_t native_pte_flags(pte_t pte)
151{
152 return native_pte_val(pte) & PTE_FLAGS_MASK;
153}
154
155#define pgprot_val(x) ((x).pgprot)
156#define __pgprot(x) ((pgprot_t) { (x) } )
157
158#ifdef CONFIG_PARAVIRT
159#include <asm/paravirt.h>
160#else /* !CONFIG_PARAVIRT */
161
162#define pgd_val(x) native_pgd_val(x)
163#define __pgd(x) native_make_pgd(x)
164
165#ifndef __PAGETABLE_PUD_FOLDED
166#define pud_val(x) native_pud_val(x)
167#define __pud(x) native_make_pud(x)
168#endif
169
170#ifndef __PAGETABLE_PMD_FOLDED
171#define pmd_val(x) native_pmd_val(x)
172#define __pmd(x) native_make_pmd(x)
173#endif
174
175#define pte_val(x) native_pte_val(x)
176#define pte_flags(x) native_pte_flags(x)
177#define __pte(x) native_make_pte(x)
178
179#endif /* CONFIG_PARAVIRT */
180
181#define __pa(x) __phys_addr((unsigned long)(x))
182#define __pa_nodebug(x) __phys_addr_nodebug((unsigned long)(x))
183/* __pa_symbol should be used for C visible symbols.
184 This seems to be the official gcc blessed way to do such arithmetic. */
185#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x)))
186
187#define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET))
188
189#define __boot_va(x) __va(x)
190#define __boot_pa(x) __pa(x)
191
192/*
193 * virt_to_page(kaddr) returns a valid pointer if and only if
194 * virt_addr_valid(kaddr) returns true.
195 */
196#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
197#define pfn_to_kaddr(pfn) __va((pfn) << PAGE_SHIFT)
198extern bool __virt_addr_valid(unsigned long kaddr);
199#define virt_addr_valid(kaddr) __virt_addr_valid((unsigned long) (kaddr))
200
201#endif /* __ASSEMBLY__ */
202
203#include <asm-generic/memory_model.h>
204#include <asm-generic/page.h>
205
206#define __HAVE_ARCH_GATE_AREA 1
207
208#endif /* __KERNEL__ */
209#endif /* _ASM_X86_PAGE_H */
diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h
new file mode 100644
index 000000000000..bcde0d7b4325
--- /dev/null
+++ b/arch/x86/include/asm/page_32.h
@@ -0,0 +1,136 @@
1#ifndef _ASM_X86_PAGE_32_H
2#define _ASM_X86_PAGE_32_H
3
4/*
5 * This handles the memory map.
6 *
7 * A __PAGE_OFFSET of 0xC0000000 means that the kernel has
8 * a virtual address space of one gigabyte, which limits the
9 * amount of physical memory you can use to about 950MB.
10 *
11 * If you want more physical memory than this then see the CONFIG_HIGHMEM4G
12 * and CONFIG_HIGHMEM64G options in the kernel configuration.
13 */
14#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
15
16#ifdef CONFIG_4KSTACKS
17#define THREAD_ORDER 0
18#else
19#define THREAD_ORDER 1
20#endif
21#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
22
23#define STACKFAULT_STACK 0
24#define DOUBLEFAULT_STACK 1
25#define NMI_STACK 0
26#define DEBUG_STACK 0
27#define MCE_STACK 0
28#define N_EXCEPTION_STACKS 1
29
30#ifdef CONFIG_X86_PAE
31/* 44=32+12, the limit we can fit into an unsigned long pfn */
32#define __PHYSICAL_MASK_SHIFT 44
33#define __VIRTUAL_MASK_SHIFT 32
34#define PAGETABLE_LEVELS 3
35
36#ifndef __ASSEMBLY__
37typedef u64 pteval_t;
38typedef u64 pmdval_t;
39typedef u64 pudval_t;
40typedef u64 pgdval_t;
41typedef u64 pgprotval_t;
42
43typedef union {
44 struct {
45 unsigned long pte_low, pte_high;
46 };
47 pteval_t pte;
48} pte_t;
49#endif /* __ASSEMBLY__
50 */
51#else /* !CONFIG_X86_PAE */
52#define __PHYSICAL_MASK_SHIFT 32
53#define __VIRTUAL_MASK_SHIFT 32
54#define PAGETABLE_LEVELS 2
55
56#ifndef __ASSEMBLY__
57typedef unsigned long pteval_t;
58typedef unsigned long pmdval_t;
59typedef unsigned long pudval_t;
60typedef unsigned long pgdval_t;
61typedef unsigned long pgprotval_t;
62
63typedef union {
64 pteval_t pte;
65 pteval_t pte_low;
66} pte_t;
67
68#endif /* __ASSEMBLY__ */
69#endif /* CONFIG_X86_PAE */
70
71#ifndef __ASSEMBLY__
72typedef struct page *pgtable_t;
73#endif
74
75#ifdef CONFIG_HUGETLB_PAGE
76#define HAVE_ARCH_HUGETLB_UNMAPPED_AREA
77#endif
78
79#ifndef __ASSEMBLY__
80#define __phys_addr_nodebug(x) ((x) - PAGE_OFFSET)
81#ifdef CONFIG_DEBUG_VIRTUAL
82extern unsigned long __phys_addr(unsigned long);
83#else
84#define __phys_addr(x) __phys_addr_nodebug(x)
85#endif
86#define __phys_reloc_hide(x) RELOC_HIDE((x), 0)
87
88#ifdef CONFIG_FLATMEM
89#define pfn_valid(pfn) ((pfn) < max_mapnr)
90#endif /* CONFIG_FLATMEM */
91
92extern int nx_enabled;
93
94/*
95 * This much address space is reserved for vmalloc() and iomap()
96 * as well as fixmap mappings.
97 */
98extern unsigned int __VMALLOC_RESERVE;
99extern int sysctl_legacy_va_layout;
100
101extern void find_low_pfn_range(void);
102extern unsigned long init_memory_mapping(unsigned long start,
103 unsigned long end);
104extern void initmem_init(unsigned long, unsigned long);
105extern void free_initmem(void);
106extern void setup_bootmem_allocator(void);
107
108
109#ifdef CONFIG_X86_USE_3DNOW
110#include <asm/mmx.h>
111
112static inline void clear_page(void *page)
113{
114 mmx_clear_page(page);
115}
116
117static inline void copy_page(void *to, void *from)
118{
119 mmx_copy_page(to, from);
120}
121#else /* !CONFIG_X86_USE_3DNOW */
122#include <linux/string.h>
123
124static inline void clear_page(void *page)
125{
126 memset(page, 0, PAGE_SIZE);
127}
128
129static inline void copy_page(void *to, void *from)
130{
131 memcpy(to, from, PAGE_SIZE);
132}
133#endif /* CONFIG_X86_3DNOW */
134#endif /* !__ASSEMBLY__ */
135
136#endif /* _ASM_X86_PAGE_32_H */
diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h
new file mode 100644
index 000000000000..5ebca29f44f0
--- /dev/null
+++ b/arch/x86/include/asm/page_64.h
@@ -0,0 +1,105 @@
1#ifndef _ASM_X86_PAGE_64_H
2#define _ASM_X86_PAGE_64_H
3
4#define PAGETABLE_LEVELS 4
5
6#define THREAD_ORDER 1
7#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
8#define CURRENT_MASK (~(THREAD_SIZE - 1))
9
10#define EXCEPTION_STACK_ORDER 0
11#define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
12
13#define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
14#define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
15
16#define IRQSTACK_ORDER 2
17#define IRQSTACKSIZE (PAGE_SIZE << IRQSTACK_ORDER)
18
19#define STACKFAULT_STACK 1
20#define DOUBLEFAULT_STACK 2
21#define NMI_STACK 3
22#define DEBUG_STACK 4
23#define MCE_STACK 5
24#define N_EXCEPTION_STACKS 5 /* hw limit: 7 */
25
26#define PUD_PAGE_SIZE (_AC(1, UL) << PUD_SHIFT)
27#define PUD_PAGE_MASK (~(PUD_PAGE_SIZE-1))
28
29/*
30 * Set __PAGE_OFFSET to the most negative possible address +
31 * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a
32 * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's
33 * what Xen requires.
34 */
35#define __PAGE_OFFSET _AC(0xffff880000000000, UL)
36
37#define __PHYSICAL_START CONFIG_PHYSICAL_START
38#define __KERNEL_ALIGN 0x200000
39
40/*
41 * Make sure kernel is aligned to 2MB address. Catching it at compile
42 * time is better. Change your config file and compile the kernel
43 * for a 2MB aligned address (CONFIG_PHYSICAL_START)
44 */
45#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
46#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
47#endif
48
49#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START)
50#define __START_KERNEL_map _AC(0xffffffff80000000, UL)
51
52/* See Documentation/x86_64/mm.txt for a description of the memory map. */
53#define __PHYSICAL_MASK_SHIFT 46
54#define __VIRTUAL_MASK_SHIFT 48
55
56/*
57 * Kernel image size is limited to 512 MB (see level2_kernel_pgt in
58 * arch/x86/kernel/head_64.S), and it is mapped here:
59 */
60#define KERNEL_IMAGE_SIZE (512 * 1024 * 1024)
61#define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL)
62
63#ifndef __ASSEMBLY__
64void clear_page(void *page);
65void copy_page(void *to, void *from);
66
67/* duplicated to the one in bootmem.h */
68extern unsigned long max_pfn;
69extern unsigned long phys_base;
70
71extern unsigned long __phys_addr(unsigned long);
72#define __phys_reloc_hide(x) (x)
73
74/*
75 * These are used to make use of C type-checking..
76 */
77typedef unsigned long pteval_t;
78typedef unsigned long pmdval_t;
79typedef unsigned long pudval_t;
80typedef unsigned long pgdval_t;
81typedef unsigned long pgprotval_t;
82
83typedef struct page *pgtable_t;
84
85typedef struct { pteval_t pte; } pte_t;
86
87#define vmemmap ((struct page *)VMEMMAP_START)
88
89extern unsigned long init_memory_mapping(unsigned long start,
90 unsigned long end);
91
92extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
93extern void free_initmem(void);
94
95extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
96extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
97
98#endif /* !__ASSEMBLY__ */
99
100#ifdef CONFIG_FLATMEM
101#define pfn_valid(pfn) ((pfn) < max_pfn)
102#endif
103
104
105#endif /* _ASM_X86_PAGE_64_H */
diff --git a/arch/x86/include/asm/param.h b/arch/x86/include/asm/param.h
new file mode 100644
index 000000000000..6f0d0422f4ca
--- /dev/null
+++ b/arch/x86/include/asm/param.h
@@ -0,0 +1,22 @@
1#ifndef _ASM_X86_PARAM_H
2#define _ASM_X86_PARAM_H
3
4#ifdef __KERNEL__
5# define HZ CONFIG_HZ /* Internal kernel timer frequency */
6# define USER_HZ 100 /* some user interfaces are */
7# define CLOCKS_PER_SEC (USER_HZ) /* in "ticks" like times() */
8#endif
9
10#ifndef HZ
11#define HZ 100
12#endif
13
14#define EXEC_PAGESIZE 4096
15
16#ifndef NOGROUP
17#define NOGROUP (-1)
18#endif
19
20#define MAXHOSTNAMELEN 64 /* max length of hostname */
21
22#endif /* _ASM_X86_PARAM_H */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
new file mode 100644
index 000000000000..ba3e2ff6aedc
--- /dev/null
+++ b/arch/x86/include/asm/paravirt.h
@@ -0,0 +1,1650 @@
1#ifndef _ASM_X86_PARAVIRT_H
2#define _ASM_X86_PARAVIRT_H
3/* Various instructions on x86 need to be replaced for
4 * para-virtualization: those hooks are defined here. */
5
6#ifdef CONFIG_PARAVIRT
7#include <asm/page.h>
8#include <asm/asm.h>
9
10/* Bitmask of what can be clobbered: usually at least eax. */
11#define CLBR_NONE 0
12#define CLBR_EAX (1 << 0)
13#define CLBR_ECX (1 << 1)
14#define CLBR_EDX (1 << 2)
15
16#ifdef CONFIG_X86_64
17#define CLBR_RSI (1 << 3)
18#define CLBR_RDI (1 << 4)
19#define CLBR_R8 (1 << 5)
20#define CLBR_R9 (1 << 6)
21#define CLBR_R10 (1 << 7)
22#define CLBR_R11 (1 << 8)
23#define CLBR_ANY ((1 << 9) - 1)
24#include <asm/desc_defs.h>
25#else
26/* CLBR_ANY should match all regs platform has. For i386, that's just it */
27#define CLBR_ANY ((1 << 3) - 1)
28#endif /* X86_64 */
29
30#ifndef __ASSEMBLY__
31#include <linux/types.h>
32#include <linux/cpumask.h>
33#include <asm/kmap_types.h>
34#include <asm/desc_defs.h>
35
36struct page;
37struct thread_struct;
38struct desc_ptr;
39struct tss_struct;
40struct mm_struct;
41struct desc_struct;
42
43/* general info */
44struct pv_info {
45 unsigned int kernel_rpl;
46 int shared_kernel_pmd;
47 int paravirt_enabled;
48 const char *name;
49};
50
51struct pv_init_ops {
52 /*
53 * Patch may replace one of the defined code sequences with
54 * arbitrary code, subject to the same register constraints.
55 * This generally means the code is not free to clobber any
56 * registers other than EAX. The patch function should return
57 * the number of bytes of code generated, as we nop pad the
58 * rest in generic code.
59 */
60 unsigned (*patch)(u8 type, u16 clobber, void *insnbuf,
61 unsigned long addr, unsigned len);
62
63 /* Basic arch-specific setup */
64 void (*arch_setup)(void);
65 char *(*memory_setup)(void);
66 void (*post_allocator_init)(void);
67
68 /* Print a banner to identify the environment */
69 void (*banner)(void);
70};
71
72
73struct pv_lazy_ops {
74 /* Set deferred update mode, used for batching operations. */
75 void (*enter)(void);
76 void (*leave)(void);
77};
78
79struct pv_time_ops {
80 void (*time_init)(void);
81
82 /* Set and set time of day */
83 unsigned long (*get_wallclock)(void);
84 int (*set_wallclock)(unsigned long);
85
86 unsigned long long (*sched_clock)(void);
87 unsigned long (*get_tsc_khz)(void);
88};
89
90struct pv_cpu_ops {
91 /* hooks for various privileged instructions */
92 unsigned long (*get_debugreg)(int regno);
93 void (*set_debugreg)(int regno, unsigned long value);
94
95 void (*clts)(void);
96
97 unsigned long (*read_cr0)(void);
98 void (*write_cr0)(unsigned long);
99
100 unsigned long (*read_cr4_safe)(void);
101 unsigned long (*read_cr4)(void);
102 void (*write_cr4)(unsigned long);
103
104#ifdef CONFIG_X86_64
105 unsigned long (*read_cr8)(void);
106 void (*write_cr8)(unsigned long);
107#endif
108
109 /* Segment descriptor handling */
110 void (*load_tr_desc)(void);
111 void (*load_gdt)(const struct desc_ptr *);
112 void (*load_idt)(const struct desc_ptr *);
113 void (*store_gdt)(struct desc_ptr *);
114 void (*store_idt)(struct desc_ptr *);
115 void (*set_ldt)(const void *desc, unsigned entries);
116 unsigned long (*store_tr)(void);
117 void (*load_tls)(struct thread_struct *t, unsigned int cpu);
118#ifdef CONFIG_X86_64
119 void (*load_gs_index)(unsigned int idx);
120#endif
121 void (*write_ldt_entry)(struct desc_struct *ldt, int entrynum,
122 const void *desc);
123 void (*write_gdt_entry)(struct desc_struct *,
124 int entrynum, const void *desc, int size);
125 void (*write_idt_entry)(gate_desc *,
126 int entrynum, const gate_desc *gate);
127 void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
128 void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
129
130 void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
131
132 void (*set_iopl_mask)(unsigned mask);
133
134 void (*wbinvd)(void);
135 void (*io_delay)(void);
136
137 /* cpuid emulation, mostly so that caps bits can be disabled */
138 void (*cpuid)(unsigned int *eax, unsigned int *ebx,
139 unsigned int *ecx, unsigned int *edx);
140
141 /* MSR, PMC and TSR operations.
142 err = 0/-EFAULT. wrmsr returns 0/-EFAULT. */
143 u64 (*read_msr_amd)(unsigned int msr, int *err);
144 u64 (*read_msr)(unsigned int msr, int *err);
145 int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
146
147 u64 (*read_tsc)(void);
148 u64 (*read_pmc)(int counter);
149 unsigned long long (*read_tscp)(unsigned int *aux);
150
151 /*
152 * Atomically enable interrupts and return to userspace. This
153 * is only ever used to return to 32-bit processes; in a
154 * 64-bit kernel, it's used for 32-on-64 compat processes, but
155 * never native 64-bit processes. (Jump, not call.)
156 */
157 void (*irq_enable_sysexit)(void);
158
159 /*
160 * Switch to usermode gs and return to 64-bit usermode using
161 * sysret. Only used in 64-bit kernels to return to 64-bit
162 * processes. Usermode register state, including %rsp, must
163 * already be restored.
164 */
165 void (*usergs_sysret64)(void);
166
167 /*
168 * Switch to usermode gs and return to 32-bit usermode using
169 * sysret. Used to return to 32-on-64 compat processes.
170 * Other usermode register state, including %esp, must already
171 * be restored.
172 */
173 void (*usergs_sysret32)(void);
174
175 /* Normal iret. Jump to this with the standard iret stack
176 frame set up. */
177 void (*iret)(void);
178
179 void (*swapgs)(void);
180
181 struct pv_lazy_ops lazy_mode;
182};
183
184struct pv_irq_ops {
185 void (*init_IRQ)(void);
186
187 /*
188 * Get/set interrupt state. save_fl and restore_fl are only
189 * expected to use X86_EFLAGS_IF; all other bits
190 * returned from save_fl are undefined, and may be ignored by
191 * restore_fl.
192 */
193 unsigned long (*save_fl)(void);
194 void (*restore_fl)(unsigned long);
195 void (*irq_disable)(void);
196 void (*irq_enable)(void);
197 void (*safe_halt)(void);
198 void (*halt)(void);
199
200#ifdef CONFIG_X86_64
201 void (*adjust_exception_frame)(void);
202#endif
203};
204
205struct pv_apic_ops {
206#ifdef CONFIG_X86_LOCAL_APIC
207 void (*setup_boot_clock)(void);
208 void (*setup_secondary_clock)(void);
209
210 void (*startup_ipi_hook)(int phys_apicid,
211 unsigned long start_eip,
212 unsigned long start_esp);
213#endif
214};
215
216struct pv_mmu_ops {
217 /*
218 * Called before/after init_mm pagetable setup. setup_start
219 * may reset %cr3, and may pre-install parts of the pagetable;
220 * pagetable setup is expected to preserve any existing
221 * mapping.
222 */
223 void (*pagetable_setup_start)(pgd_t *pgd_base);
224 void (*pagetable_setup_done)(pgd_t *pgd_base);
225
226 unsigned long (*read_cr2)(void);
227 void (*write_cr2)(unsigned long);
228
229 unsigned long (*read_cr3)(void);
230 void (*write_cr3)(unsigned long);
231
232 /*
233 * Hooks for intercepting the creation/use/destruction of an
234 * mm_struct.
235 */
236 void (*activate_mm)(struct mm_struct *prev,
237 struct mm_struct *next);
238 void (*dup_mmap)(struct mm_struct *oldmm,
239 struct mm_struct *mm);
240 void (*exit_mmap)(struct mm_struct *mm);
241
242
243 /* TLB operations */
244 void (*flush_tlb_user)(void);
245 void (*flush_tlb_kernel)(void);
246 void (*flush_tlb_single)(unsigned long addr);
247 void (*flush_tlb_others)(const cpumask_t *cpus, struct mm_struct *mm,
248 unsigned long va);
249
250 /* Hooks for allocating and freeing a pagetable top-level */
251 int (*pgd_alloc)(struct mm_struct *mm);
252 void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd);
253
254 /*
255 * Hooks for allocating/releasing pagetable pages when they're
256 * attached to a pagetable
257 */
258 void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
259 void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
260 void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
261 void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
262 void (*release_pte)(unsigned long pfn);
263 void (*release_pmd)(unsigned long pfn);
264 void (*release_pud)(unsigned long pfn);
265
266 /* Pagetable manipulation functions */
267 void (*set_pte)(pte_t *ptep, pte_t pteval);
268 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
269 pte_t *ptep, pte_t pteval);
270 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
271 void (*pte_update)(struct mm_struct *mm, unsigned long addr,
272 pte_t *ptep);
273 void (*pte_update_defer)(struct mm_struct *mm,
274 unsigned long addr, pte_t *ptep);
275
276 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
277 pte_t *ptep);
278 void (*ptep_modify_prot_commit)(struct mm_struct *mm, unsigned long addr,
279 pte_t *ptep, pte_t pte);
280
281 pteval_t (*pte_val)(pte_t);
282 pteval_t (*pte_flags)(pte_t);
283 pte_t (*make_pte)(pteval_t pte);
284
285 pgdval_t (*pgd_val)(pgd_t);
286 pgd_t (*make_pgd)(pgdval_t pgd);
287
288#if PAGETABLE_LEVELS >= 3
289#ifdef CONFIG_X86_PAE
290 void (*set_pte_atomic)(pte_t *ptep, pte_t pteval);
291 void (*set_pte_present)(struct mm_struct *mm, unsigned long addr,
292 pte_t *ptep, pte_t pte);
293 void (*pte_clear)(struct mm_struct *mm, unsigned long addr,
294 pte_t *ptep);
295 void (*pmd_clear)(pmd_t *pmdp);
296
297#endif /* CONFIG_X86_PAE */
298
299 void (*set_pud)(pud_t *pudp, pud_t pudval);
300
301 pmdval_t (*pmd_val)(pmd_t);
302 pmd_t (*make_pmd)(pmdval_t pmd);
303
304#if PAGETABLE_LEVELS == 4
305 pudval_t (*pud_val)(pud_t);
306 pud_t (*make_pud)(pudval_t pud);
307
308 void (*set_pgd)(pgd_t *pudp, pgd_t pgdval);
309#endif /* PAGETABLE_LEVELS == 4 */
310#endif /* PAGETABLE_LEVELS >= 3 */
311
312#ifdef CONFIG_HIGHPTE
313 void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
314#endif
315
316 struct pv_lazy_ops lazy_mode;
317
318 /* dom0 ops */
319
320 /* Sometimes the physical address is a pfn, and sometimes its
321 an mfn. We can tell which is which from the index. */
322 void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx,
323 unsigned long phys, pgprot_t flags);
324};
325
326struct raw_spinlock;
327struct pv_lock_ops {
328 int (*spin_is_locked)(struct raw_spinlock *lock);
329 int (*spin_is_contended)(struct raw_spinlock *lock);
330 void (*spin_lock)(struct raw_spinlock *lock);
331 void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags);
332 int (*spin_trylock)(struct raw_spinlock *lock);
333 void (*spin_unlock)(struct raw_spinlock *lock);
334};
335
336/* This contains all the paravirt structures: we get a convenient
337 * number for each function using the offset which we use to indicate
338 * what to patch. */
339struct paravirt_patch_template {
340 struct pv_init_ops pv_init_ops;
341 struct pv_time_ops pv_time_ops;
342 struct pv_cpu_ops pv_cpu_ops;
343 struct pv_irq_ops pv_irq_ops;
344 struct pv_apic_ops pv_apic_ops;
345 struct pv_mmu_ops pv_mmu_ops;
346 struct pv_lock_ops pv_lock_ops;
347};
348
349extern struct pv_info pv_info;
350extern struct pv_init_ops pv_init_ops;
351extern struct pv_time_ops pv_time_ops;
352extern struct pv_cpu_ops pv_cpu_ops;
353extern struct pv_irq_ops pv_irq_ops;
354extern struct pv_apic_ops pv_apic_ops;
355extern struct pv_mmu_ops pv_mmu_ops;
356extern struct pv_lock_ops pv_lock_ops;
357
358#define PARAVIRT_PATCH(x) \
359 (offsetof(struct paravirt_patch_template, x) / sizeof(void *))
360
361#define paravirt_type(op) \
362 [paravirt_typenum] "i" (PARAVIRT_PATCH(op)), \
363 [paravirt_opptr] "m" (op)
364#define paravirt_clobber(clobber) \
365 [paravirt_clobber] "i" (clobber)
366
367/*
368 * Generate some code, and mark it as patchable by the
369 * apply_paravirt() alternate instruction patcher.
370 */
371#define _paravirt_alt(insn_string, type, clobber) \
372 "771:\n\t" insn_string "\n" "772:\n" \
373 ".pushsection .parainstructions,\"a\"\n" \
374 _ASM_ALIGN "\n" \
375 _ASM_PTR " 771b\n" \
376 " .byte " type "\n" \
377 " .byte 772b-771b\n" \
378 " .short " clobber "\n" \
379 ".popsection\n"
380
381/* Generate patchable code, with the default asm parameters. */
382#define paravirt_alt(insn_string) \
383 _paravirt_alt(insn_string, "%c[paravirt_typenum]", "%c[paravirt_clobber]")
384
385/* Simple instruction patching code. */
386#define DEF_NATIVE(ops, name, code) \
387 extern const char start_##ops##_##name[], end_##ops##_##name[]; \
388 asm("start_" #ops "_" #name ": " code "; end_" #ops "_" #name ":")
389
390unsigned paravirt_patch_nop(void);
391unsigned paravirt_patch_ignore(unsigned len);
392unsigned paravirt_patch_call(void *insnbuf,
393 const void *target, u16 tgt_clobbers,
394 unsigned long addr, u16 site_clobbers,
395 unsigned len);
396unsigned paravirt_patch_jmp(void *insnbuf, const void *target,
397 unsigned long addr, unsigned len);
398unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf,
399 unsigned long addr, unsigned len);
400
401unsigned paravirt_patch_insns(void *insnbuf, unsigned len,
402 const char *start, const char *end);
403
404unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
405 unsigned long addr, unsigned len);
406
407int paravirt_disable_iospace(void);
408
409/*
410 * This generates an indirect call based on the operation type number.
411 * The type number, computed in PARAVIRT_PATCH, is derived from the
412 * offset into the paravirt_patch_template structure, and can therefore be
413 * freely converted back into a structure offset.
414 */
415#define PARAVIRT_CALL "call *%[paravirt_opptr];"
416
417/*
418 * These macros are intended to wrap calls through one of the paravirt
419 * ops structs, so that they can be later identified and patched at
420 * runtime.
421 *
422 * Normally, a call to a pv_op function is a simple indirect call:
423 * (pv_op_struct.operations)(args...).
424 *
425 * Unfortunately, this is a relatively slow operation for modern CPUs,
426 * because it cannot necessarily determine what the destination
427 * address is. In this case, the address is a runtime constant, so at
428 * the very least we can patch the call to e a simple direct call, or
429 * ideally, patch an inline implementation into the callsite. (Direct
430 * calls are essentially free, because the call and return addresses
431 * are completely predictable.)
432 *
433 * For i386, these macros rely on the standard gcc "regparm(3)" calling
434 * convention, in which the first three arguments are placed in %eax,
435 * %edx, %ecx (in that order), and the remaining arguments are placed
436 * on the stack. All caller-save registers (eax,edx,ecx) are expected
437 * to be modified (either clobbered or used for return values).
438 * X86_64, on the other hand, already specifies a register-based calling
439 * conventions, returning at %rax, with parameteres going on %rdi, %rsi,
440 * %rdx, and %rcx. Note that for this reason, x86_64 does not need any
441 * special handling for dealing with 4 arguments, unlike i386.
442 * However, x86_64 also have to clobber all caller saved registers, which
443 * unfortunately, are quite a bit (r8 - r11)
444 *
445 * The call instruction itself is marked by placing its start address
446 * and size into the .parainstructions section, so that
447 * apply_paravirt() in arch/i386/kernel/alternative.c can do the
448 * appropriate patching under the control of the backend pv_init_ops
449 * implementation.
450 *
451 * Unfortunately there's no way to get gcc to generate the args setup
452 * for the call, and then allow the call itself to be generated by an
453 * inline asm. Because of this, we must do the complete arg setup and
454 * return value handling from within these macros. This is fairly
455 * cumbersome.
456 *
457 * There are 5 sets of PVOP_* macros for dealing with 0-4 arguments.
458 * It could be extended to more arguments, but there would be little
459 * to be gained from that. For each number of arguments, there are
460 * the two VCALL and CALL variants for void and non-void functions.
461 *
462 * When there is a return value, the invoker of the macro must specify
463 * the return type. The macro then uses sizeof() on that type to
464 * determine whether its a 32 or 64 bit value, and places the return
465 * in the right register(s) (just %eax for 32-bit, and %edx:%eax for
466 * 64-bit). For x86_64 machines, it just returns at %rax regardless of
467 * the return value size.
468 *
469 * 64-bit arguments are passed as a pair of adjacent 32-bit arguments
470 * i386 also passes 64-bit arguments as a pair of adjacent 32-bit arguments
471 * in low,high order
472 *
473 * Small structures are passed and returned in registers. The macro
474 * calling convention can't directly deal with this, so the wrapper
475 * functions must do this.
476 *
477 * These PVOP_* macros are only defined within this header. This
478 * means that all uses must be wrapped in inline functions. This also
479 * makes sure the incoming and outgoing types are always correct.
480 */
481#ifdef CONFIG_X86_32
482#define PVOP_VCALL_ARGS unsigned long __eax, __edx, __ecx
483#define PVOP_CALL_ARGS PVOP_VCALL_ARGS
484#define PVOP_VCALL_CLOBBERS "=a" (__eax), "=d" (__edx), \
485 "=c" (__ecx)
486#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS
487#define EXTRA_CLOBBERS
488#define VEXTRA_CLOBBERS
489#else
490#define PVOP_VCALL_ARGS unsigned long __edi, __esi, __edx, __ecx
491#define PVOP_CALL_ARGS PVOP_VCALL_ARGS, __eax
492#define PVOP_VCALL_CLOBBERS "=D" (__edi), \
493 "=S" (__esi), "=d" (__edx), \
494 "=c" (__ecx)
495
496#define PVOP_CALL_CLOBBERS PVOP_VCALL_CLOBBERS, "=a" (__eax)
497
498#define EXTRA_CLOBBERS , "r8", "r9", "r10", "r11"
499#define VEXTRA_CLOBBERS , "rax", "r8", "r9", "r10", "r11"
500#endif
501
502#ifdef CONFIG_PARAVIRT_DEBUG
503#define PVOP_TEST_NULL(op) BUG_ON(op == NULL)
504#else
505#define PVOP_TEST_NULL(op) ((void)op)
506#endif
507
508#define __PVOP_CALL(rettype, op, pre, post, ...) \
509 ({ \
510 rettype __ret; \
511 PVOP_CALL_ARGS; \
512 PVOP_TEST_NULL(op); \
513 /* This is 32-bit specific, but is okay in 64-bit */ \
514 /* since this condition will never hold */ \
515 if (sizeof(rettype) > sizeof(unsigned long)) { \
516 asm volatile(pre \
517 paravirt_alt(PARAVIRT_CALL) \
518 post \
519 : PVOP_CALL_CLOBBERS \
520 : paravirt_type(op), \
521 paravirt_clobber(CLBR_ANY), \
522 ##__VA_ARGS__ \
523 : "memory", "cc" EXTRA_CLOBBERS); \
524 __ret = (rettype)((((u64)__edx) << 32) | __eax); \
525 } else { \
526 asm volatile(pre \
527 paravirt_alt(PARAVIRT_CALL) \
528 post \
529 : PVOP_CALL_CLOBBERS \
530 : paravirt_type(op), \
531 paravirt_clobber(CLBR_ANY), \
532 ##__VA_ARGS__ \
533 : "memory", "cc" EXTRA_CLOBBERS); \
534 __ret = (rettype)__eax; \
535 } \
536 __ret; \
537 })
538#define __PVOP_VCALL(op, pre, post, ...) \
539 ({ \
540 PVOP_VCALL_ARGS; \
541 PVOP_TEST_NULL(op); \
542 asm volatile(pre \
543 paravirt_alt(PARAVIRT_CALL) \
544 post \
545 : PVOP_VCALL_CLOBBERS \
546 : paravirt_type(op), \
547 paravirt_clobber(CLBR_ANY), \
548 ##__VA_ARGS__ \
549 : "memory", "cc" VEXTRA_CLOBBERS); \
550 })
551
552#define PVOP_CALL0(rettype, op) \
553 __PVOP_CALL(rettype, op, "", "")
554#define PVOP_VCALL0(op) \
555 __PVOP_VCALL(op, "", "")
556
557#define PVOP_CALL1(rettype, op, arg1) \
558 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)))
559#define PVOP_VCALL1(op, arg1) \
560 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)))
561
562#define PVOP_CALL2(rettype, op, arg1, arg2) \
563 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
564 "1" ((unsigned long)(arg2)))
565#define PVOP_VCALL2(op, arg1, arg2) \
566 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
567 "1" ((unsigned long)(arg2)))
568
569#define PVOP_CALL3(rettype, op, arg1, arg2, arg3) \
570 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
571 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
572#define PVOP_VCALL3(op, arg1, arg2, arg3) \
573 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
574 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)))
575
576/* This is the only difference in x86_64. We can make it much simpler */
577#ifdef CONFIG_X86_32
578#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
579 __PVOP_CALL(rettype, op, \
580 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
581 "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
582 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
583#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
584 __PVOP_VCALL(op, \
585 "push %[_arg4];", "lea 4(%%esp),%%esp;", \
586 "0" ((u32)(arg1)), "1" ((u32)(arg2)), \
587 "2" ((u32)(arg3)), [_arg4] "mr" ((u32)(arg4)))
588#else
589#define PVOP_CALL4(rettype, op, arg1, arg2, arg3, arg4) \
590 __PVOP_CALL(rettype, op, "", "", "0" ((unsigned long)(arg1)), \
591 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \
592 "3"((unsigned long)(arg4)))
593#define PVOP_VCALL4(op, arg1, arg2, arg3, arg4) \
594 __PVOP_VCALL(op, "", "", "0" ((unsigned long)(arg1)), \
595 "1"((unsigned long)(arg2)), "2"((unsigned long)(arg3)), \
596 "3"((unsigned long)(arg4)))
597#endif
598
599static inline int paravirt_enabled(void)
600{
601 return pv_info.paravirt_enabled;
602}
603
604static inline void load_sp0(struct tss_struct *tss,
605 struct thread_struct *thread)
606{
607 PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
608}
609
610#define ARCH_SETUP pv_init_ops.arch_setup();
611static inline unsigned long get_wallclock(void)
612{
613 return PVOP_CALL0(unsigned long, pv_time_ops.get_wallclock);
614}
615
616static inline int set_wallclock(unsigned long nowtime)
617{
618 return PVOP_CALL1(int, pv_time_ops.set_wallclock, nowtime);
619}
620
621static inline void (*choose_time_init(void))(void)
622{
623 return pv_time_ops.time_init;
624}
625
626/* The paravirtualized CPUID instruction. */
627static inline void __cpuid(unsigned int *eax, unsigned int *ebx,
628 unsigned int *ecx, unsigned int *edx)
629{
630 PVOP_VCALL4(pv_cpu_ops.cpuid, eax, ebx, ecx, edx);
631}
632
633/*
634 * These special macros can be used to get or set a debugging register
635 */
636static inline unsigned long paravirt_get_debugreg(int reg)
637{
638 return PVOP_CALL1(unsigned long, pv_cpu_ops.get_debugreg, reg);
639}
640#define get_debugreg(var, reg) var = paravirt_get_debugreg(reg)
641static inline void set_debugreg(unsigned long val, int reg)
642{
643 PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val);
644}
645
646static inline void clts(void)
647{
648 PVOP_VCALL0(pv_cpu_ops.clts);
649}
650
651static inline unsigned long read_cr0(void)
652{
653 return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0);
654}
655
656static inline void write_cr0(unsigned long x)
657{
658 PVOP_VCALL1(pv_cpu_ops.write_cr0, x);
659}
660
661static inline unsigned long read_cr2(void)
662{
663 return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr2);
664}
665
666static inline void write_cr2(unsigned long x)
667{
668 PVOP_VCALL1(pv_mmu_ops.write_cr2, x);
669}
670
671static inline unsigned long read_cr3(void)
672{
673 return PVOP_CALL0(unsigned long, pv_mmu_ops.read_cr3);
674}
675
676static inline void write_cr3(unsigned long x)
677{
678 PVOP_VCALL1(pv_mmu_ops.write_cr3, x);
679}
680
681static inline unsigned long read_cr4(void)
682{
683 return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4);
684}
685static inline unsigned long read_cr4_safe(void)
686{
687 return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe);
688}
689
690static inline void write_cr4(unsigned long x)
691{
692 PVOP_VCALL1(pv_cpu_ops.write_cr4, x);
693}
694
695#ifdef CONFIG_X86_64
696static inline unsigned long read_cr8(void)
697{
698 return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr8);
699}
700
701static inline void write_cr8(unsigned long x)
702{
703 PVOP_VCALL1(pv_cpu_ops.write_cr8, x);
704}
705#endif
706
707static inline void raw_safe_halt(void)
708{
709 PVOP_VCALL0(pv_irq_ops.safe_halt);
710}
711
712static inline void halt(void)
713{
714 PVOP_VCALL0(pv_irq_ops.safe_halt);
715}
716
717static inline void wbinvd(void)
718{
719 PVOP_VCALL0(pv_cpu_ops.wbinvd);
720}
721
722#define get_kernel_rpl() (pv_info.kernel_rpl)
723
724static inline u64 paravirt_read_msr(unsigned msr, int *err)
725{
726 return PVOP_CALL2(u64, pv_cpu_ops.read_msr, msr, err);
727}
728static inline u64 paravirt_read_msr_amd(unsigned msr, int *err)
729{
730 return PVOP_CALL2(u64, pv_cpu_ops.read_msr_amd, msr, err);
731}
732static inline int paravirt_write_msr(unsigned msr, unsigned low, unsigned high)
733{
734 return PVOP_CALL3(int, pv_cpu_ops.write_msr, msr, low, high);
735}
736
737/* These should all do BUG_ON(_err), but our headers are too tangled. */
738#define rdmsr(msr, val1, val2) \
739do { \
740 int _err; \
741 u64 _l = paravirt_read_msr(msr, &_err); \
742 val1 = (u32)_l; \
743 val2 = _l >> 32; \
744} while (0)
745
746#define wrmsr(msr, val1, val2) \
747do { \
748 paravirt_write_msr(msr, val1, val2); \
749} while (0)
750
751#define rdmsrl(msr, val) \
752do { \
753 int _err; \
754 val = paravirt_read_msr(msr, &_err); \
755} while (0)
756
757#define wrmsrl(msr, val) wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32)
758#define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b)
759
760/* rdmsr with exception handling */
761#define rdmsr_safe(msr, a, b) \
762({ \
763 int _err; \
764 u64 _l = paravirt_read_msr(msr, &_err); \
765 (*a) = (u32)_l; \
766 (*b) = _l >> 32; \
767 _err; \
768})
769
770static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
771{
772 int err;
773
774 *p = paravirt_read_msr(msr, &err);
775 return err;
776}
777static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p)
778{
779 int err;
780
781 *p = paravirt_read_msr_amd(msr, &err);
782 return err;
783}
784
785static inline u64 paravirt_read_tsc(void)
786{
787 return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
788}
789
790#define rdtscl(low) \
791do { \
792 u64 _l = paravirt_read_tsc(); \
793 low = (int)_l; \
794} while (0)
795
796#define rdtscll(val) (val = paravirt_read_tsc())
797
798static inline unsigned long long paravirt_sched_clock(void)
799{
800 return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
801}
802#define calibrate_tsc() (pv_time_ops.get_tsc_khz())
803
804static inline unsigned long long paravirt_read_pmc(int counter)
805{
806 return PVOP_CALL1(u64, pv_cpu_ops.read_pmc, counter);
807}
808
809#define rdpmc(counter, low, high) \
810do { \
811 u64 _l = paravirt_read_pmc(counter); \
812 low = (u32)_l; \
813 high = _l >> 32; \
814} while (0)
815
816static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
817{
818 return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
819}
820
821#define rdtscp(low, high, aux) \
822do { \
823 int __aux; \
824 unsigned long __val = paravirt_rdtscp(&__aux); \
825 (low) = (u32)__val; \
826 (high) = (u32)(__val >> 32); \
827 (aux) = __aux; \
828} while (0)
829
830#define rdtscpll(val, aux) \
831do { \
832 unsigned long __aux; \
833 val = paravirt_rdtscp(&__aux); \
834 (aux) = __aux; \
835} while (0)
836
837static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
838{
839 PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
840}
841
842static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
843{
844 PVOP_VCALL2(pv_cpu_ops.free_ldt, ldt, entries);
845}
846
847static inline void load_TR_desc(void)
848{
849 PVOP_VCALL0(pv_cpu_ops.load_tr_desc);
850}
851static inline void load_gdt(const struct desc_ptr *dtr)
852{
853 PVOP_VCALL1(pv_cpu_ops.load_gdt, dtr);
854}
855static inline void load_idt(const struct desc_ptr *dtr)
856{
857 PVOP_VCALL1(pv_cpu_ops.load_idt, dtr);
858}
859static inline void set_ldt(const void *addr, unsigned entries)
860{
861 PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries);
862}
863static inline void store_gdt(struct desc_ptr *dtr)
864{
865 PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr);
866}
867static inline void store_idt(struct desc_ptr *dtr)
868{
869 PVOP_VCALL1(pv_cpu_ops.store_idt, dtr);
870}
871static inline unsigned long paravirt_store_tr(void)
872{
873 return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr);
874}
875#define store_tr(tr) ((tr) = paravirt_store_tr())
876static inline void load_TLS(struct thread_struct *t, unsigned cpu)
877{
878 PVOP_VCALL2(pv_cpu_ops.load_tls, t, cpu);
879}
880
881#ifdef CONFIG_X86_64
882static inline void load_gs_index(unsigned int gs)
883{
884 PVOP_VCALL1(pv_cpu_ops.load_gs_index, gs);
885}
886#endif
887
888static inline void write_ldt_entry(struct desc_struct *dt, int entry,
889 const void *desc)
890{
891 PVOP_VCALL3(pv_cpu_ops.write_ldt_entry, dt, entry, desc);
892}
893
894static inline void write_gdt_entry(struct desc_struct *dt, int entry,
895 void *desc, int type)
896{
897 PVOP_VCALL4(pv_cpu_ops.write_gdt_entry, dt, entry, desc, type);
898}
899
900static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
901{
902 PVOP_VCALL3(pv_cpu_ops.write_idt_entry, dt, entry, g);
903}
904static inline void set_iopl_mask(unsigned mask)
905{
906 PVOP_VCALL1(pv_cpu_ops.set_iopl_mask, mask);
907}
908
909/* The paravirtualized I/O functions */
910static inline void slow_down_io(void)
911{
912 pv_cpu_ops.io_delay();
913#ifdef REALLY_SLOW_IO
914 pv_cpu_ops.io_delay();
915 pv_cpu_ops.io_delay();
916 pv_cpu_ops.io_delay();
917#endif
918}
919
920#ifdef CONFIG_X86_LOCAL_APIC
921static inline void setup_boot_clock(void)
922{
923 PVOP_VCALL0(pv_apic_ops.setup_boot_clock);
924}
925
926static inline void setup_secondary_clock(void)
927{
928 PVOP_VCALL0(pv_apic_ops.setup_secondary_clock);
929}
930#endif
931
932static inline void paravirt_post_allocator_init(void)
933{
934 if (pv_init_ops.post_allocator_init)
935 (*pv_init_ops.post_allocator_init)();
936}
937
938static inline void paravirt_pagetable_setup_start(pgd_t *base)
939{
940 (*pv_mmu_ops.pagetable_setup_start)(base);
941}
942
943static inline void paravirt_pagetable_setup_done(pgd_t *base)
944{
945 (*pv_mmu_ops.pagetable_setup_done)(base);
946}
947
948#ifdef CONFIG_SMP
949static inline void startup_ipi_hook(int phys_apicid, unsigned long start_eip,
950 unsigned long start_esp)
951{
952 PVOP_VCALL3(pv_apic_ops.startup_ipi_hook,
953 phys_apicid, start_eip, start_esp);
954}
955#endif
956
957static inline void paravirt_activate_mm(struct mm_struct *prev,
958 struct mm_struct *next)
959{
960 PVOP_VCALL2(pv_mmu_ops.activate_mm, prev, next);
961}
962
963static inline void arch_dup_mmap(struct mm_struct *oldmm,
964 struct mm_struct *mm)
965{
966 PVOP_VCALL2(pv_mmu_ops.dup_mmap, oldmm, mm);
967}
968
969static inline void arch_exit_mmap(struct mm_struct *mm)
970{
971 PVOP_VCALL1(pv_mmu_ops.exit_mmap, mm);
972}
973
974static inline void __flush_tlb(void)
975{
976 PVOP_VCALL0(pv_mmu_ops.flush_tlb_user);
977}
978static inline void __flush_tlb_global(void)
979{
980 PVOP_VCALL0(pv_mmu_ops.flush_tlb_kernel);
981}
982static inline void __flush_tlb_single(unsigned long addr)
983{
984 PVOP_VCALL1(pv_mmu_ops.flush_tlb_single, addr);
985}
986
987static inline void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm,
988 unsigned long va)
989{
990 PVOP_VCALL3(pv_mmu_ops.flush_tlb_others, &cpumask, mm, va);
991}
992
993static inline int paravirt_pgd_alloc(struct mm_struct *mm)
994{
995 return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm);
996}
997
998static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd)
999{
1000 PVOP_VCALL2(pv_mmu_ops.pgd_free, mm, pgd);
1001}
1002
1003static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn)
1004{
1005 PVOP_VCALL2(pv_mmu_ops.alloc_pte, mm, pfn);
1006}
1007static inline void paravirt_release_pte(unsigned long pfn)
1008{
1009 PVOP_VCALL1(pv_mmu_ops.release_pte, pfn);
1010}
1011
1012static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
1013{
1014 PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
1015}
1016
1017static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
1018 unsigned long start, unsigned long count)
1019{
1020 PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
1021}
1022static inline void paravirt_release_pmd(unsigned long pfn)
1023{
1024 PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
1025}
1026
1027static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn)
1028{
1029 PVOP_VCALL2(pv_mmu_ops.alloc_pud, mm, pfn);
1030}
1031static inline void paravirt_release_pud(unsigned long pfn)
1032{
1033 PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
1034}
1035
1036#ifdef CONFIG_HIGHPTE
1037static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
1038{
1039 unsigned long ret;
1040 ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
1041 return (void *)ret;
1042}
1043#endif
1044
1045static inline void pte_update(struct mm_struct *mm, unsigned long addr,
1046 pte_t *ptep)
1047{
1048 PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
1049}
1050
1051static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
1052 pte_t *ptep)
1053{
1054 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
1055}
1056
1057static inline pte_t __pte(pteval_t val)
1058{
1059 pteval_t ret;
1060
1061 if (sizeof(pteval_t) > sizeof(long))
1062 ret = PVOP_CALL2(pteval_t,
1063 pv_mmu_ops.make_pte,
1064 val, (u64)val >> 32);
1065 else
1066 ret = PVOP_CALL1(pteval_t,
1067 pv_mmu_ops.make_pte,
1068 val);
1069
1070 return (pte_t) { .pte = ret };
1071}
1072
1073static inline pteval_t pte_val(pte_t pte)
1074{
1075 pteval_t ret;
1076
1077 if (sizeof(pteval_t) > sizeof(long))
1078 ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_val,
1079 pte.pte, (u64)pte.pte >> 32);
1080 else
1081 ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_val,
1082 pte.pte);
1083
1084 return ret;
1085}
1086
1087static inline pteval_t pte_flags(pte_t pte)
1088{
1089 pteval_t ret;
1090
1091 if (sizeof(pteval_t) > sizeof(long))
1092 ret = PVOP_CALL2(pteval_t, pv_mmu_ops.pte_flags,
1093 pte.pte, (u64)pte.pte >> 32);
1094 else
1095 ret = PVOP_CALL1(pteval_t, pv_mmu_ops.pte_flags,
1096 pte.pte);
1097
1098#ifdef CONFIG_PARAVIRT_DEBUG
1099 BUG_ON(ret & PTE_PFN_MASK);
1100#endif
1101 return ret;
1102}
1103
1104static inline pgd_t __pgd(pgdval_t val)
1105{
1106 pgdval_t ret;
1107
1108 if (sizeof(pgdval_t) > sizeof(long))
1109 ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.make_pgd,
1110 val, (u64)val >> 32);
1111 else
1112 ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.make_pgd,
1113 val);
1114
1115 return (pgd_t) { ret };
1116}
1117
1118static inline pgdval_t pgd_val(pgd_t pgd)
1119{
1120 pgdval_t ret;
1121
1122 if (sizeof(pgdval_t) > sizeof(long))
1123 ret = PVOP_CALL2(pgdval_t, pv_mmu_ops.pgd_val,
1124 pgd.pgd, (u64)pgd.pgd >> 32);
1125 else
1126 ret = PVOP_CALL1(pgdval_t, pv_mmu_ops.pgd_val,
1127 pgd.pgd);
1128
1129 return ret;
1130}
1131
1132#define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION
1133static inline pte_t ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr,
1134 pte_t *ptep)
1135{
1136 pteval_t ret;
1137
1138 ret = PVOP_CALL3(pteval_t, pv_mmu_ops.ptep_modify_prot_start,
1139 mm, addr, ptep);
1140
1141 return (pte_t) { .pte = ret };
1142}
1143
1144static inline void ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
1145 pte_t *ptep, pte_t pte)
1146{
1147 if (sizeof(pteval_t) > sizeof(long))
1148 /* 5 arg words */
1149 pv_mmu_ops.ptep_modify_prot_commit(mm, addr, ptep, pte);
1150 else
1151 PVOP_VCALL4(pv_mmu_ops.ptep_modify_prot_commit,
1152 mm, addr, ptep, pte.pte);
1153}
1154
1155static inline void set_pte(pte_t *ptep, pte_t pte)
1156{
1157 if (sizeof(pteval_t) > sizeof(long))
1158 PVOP_VCALL3(pv_mmu_ops.set_pte, ptep,
1159 pte.pte, (u64)pte.pte >> 32);
1160 else
1161 PVOP_VCALL2(pv_mmu_ops.set_pte, ptep,
1162 pte.pte);
1163}
1164
1165static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
1166 pte_t *ptep, pte_t pte)
1167{
1168 if (sizeof(pteval_t) > sizeof(long))
1169 /* 5 arg words */
1170 pv_mmu_ops.set_pte_at(mm, addr, ptep, pte);
1171 else
1172 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
1173}
1174
1175static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
1176{
1177 pmdval_t val = native_pmd_val(pmd);
1178
1179 if (sizeof(pmdval_t) > sizeof(long))
1180 PVOP_VCALL3(pv_mmu_ops.set_pmd, pmdp, val, (u64)val >> 32);
1181 else
1182 PVOP_VCALL2(pv_mmu_ops.set_pmd, pmdp, val);
1183}
1184
1185#if PAGETABLE_LEVELS >= 3
1186static inline pmd_t __pmd(pmdval_t val)
1187{
1188 pmdval_t ret;
1189
1190 if (sizeof(pmdval_t) > sizeof(long))
1191 ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.make_pmd,
1192 val, (u64)val >> 32);
1193 else
1194 ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.make_pmd,
1195 val);
1196
1197 return (pmd_t) { ret };
1198}
1199
1200static inline pmdval_t pmd_val(pmd_t pmd)
1201{
1202 pmdval_t ret;
1203
1204 if (sizeof(pmdval_t) > sizeof(long))
1205 ret = PVOP_CALL2(pmdval_t, pv_mmu_ops.pmd_val,
1206 pmd.pmd, (u64)pmd.pmd >> 32);
1207 else
1208 ret = PVOP_CALL1(pmdval_t, pv_mmu_ops.pmd_val,
1209 pmd.pmd);
1210
1211 return ret;
1212}
1213
1214static inline void set_pud(pud_t *pudp, pud_t pud)
1215{
1216 pudval_t val = native_pud_val(pud);
1217
1218 if (sizeof(pudval_t) > sizeof(long))
1219 PVOP_VCALL3(pv_mmu_ops.set_pud, pudp,
1220 val, (u64)val >> 32);
1221 else
1222 PVOP_VCALL2(pv_mmu_ops.set_pud, pudp,
1223 val);
1224}
1225#if PAGETABLE_LEVELS == 4
1226static inline pud_t __pud(pudval_t val)
1227{
1228 pudval_t ret;
1229
1230 if (sizeof(pudval_t) > sizeof(long))
1231 ret = PVOP_CALL2(pudval_t, pv_mmu_ops.make_pud,
1232 val, (u64)val >> 32);
1233 else
1234 ret = PVOP_CALL1(pudval_t, pv_mmu_ops.make_pud,
1235 val);
1236
1237 return (pud_t) { ret };
1238}
1239
1240static inline pudval_t pud_val(pud_t pud)
1241{
1242 pudval_t ret;
1243
1244 if (sizeof(pudval_t) > sizeof(long))
1245 ret = PVOP_CALL2(pudval_t, pv_mmu_ops.pud_val,
1246 pud.pud, (u64)pud.pud >> 32);
1247 else
1248 ret = PVOP_CALL1(pudval_t, pv_mmu_ops.pud_val,
1249 pud.pud);
1250
1251 return ret;
1252}
1253
1254static inline void set_pgd(pgd_t *pgdp, pgd_t pgd)
1255{
1256 pgdval_t val = native_pgd_val(pgd);
1257
1258 if (sizeof(pgdval_t) > sizeof(long))
1259 PVOP_VCALL3(pv_mmu_ops.set_pgd, pgdp,
1260 val, (u64)val >> 32);
1261 else
1262 PVOP_VCALL2(pv_mmu_ops.set_pgd, pgdp,
1263 val);
1264}
1265
1266static inline void pgd_clear(pgd_t *pgdp)
1267{
1268 set_pgd(pgdp, __pgd(0));
1269}
1270
1271static inline void pud_clear(pud_t *pudp)
1272{
1273 set_pud(pudp, __pud(0));
1274}
1275
1276#endif /* PAGETABLE_LEVELS == 4 */
1277
1278#endif /* PAGETABLE_LEVELS >= 3 */
1279
1280#ifdef CONFIG_X86_PAE
1281/* Special-case pte-setting operations for PAE, which can't update a
1282 64-bit pte atomically */
1283static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
1284{
1285 PVOP_VCALL3(pv_mmu_ops.set_pte_atomic, ptep,
1286 pte.pte, pte.pte >> 32);
1287}
1288
1289static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
1290 pte_t *ptep, pte_t pte)
1291{
1292 /* 5 arg words */
1293 pv_mmu_ops.set_pte_present(mm, addr, ptep, pte);
1294}
1295
1296static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
1297 pte_t *ptep)
1298{
1299 PVOP_VCALL3(pv_mmu_ops.pte_clear, mm, addr, ptep);
1300}
1301
1302static inline void pmd_clear(pmd_t *pmdp)
1303{
1304 PVOP_VCALL1(pv_mmu_ops.pmd_clear, pmdp);
1305}
1306#else /* !CONFIG_X86_PAE */
1307static inline void set_pte_atomic(pte_t *ptep, pte_t pte)
1308{
1309 set_pte(ptep, pte);
1310}
1311
1312static inline void set_pte_present(struct mm_struct *mm, unsigned long addr,
1313 pte_t *ptep, pte_t pte)
1314{
1315 set_pte(ptep, pte);
1316}
1317
1318static inline void pte_clear(struct mm_struct *mm, unsigned long addr,
1319 pte_t *ptep)
1320{
1321 set_pte_at(mm, addr, ptep, __pte(0));
1322}
1323
1324static inline void pmd_clear(pmd_t *pmdp)
1325{
1326 set_pmd(pmdp, __pmd(0));
1327}
1328#endif /* CONFIG_X86_PAE */
1329
1330/* Lazy mode for batching updates / context switch */
1331enum paravirt_lazy_mode {
1332 PARAVIRT_LAZY_NONE,
1333 PARAVIRT_LAZY_MMU,
1334 PARAVIRT_LAZY_CPU,
1335};
1336
1337enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
1338void paravirt_enter_lazy_cpu(void);
1339void paravirt_leave_lazy_cpu(void);
1340void paravirt_enter_lazy_mmu(void);
1341void paravirt_leave_lazy_mmu(void);
1342void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
1343
1344#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE
1345static inline void arch_enter_lazy_cpu_mode(void)
1346{
1347 PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
1348}
1349
1350static inline void arch_leave_lazy_cpu_mode(void)
1351{
1352 PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
1353}
1354
1355static inline void arch_flush_lazy_cpu_mode(void)
1356{
1357 if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU)) {
1358 arch_leave_lazy_cpu_mode();
1359 arch_enter_lazy_cpu_mode();
1360 }
1361}
1362
1363
1364#define __HAVE_ARCH_ENTER_LAZY_MMU_MODE
1365static inline void arch_enter_lazy_mmu_mode(void)
1366{
1367 PVOP_VCALL0(pv_mmu_ops.lazy_mode.enter);
1368}
1369
1370static inline void arch_leave_lazy_mmu_mode(void)
1371{
1372 PVOP_VCALL0(pv_mmu_ops.lazy_mode.leave);
1373}
1374
1375static inline void arch_flush_lazy_mmu_mode(void)
1376{
1377 if (unlikely(paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU)) {
1378 arch_leave_lazy_mmu_mode();
1379 arch_enter_lazy_mmu_mode();
1380 }
1381}
1382
1383static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
1384 unsigned long phys, pgprot_t flags)
1385{
1386 pv_mmu_ops.set_fixmap(idx, phys, flags);
1387}
1388
1389void _paravirt_nop(void);
1390#define paravirt_nop ((void *)_paravirt_nop)
1391
1392void paravirt_use_bytelocks(void);
1393
1394#ifdef CONFIG_SMP
1395
1396static inline int __raw_spin_is_locked(struct raw_spinlock *lock)
1397{
1398 return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock);
1399}
1400
1401static inline int __raw_spin_is_contended(struct raw_spinlock *lock)
1402{
1403 return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
1404}
1405
1406static __always_inline void __raw_spin_lock(struct raw_spinlock *lock)
1407{
1408 PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
1409}
1410
1411static __always_inline void __raw_spin_lock_flags(struct raw_spinlock *lock,
1412 unsigned long flags)
1413{
1414 PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags);
1415}
1416
1417static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock)
1418{
1419 return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
1420}
1421
1422static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock)
1423{
1424 PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
1425}
1426
1427#endif
1428
1429/* These all sit in the .parainstructions section to tell us what to patch. */
1430struct paravirt_patch_site {
1431 u8 *instr; /* original instructions */
1432 u8 instrtype; /* type of this instruction */
1433 u8 len; /* length of original instruction */
1434 u16 clobbers; /* what registers you may clobber */
1435};
1436
1437extern struct paravirt_patch_site __parainstructions[],
1438 __parainstructions_end[];
1439
1440#ifdef CONFIG_X86_32
1441#define PV_SAVE_REGS "pushl %%ecx; pushl %%edx;"
1442#define PV_RESTORE_REGS "popl %%edx; popl %%ecx"
1443#define PV_FLAGS_ARG "0"
1444#define PV_EXTRA_CLOBBERS
1445#define PV_VEXTRA_CLOBBERS
1446#else
1447/* We save some registers, but all of them, that's too much. We clobber all
1448 * caller saved registers but the argument parameter */
1449#define PV_SAVE_REGS "pushq %%rdi;"
1450#define PV_RESTORE_REGS "popq %%rdi;"
1451#define PV_EXTRA_CLOBBERS EXTRA_CLOBBERS, "rcx" , "rdx", "rsi"
1452#define PV_VEXTRA_CLOBBERS EXTRA_CLOBBERS, "rdi", "rcx" , "rdx", "rsi"
1453#define PV_FLAGS_ARG "D"
1454#endif
1455
1456static inline unsigned long __raw_local_save_flags(void)
1457{
1458 unsigned long f;
1459
1460 asm volatile(paravirt_alt(PV_SAVE_REGS
1461 PARAVIRT_CALL
1462 PV_RESTORE_REGS)
1463 : "=a"(f)
1464 : paravirt_type(pv_irq_ops.save_fl),
1465 paravirt_clobber(CLBR_EAX)
1466 : "memory", "cc" PV_VEXTRA_CLOBBERS);
1467 return f;
1468}
1469
1470static inline void raw_local_irq_restore(unsigned long f)
1471{
1472 asm volatile(paravirt_alt(PV_SAVE_REGS
1473 PARAVIRT_CALL
1474 PV_RESTORE_REGS)
1475 : "=a"(f)
1476 : PV_FLAGS_ARG(f),
1477 paravirt_type(pv_irq_ops.restore_fl),
1478 paravirt_clobber(CLBR_EAX)
1479 : "memory", "cc" PV_EXTRA_CLOBBERS);
1480}
1481
1482static inline void raw_local_irq_disable(void)
1483{
1484 asm volatile(paravirt_alt(PV_SAVE_REGS
1485 PARAVIRT_CALL
1486 PV_RESTORE_REGS)
1487 :
1488 : paravirt_type(pv_irq_ops.irq_disable),
1489 paravirt_clobber(CLBR_EAX)
1490 : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
1491}
1492
1493static inline void raw_local_irq_enable(void)
1494{
1495 asm volatile(paravirt_alt(PV_SAVE_REGS
1496 PARAVIRT_CALL
1497 PV_RESTORE_REGS)
1498 :
1499 : paravirt_type(pv_irq_ops.irq_enable),
1500 paravirt_clobber(CLBR_EAX)
1501 : "memory", "eax", "cc" PV_EXTRA_CLOBBERS);
1502}
1503
1504static inline unsigned long __raw_local_irq_save(void)
1505{
1506 unsigned long f;
1507
1508 f = __raw_local_save_flags();
1509 raw_local_irq_disable();
1510 return f;
1511}
1512
1513
1514/* Make sure as little as possible of this mess escapes. */
1515#undef PARAVIRT_CALL
1516#undef __PVOP_CALL
1517#undef __PVOP_VCALL
1518#undef PVOP_VCALL0
1519#undef PVOP_CALL0
1520#undef PVOP_VCALL1
1521#undef PVOP_CALL1
1522#undef PVOP_VCALL2
1523#undef PVOP_CALL2
1524#undef PVOP_VCALL3
1525#undef PVOP_CALL3
1526#undef PVOP_VCALL4
1527#undef PVOP_CALL4
1528
1529#else /* __ASSEMBLY__ */
1530
1531#define _PVSITE(ptype, clobbers, ops, word, algn) \
1532771:; \
1533 ops; \
1534772:; \
1535 .pushsection .parainstructions,"a"; \
1536 .align algn; \
1537 word 771b; \
1538 .byte ptype; \
1539 .byte 772b-771b; \
1540 .short clobbers; \
1541 .popsection
1542
1543
1544#ifdef CONFIG_X86_64
1545#define PV_SAVE_REGS \
1546 push %rax; \
1547 push %rcx; \
1548 push %rdx; \
1549 push %rsi; \
1550 push %rdi; \
1551 push %r8; \
1552 push %r9; \
1553 push %r10; \
1554 push %r11
1555#define PV_RESTORE_REGS \
1556 pop %r11; \
1557 pop %r10; \
1558 pop %r9; \
1559 pop %r8; \
1560 pop %rdi; \
1561 pop %rsi; \
1562 pop %rdx; \
1563 pop %rcx; \
1564 pop %rax
1565#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 8)
1566#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .quad, 8)
1567#define PARA_INDIRECT(addr) *addr(%rip)
1568#else
1569#define PV_SAVE_REGS pushl %eax; pushl %edi; pushl %ecx; pushl %edx
1570#define PV_RESTORE_REGS popl %edx; popl %ecx; popl %edi; popl %eax
1571#define PARA_PATCH(struct, off) ((PARAVIRT_PATCH_##struct + (off)) / 4)
1572#define PARA_SITE(ptype, clobbers, ops) _PVSITE(ptype, clobbers, ops, .long, 4)
1573#define PARA_INDIRECT(addr) *%cs:addr
1574#endif
1575
1576#define INTERRUPT_RETURN \
1577 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \
1578 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret))
1579
1580#define DISABLE_INTERRUPTS(clobbers) \
1581 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \
1582 PV_SAVE_REGS; \
1583 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \
1584 PV_RESTORE_REGS;) \
1585
1586#define ENABLE_INTERRUPTS(clobbers) \
1587 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \
1588 PV_SAVE_REGS; \
1589 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \
1590 PV_RESTORE_REGS;)
1591
1592#define USERGS_SYSRET32 \
1593 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \
1594 CLBR_NONE, \
1595 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32))
1596
1597#ifdef CONFIG_X86_32
1598#define GET_CR0_INTO_EAX \
1599 push %ecx; push %edx; \
1600 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \
1601 pop %edx; pop %ecx
1602
1603#define ENABLE_INTERRUPTS_SYSEXIT \
1604 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
1605 CLBR_NONE, \
1606 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
1607
1608
1609#else /* !CONFIG_X86_32 */
1610
1611/*
1612 * If swapgs is used while the userspace stack is still current,
1613 * there's no way to call a pvop. The PV replacement *must* be
1614 * inlined, or the swapgs instruction must be trapped and emulated.
1615 */
1616#define SWAPGS_UNSAFE_STACK \
1617 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
1618 swapgs)
1619
1620#define SWAPGS \
1621 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \
1622 PV_SAVE_REGS; \
1623 call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \
1624 PV_RESTORE_REGS \
1625 )
1626
1627#define GET_CR2_INTO_RCX \
1628 call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2); \
1629 movq %rax, %rcx; \
1630 xorq %rax, %rax;
1631
1632#define PARAVIRT_ADJUST_EXCEPTION_FRAME \
1633 PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \
1634 CLBR_NONE, \
1635 call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame))
1636
1637#define USERGS_SYSRET64 \
1638 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \
1639 CLBR_NONE, \
1640 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64))
1641
1642#define ENABLE_INTERRUPTS_SYSEXIT32 \
1643 PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \
1644 CLBR_NONE, \
1645 jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit))
1646#endif /* CONFIG_X86_32 */
1647
1648#endif /* __ASSEMBLY__ */
1649#endif /* CONFIG_PARAVIRT */
1650#endif /* _ASM_X86_PARAVIRT_H */
diff --git a/arch/x86/include/asm/parport.h b/arch/x86/include/asm/parport.h
new file mode 100644
index 000000000000..3c4ffeb467e9
--- /dev/null
+++ b/arch/x86/include/asm/parport.h
@@ -0,0 +1,10 @@
1#ifndef _ASM_X86_PARPORT_H
2#define _ASM_X86_PARPORT_H
3
4static int __devinit parport_pc_find_isa_ports(int autoirq, int autodma);
5static int __devinit parport_pc_find_nonpci_ports(int autoirq, int autodma)
6{
7 return parport_pc_find_isa_ports(autoirq, autodma);
8}
9
10#endif /* _ASM_X86_PARPORT_H */
diff --git a/arch/x86/include/asm/pat.h b/arch/x86/include/asm/pat.h
new file mode 100644
index 000000000000..b8493b3b9890
--- /dev/null
+++ b/arch/x86/include/asm/pat.h
@@ -0,0 +1,22 @@
1#ifndef _ASM_X86_PAT_H
2#define _ASM_X86_PAT_H
3
4#include <linux/types.h>
5
6#ifdef CONFIG_X86_PAT
7extern int pat_enabled;
8extern void validate_pat_support(struct cpuinfo_x86 *c);
9#else
10static const int pat_enabled;
11static inline void validate_pat_support(struct cpuinfo_x86 *c) { }
12#endif
13
14extern void pat_init(void);
15
16extern int reserve_memtype(u64 start, u64 end,
17 unsigned long req_type, unsigned long *ret_type);
18extern int free_memtype(u64 start, u64 end);
19
20extern void pat_disable(char *reason);
21
22#endif /* _ASM_X86_PAT_H */
diff --git a/arch/x86/include/asm/pci-direct.h b/arch/x86/include/asm/pci-direct.h
new file mode 100644
index 000000000000..b1e7a45d868a
--- /dev/null
+++ b/arch/x86/include/asm/pci-direct.h
@@ -0,0 +1,21 @@
1#ifndef _ASM_X86_PCI_DIRECT_H
2#define _ASM_X86_PCI_DIRECT_H
3
4#include <linux/types.h>
5
6/* Direct PCI access. This is used for PCI accesses in early boot before
7 the PCI subsystem works. */
8
9extern u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset);
10extern u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset);
11extern u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset);
12extern void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val);
13extern void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val);
14extern void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val);
15
16extern int early_pci_allowed(void);
17
18extern unsigned int pci_early_dump_regs;
19extern void early_dump_pci_device(u8 bus, u8 slot, u8 func);
20extern void early_dump_pci_devices(void);
21#endif /* _ASM_X86_PCI_DIRECT_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
new file mode 100644
index 000000000000..647781298e7e
--- /dev/null
+++ b/arch/x86/include/asm/pci.h
@@ -0,0 +1,116 @@
1#ifndef _ASM_X86_PCI_H
2#define _ASM_X86_PCI_H
3
4#include <linux/mm.h> /* for struct page */
5#include <linux/types.h>
6#include <linux/slab.h>
7#include <linux/string.h>
8#include <asm/scatterlist.h>
9#include <asm/io.h>
10
11#ifdef __KERNEL__
12
13struct pci_sysdata {
14 int domain; /* PCI domain */
15 int node; /* NUMA node */
16#ifdef CONFIG_X86_64
17 void *iommu; /* IOMMU private data */
18#endif
19};
20
21extern int pci_routeirq;
22extern int noioapicquirk;
23extern int noioapicreroute;
24
25/* scan a bus after allocating a pci_sysdata for it */
26extern struct pci_bus *pci_scan_bus_on_node(int busno, struct pci_ops *ops,
27 int node);
28extern struct pci_bus *pci_scan_bus_with_sysdata(int busno);
29
30static inline int pci_domain_nr(struct pci_bus *bus)
31{
32 struct pci_sysdata *sd = bus->sysdata;
33 return sd->domain;
34}
35
36static inline int pci_proc_domain(struct pci_bus *bus)
37{
38 return pci_domain_nr(bus);
39}
40
41
42/* Can be used to override the logic in pci_scan_bus for skipping
43 already-configured bus numbers - to be used for buggy BIOSes
44 or architectures with incomplete PCI setup by the loader */
45
46#ifdef CONFIG_PCI
47extern unsigned int pcibios_assign_all_busses(void);
48#else
49#define pcibios_assign_all_busses() 0
50#endif
51#define pcibios_scan_all_fns(a, b) 0
52
53extern unsigned long pci_mem_start;
54#define PCIBIOS_MIN_IO 0x1000
55#define PCIBIOS_MIN_MEM (pci_mem_start)
56
57#define PCIBIOS_MIN_CARDBUS_IO 0x4000
58
59void pcibios_config_init(void);
60struct pci_bus *pcibios_scan_root(int bus);
61
62void pcibios_set_master(struct pci_dev *dev);
63void pcibios_penalize_isa_irq(int irq, int active);
64struct irq_routing_table *pcibios_get_irq_routing_table(void);
65int pcibios_set_irq_routing(struct pci_dev *dev, int pin, int irq);
66
67
68#define HAVE_PCI_MMAP
69extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
70 enum pci_mmap_state mmap_state,
71 int write_combine);
72
73
74#ifdef CONFIG_PCI
75extern void early_quirks(void);
76static inline void pci_dma_burst_advice(struct pci_dev *pdev,
77 enum pci_dma_burst_strategy *strat,
78 unsigned long *strategy_parameter)
79{
80 *strat = PCI_DMA_BURST_INFINITY;
81 *strategy_parameter = ~0UL;
82}
83#else
84static inline void early_quirks(void) { }
85#endif
86
87#endif /* __KERNEL__ */
88
89#ifdef CONFIG_X86_32
90# include "pci_32.h"
91#else
92# include "pci_64.h"
93#endif
94
95/* implement the pci_ DMA API in terms of the generic device dma_ one */
96#include <asm-generic/pci-dma-compat.h>
97
98/* generic pci stuff */
99#include <asm-generic/pci.h>
100
101#ifdef CONFIG_NUMA
102/* Returns the node based on pci bus */
103static inline int __pcibus_to_node(struct pci_bus *bus)
104{
105 struct pci_sysdata *sd = bus->sysdata;
106
107 return sd->node;
108}
109
110static inline cpumask_t __pcibus_to_cpumask(struct pci_bus *bus)
111{
112 return node_to_cpumask(__pcibus_to_node(bus));
113}
114#endif
115
116#endif /* _ASM_X86_PCI_H */
diff --git a/arch/x86/include/asm/pci_32.h b/arch/x86/include/asm/pci_32.h
new file mode 100644
index 000000000000..6f1213a6ef4f
--- /dev/null
+++ b/arch/x86/include/asm/pci_32.h
@@ -0,0 +1,34 @@
1#ifndef _ASM_X86_PCI_32_H
2#define _ASM_X86_PCI_32_H
3
4
5#ifdef __KERNEL__
6
7
8/* Dynamic DMA mapping stuff.
9 * i386 has everything mapped statically.
10 */
11
12struct pci_dev;
13
14/* The PCI address space does equal the physical memory
15 * address space. The networking and block device layers use
16 * this boolean for bounce buffer decisions.
17 */
18#define PCI_DMA_BUS_IS_PHYS (1)
19
20/* pci_unmap_{page,single} is a nop so... */
21#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
22#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
23#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
24#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
25 do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
26#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
27#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
28 do { break; } while (pci_unmap_len(PTR, LEN_NAME))
29
30
31#endif /* __KERNEL__ */
32
33
34#endif /* _ASM_X86_PCI_32_H */
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
new file mode 100644
index 000000000000..5b28995d664e
--- /dev/null
+++ b/arch/x86/include/asm/pci_64.h
@@ -0,0 +1,66 @@
1#ifndef _ASM_X86_PCI_64_H
2#define _ASM_X86_PCI_64_H
3
4#ifdef __KERNEL__
5
6#ifdef CONFIG_CALGARY_IOMMU
7static inline void *pci_iommu(struct pci_bus *bus)
8{
9 struct pci_sysdata *sd = bus->sysdata;
10 return sd->iommu;
11}
12
13static inline void set_pci_iommu(struct pci_bus *bus, void *val)
14{
15 struct pci_sysdata *sd = bus->sysdata;
16 sd->iommu = val;
17}
18#endif /* CONFIG_CALGARY_IOMMU */
19
20extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
21 int reg, int len, u32 *value);
22extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
23 int reg, int len, u32 value);
24
25extern void dma32_reserve_bootmem(void);
26extern void pci_iommu_alloc(void);
27
28/* The PCI address space does equal the physical memory
29 * address space. The networking and block device layers use
30 * this boolean for bounce buffer decisions
31 *
32 * On AMD64 it mostly equals, but we set it to zero if a hardware
33 * IOMMU (gart) of sotware IOMMU (swiotlb) is available.
34 */
35#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
36
37#if defined(CONFIG_GART_IOMMU) || defined(CONFIG_CALGARY_IOMMU)
38
39#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
40 dma_addr_t ADDR_NAME;
41#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
42 __u32 LEN_NAME;
43#define pci_unmap_addr(PTR, ADDR_NAME) \
44 ((PTR)->ADDR_NAME)
45#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
46 (((PTR)->ADDR_NAME) = (VAL))
47#define pci_unmap_len(PTR, LEN_NAME) \
48 ((PTR)->LEN_NAME)
49#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
50 (((PTR)->LEN_NAME) = (VAL))
51
52#else
53/* No IOMMU */
54
55#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME)
56#define DECLARE_PCI_UNMAP_LEN(LEN_NAME)
57#define pci_unmap_addr(PTR, ADDR_NAME) (0)
58#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0)
59#define pci_unmap_len(PTR, LEN_NAME) (0)
60#define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0)
61
62#endif
63
64#endif /* __KERNEL__ */
65
66#endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h
new file mode 100644
index 000000000000..2fbfff88df37
--- /dev/null
+++ b/arch/x86/include/asm/pda.h
@@ -0,0 +1,137 @@
1#ifndef _ASM_X86_PDA_H
2#define _ASM_X86_PDA_H
3
4#ifndef __ASSEMBLY__
5#include <linux/stddef.h>
6#include <linux/types.h>
7#include <linux/cache.h>
8#include <asm/page.h>
9
10/* Per processor datastructure. %gs points to it while the kernel runs */
11struct x8664_pda {
12 struct task_struct *pcurrent; /* 0 Current process */
13 unsigned long data_offset; /* 8 Per cpu data offset from linker
14 address */
15 unsigned long kernelstack; /* 16 top of kernel stack for current */
16 unsigned long oldrsp; /* 24 user rsp for system call */
17 int irqcount; /* 32 Irq nesting counter. Starts -1 */
18 unsigned int cpunumber; /* 36 Logical CPU number */
19#ifdef CONFIG_CC_STACKPROTECTOR
20 unsigned long stack_canary; /* 40 stack canary value */
21 /* gcc-ABI: this canary MUST be at
22 offset 40!!! */
23#endif
24 char *irqstackptr;
25 short nodenumber; /* number of current node (32k max) */
26 short in_bootmem; /* pda lives in bootmem */
27 unsigned int __softirq_pending;
28 unsigned int __nmi_count; /* number of NMI on this CPUs */
29 short mmu_state;
30 short isidle;
31 struct mm_struct *active_mm;
32 unsigned apic_timer_irqs;
33 unsigned irq0_irqs;
34 unsigned irq_resched_count;
35 unsigned irq_call_count;
36 unsigned irq_tlb_count;
37 unsigned irq_thermal_count;
38 unsigned irq_threshold_count;
39 unsigned irq_spurious_count;
40} ____cacheline_aligned_in_smp;
41
42extern struct x8664_pda **_cpu_pda;
43extern void pda_init(int);
44
45#define cpu_pda(i) (_cpu_pda[i])
46
47/*
48 * There is no fast way to get the base address of the PDA, all the accesses
49 * have to mention %fs/%gs. So it needs to be done this Torvaldian way.
50 */
51extern void __bad_pda_field(void) __attribute__((noreturn));
52
53/*
54 * proxy_pda doesn't actually exist, but tell gcc it is accessed for
55 * all PDA accesses so it gets read/write dependencies right.
56 */
57extern struct x8664_pda _proxy_pda;
58
59#define pda_offset(field) offsetof(struct x8664_pda, field)
60
61#define pda_to_op(op, field, val) \
62do { \
63 typedef typeof(_proxy_pda.field) T__; \
64 if (0) { T__ tmp__; tmp__ = (val); } /* type checking */ \
65 switch (sizeof(_proxy_pda.field)) { \
66 case 2: \
67 asm(op "w %1,%%gs:%c2" : \
68 "+m" (_proxy_pda.field) : \
69 "ri" ((T__)val), \
70 "i"(pda_offset(field))); \
71 break; \
72 case 4: \
73 asm(op "l %1,%%gs:%c2" : \
74 "+m" (_proxy_pda.field) : \
75 "ri" ((T__)val), \
76 "i" (pda_offset(field))); \
77 break; \
78 case 8: \
79 asm(op "q %1,%%gs:%c2": \
80 "+m" (_proxy_pda.field) : \
81 "ri" ((T__)val), \
82 "i"(pda_offset(field))); \
83 break; \
84 default: \
85 __bad_pda_field(); \
86 } \
87} while (0)
88
89#define pda_from_op(op, field) \
90({ \
91 typeof(_proxy_pda.field) ret__; \
92 switch (sizeof(_proxy_pda.field)) { \
93 case 2: \
94 asm(op "w %%gs:%c1,%0" : \
95 "=r" (ret__) : \
96 "i" (pda_offset(field)), \
97 "m" (_proxy_pda.field)); \
98 break; \
99 case 4: \
100 asm(op "l %%gs:%c1,%0": \
101 "=r" (ret__): \
102 "i" (pda_offset(field)), \
103 "m" (_proxy_pda.field)); \
104 break; \
105 case 8: \
106 asm(op "q %%gs:%c1,%0": \
107 "=r" (ret__) : \
108 "i" (pda_offset(field)), \
109 "m" (_proxy_pda.field)); \
110 break; \
111 default: \
112 __bad_pda_field(); \
113 } \
114 ret__; \
115})
116
117#define read_pda(field) pda_from_op("mov", field)
118#define write_pda(field, val) pda_to_op("mov", field, val)
119#define add_pda(field, val) pda_to_op("add", field, val)
120#define sub_pda(field, val) pda_to_op("sub", field, val)
121#define or_pda(field, val) pda_to_op("or", field, val)
122
123/* This is not atomic against other CPUs -- CPU preemption needs to be off */
124#define test_and_clear_bit_pda(bit, field) \
125({ \
126 int old__; \
127 asm volatile("btr %2,%%gs:%c3\n\tsbbl %0,%0" \
128 : "=r" (old__), "+m" (_proxy_pda.field) \
129 : "dIr" (bit), "i" (pda_offset(field)) : "memory");\
130 old__; \
131})
132
133#endif
134
135#define PDA_STACKOFFSET (5*8)
136
137#endif /* _ASM_X86_PDA_H */
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
new file mode 100644
index 000000000000..ece72053ba63
--- /dev/null
+++ b/arch/x86/include/asm/percpu.h
@@ -0,0 +1,218 @@
1#ifndef _ASM_X86_PERCPU_H
2#define _ASM_X86_PERCPU_H
3
4#ifdef CONFIG_X86_64
5#include <linux/compiler.h>
6
7/* Same as asm-generic/percpu.h, except that we store the per cpu offset
8 in the PDA. Longer term the PDA and every per cpu variable
9 should be just put into a single section and referenced directly
10 from %gs */
11
12#ifdef CONFIG_SMP
13#include <asm/pda.h>
14
15#define __per_cpu_offset(cpu) (cpu_pda(cpu)->data_offset)
16#define __my_cpu_offset read_pda(data_offset)
17
18#define per_cpu_offset(x) (__per_cpu_offset(x))
19
20#endif
21#include <asm-generic/percpu.h>
22
23DECLARE_PER_CPU(struct x8664_pda, pda);
24
25/*
26 * These are supposed to be implemented as a single instruction which
27 * operates on the per-cpu data base segment. x86-64 doesn't have
28 * that yet, so this is a fairly inefficient workaround for the
29 * meantime. The single instruction is atomic with respect to
30 * preemption and interrupts, so we need to explicitly disable
31 * interrupts here to achieve the same effect. However, because it
32 * can be used from within interrupt-disable/enable, we can't actually
33 * disable interrupts; disabling preemption is enough.
34 */
35#define x86_read_percpu(var) \
36 ({ \
37 typeof(per_cpu_var(var)) __tmp; \
38 preempt_disable(); \
39 __tmp = __get_cpu_var(var); \
40 preempt_enable(); \
41 __tmp; \
42 })
43
44#define x86_write_percpu(var, val) \
45 do { \
46 preempt_disable(); \
47 __get_cpu_var(var) = (val); \
48 preempt_enable(); \
49 } while(0)
50
51#else /* CONFIG_X86_64 */
52
53#ifdef __ASSEMBLY__
54
55/*
56 * PER_CPU finds an address of a per-cpu variable.
57 *
58 * Args:
59 * var - variable name
60 * reg - 32bit register
61 *
62 * The resulting address is stored in the "reg" argument.
63 *
64 * Example:
65 * PER_CPU(cpu_gdt_descr, %ebx)
66 */
67#ifdef CONFIG_SMP
68#define PER_CPU(var, reg) \
69 movl %fs:per_cpu__##this_cpu_off, reg; \
70 lea per_cpu__##var(reg), reg
71#define PER_CPU_VAR(var) %fs:per_cpu__##var
72#else /* ! SMP */
73#define PER_CPU(var, reg) \
74 movl $per_cpu__##var, reg
75#define PER_CPU_VAR(var) per_cpu__##var
76#endif /* SMP */
77
78#else /* ...!ASSEMBLY */
79
80/*
81 * PER_CPU finds an address of a per-cpu variable.
82 *
83 * Args:
84 * var - variable name
85 * cpu - 32bit register containing the current CPU number
86 *
87 * The resulting address is stored in the "cpu" argument.
88 *
89 * Example:
90 * PER_CPU(cpu_gdt_descr, %ebx)
91 */
92#ifdef CONFIG_SMP
93
94#define __my_cpu_offset x86_read_percpu(this_cpu_off)
95
96/* fs segment starts at (positive) offset == __per_cpu_offset[cpu] */
97#define __percpu_seg "%%fs:"
98
99#else /* !SMP */
100
101#define __percpu_seg ""
102
103#endif /* SMP */
104
105#include <asm-generic/percpu.h>
106
107/* We can use this directly for local CPU (faster). */
108DECLARE_PER_CPU(unsigned long, this_cpu_off);
109
110/* For arch-specific code, we can use direct single-insn ops (they
111 * don't give an lvalue though). */
112extern void __bad_percpu_size(void);
113
114#define percpu_to_op(op, var, val) \
115do { \
116 typedef typeof(var) T__; \
117 if (0) { \
118 T__ tmp__; \
119 tmp__ = (val); \
120 } \
121 switch (sizeof(var)) { \
122 case 1: \
123 asm(op "b %1,"__percpu_seg"%0" \
124 : "+m" (var) \
125 : "ri" ((T__)val)); \
126 break; \
127 case 2: \
128 asm(op "w %1,"__percpu_seg"%0" \
129 : "+m" (var) \
130 : "ri" ((T__)val)); \
131 break; \
132 case 4: \
133 asm(op "l %1,"__percpu_seg"%0" \
134 : "+m" (var) \
135 : "ri" ((T__)val)); \
136 break; \
137 default: __bad_percpu_size(); \
138 } \
139} while (0)
140
141#define percpu_from_op(op, var) \
142({ \
143 typeof(var) ret__; \
144 switch (sizeof(var)) { \
145 case 1: \
146 asm(op "b "__percpu_seg"%1,%0" \
147 : "=r" (ret__) \
148 : "m" (var)); \
149 break; \
150 case 2: \
151 asm(op "w "__percpu_seg"%1,%0" \
152 : "=r" (ret__) \
153 : "m" (var)); \
154 break; \
155 case 4: \
156 asm(op "l "__percpu_seg"%1,%0" \
157 : "=r" (ret__) \
158 : "m" (var)); \
159 break; \
160 default: __bad_percpu_size(); \
161 } \
162 ret__; \
163})
164
165#define x86_read_percpu(var) percpu_from_op("mov", per_cpu__##var)
166#define x86_write_percpu(var, val) percpu_to_op("mov", per_cpu__##var, val)
167#define x86_add_percpu(var, val) percpu_to_op("add", per_cpu__##var, val)
168#define x86_sub_percpu(var, val) percpu_to_op("sub", per_cpu__##var, val)
169#define x86_or_percpu(var, val) percpu_to_op("or", per_cpu__##var, val)
170#endif /* !__ASSEMBLY__ */
171#endif /* !CONFIG_X86_64 */
172
173#ifdef CONFIG_SMP
174
175/*
176 * Define the "EARLY_PER_CPU" macros. These are used for some per_cpu
177 * variables that are initialized and accessed before there are per_cpu
178 * areas allocated.
179 */
180
181#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
182 DEFINE_PER_CPU(_type, _name) = _initvalue; \
183 __typeof__(_type) _name##_early_map[NR_CPUS] __initdata = \
184 { [0 ... NR_CPUS-1] = _initvalue }; \
185 __typeof__(_type) *_name##_early_ptr __refdata = _name##_early_map
186
187#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
188 EXPORT_PER_CPU_SYMBOL(_name)
189
190#define DECLARE_EARLY_PER_CPU(_type, _name) \
191 DECLARE_PER_CPU(_type, _name); \
192 extern __typeof__(_type) *_name##_early_ptr; \
193 extern __typeof__(_type) _name##_early_map[]
194
195#define early_per_cpu_ptr(_name) (_name##_early_ptr)
196#define early_per_cpu_map(_name, _idx) (_name##_early_map[_idx])
197#define early_per_cpu(_name, _cpu) \
198 (early_per_cpu_ptr(_name) ? \
199 early_per_cpu_ptr(_name)[_cpu] : \
200 per_cpu(_name, _cpu))
201
202#else /* !CONFIG_SMP */
203#define DEFINE_EARLY_PER_CPU(_type, _name, _initvalue) \
204 DEFINE_PER_CPU(_type, _name) = _initvalue
205
206#define EXPORT_EARLY_PER_CPU_SYMBOL(_name) \
207 EXPORT_PER_CPU_SYMBOL(_name)
208
209#define DECLARE_EARLY_PER_CPU(_type, _name) \
210 DECLARE_PER_CPU(_type, _name)
211
212#define early_per_cpu(_name, _cpu) per_cpu(_name, _cpu)
213#define early_per_cpu_ptr(_name) NULL
214/* no early_per_cpu_map() */
215
216#endif /* !CONFIG_SMP */
217
218#endif /* _ASM_X86_PERCPU_H */
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
new file mode 100644
index 000000000000..cb7c151a8bff
--- /dev/null
+++ b/arch/x86/include/asm/pgalloc.h
@@ -0,0 +1,114 @@
1#ifndef _ASM_X86_PGALLOC_H
2#define _ASM_X86_PGALLOC_H
3
4#include <linux/threads.h>
5#include <linux/mm.h> /* for struct page */
6#include <linux/pagemap.h>
7
8static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
9
10#ifdef CONFIG_PARAVIRT
11#include <asm/paravirt.h>
12#else
13#define paravirt_pgd_alloc(mm) __paravirt_pgd_alloc(mm)
14static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {}
15static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {}
16static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {}
17static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
18 unsigned long start, unsigned long count) {}
19static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {}
20static inline void paravirt_release_pte(unsigned long pfn) {}
21static inline void paravirt_release_pmd(unsigned long pfn) {}
22static inline void paravirt_release_pud(unsigned long pfn) {}
23#endif
24
25/*
26 * Allocate and free page tables.
27 */
28extern pgd_t *pgd_alloc(struct mm_struct *);
29extern void pgd_free(struct mm_struct *mm, pgd_t *pgd);
30
31extern pte_t *pte_alloc_one_kernel(struct mm_struct *, unsigned long);
32extern pgtable_t pte_alloc_one(struct mm_struct *, unsigned long);
33
34/* Should really implement gc for free page table pages. This could be
35 done with a reference count in struct page. */
36
37static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte)
38{
39 BUG_ON((unsigned long)pte & (PAGE_SIZE-1));
40 free_page((unsigned long)pte);
41}
42
43static inline void pte_free(struct mm_struct *mm, struct page *pte)
44{
45 __free_page(pte);
46}
47
48extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte);
49
50static inline void pmd_populate_kernel(struct mm_struct *mm,
51 pmd_t *pmd, pte_t *pte)
52{
53 paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT);
54 set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE));
55}
56
57static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
58 struct page *pte)
59{
60 unsigned long pfn = page_to_pfn(pte);
61
62 paravirt_alloc_pte(mm, pfn);
63 set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE));
64}
65
66#define pmd_pgtable(pmd) pmd_page(pmd)
67
68#if PAGETABLE_LEVELS > 2
69static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
70{
71 return (pmd_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
72}
73
74static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
75{
76 BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
77 free_page((unsigned long)pmd);
78}
79
80extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
81
82#ifdef CONFIG_X86_PAE
83extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd);
84#else /* !CONFIG_X86_PAE */
85static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
86{
87 paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT);
88 set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd)));
89}
90#endif /* CONFIG_X86_PAE */
91
92#if PAGETABLE_LEVELS > 3
93static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
94{
95 paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT);
96 set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(pud)));
97}
98
99static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
100{
101 return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
102}
103
104static inline void pud_free(struct mm_struct *mm, pud_t *pud)
105{
106 BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
107 free_page((unsigned long)pud);
108}
109
110extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud);
111#endif /* PAGETABLE_LEVELS > 3 */
112#endif /* PAGETABLE_LEVELS > 2 */
113
114#endif /* _ASM_X86_PGALLOC_H */
diff --git a/arch/x86/include/asm/pgtable-2level-defs.h b/arch/x86/include/asm/pgtable-2level-defs.h
new file mode 100644
index 000000000000..d77db8990eaa
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-2level-defs.h
@@ -0,0 +1,20 @@
1#ifndef _ASM_X86_PGTABLE_2LEVEL_DEFS_H
2#define _ASM_X86_PGTABLE_2LEVEL_DEFS_H
3
4#define SHARED_KERNEL_PMD 0
5
6/*
7 * traditional i386 two-level paging structure:
8 */
9
10#define PGDIR_SHIFT 22
11#define PTRS_PER_PGD 1024
12
13/*
14 * the i386 is two-level, so we don't really have any
15 * PMD directory physically.
16 */
17
18#define PTRS_PER_PTE 1024
19
20#endif /* _ASM_X86_PGTABLE_2LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
new file mode 100644
index 000000000000..b17edfd23628
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -0,0 +1,79 @@
1#ifndef _ASM_X86_PGTABLE_2LEVEL_H
2#define _ASM_X86_PGTABLE_2LEVEL_H
3
4#define pte_ERROR(e) \
5 printk("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, (e).pte_low)
6#define pgd_ERROR(e) \
7 printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e))
8
9/*
10 * Certain architectures need to do special things when PTEs
11 * within a page table are directly modified. Thus, the following
12 * hook is made available.
13 */
14static inline void native_set_pte(pte_t *ptep , pte_t pte)
15{
16 *ptep = pte;
17}
18
19static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
20{
21 *pmdp = pmd;
22}
23
24static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
25{
26 native_set_pte(ptep, pte);
27}
28
29static inline void native_set_pte_present(struct mm_struct *mm,
30 unsigned long addr,
31 pte_t *ptep, pte_t pte)
32{
33 native_set_pte(ptep, pte);
34}
35
36static inline void native_pmd_clear(pmd_t *pmdp)
37{
38 native_set_pmd(pmdp, __pmd(0));
39}
40
41static inline void native_pte_clear(struct mm_struct *mm,
42 unsigned long addr, pte_t *xp)
43{
44 *xp = native_make_pte(0);
45}
46
47#ifdef CONFIG_SMP
48static inline pte_t native_ptep_get_and_clear(pte_t *xp)
49{
50 return __pte(xchg(&xp->pte_low, 0));
51}
52#else
53#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
54#endif
55
56#define pte_none(x) (!(x).pte_low)
57
58/*
59 * Bits 0, 6 and 7 are taken, split up the 29 bits of offset
60 * into this range:
61 */
62#define PTE_FILE_MAX_BITS 29
63
64#define pte_to_pgoff(pte) \
65 ((((pte).pte_low >> 1) & 0x1f) + (((pte).pte_low >> 8) << 5))
66
67#define pgoff_to_pte(off) \
68 ((pte_t) { .pte_low = (((off) & 0x1f) << 1) + \
69 (((off) >> 5) << 8) + _PAGE_FILE })
70
71/* Encode and de-code a swap entry */
72#define __swp_type(x) (((x).val >> 1) & 0x1f)
73#define __swp_offset(x) ((x).val >> 8)
74#define __swp_entry(type, offset) \
75 ((swp_entry_t) { ((type) << 1) | ((offset) << 8) })
76#define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low })
77#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
78
79#endif /* _ASM_X86_PGTABLE_2LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable-3level-defs.h b/arch/x86/include/asm/pgtable-3level-defs.h
new file mode 100644
index 000000000000..62561367653c
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-3level-defs.h
@@ -0,0 +1,28 @@
1#ifndef _ASM_X86_PGTABLE_3LEVEL_DEFS_H
2#define _ASM_X86_PGTABLE_3LEVEL_DEFS_H
3
4#ifdef CONFIG_PARAVIRT
5#define SHARED_KERNEL_PMD (pv_info.shared_kernel_pmd)
6#else
7#define SHARED_KERNEL_PMD 1
8#endif
9
10/*
11 * PGDIR_SHIFT determines what a top-level page table entry can map
12 */
13#define PGDIR_SHIFT 30
14#define PTRS_PER_PGD 4
15
16/*
17 * PMD_SHIFT determines the size of the area a middle-level
18 * page table can map
19 */
20#define PMD_SHIFT 21
21#define PTRS_PER_PMD 512
22
23/*
24 * entries per page directory level
25 */
26#define PTRS_PER_PTE 512
27
28#endif /* _ASM_X86_PGTABLE_3LEVEL_DEFS_H */
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
new file mode 100644
index 000000000000..fb16cec702e4
--- /dev/null
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -0,0 +1,175 @@
1#ifndef _ASM_X86_PGTABLE_3LEVEL_H
2#define _ASM_X86_PGTABLE_3LEVEL_H
3
4/*
5 * Intel Physical Address Extension (PAE) Mode - three-level page
6 * tables on PPro+ CPUs.
7 *
8 * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com>
9 */
10
11#define pte_ERROR(e) \
12 printk("%s:%d: bad pte %p(%08lx%08lx).\n", \
13 __FILE__, __LINE__, &(e), (e).pte_high, (e).pte_low)
14#define pmd_ERROR(e) \
15 printk("%s:%d: bad pmd %p(%016Lx).\n", \
16 __FILE__, __LINE__, &(e), pmd_val(e))
17#define pgd_ERROR(e) \
18 printk("%s:%d: bad pgd %p(%016Lx).\n", \
19 __FILE__, __LINE__, &(e), pgd_val(e))
20
21static inline int pud_none(pud_t pud)
22{
23 return pud_val(pud) == 0;
24}
25
26static inline int pud_bad(pud_t pud)
27{
28 return (pud_val(pud) & ~(PTE_PFN_MASK | _KERNPG_TABLE | _PAGE_USER)) != 0;
29}
30
31static inline int pud_present(pud_t pud)
32{
33 return pud_val(pud) & _PAGE_PRESENT;
34}
35
36/* Rules for using set_pte: the pte being assigned *must* be
37 * either not present or in a state where the hardware will
38 * not attempt to update the pte. In places where this is
39 * not possible, use pte_get_and_clear to obtain the old pte
40 * value and then use set_pte to update it. -ben
41 */
42static inline void native_set_pte(pte_t *ptep, pte_t pte)
43{
44 ptep->pte_high = pte.pte_high;
45 smp_wmb();
46 ptep->pte_low = pte.pte_low;
47}
48
49/*
50 * Since this is only called on user PTEs, and the page fault handler
51 * must handle the already racy situation of simultaneous page faults,
52 * we are justified in merely clearing the PTE present bit, followed
53 * by a set. The ordering here is important.
54 */
55static inline void native_set_pte_present(struct mm_struct *mm,
56 unsigned long addr,
57 pte_t *ptep, pte_t pte)
58{
59 ptep->pte_low = 0;
60 smp_wmb();
61 ptep->pte_high = pte.pte_high;
62 smp_wmb();
63 ptep->pte_low = pte.pte_low;
64}
65
66static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
67{
68 set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
69}
70
71static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
72{
73 set_64bit((unsigned long long *)(pmdp), native_pmd_val(pmd));
74}
75
76static inline void native_set_pud(pud_t *pudp, pud_t pud)
77{
78 set_64bit((unsigned long long *)(pudp), native_pud_val(pud));
79}
80
81/*
82 * For PTEs and PDEs, we must clear the P-bit first when clearing a page table
83 * entry, so clear the bottom half first and enforce ordering with a compiler
84 * barrier.
85 */
86static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
87 pte_t *ptep)
88{
89 ptep->pte_low = 0;
90 smp_wmb();
91 ptep->pte_high = 0;
92}
93
94static inline void native_pmd_clear(pmd_t *pmd)
95{
96 u32 *tmp = (u32 *)pmd;
97 *tmp = 0;
98 smp_wmb();
99 *(tmp + 1) = 0;
100}
101
102static inline void pud_clear(pud_t *pudp)
103{
104 unsigned long pgd;
105
106 set_pud(pudp, __pud(0));
107
108 /*
109 * According to Intel App note "TLBs, Paging-Structure Caches,
110 * and Their Invalidation", April 2007, document 317080-001,
111 * section 8.1: in PAE mode we explicitly have to flush the
112 * TLB via cr3 if the top-level pgd is changed...
113 *
114 * Make sure the pud entry we're updating is within the
115 * current pgd to avoid unnecessary TLB flushes.
116 */
117 pgd = read_cr3();
118 if (__pa(pudp) >= pgd && __pa(pudp) <
119 (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
120 write_cr3(pgd);
121}
122
123#define pud_page(pud) ((struct page *) __va(pud_val(pud) & PTE_PFN_MASK))
124
125#define pud_page_vaddr(pud) ((unsigned long) __va(pud_val(pud) & PTE_PFN_MASK))
126
127
128/* Find an entry in the second-level page table.. */
129#define pmd_offset(pud, address) ((pmd_t *)pud_page(*(pud)) + \
130 pmd_index(address))
131
132#ifdef CONFIG_SMP
133static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
134{
135 pte_t res;
136
137 /* xchg acts as a barrier before the setting of the high bits */
138 res.pte_low = xchg(&ptep->pte_low, 0);
139 res.pte_high = ptep->pte_high;
140 ptep->pte_high = 0;
141
142 return res;
143}
144#else
145#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
146#endif
147
148#define __HAVE_ARCH_PTE_SAME
149static inline int pte_same(pte_t a, pte_t b)
150{
151 return a.pte_low == b.pte_low && a.pte_high == b.pte_high;
152}
153
154static inline int pte_none(pte_t pte)
155{
156 return !pte.pte_low && !pte.pte_high;
157}
158
159/*
160 * Bits 0, 6 and 7 are taken in the low part of the pte,
161 * put the 32 bits of offset into the high part.
162 */
163#define pte_to_pgoff(pte) ((pte).pte_high)
164#define pgoff_to_pte(off) \
165 ((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
166#define PTE_FILE_MAX_BITS 32
167
168/* Encode and de-code a swap entry */
169#define __swp_type(x) (((x).val) & 0x1f)
170#define __swp_offset(x) ((x).val >> 5)
171#define __swp_entry(type, offset) ((swp_entry_t){(type) | (offset) << 5})
172#define __pte_to_swp_entry(pte) ((swp_entry_t){ (pte).pte_high })
173#define __swp_entry_to_pte(x) ((pte_t){ { .pte_high = (x).val } })
174
175#endif /* _ASM_X86_PGTABLE_3LEVEL_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
new file mode 100644
index 000000000000..c012f3b11671
--- /dev/null
+++ b/arch/x86/include/asm/pgtable.h
@@ -0,0 +1,562 @@
1#ifndef _ASM_X86_PGTABLE_H
2#define _ASM_X86_PGTABLE_H
3
4#define FIRST_USER_ADDRESS 0
5
6#define _PAGE_BIT_PRESENT 0 /* is present */
7#define _PAGE_BIT_RW 1 /* writeable */
8#define _PAGE_BIT_USER 2 /* userspace addressable */
9#define _PAGE_BIT_PWT 3 /* page write through */
10#define _PAGE_BIT_PCD 4 /* page cache disabled */
11#define _PAGE_BIT_ACCESSED 5 /* was accessed (raised by CPU) */
12#define _PAGE_BIT_DIRTY 6 /* was written to (raised by CPU) */
13#define _PAGE_BIT_FILE 6
14#define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */
15#define _PAGE_BIT_PAT 7 /* on 4KB pages */
16#define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */
17#define _PAGE_BIT_UNUSED1 9 /* available for programmer */
18#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */
19#define _PAGE_BIT_UNUSED3 11
20#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
21#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
22#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
23#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
24
25#define _PAGE_PRESENT (_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
26#define _PAGE_RW (_AT(pteval_t, 1) << _PAGE_BIT_RW)
27#define _PAGE_USER (_AT(pteval_t, 1) << _PAGE_BIT_USER)
28#define _PAGE_PWT (_AT(pteval_t, 1) << _PAGE_BIT_PWT)
29#define _PAGE_PCD (_AT(pteval_t, 1) << _PAGE_BIT_PCD)
30#define _PAGE_ACCESSED (_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
31#define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
32#define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE)
33#define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
34#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1)
35#define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
36#define _PAGE_UNUSED3 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED3)
37#define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT)
38#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
39#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
40#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
41#define __HAVE_ARCH_PTE_SPECIAL
42
43#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
44#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
45#else
46#define _PAGE_NX (_AT(pteval_t, 0))
47#endif
48
49/* If _PAGE_PRESENT is clear, we use these: */
50#define _PAGE_FILE _PAGE_DIRTY /* nonlinear file mapping,
51 * saved PTE; unset:swap */
52#define _PAGE_PROTNONE _PAGE_PSE /* if the user mapped it with PROT_NONE;
53 pte_present gives true */
54
55#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
56 _PAGE_ACCESSED | _PAGE_DIRTY)
57#define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \
58 _PAGE_DIRTY)
59
60/* Set of bits not changed in pte_modify */
61#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
62 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
63
64#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
65#define _PAGE_CACHE_WB (0)
66#define _PAGE_CACHE_WC (_PAGE_PWT)
67#define _PAGE_CACHE_UC_MINUS (_PAGE_PCD)
68#define _PAGE_CACHE_UC (_PAGE_PCD | _PAGE_PWT)
69
70#define PAGE_NONE __pgprot(_PAGE_PROTNONE | _PAGE_ACCESSED)
71#define PAGE_SHARED __pgprot(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \
72 _PAGE_ACCESSED | _PAGE_NX)
73
74#define PAGE_SHARED_EXEC __pgprot(_PAGE_PRESENT | _PAGE_RW | \
75 _PAGE_USER | _PAGE_ACCESSED)
76#define PAGE_COPY_NOEXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
77 _PAGE_ACCESSED | _PAGE_NX)
78#define PAGE_COPY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
79 _PAGE_ACCESSED)
80#define PAGE_COPY PAGE_COPY_NOEXEC
81#define PAGE_READONLY __pgprot(_PAGE_PRESENT | _PAGE_USER | \
82 _PAGE_ACCESSED | _PAGE_NX)
83#define PAGE_READONLY_EXEC __pgprot(_PAGE_PRESENT | _PAGE_USER | \
84 _PAGE_ACCESSED)
85
86#define __PAGE_KERNEL_EXEC \
87 (_PAGE_PRESENT | _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_GLOBAL)
88#define __PAGE_KERNEL (__PAGE_KERNEL_EXEC | _PAGE_NX)
89
90#define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW)
91#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW)
92#define __PAGE_KERNEL_EXEC_NOCACHE (__PAGE_KERNEL_EXEC | _PAGE_PCD | _PAGE_PWT)
93#define __PAGE_KERNEL_WC (__PAGE_KERNEL | _PAGE_CACHE_WC)
94#define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD | _PAGE_PWT)
95#define __PAGE_KERNEL_UC_MINUS (__PAGE_KERNEL | _PAGE_PCD)
96#define __PAGE_KERNEL_VSYSCALL (__PAGE_KERNEL_RX | _PAGE_USER)
97#define __PAGE_KERNEL_VSYSCALL_NOCACHE (__PAGE_KERNEL_VSYSCALL | _PAGE_PCD | _PAGE_PWT)
98#define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE)
99#define __PAGE_KERNEL_LARGE_NOCACHE (__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
100#define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE)
101
102#define __PAGE_KERNEL_IO (__PAGE_KERNEL | _PAGE_IOMAP)
103#define __PAGE_KERNEL_IO_NOCACHE (__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
104#define __PAGE_KERNEL_IO_UC_MINUS (__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
105#define __PAGE_KERNEL_IO_WC (__PAGE_KERNEL_WC | _PAGE_IOMAP)
106
107#define PAGE_KERNEL __pgprot(__PAGE_KERNEL)
108#define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO)
109#define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC)
110#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX)
111#define PAGE_KERNEL_WC __pgprot(__PAGE_KERNEL_WC)
112#define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE)
113#define PAGE_KERNEL_UC_MINUS __pgprot(__PAGE_KERNEL_UC_MINUS)
114#define PAGE_KERNEL_EXEC_NOCACHE __pgprot(__PAGE_KERNEL_EXEC_NOCACHE)
115#define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE)
116#define PAGE_KERNEL_LARGE_NOCACHE __pgprot(__PAGE_KERNEL_LARGE_NOCACHE)
117#define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC)
118#define PAGE_KERNEL_VSYSCALL __pgprot(__PAGE_KERNEL_VSYSCALL)
119#define PAGE_KERNEL_VSYSCALL_NOCACHE __pgprot(__PAGE_KERNEL_VSYSCALL_NOCACHE)
120
121#define PAGE_KERNEL_IO __pgprot(__PAGE_KERNEL_IO)
122#define PAGE_KERNEL_IO_NOCACHE __pgprot(__PAGE_KERNEL_IO_NOCACHE)
123#define PAGE_KERNEL_IO_UC_MINUS __pgprot(__PAGE_KERNEL_IO_UC_MINUS)
124#define PAGE_KERNEL_IO_WC __pgprot(__PAGE_KERNEL_IO_WC)
125
126/* xwr */
127#define __P000 PAGE_NONE
128#define __P001 PAGE_READONLY
129#define __P010 PAGE_COPY
130#define __P011 PAGE_COPY
131#define __P100 PAGE_READONLY_EXEC
132#define __P101 PAGE_READONLY_EXEC
133#define __P110 PAGE_COPY_EXEC
134#define __P111 PAGE_COPY_EXEC
135
136#define __S000 PAGE_NONE
137#define __S001 PAGE_READONLY
138#define __S010 PAGE_SHARED
139#define __S011 PAGE_SHARED
140#define __S100 PAGE_READONLY_EXEC
141#define __S101 PAGE_READONLY_EXEC
142#define __S110 PAGE_SHARED_EXEC
143#define __S111 PAGE_SHARED_EXEC
144
145/*
146 * early identity mapping pte attrib macros.
147 */
148#ifdef CONFIG_X86_64
149#define __PAGE_KERNEL_IDENT_LARGE_EXEC __PAGE_KERNEL_LARGE_EXEC
150#else
151/*
152 * For PDE_IDENT_ATTR include USER bit. As the PDE and PTE protection
153 * bits are combined, this will alow user to access the high address mapped
154 * VDSO in the presence of CONFIG_COMPAT_VDSO
155 */
156#define PTE_IDENT_ATTR 0x003 /* PRESENT+RW */
157#define PDE_IDENT_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
158#define PGD_IDENT_ATTR 0x001 /* PRESENT (no other attributes) */
159#endif
160
161#ifndef __ASSEMBLY__
162
163/*
164 * ZERO_PAGE is a global shared page that is always zero: used
165 * for zero-mapped memory areas etc..
166 */
167extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
168#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
169
170extern spinlock_t pgd_lock;
171extern struct list_head pgd_list;
172
173/*
174 * The following only work if pte_present() is true.
175 * Undefined behaviour if not..
176 */
177static inline int pte_dirty(pte_t pte)
178{
179 return pte_flags(pte) & _PAGE_DIRTY;
180}
181
182static inline int pte_young(pte_t pte)
183{
184 return pte_flags(pte) & _PAGE_ACCESSED;
185}
186
187static inline int pte_write(pte_t pte)
188{
189 return pte_flags(pte) & _PAGE_RW;
190}
191
192static inline int pte_file(pte_t pte)
193{
194 return pte_flags(pte) & _PAGE_FILE;
195}
196
197static inline int pte_huge(pte_t pte)
198{
199 return pte_flags(pte) & _PAGE_PSE;
200}
201
202static inline int pte_global(pte_t pte)
203{
204 return pte_flags(pte) & _PAGE_GLOBAL;
205}
206
207static inline int pte_exec(pte_t pte)
208{
209 return !(pte_flags(pte) & _PAGE_NX);
210}
211
212static inline int pte_special(pte_t pte)
213{
214 return pte_flags(pte) & _PAGE_SPECIAL;
215}
216
217static inline unsigned long pte_pfn(pte_t pte)
218{
219 return (pte_val(pte) & PTE_PFN_MASK) >> PAGE_SHIFT;
220}
221
222#define pte_page(pte) pfn_to_page(pte_pfn(pte))
223
224static inline int pmd_large(pmd_t pte)
225{
226 return (pmd_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
227 (_PAGE_PSE | _PAGE_PRESENT);
228}
229
230static inline pte_t pte_mkclean(pte_t pte)
231{
232 return __pte(pte_val(pte) & ~_PAGE_DIRTY);
233}
234
235static inline pte_t pte_mkold(pte_t pte)
236{
237 return __pte(pte_val(pte) & ~_PAGE_ACCESSED);
238}
239
240static inline pte_t pte_wrprotect(pte_t pte)
241{
242 return __pte(pte_val(pte) & ~_PAGE_RW);
243}
244
245static inline pte_t pte_mkexec(pte_t pte)
246{
247 return __pte(pte_val(pte) & ~_PAGE_NX);
248}
249
250static inline pte_t pte_mkdirty(pte_t pte)
251{
252 return __pte(pte_val(pte) | _PAGE_DIRTY);
253}
254
255static inline pte_t pte_mkyoung(pte_t pte)
256{
257 return __pte(pte_val(pte) | _PAGE_ACCESSED);
258}
259
260static inline pte_t pte_mkwrite(pte_t pte)
261{
262 return __pte(pte_val(pte) | _PAGE_RW);
263}
264
265static inline pte_t pte_mkhuge(pte_t pte)
266{
267 return __pte(pte_val(pte) | _PAGE_PSE);
268}
269
270static inline pte_t pte_clrhuge(pte_t pte)
271{
272 return __pte(pte_val(pte) & ~_PAGE_PSE);
273}
274
275static inline pte_t pte_mkglobal(pte_t pte)
276{
277 return __pte(pte_val(pte) | _PAGE_GLOBAL);
278}
279
280static inline pte_t pte_clrglobal(pte_t pte)
281{
282 return __pte(pte_val(pte) & ~_PAGE_GLOBAL);
283}
284
285static inline pte_t pte_mkspecial(pte_t pte)
286{
287 return __pte(pte_val(pte) | _PAGE_SPECIAL);
288}
289
290extern pteval_t __supported_pte_mask;
291
292static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot)
293{
294 return __pte((((phys_addr_t)page_nr << PAGE_SHIFT) |
295 pgprot_val(pgprot)) & __supported_pte_mask);
296}
297
298static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
299{
300 return __pmd((((phys_addr_t)page_nr << PAGE_SHIFT) |
301 pgprot_val(pgprot)) & __supported_pte_mask);
302}
303
304static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
305{
306 pteval_t val = pte_val(pte);
307
308 /*
309 * Chop off the NX bit (if present), and add the NX portion of
310 * the newprot (if present):
311 */
312 val &= _PAGE_CHG_MASK;
313 val |= pgprot_val(newprot) & (~_PAGE_CHG_MASK) & __supported_pte_mask;
314
315 return __pte(val);
316}
317
318/* mprotect needs to preserve PAT bits when updating vm_page_prot */
319#define pgprot_modify pgprot_modify
320static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
321{
322 pgprotval_t preservebits = pgprot_val(oldprot) & _PAGE_CHG_MASK;
323 pgprotval_t addbits = pgprot_val(newprot);
324 return __pgprot(preservebits | addbits);
325}
326
327#define pte_pgprot(x) __pgprot(pte_flags(x) & PTE_FLAGS_MASK)
328
329#define canon_pgprot(p) __pgprot(pgprot_val(p) & __supported_pte_mask)
330
331#ifndef __ASSEMBLY__
332#define __HAVE_PHYS_MEM_ACCESS_PROT
333struct file;
334pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
335 unsigned long size, pgprot_t vma_prot);
336int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
337 unsigned long size, pgprot_t *vma_prot);
338#endif
339
340/* Install a pte for a particular vaddr in kernel space. */
341void set_pte_vaddr(unsigned long vaddr, pte_t pte);
342
343#ifdef CONFIG_X86_32
344extern void native_pagetable_setup_start(pgd_t *base);
345extern void native_pagetable_setup_done(pgd_t *base);
346#else
347static inline void native_pagetable_setup_start(pgd_t *base) {}
348static inline void native_pagetable_setup_done(pgd_t *base) {}
349#endif
350
351struct seq_file;
352extern void arch_report_meminfo(struct seq_file *m);
353
354#ifdef CONFIG_PARAVIRT
355#include <asm/paravirt.h>
356#else /* !CONFIG_PARAVIRT */
357#define set_pte(ptep, pte) native_set_pte(ptep, pte)
358#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
359
360#define set_pte_present(mm, addr, ptep, pte) \
361 native_set_pte_present(mm, addr, ptep, pte)
362#define set_pte_atomic(ptep, pte) \
363 native_set_pte_atomic(ptep, pte)
364
365#define set_pmd(pmdp, pmd) native_set_pmd(pmdp, pmd)
366
367#ifndef __PAGETABLE_PUD_FOLDED
368#define set_pgd(pgdp, pgd) native_set_pgd(pgdp, pgd)
369#define pgd_clear(pgd) native_pgd_clear(pgd)
370#endif
371
372#ifndef set_pud
373# define set_pud(pudp, pud) native_set_pud(pudp, pud)
374#endif
375
376#ifndef __PAGETABLE_PMD_FOLDED
377#define pud_clear(pud) native_pud_clear(pud)
378#endif
379
380#define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep)
381#define pmd_clear(pmd) native_pmd_clear(pmd)
382
383#define pte_update(mm, addr, ptep) do { } while (0)
384#define pte_update_defer(mm, addr, ptep) do { } while (0)
385
386static inline void __init paravirt_pagetable_setup_start(pgd_t *base)
387{
388 native_pagetable_setup_start(base);
389}
390
391static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
392{
393 native_pagetable_setup_done(base);
394}
395#endif /* CONFIG_PARAVIRT */
396
397#endif /* __ASSEMBLY__ */
398
399#ifdef CONFIG_X86_32
400# include "pgtable_32.h"
401#else
402# include "pgtable_64.h"
403#endif
404
405/*
406 * the pgd page can be thought of an array like this: pgd_t[PTRS_PER_PGD]
407 *
408 * this macro returns the index of the entry in the pgd page which would
409 * control the given virtual address
410 */
411#define pgd_index(address) (((address) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))
412
413/*
414 * pgd_offset() returns a (pgd_t *)
415 * pgd_index() is used get the offset into the pgd page's array of pgd_t's;
416 */
417#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address)))
418/*
419 * a shortcut which implies the use of the kernel's pgd, instead
420 * of a process's
421 */
422#define pgd_offset_k(address) pgd_offset(&init_mm, (address))
423
424
425#define KERNEL_PGD_BOUNDARY pgd_index(PAGE_OFFSET)
426#define KERNEL_PGD_PTRS (PTRS_PER_PGD - KERNEL_PGD_BOUNDARY)
427
428#ifndef __ASSEMBLY__
429
430enum {
431 PG_LEVEL_NONE,
432 PG_LEVEL_4K,
433 PG_LEVEL_2M,
434 PG_LEVEL_1G,
435 PG_LEVEL_NUM
436};
437
438#ifdef CONFIG_PROC_FS
439extern void update_page_count(int level, unsigned long pages);
440#else
441static inline void update_page_count(int level, unsigned long pages) { }
442#endif
443
444/*
445 * Helper function that returns the kernel pagetable entry controlling
446 * the virtual address 'address'. NULL means no pagetable entry present.
447 * NOTE: the return type is pte_t but if the pmd is PSE then we return it
448 * as a pte too.
449 */
450extern pte_t *lookup_address(unsigned long address, unsigned int *level);
451
452/* local pte updates need not use xchg for locking */
453static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
454{
455 pte_t res = *ptep;
456
457 /* Pure native function needs no input for mm, addr */
458 native_pte_clear(NULL, 0, ptep);
459 return res;
460}
461
462static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
463 pte_t *ptep , pte_t pte)
464{
465 native_set_pte(ptep, pte);
466}
467
468#ifndef CONFIG_PARAVIRT
469/*
470 * Rules for using pte_update - it must be called after any PTE update which
471 * has not been done using the set_pte / clear_pte interfaces. It is used by
472 * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE
473 * updates should either be sets, clears, or set_pte_atomic for P->P
474 * transitions, which means this hook should only be called for user PTEs.
475 * This hook implies a P->P protection or access change has taken place, which
476 * requires a subsequent TLB flush. The notification can optionally be delayed
477 * until the TLB flush event by using the pte_update_defer form of the
478 * interface, but care must be taken to assure that the flush happens while
479 * still holding the same page table lock so that the shadow and primary pages
480 * do not become out of sync on SMP.
481 */
482#define pte_update(mm, addr, ptep) do { } while (0)
483#define pte_update_defer(mm, addr, ptep) do { } while (0)
484#endif
485
486/*
487 * We only update the dirty/accessed state if we set
488 * the dirty bit by hand in the kernel, since the hardware
489 * will do the accessed bit for us, and we don't want to
490 * race with other CPU's that might be updating the dirty
491 * bit at the same time.
492 */
493struct vm_area_struct;
494
495#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
496extern int ptep_set_access_flags(struct vm_area_struct *vma,
497 unsigned long address, pte_t *ptep,
498 pte_t entry, int dirty);
499
500#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
501extern int ptep_test_and_clear_young(struct vm_area_struct *vma,
502 unsigned long addr, pte_t *ptep);
503
504#define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH
505extern int ptep_clear_flush_young(struct vm_area_struct *vma,
506 unsigned long address, pte_t *ptep);
507
508#define __HAVE_ARCH_PTEP_GET_AND_CLEAR
509static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
510 pte_t *ptep)
511{
512 pte_t pte = native_ptep_get_and_clear(ptep);
513 pte_update(mm, addr, ptep);
514 return pte;
515}
516
517#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL
518static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
519 unsigned long addr, pte_t *ptep,
520 int full)
521{
522 pte_t pte;
523 if (full) {
524 /*
525 * Full address destruction in progress; paravirt does not
526 * care about updates and native needs no locking
527 */
528 pte = native_local_ptep_get_and_clear(ptep);
529 } else {
530 pte = ptep_get_and_clear(mm, addr, ptep);
531 }
532 return pte;
533}
534
535#define __HAVE_ARCH_PTEP_SET_WRPROTECT
536static inline void ptep_set_wrprotect(struct mm_struct *mm,
537 unsigned long addr, pte_t *ptep)
538{
539 clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
540 pte_update(mm, addr, ptep);
541}
542
543/*
544 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
545 *
546 * dst - pointer to pgd range anwhere on a pgd page
547 * src - ""
548 * count - the number of pgds to copy.
549 *
550 * dst and src can be on the same page, but the range must not overlap,
551 * and must not cross a page boundary.
552 */
553static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
554{
555 memcpy(dst, src, count * sizeof(pgd_t));
556}
557
558
559#include <asm-generic/pgtable.h>
560#endif /* __ASSEMBLY__ */
561
562#endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
new file mode 100644
index 000000000000..f9d5889b336b
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -0,0 +1,191 @@
1#ifndef _ASM_X86_PGTABLE_32_H
2#define _ASM_X86_PGTABLE_32_H
3
4
5/*
6 * The Linux memory management assumes a three-level page table setup. On
7 * the i386, we use that, but "fold" the mid level into the top-level page
8 * table, so that we physically have the same two-level page table as the
9 * i386 mmu expects.
10 *
11 * This file contains the functions and defines necessary to modify and use
12 * the i386 page table tree.
13 */
14#ifndef __ASSEMBLY__
15#include <asm/processor.h>
16#include <asm/fixmap.h>
17#include <linux/threads.h>
18#include <asm/paravirt.h>
19
20#include <linux/bitops.h>
21#include <linux/slab.h>
22#include <linux/list.h>
23#include <linux/spinlock.h>
24
25struct mm_struct;
26struct vm_area_struct;
27
28extern pgd_t swapper_pg_dir[1024];
29
30static inline void pgtable_cache_init(void) { }
31static inline void check_pgt_cache(void) { }
32void paging_init(void);
33
34extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
35
36/*
37 * The Linux x86 paging architecture is 'compile-time dual-mode', it
38 * implements both the traditional 2-level x86 page tables and the
39 * newer 3-level PAE-mode page tables.
40 */
41#ifdef CONFIG_X86_PAE
42# include <asm/pgtable-3level-defs.h>
43# define PMD_SIZE (1UL << PMD_SHIFT)
44# define PMD_MASK (~(PMD_SIZE - 1))
45#else
46# include <asm/pgtable-2level-defs.h>
47#endif
48
49#define PGDIR_SIZE (1UL << PGDIR_SHIFT)
50#define PGDIR_MASK (~(PGDIR_SIZE - 1))
51
52/* Just any arbitrary offset to the start of the vmalloc VM area: the
53 * current 8MB value just means that there will be a 8MB "hole" after the
54 * physical memory until the kernel virtual memory starts. That means that
55 * any out-of-bounds memory accesses will hopefully be caught.
56 * The vmalloc() routines leaves a hole of 4kB between each vmalloced
57 * area for the same reason. ;)
58 */
59#define VMALLOC_OFFSET (8 * 1024 * 1024)
60#define VMALLOC_START ((unsigned long)high_memory + VMALLOC_OFFSET)
61#ifdef CONFIG_X86_PAE
62#define LAST_PKMAP 512
63#else
64#define LAST_PKMAP 1024
65#endif
66
67#define PKMAP_BASE ((FIXADDR_BOOT_START - PAGE_SIZE * (LAST_PKMAP + 1)) \
68 & PMD_MASK)
69
70#ifdef CONFIG_HIGHMEM
71# define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE)
72#else
73# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE)
74#endif
75
76#define MAXMEM (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE)
77
78/*
79 * Define this if things work differently on an i386 and an i486:
80 * it will (on an i486) warn about kernel memory accesses that are
81 * done without a 'access_ok(VERIFY_WRITE,..)'
82 */
83#undef TEST_ACCESS_OK
84
85/* The boot page tables (all created as a single array) */
86extern unsigned long pg0[];
87
88#define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE))
89
90/* To avoid harmful races, pmd_none(x) should check only the lower when PAE */
91#define pmd_none(x) (!(unsigned long)pmd_val((x)))
92#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT)
93#define pmd_bad(x) ((pmd_val(x) & (PTE_FLAGS_MASK & ~_PAGE_USER)) != _KERNPG_TABLE)
94
95#define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT))
96
97#ifdef CONFIG_X86_PAE
98# include <asm/pgtable-3level.h>
99#else
100# include <asm/pgtable-2level.h>
101#endif
102
103/*
104 * Macro to mark a page protection value as "uncacheable".
105 * On processors which do not support it, this is a no-op.
106 */
107#define pgprot_noncached(prot) \
108 ((boot_cpu_data.x86 > 3) \
109 ? (__pgprot(pgprot_val(prot) | _PAGE_PCD | _PAGE_PWT)) \
110 : (prot))
111
112/*
113 * Conversion functions: convert a page and protection to a page entry,
114 * and a page entry and page directory to the page they refer to.
115 */
116#define mk_pte(page, pgprot) pfn_pte(page_to_pfn(page), (pgprot))
117
118
119static inline int pud_large(pud_t pud) { return 0; }
120
121/*
122 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
123 *
124 * this macro returns the index of the entry in the pmd page which would
125 * control the given virtual address
126 */
127#define pmd_index(address) \
128 (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
129
130/*
131 * the pte page can be thought of an array like this: pte_t[PTRS_PER_PTE]
132 *
133 * this macro returns the index of the entry in the pte page which would
134 * control the given virtual address
135 */
136#define pte_index(address) \
137 (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
138#define pte_offset_kernel(dir, address) \
139 ((pte_t *)pmd_page_vaddr(*(dir)) + pte_index((address)))
140
141#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
142
143#define pmd_page_vaddr(pmd) \
144 ((unsigned long)__va(pmd_val((pmd)) & PTE_PFN_MASK))
145
146#if defined(CONFIG_HIGHPTE)
147#define pte_offset_map(dir, address) \
148 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE0) + \
149 pte_index((address)))
150#define pte_offset_map_nested(dir, address) \
151 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \
152 pte_index((address)))
153#define pte_unmap(pte) kunmap_atomic((pte), KM_PTE0)
154#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
155#else
156#define pte_offset_map(dir, address) \
157 ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
158#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
159#define pte_unmap(pte) do { } while (0)
160#define pte_unmap_nested(pte) do { } while (0)
161#endif
162
163/* Clear a kernel PTE and flush it from the TLB */
164#define kpte_clear_flush(ptep, vaddr) \
165do { \
166 pte_clear(&init_mm, (vaddr), (ptep)); \
167 __flush_tlb_one((vaddr)); \
168} while (0)
169
170/*
171 * The i386 doesn't have any external MMU info: the kernel page
172 * tables contain all the necessary information.
173 */
174#define update_mmu_cache(vma, address, pte) do { } while (0)
175
176#endif /* !__ASSEMBLY__ */
177
178/*
179 * kern_addr_valid() is (1) for FLATMEM and (0) for
180 * SPARSEMEM and DISCONTIGMEM
181 */
182#ifdef CONFIG_FLATMEM
183#define kern_addr_valid(addr) (1)
184#else
185#define kern_addr_valid(kaddr) (0)
186#endif
187
188#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
189 remap_pfn_range(vma, vaddr, pfn, size, prot)
190
191#endif /* _ASM_X86_PGTABLE_32_H */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
new file mode 100644
index 000000000000..545a0e042bb2
--- /dev/null
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -0,0 +1,285 @@
1#ifndef _ASM_X86_PGTABLE_64_H
2#define _ASM_X86_PGTABLE_64_H
3
4#include <linux/const.h>
5#ifndef __ASSEMBLY__
6
7/*
8 * This file contains the functions and defines necessary to modify and use
9 * the x86-64 page table tree.
10 */
11#include <asm/processor.h>
12#include <linux/bitops.h>
13#include <linux/threads.h>
14#include <asm/pda.h>
15
16extern pud_t level3_kernel_pgt[512];
17extern pud_t level3_ident_pgt[512];
18extern pmd_t level2_kernel_pgt[512];
19extern pmd_t level2_fixmap_pgt[512];
20extern pmd_t level2_ident_pgt[512];
21extern pgd_t init_level4_pgt[];
22
23#define swapper_pg_dir init_level4_pgt
24
25extern void paging_init(void);
26
27#endif /* !__ASSEMBLY__ */
28
29#define SHARED_KERNEL_PMD 0
30
31/*
32 * PGDIR_SHIFT determines what a top-level page table entry can map
33 */
34#define PGDIR_SHIFT 39
35#define PTRS_PER_PGD 512
36
37/*
38 * 3rd level page
39 */
40#define PUD_SHIFT 30
41#define PTRS_PER_PUD 512
42
43/*
44 * PMD_SHIFT determines the size of the area a middle-level
45 * page table can map
46 */
47#define PMD_SHIFT 21
48#define PTRS_PER_PMD 512
49
50/*
51 * entries per page directory level
52 */
53#define PTRS_PER_PTE 512
54
55#ifndef __ASSEMBLY__
56
57#define pte_ERROR(e) \
58 printk("%s:%d: bad pte %p(%016lx).\n", \
59 __FILE__, __LINE__, &(e), pte_val(e))
60#define pmd_ERROR(e) \
61 printk("%s:%d: bad pmd %p(%016lx).\n", \
62 __FILE__, __LINE__, &(e), pmd_val(e))
63#define pud_ERROR(e) \
64 printk("%s:%d: bad pud %p(%016lx).\n", \
65 __FILE__, __LINE__, &(e), pud_val(e))
66#define pgd_ERROR(e) \
67 printk("%s:%d: bad pgd %p(%016lx).\n", \
68 __FILE__, __LINE__, &(e), pgd_val(e))
69
70#define pgd_none(x) (!pgd_val(x))
71#define pud_none(x) (!pud_val(x))
72
73struct mm_struct;
74
75void set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte);
76
77
78static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
79 pte_t *ptep)
80{
81 *ptep = native_make_pte(0);
82}
83
84static inline void native_set_pte(pte_t *ptep, pte_t pte)
85{
86 *ptep = pte;
87}
88
89static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
90{
91 native_set_pte(ptep, pte);
92}
93
94static inline pte_t native_ptep_get_and_clear(pte_t *xp)
95{
96#ifdef CONFIG_SMP
97 return native_make_pte(xchg(&xp->pte, 0));
98#else
99 /* native_local_ptep_get_and_clear,
100 but duplicated because of cyclic dependency */
101 pte_t ret = *xp;
102 native_pte_clear(NULL, 0, xp);
103 return ret;
104#endif
105}
106
107static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
108{
109 *pmdp = pmd;
110}
111
112static inline void native_pmd_clear(pmd_t *pmd)
113{
114 native_set_pmd(pmd, native_make_pmd(0));
115}
116
117static inline void native_set_pud(pud_t *pudp, pud_t pud)
118{
119 *pudp = pud;
120}
121
122static inline void native_pud_clear(pud_t *pud)
123{
124 native_set_pud(pud, native_make_pud(0));
125}
126
127static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
128{
129 *pgdp = pgd;
130}
131
132static inline void native_pgd_clear(pgd_t *pgd)
133{
134 native_set_pgd(pgd, native_make_pgd(0));
135}
136
137#define pte_same(a, b) ((a).pte == (b).pte)
138
139#endif /* !__ASSEMBLY__ */
140
141#define PMD_SIZE (_AC(1, UL) << PMD_SHIFT)
142#define PMD_MASK (~(PMD_SIZE - 1))
143#define PUD_SIZE (_AC(1, UL) << PUD_SHIFT)
144#define PUD_MASK (~(PUD_SIZE - 1))
145#define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT)
146#define PGDIR_MASK (~(PGDIR_SIZE - 1))
147
148
149#define MAXMEM _AC(0x00003fffffffffff, UL)
150#define VMALLOC_START _AC(0xffffc20000000000, UL)
151#define VMALLOC_END _AC(0xffffe1ffffffffff, UL)
152#define VMEMMAP_START _AC(0xffffe20000000000, UL)
153#define MODULES_VADDR _AC(0xffffffffa0000000, UL)
154#define MODULES_END _AC(0xffffffffff000000, UL)
155#define MODULES_LEN (MODULES_END - MODULES_VADDR)
156
157#ifndef __ASSEMBLY__
158
159static inline int pgd_bad(pgd_t pgd)
160{
161 return (pgd_val(pgd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
162}
163
164static inline int pud_bad(pud_t pud)
165{
166 return (pud_val(pud) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
167}
168
169static inline int pmd_bad(pmd_t pmd)
170{
171 return (pmd_val(pmd) & ~(PTE_PFN_MASK | _PAGE_USER)) != _KERNPG_TABLE;
172}
173
174#define pte_none(x) (!pte_val((x)))
175#define pte_present(x) (pte_val((x)) & (_PAGE_PRESENT | _PAGE_PROTNONE))
176
177#define pages_to_mb(x) ((x) >> (20 - PAGE_SHIFT)) /* FIXME: is this right? */
178
179/*
180 * Macro to mark a page protection value as "uncacheable".
181 */
182#define pgprot_noncached(prot) \
183 (__pgprot(pgprot_val((prot)) | _PAGE_PCD | _PAGE_PWT))
184
185/*
186 * Conversion functions: convert a page and protection to a page entry,
187 * and a page entry and page directory to the page they refer to.
188 */
189
190/*
191 * Level 4 access.
192 */
193#define pgd_page_vaddr(pgd) \
194 ((unsigned long)__va((unsigned long)pgd_val((pgd)) & PTE_PFN_MASK))
195#define pgd_page(pgd) (pfn_to_page(pgd_val((pgd)) >> PAGE_SHIFT))
196#define pgd_present(pgd) (pgd_val(pgd) & _PAGE_PRESENT)
197static inline int pgd_large(pgd_t pgd) { return 0; }
198#define mk_kernel_pgd(address) __pgd((address) | _KERNPG_TABLE)
199
200/* PUD - Level3 access */
201/* to find an entry in a page-table-directory. */
202#define pud_page_vaddr(pud) \
203 ((unsigned long)__va(pud_val((pud)) & PHYSICAL_PAGE_MASK))
204#define pud_page(pud) (pfn_to_page(pud_val((pud)) >> PAGE_SHIFT))
205#define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD - 1))
206#define pud_offset(pgd, address) \
207 ((pud_t *)pgd_page_vaddr(*(pgd)) + pud_index((address)))
208#define pud_present(pud) (pud_val((pud)) & _PAGE_PRESENT)
209
210static inline int pud_large(pud_t pte)
211{
212 return (pud_val(pte) & (_PAGE_PSE | _PAGE_PRESENT)) ==
213 (_PAGE_PSE | _PAGE_PRESENT);
214}
215
216/* PMD - Level 2 access */
217#define pmd_page_vaddr(pmd) ((unsigned long) __va(pmd_val((pmd)) & PTE_PFN_MASK))
218#define pmd_page(pmd) (pfn_to_page(pmd_val((pmd)) >> PAGE_SHIFT))
219
220#define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD - 1))
221#define pmd_offset(dir, address) ((pmd_t *)pud_page_vaddr(*(dir)) + \
222 pmd_index(address))
223#define pmd_none(x) (!pmd_val((x)))
224#define pmd_present(x) (pmd_val((x)) & _PAGE_PRESENT)
225#define pfn_pmd(nr, prot) (__pmd(((nr) << PAGE_SHIFT) | pgprot_val((prot))))
226#define pmd_pfn(x) ((pmd_val((x)) & __PHYSICAL_MASK) >> PAGE_SHIFT)
227
228#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
229#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) | \
230 _PAGE_FILE })
231#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
232
233/* PTE - Level 1 access. */
234
235/* page, protection -> pte */
236#define mk_pte(page, pgprot) pfn_pte(page_to_pfn((page)), (pgprot))
237
238#define pte_index(address) (((address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1))
239#define pte_offset_kernel(dir, address) ((pte_t *) pmd_page_vaddr(*(dir)) + \
240 pte_index((address)))
241
242/* x86-64 always has all page tables mapped. */
243#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
244#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
245#define pte_unmap(pte) /* NOP */
246#define pte_unmap_nested(pte) /* NOP */
247
248#define update_mmu_cache(vma, address, pte) do { } while (0)
249
250extern int direct_gbpages;
251
252/* Encode and de-code a swap entry */
253#define __swp_type(x) (((x).val >> 1) & 0x3f)
254#define __swp_offset(x) ((x).val >> 8)
255#define __swp_entry(type, offset) ((swp_entry_t) { ((type) << 1) | \
256 ((offset) << 8) })
257#define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) })
258#define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val })
259
260extern int kern_addr_valid(unsigned long addr);
261extern void cleanup_highmap(void);
262
263#define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \
264 remap_pfn_range(vma, vaddr, pfn, size, prot)
265
266#define HAVE_ARCH_UNMAPPED_AREA
267#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
268
269#define pgtable_cache_init() do { } while (0)
270#define check_pgt_cache() do { } while (0)
271
272#define PAGE_AGP PAGE_KERNEL_NOCACHE
273#define HAVE_PAGE_AGP 1
274
275/* fs/proc/kcore.c */
276#define kc_vaddr_to_offset(v) ((v) & __VIRTUAL_MASK)
277#define kc_offset_to_vaddr(o) \
278 (((o) & (1UL << (__VIRTUAL_MASK_SHIFT - 1))) \
279 ? ((o) | ~__VIRTUAL_MASK) \
280 : (o))
281
282#define __HAVE_ARCH_PTE_SAME
283#endif /* !__ASSEMBLY__ */
284
285#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/poll.h b/arch/x86/include/asm/poll.h
new file mode 100644
index 000000000000..c98509d3149e
--- /dev/null
+++ b/arch/x86/include/asm/poll.h
@@ -0,0 +1 @@
#include <asm-generic/poll.h>
diff --git a/arch/x86/include/asm/posix_types.h b/arch/x86/include/asm/posix_types.h
new file mode 100644
index 000000000000..bb7133dc155d
--- /dev/null
+++ b/arch/x86/include/asm/posix_types.h
@@ -0,0 +1,13 @@
1#ifdef __KERNEL__
2# ifdef CONFIG_X86_32
3# include "posix_types_32.h"
4# else
5# include "posix_types_64.h"
6# endif
7#else
8# ifdef __i386__
9# include "posix_types_32.h"
10# else
11# include "posix_types_64.h"
12# endif
13#endif
diff --git a/arch/x86/include/asm/posix_types_32.h b/arch/x86/include/asm/posix_types_32.h
new file mode 100644
index 000000000000..f7d9adf82e53
--- /dev/null
+++ b/arch/x86/include/asm/posix_types_32.h
@@ -0,0 +1,85 @@
1#ifndef _ASM_X86_POSIX_TYPES_32_H
2#define _ASM_X86_POSIX_TYPES_32_H
3
4/*
5 * This file is generally used by user-level software, so you need to
6 * be a little careful about namespace pollution etc. Also, we cannot
7 * assume GCC is being used.
8 */
9
10typedef unsigned long __kernel_ino_t;
11typedef unsigned short __kernel_mode_t;
12typedef unsigned short __kernel_nlink_t;
13typedef long __kernel_off_t;
14typedef int __kernel_pid_t;
15typedef unsigned short __kernel_ipc_pid_t;
16typedef unsigned short __kernel_uid_t;
17typedef unsigned short __kernel_gid_t;
18typedef unsigned int __kernel_size_t;
19typedef int __kernel_ssize_t;
20typedef int __kernel_ptrdiff_t;
21typedef long __kernel_time_t;
22typedef long __kernel_suseconds_t;
23typedef long __kernel_clock_t;
24typedef int __kernel_timer_t;
25typedef int __kernel_clockid_t;
26typedef int __kernel_daddr_t;
27typedef char * __kernel_caddr_t;
28typedef unsigned short __kernel_uid16_t;
29typedef unsigned short __kernel_gid16_t;
30typedef unsigned int __kernel_uid32_t;
31typedef unsigned int __kernel_gid32_t;
32
33typedef unsigned short __kernel_old_uid_t;
34typedef unsigned short __kernel_old_gid_t;
35typedef unsigned short __kernel_old_dev_t;
36
37#ifdef __GNUC__
38typedef long long __kernel_loff_t;
39#endif
40
41typedef struct {
42 int val[2];
43} __kernel_fsid_t;
44
45#if defined(__KERNEL__)
46
47#undef __FD_SET
48#define __FD_SET(fd,fdsetp) \
49 asm volatile("btsl %1,%0": \
50 "+m" (*(__kernel_fd_set *)(fdsetp)) \
51 : "r" ((int)(fd)))
52
53#undef __FD_CLR
54#define __FD_CLR(fd,fdsetp) \
55 asm volatile("btrl %1,%0": \
56 "+m" (*(__kernel_fd_set *)(fdsetp)) \
57 : "r" ((int) (fd)))
58
59#undef __FD_ISSET
60#define __FD_ISSET(fd,fdsetp) \
61 (__extension__ \
62 ({ \
63 unsigned char __result; \
64 asm volatile("btl %1,%2 ; setb %0" \
65 : "=q" (__result) \
66 : "r" ((int)(fd)), \
67 "m" (*(__kernel_fd_set *)(fdsetp))); \
68 __result; \
69}))
70
71#undef __FD_ZERO
72#define __FD_ZERO(fdsetp) \
73do { \
74 int __d0, __d1; \
75 asm volatile("cld ; rep ; stosl" \
76 : "=m" (*(__kernel_fd_set *)(fdsetp)), \
77 "=&c" (__d0), "=&D" (__d1) \
78 : "a" (0), "1" (__FDSET_LONGS), \
79 "2" ((__kernel_fd_set *)(fdsetp)) \
80 : "memory"); \
81} while (0)
82
83#endif /* defined(__KERNEL__) */
84
85#endif /* _ASM_X86_POSIX_TYPES_32_H */
diff --git a/arch/x86/include/asm/posix_types_64.h b/arch/x86/include/asm/posix_types_64.h
new file mode 100644
index 000000000000..eb8d2d92b63e
--- /dev/null
+++ b/arch/x86/include/asm/posix_types_64.h
@@ -0,0 +1,119 @@
1#ifndef _ASM_X86_POSIX_TYPES_64_H
2#define _ASM_X86_POSIX_TYPES_64_H
3
4/*
5 * This file is generally used by user-level software, so you need to
6 * be a little careful about namespace pollution etc. Also, we cannot
7 * assume GCC is being used.
8 */
9
10typedef unsigned long __kernel_ino_t;
11typedef unsigned int __kernel_mode_t;
12typedef unsigned long __kernel_nlink_t;
13typedef long __kernel_off_t;
14typedef int __kernel_pid_t;
15typedef int __kernel_ipc_pid_t;
16typedef unsigned int __kernel_uid_t;
17typedef unsigned int __kernel_gid_t;
18typedef unsigned long __kernel_size_t;
19typedef long __kernel_ssize_t;
20typedef long __kernel_ptrdiff_t;
21typedef long __kernel_time_t;
22typedef long __kernel_suseconds_t;
23typedef long __kernel_clock_t;
24typedef int __kernel_timer_t;
25typedef int __kernel_clockid_t;
26typedef int __kernel_daddr_t;
27typedef char * __kernel_caddr_t;
28typedef unsigned short __kernel_uid16_t;
29typedef unsigned short __kernel_gid16_t;
30
31#ifdef __GNUC__
32typedef long long __kernel_loff_t;
33#endif
34
35typedef struct {
36 int val[2];
37} __kernel_fsid_t;
38
39typedef unsigned short __kernel_old_uid_t;
40typedef unsigned short __kernel_old_gid_t;
41typedef __kernel_uid_t __kernel_uid32_t;
42typedef __kernel_gid_t __kernel_gid32_t;
43
44typedef unsigned long __kernel_old_dev_t;
45
46#ifdef __KERNEL__
47
48#undef __FD_SET
49static inline void __FD_SET(unsigned long fd, __kernel_fd_set *fdsetp)
50{
51 unsigned long _tmp = fd / __NFDBITS;
52 unsigned long _rem = fd % __NFDBITS;
53 fdsetp->fds_bits[_tmp] |= (1UL<<_rem);
54}
55
56#undef __FD_CLR
57static inline void __FD_CLR(unsigned long fd, __kernel_fd_set *fdsetp)
58{
59 unsigned long _tmp = fd / __NFDBITS;
60 unsigned long _rem = fd % __NFDBITS;
61 fdsetp->fds_bits[_tmp] &= ~(1UL<<_rem);
62}
63
64#undef __FD_ISSET
65static inline int __FD_ISSET(unsigned long fd, __const__ __kernel_fd_set *p)
66{
67 unsigned long _tmp = fd / __NFDBITS;
68 unsigned long _rem = fd % __NFDBITS;
69 return (p->fds_bits[_tmp] & (1UL<<_rem)) != 0;
70}
71
72/*
73 * This will unroll the loop for the normal constant cases (8 or 32 longs,
74 * for 256 and 1024-bit fd_sets respectively)
75 */
76#undef __FD_ZERO
77static inline void __FD_ZERO(__kernel_fd_set *p)
78{
79 unsigned long *tmp = p->fds_bits;
80 int i;
81
82 if (__builtin_constant_p(__FDSET_LONGS)) {
83 switch (__FDSET_LONGS) {
84 case 32:
85 tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
86 tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
87 tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0;
88 tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0;
89 tmp[16] = 0; tmp[17] = 0; tmp[18] = 0; tmp[19] = 0;
90 tmp[20] = 0; tmp[21] = 0; tmp[22] = 0; tmp[23] = 0;
91 tmp[24] = 0; tmp[25] = 0; tmp[26] = 0; tmp[27] = 0;
92 tmp[28] = 0; tmp[29] = 0; tmp[30] = 0; tmp[31] = 0;
93 return;
94 case 16:
95 tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
96 tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
97 tmp[ 8] = 0; tmp[ 9] = 0; tmp[10] = 0; tmp[11] = 0;
98 tmp[12] = 0; tmp[13] = 0; tmp[14] = 0; tmp[15] = 0;
99 return;
100 case 8:
101 tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
102 tmp[ 4] = 0; tmp[ 5] = 0; tmp[ 6] = 0; tmp[ 7] = 0;
103 return;
104 case 4:
105 tmp[ 0] = 0; tmp[ 1] = 0; tmp[ 2] = 0; tmp[ 3] = 0;
106 return;
107 }
108 }
109 i = __FDSET_LONGS;
110 while (i) {
111 i--;
112 *tmp = 0;
113 tmp++;
114 }
115}
116
117#endif /* defined(__KERNEL__) */
118
119#endif /* _ASM_X86_POSIX_TYPES_64_H */
diff --git a/arch/x86/include/asm/prctl.h b/arch/x86/include/asm/prctl.h
new file mode 100644
index 000000000000..fe681147a4f7
--- /dev/null
+++ b/arch/x86/include/asm/prctl.h
@@ -0,0 +1,10 @@
1#ifndef _ASM_X86_PRCTL_H
2#define _ASM_X86_PRCTL_H
3
4#define ARCH_SET_GS 0x1001
5#define ARCH_SET_FS 0x1002
6#define ARCH_GET_FS 0x1003
7#define ARCH_GET_GS 0x1004
8
9
10#endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/include/asm/processor-cyrix.h b/arch/x86/include/asm/processor-cyrix.h
new file mode 100644
index 000000000000..1198f2a0e42c
--- /dev/null
+++ b/arch/x86/include/asm/processor-cyrix.h
@@ -0,0 +1,38 @@
1/*
2 * NSC/Cyrix CPU indexed register access. Must be inlined instead of
3 * macros to ensure correct access ordering
4 * Access order is always 0x22 (=offset), 0x23 (=value)
5 *
6 * When using the old macros a line like
7 * setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88);
8 * gets expanded to:
9 * do {
10 * outb((CX86_CCR2), 0x22);
11 * outb((({
12 * outb((CX86_CCR2), 0x22);
13 * inb(0x23);
14 * }) | 0x88), 0x23);
15 * } while (0);
16 *
17 * which in fact violates the access order (= 0x22, 0x22, 0x23, 0x23).
18 */
19
20static inline u8 getCx86(u8 reg)
21{
22 outb(reg, 0x22);
23 return inb(0x23);
24}
25
26static inline void setCx86(u8 reg, u8 data)
27{
28 outb(reg, 0x22);
29 outb(data, 0x23);
30}
31
32#define getCx86_old(reg) ({ outb((reg), 0x22); inb(0x23); })
33
34#define setCx86_old(reg, data) do { \
35 outb((reg), 0x22); \
36 outb((data), 0x23); \
37} while (0)
38
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
new file mode 100644
index 000000000000..7a3e836eb2a9
--- /dev/null
+++ b/arch/x86/include/asm/processor-flags.h
@@ -0,0 +1,100 @@
1#ifndef _ASM_X86_PROCESSOR_FLAGS_H
2#define _ASM_X86_PROCESSOR_FLAGS_H
3/* Various flags defined: can be included from assembler. */
4
5/*
6 * EFLAGS bits
7 */
8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
9#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
10#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */
11#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
12#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
13#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
14#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */
15#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */
16#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */
17#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */
18#define X86_EFLAGS_NT 0x00004000 /* Nested Task */
19#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */
20#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */
21#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */
22#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */
23#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */
24#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */
25
26/*
27 * Basic CPU control in CR0
28 */
29#define X86_CR0_PE 0x00000001 /* Protection Enable */
30#define X86_CR0_MP 0x00000002 /* Monitor Coprocessor */
31#define X86_CR0_EM 0x00000004 /* Emulation */
32#define X86_CR0_TS 0x00000008 /* Task Switched */
33#define X86_CR0_ET 0x00000010 /* Extension Type */
34#define X86_CR0_NE 0x00000020 /* Numeric Error */
35#define X86_CR0_WP 0x00010000 /* Write Protect */
36#define X86_CR0_AM 0x00040000 /* Alignment Mask */
37#define X86_CR0_NW 0x20000000 /* Not Write-through */
38#define X86_CR0_CD 0x40000000 /* Cache Disable */
39#define X86_CR0_PG 0x80000000 /* Paging */
40
41/*
42 * Paging options in CR3
43 */
44#define X86_CR3_PWT 0x00000008 /* Page Write Through */
45#define X86_CR3_PCD 0x00000010 /* Page Cache Disable */
46
47/*
48 * Intel CPU features in CR4
49 */
50#define X86_CR4_VME 0x00000001 /* enable vm86 extensions */
51#define X86_CR4_PVI 0x00000002 /* virtual interrupts flag enable */
52#define X86_CR4_TSD 0x00000004 /* disable time stamp at ipl 3 */
53#define X86_CR4_DE 0x00000008 /* enable debugging extensions */
54#define X86_CR4_PSE 0x00000010 /* enable page size extensions */
55#define X86_CR4_PAE 0x00000020 /* enable physical address extensions */
56#define X86_CR4_MCE 0x00000040 /* Machine check enable */
57#define X86_CR4_PGE 0x00000080 /* enable global pages */
58#define X86_CR4_PCE 0x00000100 /* enable performance counters at ipl 3 */
59#define X86_CR4_OSFXSR 0x00000200 /* enable fast FPU save and restore */
60#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
61#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
62#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
63
64/*
65 * x86-64 Task Priority Register, CR8
66 */
67#define X86_CR8_TPR 0x0000000F /* task priority register */
68
69/*
70 * AMD and Transmeta use MSRs for configuration; see <asm/msr-index.h>
71 */
72
73/*
74 * NSC/Cyrix CPU configuration register indexes
75 */
76#define CX86_PCR0 0x20
77#define CX86_GCR 0xb8
78#define CX86_CCR0 0xc0
79#define CX86_CCR1 0xc1
80#define CX86_CCR2 0xc2
81#define CX86_CCR3 0xc3
82#define CX86_CCR4 0xe8
83#define CX86_CCR5 0xe9
84#define CX86_CCR6 0xea
85#define CX86_CCR7 0xeb
86#define CX86_PCR1 0xf0
87#define CX86_DIR0 0xfe
88#define CX86_DIR1 0xff
89#define CX86_ARR_BASE 0xc4
90#define CX86_RCR_BASE 0xdc
91
92#ifdef __KERNEL__
93#ifdef CONFIG_VM86
94#define X86_VM_MASK X86_EFLAGS_VM
95#else
96#define X86_VM_MASK 0 /* No VM86 support */
97#endif
98#endif
99
100#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
new file mode 100644
index 000000000000..5ca01e383269
--- /dev/null
+++ b/arch/x86/include/asm/processor.h
@@ -0,0 +1,936 @@
1#ifndef _ASM_X86_PROCESSOR_H
2#define _ASM_X86_PROCESSOR_H
3
4#include <asm/processor-flags.h>
5
6/* Forward declaration, a strange C thing */
7struct task_struct;
8struct mm_struct;
9
10#include <asm/vm86.h>
11#include <asm/math_emu.h>
12#include <asm/segment.h>
13#include <asm/types.h>
14#include <asm/sigcontext.h>
15#include <asm/current.h>
16#include <asm/cpufeature.h>
17#include <asm/system.h>
18#include <asm/page.h>
19#include <asm/percpu.h>
20#include <asm/msr.h>
21#include <asm/desc_defs.h>
22#include <asm/nops.h>
23#include <asm/ds.h>
24
25#include <linux/personality.h>
26#include <linux/cpumask.h>
27#include <linux/cache.h>
28#include <linux/threads.h>
29#include <linux/init.h>
30
31/*
32 * Default implementation of macro that returns current
33 * instruction pointer ("program counter").
34 */
35static inline void *current_text_addr(void)
36{
37 void *pc;
38
39 asm volatile("mov $1f, %0; 1:":"=r" (pc));
40
41 return pc;
42}
43
44#ifdef CONFIG_X86_VSMP
45# define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT)
46# define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT)
47#else
48# define ARCH_MIN_TASKALIGN 16
49# define ARCH_MIN_MMSTRUCT_ALIGN 0
50#endif
51
52/*
53 * CPU type and hardware bug flags. Kept separately for each CPU.
54 * Members of this structure are referenced in head.S, so think twice
55 * before touching them. [mj]
56 */
57
58struct cpuinfo_x86 {
59 __u8 x86; /* CPU family */
60 __u8 x86_vendor; /* CPU vendor */
61 __u8 x86_model;
62 __u8 x86_mask;
63#ifdef CONFIG_X86_32
64 char wp_works_ok; /* It doesn't on 386's */
65
66 /* Problems on some 486Dx4's and old 386's: */
67 char hlt_works_ok;
68 char hard_math;
69 char rfu;
70 char fdiv_bug;
71 char f00f_bug;
72 char coma_bug;
73 char pad0;
74#else
75 /* Number of 4K pages in DTLB/ITLB combined(in pages): */
76 int x86_tlbsize;
77 __u8 x86_virt_bits;
78 __u8 x86_phys_bits;
79#endif
80 /* CPUID returned core id bits: */
81 __u8 x86_coreid_bits;
82 /* Max extended CPUID function supported: */
83 __u32 extended_cpuid_level;
84 /* Maximum supported CPUID level, -1=no CPUID: */
85 int cpuid_level;
86 __u32 x86_capability[NCAPINTS];
87 char x86_vendor_id[16];
88 char x86_model_id[64];
89 /* in KB - valid for CPUS which support this call: */
90 int x86_cache_size;
91 int x86_cache_alignment; /* In bytes */
92 int x86_power;
93 unsigned long loops_per_jiffy;
94#ifdef CONFIG_SMP
95 /* cpus sharing the last level cache: */
96 cpumask_t llc_shared_map;
97#endif
98 /* cpuid returned max cores value: */
99 u16 x86_max_cores;
100 u16 apicid;
101 u16 initial_apicid;
102 u16 x86_clflush_size;
103#ifdef CONFIG_SMP
104 /* number of cores as seen by the OS: */
105 u16 booted_cores;
106 /* Physical processor id: */
107 u16 phys_proc_id;
108 /* Core id: */
109 u16 cpu_core_id;
110 /* Index into per_cpu list: */
111 u16 cpu_index;
112#endif
113} __attribute__((__aligned__(SMP_CACHE_BYTES)));
114
115#define X86_VENDOR_INTEL 0
116#define X86_VENDOR_CYRIX 1
117#define X86_VENDOR_AMD 2
118#define X86_VENDOR_UMC 3
119#define X86_VENDOR_CENTAUR 5
120#define X86_VENDOR_TRANSMETA 7
121#define X86_VENDOR_NSC 8
122#define X86_VENDOR_NUM 9
123
124#define X86_VENDOR_UNKNOWN 0xff
125
126/*
127 * capabilities of CPUs
128 */
129extern struct cpuinfo_x86 boot_cpu_data;
130extern struct cpuinfo_x86 new_cpu_data;
131
132extern struct tss_struct doublefault_tss;
133extern __u32 cleared_cpu_caps[NCAPINTS];
134
135#ifdef CONFIG_SMP
136DECLARE_PER_CPU(struct cpuinfo_x86, cpu_info);
137#define cpu_data(cpu) per_cpu(cpu_info, cpu)
138#define current_cpu_data __get_cpu_var(cpu_info)
139#else
140#define cpu_data(cpu) boot_cpu_data
141#define current_cpu_data boot_cpu_data
142#endif
143
144extern const struct seq_operations cpuinfo_op;
145
146static inline int hlt_works(int cpu)
147{
148#ifdef CONFIG_X86_32
149 return cpu_data(cpu).hlt_works_ok;
150#else
151 return 1;
152#endif
153}
154
155#define cache_line_size() (boot_cpu_data.x86_cache_alignment)
156
157extern void cpu_detect(struct cpuinfo_x86 *c);
158
159extern struct pt_regs *idle_regs(struct pt_regs *);
160
161extern void early_cpu_init(void);
162extern void identify_boot_cpu(void);
163extern void identify_secondary_cpu(struct cpuinfo_x86 *);
164extern void print_cpu_info(struct cpuinfo_x86 *);
165extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
166extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
167extern unsigned short num_cache_leaves;
168
169extern void detect_extended_topology(struct cpuinfo_x86 *c);
170extern void detect_ht(struct cpuinfo_x86 *c);
171
172static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
173 unsigned int *ecx, unsigned int *edx)
174{
175 /* ecx is often an input as well as an output. */
176 asm("cpuid"
177 : "=a" (*eax),
178 "=b" (*ebx),
179 "=c" (*ecx),
180 "=d" (*edx)
181 : "0" (*eax), "2" (*ecx));
182}
183
184static inline void load_cr3(pgd_t *pgdir)
185{
186 write_cr3(__pa(pgdir));
187}
188
189#ifdef CONFIG_X86_32
190/* This is the TSS defined by the hardware. */
191struct x86_hw_tss {
192 unsigned short back_link, __blh;
193 unsigned long sp0;
194 unsigned short ss0, __ss0h;
195 unsigned long sp1;
196 /* ss1 caches MSR_IA32_SYSENTER_CS: */
197 unsigned short ss1, __ss1h;
198 unsigned long sp2;
199 unsigned short ss2, __ss2h;
200 unsigned long __cr3;
201 unsigned long ip;
202 unsigned long flags;
203 unsigned long ax;
204 unsigned long cx;
205 unsigned long dx;
206 unsigned long bx;
207 unsigned long sp;
208 unsigned long bp;
209 unsigned long si;
210 unsigned long di;
211 unsigned short es, __esh;
212 unsigned short cs, __csh;
213 unsigned short ss, __ssh;
214 unsigned short ds, __dsh;
215 unsigned short fs, __fsh;
216 unsigned short gs, __gsh;
217 unsigned short ldt, __ldth;
218 unsigned short trace;
219 unsigned short io_bitmap_base;
220
221} __attribute__((packed));
222#else
223struct x86_hw_tss {
224 u32 reserved1;
225 u64 sp0;
226 u64 sp1;
227 u64 sp2;
228 u64 reserved2;
229 u64 ist[7];
230 u32 reserved3;
231 u32 reserved4;
232 u16 reserved5;
233 u16 io_bitmap_base;
234
235} __attribute__((packed)) ____cacheline_aligned;
236#endif
237
238/*
239 * IO-bitmap sizes:
240 */
241#define IO_BITMAP_BITS 65536
242#define IO_BITMAP_BYTES (IO_BITMAP_BITS/8)
243#define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long))
244#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap)
245#define INVALID_IO_BITMAP_OFFSET 0x8000
246#define INVALID_IO_BITMAP_OFFSET_LAZY 0x9000
247
248struct tss_struct {
249 /*
250 * The hardware state:
251 */
252 struct x86_hw_tss x86_tss;
253
254 /*
255 * The extra 1 is there because the CPU will access an
256 * additional byte beyond the end of the IO permission
257 * bitmap. The extra byte must be all 1 bits, and must
258 * be within the limit.
259 */
260 unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
261 /*
262 * Cache the current maximum and the last task that used the bitmap:
263 */
264 unsigned long io_bitmap_max;
265 struct thread_struct *io_bitmap_owner;
266
267 /*
268 * .. and then another 0x100 bytes for the emergency kernel stack:
269 */
270 unsigned long stack[64];
271
272} ____cacheline_aligned;
273
274DECLARE_PER_CPU(struct tss_struct, init_tss);
275
276/*
277 * Save the original ist values for checking stack pointers during debugging
278 */
279struct orig_ist {
280 unsigned long ist[7];
281};
282
283#define MXCSR_DEFAULT 0x1f80
284
285struct i387_fsave_struct {
286 u32 cwd; /* FPU Control Word */
287 u32 swd; /* FPU Status Word */
288 u32 twd; /* FPU Tag Word */
289 u32 fip; /* FPU IP Offset */
290 u32 fcs; /* FPU IP Selector */
291 u32 foo; /* FPU Operand Pointer Offset */
292 u32 fos; /* FPU Operand Pointer Selector */
293
294 /* 8*10 bytes for each FP-reg = 80 bytes: */
295 u32 st_space[20];
296
297 /* Software status information [not touched by FSAVE ]: */
298 u32 status;
299};
300
301struct i387_fxsave_struct {
302 u16 cwd; /* Control Word */
303 u16 swd; /* Status Word */
304 u16 twd; /* Tag Word */
305 u16 fop; /* Last Instruction Opcode */
306 union {
307 struct {
308 u64 rip; /* Instruction Pointer */
309 u64 rdp; /* Data Pointer */
310 };
311 struct {
312 u32 fip; /* FPU IP Offset */
313 u32 fcs; /* FPU IP Selector */
314 u32 foo; /* FPU Operand Offset */
315 u32 fos; /* FPU Operand Selector */
316 };
317 };
318 u32 mxcsr; /* MXCSR Register State */
319 u32 mxcsr_mask; /* MXCSR Mask */
320
321 /* 8*16 bytes for each FP-reg = 128 bytes: */
322 u32 st_space[32];
323
324 /* 16*16 bytes for each XMM-reg = 256 bytes: */
325 u32 xmm_space[64];
326
327 u32 padding[12];
328
329 union {
330 u32 padding1[12];
331 u32 sw_reserved[12];
332 };
333
334} __attribute__((aligned(16)));
335
336struct i387_soft_struct {
337 u32 cwd;
338 u32 swd;
339 u32 twd;
340 u32 fip;
341 u32 fcs;
342 u32 foo;
343 u32 fos;
344 /* 8*10 bytes for each FP-reg = 80 bytes: */
345 u32 st_space[20];
346 u8 ftop;
347 u8 changed;
348 u8 lookahead;
349 u8 no_update;
350 u8 rm;
351 u8 alimit;
352 struct info *info;
353 u32 entry_eip;
354};
355
356struct xsave_hdr_struct {
357 u64 xstate_bv;
358 u64 reserved1[2];
359 u64 reserved2[5];
360} __attribute__((packed));
361
362struct xsave_struct {
363 struct i387_fxsave_struct i387;
364 struct xsave_hdr_struct xsave_hdr;
365 /* new processor state extensions will go here */
366} __attribute__ ((packed, aligned (64)));
367
368union thread_xstate {
369 struct i387_fsave_struct fsave;
370 struct i387_fxsave_struct fxsave;
371 struct i387_soft_struct soft;
372 struct xsave_struct xsave;
373};
374
375#ifdef CONFIG_X86_64
376DECLARE_PER_CPU(struct orig_ist, orig_ist);
377#endif
378
379extern void print_cpu_info(struct cpuinfo_x86 *);
380extern unsigned int xstate_size;
381extern void free_thread_xstate(struct task_struct *);
382extern struct kmem_cache *task_xstate_cachep;
383extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
384extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
385extern unsigned short num_cache_leaves;
386
387struct thread_struct {
388 /* Cached TLS descriptors: */
389 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
390 unsigned long sp0;
391 unsigned long sp;
392#ifdef CONFIG_X86_32
393 unsigned long sysenter_cs;
394#else
395 unsigned long usersp; /* Copy from PDA */
396 unsigned short es;
397 unsigned short ds;
398 unsigned short fsindex;
399 unsigned short gsindex;
400#endif
401 unsigned long ip;
402 unsigned long fs;
403 unsigned long gs;
404 /* Hardware debugging registers: */
405 unsigned long debugreg0;
406 unsigned long debugreg1;
407 unsigned long debugreg2;
408 unsigned long debugreg3;
409 unsigned long debugreg6;
410 unsigned long debugreg7;
411 /* Fault info: */
412 unsigned long cr2;
413 unsigned long trap_no;
414 unsigned long error_code;
415 /* floating point and extended processor state */
416 union thread_xstate *xstate;
417#ifdef CONFIG_X86_32
418 /* Virtual 86 mode info */
419 struct vm86_struct __user *vm86_info;
420 unsigned long screen_bitmap;
421 unsigned long v86flags;
422 unsigned long v86mask;
423 unsigned long saved_sp0;
424 unsigned int saved_fs;
425 unsigned int saved_gs;
426#endif
427 /* IO permissions: */
428 unsigned long *io_bitmap_ptr;
429 unsigned long iopl;
430 /* Max allowed port in the bitmap, in bytes: */
431 unsigned io_bitmap_max;
432/* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */
433 unsigned long debugctlmsr;
434#ifdef CONFIG_X86_DS
435/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
436 struct ds_context *ds_ctx;
437#endif /* CONFIG_X86_DS */
438#ifdef CONFIG_X86_PTRACE_BTS
439/* the signal to send on a bts buffer overflow */
440 unsigned int bts_ovfl_signal;
441#endif /* CONFIG_X86_PTRACE_BTS */
442};
443
444static inline unsigned long native_get_debugreg(int regno)
445{
446 unsigned long val = 0; /* Damn you, gcc! */
447
448 switch (regno) {
449 case 0:
450 asm("mov %%db0, %0" :"=r" (val));
451 break;
452 case 1:
453 asm("mov %%db1, %0" :"=r" (val));
454 break;
455 case 2:
456 asm("mov %%db2, %0" :"=r" (val));
457 break;
458 case 3:
459 asm("mov %%db3, %0" :"=r" (val));
460 break;
461 case 6:
462 asm("mov %%db6, %0" :"=r" (val));
463 break;
464 case 7:
465 asm("mov %%db7, %0" :"=r" (val));
466 break;
467 default:
468 BUG();
469 }
470 return val;
471}
472
473static inline void native_set_debugreg(int regno, unsigned long value)
474{
475 switch (regno) {
476 case 0:
477 asm("mov %0, %%db0" ::"r" (value));
478 break;
479 case 1:
480 asm("mov %0, %%db1" ::"r" (value));
481 break;
482 case 2:
483 asm("mov %0, %%db2" ::"r" (value));
484 break;
485 case 3:
486 asm("mov %0, %%db3" ::"r" (value));
487 break;
488 case 6:
489 asm("mov %0, %%db6" ::"r" (value));
490 break;
491 case 7:
492 asm("mov %0, %%db7" ::"r" (value));
493 break;
494 default:
495 BUG();
496 }
497}
498
499/*
500 * Set IOPL bits in EFLAGS from given mask
501 */
502static inline void native_set_iopl_mask(unsigned mask)
503{
504#ifdef CONFIG_X86_32
505 unsigned int reg;
506
507 asm volatile ("pushfl;"
508 "popl %0;"
509 "andl %1, %0;"
510 "orl %2, %0;"
511 "pushl %0;"
512 "popfl"
513 : "=&r" (reg)
514 : "i" (~X86_EFLAGS_IOPL), "r" (mask));
515#endif
516}
517
518static inline void
519native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
520{
521 tss->x86_tss.sp0 = thread->sp0;
522#ifdef CONFIG_X86_32
523 /* Only happens when SEP is enabled, no need to test "SEP"arately: */
524 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
525 tss->x86_tss.ss1 = thread->sysenter_cs;
526 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
527 }
528#endif
529}
530
531static inline void native_swapgs(void)
532{
533#ifdef CONFIG_X86_64
534 asm volatile("swapgs" ::: "memory");
535#endif
536}
537
538#ifdef CONFIG_PARAVIRT
539#include <asm/paravirt.h>
540#else
541#define __cpuid native_cpuid
542#define paravirt_enabled() 0
543
544/*
545 * These special macros can be used to get or set a debugging register
546 */
547#define get_debugreg(var, register) \
548 (var) = native_get_debugreg(register)
549#define set_debugreg(value, register) \
550 native_set_debugreg(register, value)
551
552static inline void load_sp0(struct tss_struct *tss,
553 struct thread_struct *thread)
554{
555 native_load_sp0(tss, thread);
556}
557
558#define set_iopl_mask native_set_iopl_mask
559#endif /* CONFIG_PARAVIRT */
560
561/*
562 * Save the cr4 feature set we're using (ie
563 * Pentium 4MB enable and PPro Global page
564 * enable), so that any CPU's that boot up
565 * after us can get the correct flags.
566 */
567extern unsigned long mmu_cr4_features;
568
569static inline void set_in_cr4(unsigned long mask)
570{
571 unsigned cr4;
572
573 mmu_cr4_features |= mask;
574 cr4 = read_cr4();
575 cr4 |= mask;
576 write_cr4(cr4);
577}
578
579static inline void clear_in_cr4(unsigned long mask)
580{
581 unsigned cr4;
582
583 mmu_cr4_features &= ~mask;
584 cr4 = read_cr4();
585 cr4 &= ~mask;
586 write_cr4(cr4);
587}
588
589typedef struct {
590 unsigned long seg;
591} mm_segment_t;
592
593
594/*
595 * create a kernel thread without removing it from tasklists
596 */
597extern int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
598
599/* Free all resources held by a thread. */
600extern void release_thread(struct task_struct *);
601
602/* Prepare to copy thread state - unlazy all lazy state */
603extern void prepare_to_copy(struct task_struct *tsk);
604
605unsigned long get_wchan(struct task_struct *p);
606
607/*
608 * Generic CPUID function
609 * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx
610 * resulting in stale register contents being returned.
611 */
612static inline void cpuid(unsigned int op,
613 unsigned int *eax, unsigned int *ebx,
614 unsigned int *ecx, unsigned int *edx)
615{
616 *eax = op;
617 *ecx = 0;
618 __cpuid(eax, ebx, ecx, edx);
619}
620
621/* Some CPUID calls want 'count' to be placed in ecx */
622static inline void cpuid_count(unsigned int op, int count,
623 unsigned int *eax, unsigned int *ebx,
624 unsigned int *ecx, unsigned int *edx)
625{
626 *eax = op;
627 *ecx = count;
628 __cpuid(eax, ebx, ecx, edx);
629}
630
631/*
632 * CPUID functions returning a single datum
633 */
634static inline unsigned int cpuid_eax(unsigned int op)
635{
636 unsigned int eax, ebx, ecx, edx;
637
638 cpuid(op, &eax, &ebx, &ecx, &edx);
639
640 return eax;
641}
642
643static inline unsigned int cpuid_ebx(unsigned int op)
644{
645 unsigned int eax, ebx, ecx, edx;
646
647 cpuid(op, &eax, &ebx, &ecx, &edx);
648
649 return ebx;
650}
651
652static inline unsigned int cpuid_ecx(unsigned int op)
653{
654 unsigned int eax, ebx, ecx, edx;
655
656 cpuid(op, &eax, &ebx, &ecx, &edx);
657
658 return ecx;
659}
660
661static inline unsigned int cpuid_edx(unsigned int op)
662{
663 unsigned int eax, ebx, ecx, edx;
664
665 cpuid(op, &eax, &ebx, &ecx, &edx);
666
667 return edx;
668}
669
670/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
671static inline void rep_nop(void)
672{
673 asm volatile("rep; nop" ::: "memory");
674}
675
676static inline void cpu_relax(void)
677{
678 rep_nop();
679}
680
681/* Stop speculative execution: */
682static inline void sync_core(void)
683{
684 int tmp;
685
686 asm volatile("cpuid" : "=a" (tmp) : "0" (1)
687 : "ebx", "ecx", "edx", "memory");
688}
689
690static inline void __monitor(const void *eax, unsigned long ecx,
691 unsigned long edx)
692{
693 /* "monitor %eax, %ecx, %edx;" */
694 asm volatile(".byte 0x0f, 0x01, 0xc8;"
695 :: "a" (eax), "c" (ecx), "d"(edx));
696}
697
698static inline void __mwait(unsigned long eax, unsigned long ecx)
699{
700 /* "mwait %eax, %ecx;" */
701 asm volatile(".byte 0x0f, 0x01, 0xc9;"
702 :: "a" (eax), "c" (ecx));
703}
704
705static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
706{
707 trace_hardirqs_on();
708 /* "mwait %eax, %ecx;" */
709 asm volatile("sti; .byte 0x0f, 0x01, 0xc9;"
710 :: "a" (eax), "c" (ecx));
711}
712
713extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
714
715extern void select_idle_routine(const struct cpuinfo_x86 *c);
716
717extern unsigned long boot_option_idle_override;
718extern unsigned long idle_halt;
719extern unsigned long idle_nomwait;
720
721/*
722 * on systems with caches, caches must be flashed as the absolute
723 * last instruction before going into a suspended halt. Otherwise,
724 * dirty data can linger in the cache and become stale on resume,
725 * leading to strange errors.
726 *
727 * perform a variety of operations to guarantee that the compiler
728 * will not reorder instructions. wbinvd itself is serializing
729 * so the processor will not reorder.
730 *
731 * Systems without cache can just go into halt.
732 */
733static inline void wbinvd_halt(void)
734{
735 mb();
736 /* check for clflush to determine if wbinvd is legal */
737 if (cpu_has_clflush)
738 asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
739 else
740 while (1)
741 halt();
742}
743
744extern void enable_sep_cpu(void);
745extern int sysenter_setup(void);
746
747/* Defined in head.S */
748extern struct desc_ptr early_gdt_descr;
749
750extern void cpu_set_gdt(int);
751extern void switch_to_new_gdt(void);
752extern void cpu_init(void);
753extern void init_gdt(int cpu);
754
755static inline void update_debugctlmsr(unsigned long debugctlmsr)
756{
757#ifndef CONFIG_X86_DEBUGCTLMSR
758 if (boot_cpu_data.x86 < 6)
759 return;
760#endif
761 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
762}
763
764/*
765 * from system description table in BIOS. Mostly for MCA use, but
766 * others may find it useful:
767 */
768extern unsigned int machine_id;
769extern unsigned int machine_submodel_id;
770extern unsigned int BIOS_revision;
771
772/* Boot loader type from the setup header: */
773extern int bootloader_type;
774
775extern char ignore_fpu_irq;
776
777#define HAVE_ARCH_PICK_MMAP_LAYOUT 1
778#define ARCH_HAS_PREFETCHW
779#define ARCH_HAS_SPINLOCK_PREFETCH
780
781#ifdef CONFIG_X86_32
782# define BASE_PREFETCH ASM_NOP4
783# define ARCH_HAS_PREFETCH
784#else
785# define BASE_PREFETCH "prefetcht0 (%1)"
786#endif
787
788/*
789 * Prefetch instructions for Pentium III (+) and AMD Athlon (+)
790 *
791 * It's not worth to care about 3dnow prefetches for the K6
792 * because they are microcoded there and very slow.
793 */
794static inline void prefetch(const void *x)
795{
796 alternative_input(BASE_PREFETCH,
797 "prefetchnta (%1)",
798 X86_FEATURE_XMM,
799 "r" (x));
800}
801
802/*
803 * 3dnow prefetch to get an exclusive cache line.
804 * Useful for spinlocks to avoid one state transition in the
805 * cache coherency protocol:
806 */
807static inline void prefetchw(const void *x)
808{
809 alternative_input(BASE_PREFETCH,
810 "prefetchw (%1)",
811 X86_FEATURE_3DNOW,
812 "r" (x));
813}
814
815static inline void spin_lock_prefetch(const void *x)
816{
817 prefetchw(x);
818}
819
820#ifdef CONFIG_X86_32
821/*
822 * User space process size: 3GB (default).
823 */
824#define TASK_SIZE PAGE_OFFSET
825#define STACK_TOP TASK_SIZE
826#define STACK_TOP_MAX STACK_TOP
827
828#define INIT_THREAD { \
829 .sp0 = sizeof(init_stack) + (long)&init_stack, \
830 .vm86_info = NULL, \
831 .sysenter_cs = __KERNEL_CS, \
832 .io_bitmap_ptr = NULL, \
833 .fs = __KERNEL_PERCPU, \
834}
835
836/*
837 * Note that the .io_bitmap member must be extra-big. This is because
838 * the CPU will access an additional byte beyond the end of the IO
839 * permission bitmap. The extra byte must be all 1 bits, and must
840 * be within the limit.
841 */
842#define INIT_TSS { \
843 .x86_tss = { \
844 .sp0 = sizeof(init_stack) + (long)&init_stack, \
845 .ss0 = __KERNEL_DS, \
846 .ss1 = __KERNEL_CS, \
847 .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
848 }, \
849 .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
850}
851
852extern unsigned long thread_saved_pc(struct task_struct *tsk);
853
854#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
855#define KSTK_TOP(info) \
856({ \
857 unsigned long *__ptr = (unsigned long *)(info); \
858 (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
859})
860
861/*
862 * The below -8 is to reserve 8 bytes on top of the ring0 stack.
863 * This is necessary to guarantee that the entire "struct pt_regs"
864 * is accessable even if the CPU haven't stored the SS/ESP registers
865 * on the stack (interrupt gate does not save these registers
866 * when switching to the same priv ring).
867 * Therefore beware: accessing the ss/esp fields of the
868 * "struct pt_regs" is possible, but they may contain the
869 * completely wrong values.
870 */
871#define task_pt_regs(task) \
872({ \
873 struct pt_regs *__regs__; \
874 __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
875 __regs__ - 1; \
876})
877
878#define KSTK_ESP(task) (task_pt_regs(task)->sp)
879
880#else
881/*
882 * User space process size. 47bits minus one guard page.
883 */
884#define TASK_SIZE64 ((1UL << 47) - PAGE_SIZE)
885
886/* This decides where the kernel will search for a free chunk of vm
887 * space during mmap's.
888 */
889#define IA32_PAGE_OFFSET ((current->personality & ADDR_LIMIT_3GB) ? \
890 0xc0000000 : 0xFFFFe000)
891
892#define TASK_SIZE (test_thread_flag(TIF_IA32) ? \
893 IA32_PAGE_OFFSET : TASK_SIZE64)
894#define TASK_SIZE_OF(child) ((test_tsk_thread_flag(child, TIF_IA32)) ? \
895 IA32_PAGE_OFFSET : TASK_SIZE64)
896
897#define STACK_TOP TASK_SIZE
898#define STACK_TOP_MAX TASK_SIZE64
899
900#define INIT_THREAD { \
901 .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
902}
903
904#define INIT_TSS { \
905 .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
906}
907
908/*
909 * Return saved PC of a blocked thread.
910 * What is this good for? it will be always the scheduler or ret_from_fork.
911 */
912#define thread_saved_pc(t) (*(unsigned long *)((t)->thread.sp - 8))
913
914#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
915#define KSTK_ESP(tsk) -1 /* sorry. doesn't work for syscall. */
916#endif /* CONFIG_X86_64 */
917
918extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
919 unsigned long new_sp);
920
921/*
922 * This decides where the kernel will search for a free chunk of vm
923 * space during mmap's.
924 */
925#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3))
926
927#define KSTK_EIP(task) (task_pt_regs(task)->ip)
928
929/* Get/set a process' ability to use the timestamp counter instruction */
930#define GET_TSC_CTL(adr) get_tsc_mode((adr))
931#define SET_TSC_CTL(val) set_tsc_mode((val))
932
933extern int get_tsc_mode(unsigned long adr);
934extern int set_tsc_mode(unsigned int val);
935
936#endif /* _ASM_X86_PROCESSOR_H */
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
new file mode 100644
index 000000000000..d6a22f92ba77
--- /dev/null
+++ b/arch/x86/include/asm/proto.h
@@ -0,0 +1,32 @@
1#ifndef _ASM_X86_PROTO_H
2#define _ASM_X86_PROTO_H
3
4#include <asm/ldt.h>
5
6/* misc architecture specific prototypes */
7
8extern void early_idt_handler(void);
9
10extern void system_call(void);
11extern void syscall_init(void);
12
13extern void ia32_syscall(void);
14extern void ia32_cstar_target(void);
15extern void ia32_sysenter_target(void);
16
17extern void syscall32_cpu_init(void);
18
19extern void check_efer(void);
20
21#ifdef CONFIG_X86_BIOS_REBOOT
22extern int reboot_force;
23#else
24static const int reboot_force = 0;
25#endif
26
27long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
28
29#define round_up(x, y) (((x) + (y) - 1) & ~((y) - 1))
30#define round_down(x, y) ((x) & ~((y) - 1))
31
32#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
new file mode 100644
index 000000000000..25f1bb8fc626
--- /dev/null
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -0,0 +1,145 @@
1#ifndef _ASM_X86_PTRACE_ABI_H
2#define _ASM_X86_PTRACE_ABI_H
3
4#ifdef __i386__
5
6#define EBX 0
7#define ECX 1
8#define EDX 2
9#define ESI 3
10#define EDI 4
11#define EBP 5
12#define EAX 6
13#define DS 7
14#define ES 8
15#define FS 9
16#define GS 10
17#define ORIG_EAX 11
18#define EIP 12
19#define CS 13
20#define EFL 14
21#define UESP 15
22#define SS 16
23#define FRAME_SIZE 17
24
25#else /* __i386__ */
26
27#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
28#define R15 0
29#define R14 8
30#define R13 16
31#define R12 24
32#define RBP 32
33#define RBX 40
34/* arguments: interrupts/non tracing syscalls only save upto here*/
35#define R11 48
36#define R10 56
37#define R9 64
38#define R8 72
39#define RAX 80
40#define RCX 88
41#define RDX 96
42#define RSI 104
43#define RDI 112
44#define ORIG_RAX 120 /* = ERROR */
45/* end of arguments */
46/* cpu exception frame or undefined in case of fast syscall. */
47#define RIP 128
48#define CS 136
49#define EFLAGS 144
50#define RSP 152
51#define SS 160
52#define ARGOFFSET R11
53#endif /* __ASSEMBLY__ */
54
55/* top of stack page */
56#define FRAME_SIZE 168
57
58#endif /* !__i386__ */
59
60/* Arbitrarily choose the same ptrace numbers as used by the Sparc code. */
61#define PTRACE_GETREGS 12
62#define PTRACE_SETREGS 13
63#define PTRACE_GETFPREGS 14
64#define PTRACE_SETFPREGS 15
65#define PTRACE_GETFPXREGS 18
66#define PTRACE_SETFPXREGS 19
67
68#define PTRACE_OLDSETOPTIONS 21
69
70/* only useful for access 32bit programs / kernels */
71#define PTRACE_GET_THREAD_AREA 25
72#define PTRACE_SET_THREAD_AREA 26
73
74#ifdef __x86_64__
75# define PTRACE_ARCH_PRCTL 30
76#endif
77
78#define PTRACE_SYSEMU 31
79#define PTRACE_SYSEMU_SINGLESTEP 32
80
81#define PTRACE_SINGLEBLOCK 33 /* resume execution until next branch */
82
83#ifdef CONFIG_X86_PTRACE_BTS
84
85#ifndef __ASSEMBLY__
86#include <asm/types.h>
87
88/* configuration/status structure used in PTRACE_BTS_CONFIG and
89 PTRACE_BTS_STATUS commands.
90*/
91struct ptrace_bts_config {
92 /* requested or actual size of BTS buffer in bytes */
93 __u32 size;
94 /* bitmask of below flags */
95 __u32 flags;
96 /* buffer overflow signal */
97 __u32 signal;
98 /* actual size of bts_struct in bytes */
99 __u32 bts_size;
100};
101#endif /* __ASSEMBLY__ */
102
103#define PTRACE_BTS_O_TRACE 0x1 /* branch trace */
104#define PTRACE_BTS_O_SCHED 0x2 /* scheduling events w/ jiffies */
105#define PTRACE_BTS_O_SIGNAL 0x4 /* send SIG<signal> on buffer overflow
106 instead of wrapping around */
107#define PTRACE_BTS_O_ALLOC 0x8 /* (re)allocate buffer */
108
109#define PTRACE_BTS_CONFIG 40
110/* Configure branch trace recording.
111 ADDR points to a struct ptrace_bts_config.
112 DATA gives the size of that buffer.
113 A new buffer is allocated, if requested in the flags.
114 An overflow signal may only be requested for new buffers.
115 Returns the number of bytes read.
116*/
117#define PTRACE_BTS_STATUS 41
118/* Return the current configuration in a struct ptrace_bts_config
119 pointed to by ADDR; DATA gives the size of that buffer.
120 Returns the number of bytes written.
121*/
122#define PTRACE_BTS_SIZE 42
123/* Return the number of available BTS records for draining.
124 DATA and ADDR are ignored.
125*/
126#define PTRACE_BTS_GET 43
127/* Get a single BTS record.
128 DATA defines the index into the BTS array, where 0 is the newest
129 entry, and higher indices refer to older entries.
130 ADDR is pointing to struct bts_struct (see asm/ds.h).
131*/
132#define PTRACE_BTS_CLEAR 44
133/* Clear the BTS buffer.
134 DATA and ADDR are ignored.
135*/
136#define PTRACE_BTS_DRAIN 45
137/* Read all available BTS records and clear the buffer.
138 ADDR points to an array of struct bts_struct.
139 DATA gives the size of that buffer.
140 BTS records are read from oldest to newest.
141 Returns number of BTS records drained.
142*/
143#endif /* CONFIG_X86_PTRACE_BTS */
144
145#endif /* _ASM_X86_PTRACE_ABI_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
new file mode 100644
index 000000000000..d1531c8480b7
--- /dev/null
+++ b/arch/x86/include/asm/ptrace.h
@@ -0,0 +1,280 @@
1#ifndef _ASM_X86_PTRACE_H
2#define _ASM_X86_PTRACE_H
3
4#include <linux/compiler.h> /* For __user */
5#include <asm/ptrace-abi.h>
6#include <asm/processor-flags.h>
7
8#ifdef __KERNEL__
9#include <asm/ds.h> /* the DS BTS struct is used for ptrace too */
10#include <asm/segment.h>
11#endif
12
13#ifndef __ASSEMBLY__
14
15#ifdef __i386__
16/* this struct defines the way the registers are stored on the
17 stack during a system call. */
18
19#ifndef __KERNEL__
20
21struct pt_regs {
22 long ebx;
23 long ecx;
24 long edx;
25 long esi;
26 long edi;
27 long ebp;
28 long eax;
29 int xds;
30 int xes;
31 int xfs;
32 /* int gs; */
33 long orig_eax;
34 long eip;
35 int xcs;
36 long eflags;
37 long esp;
38 int xss;
39};
40
41#else /* __KERNEL__ */
42
43struct pt_regs {
44 unsigned long bx;
45 unsigned long cx;
46 unsigned long dx;
47 unsigned long si;
48 unsigned long di;
49 unsigned long bp;
50 unsigned long ax;
51 unsigned long ds;
52 unsigned long es;
53 unsigned long fs;
54 /* int gs; */
55 unsigned long orig_ax;
56 unsigned long ip;
57 unsigned long cs;
58 unsigned long flags;
59 unsigned long sp;
60 unsigned long ss;
61};
62
63#endif /* __KERNEL__ */
64
65#else /* __i386__ */
66
67#ifndef __KERNEL__
68
69struct pt_regs {
70 unsigned long r15;
71 unsigned long r14;
72 unsigned long r13;
73 unsigned long r12;
74 unsigned long rbp;
75 unsigned long rbx;
76/* arguments: non interrupts/non tracing syscalls only save upto here*/
77 unsigned long r11;
78 unsigned long r10;
79 unsigned long r9;
80 unsigned long r8;
81 unsigned long rax;
82 unsigned long rcx;
83 unsigned long rdx;
84 unsigned long rsi;
85 unsigned long rdi;
86 unsigned long orig_rax;
87/* end of arguments */
88/* cpu exception frame or undefined */
89 unsigned long rip;
90 unsigned long cs;
91 unsigned long eflags;
92 unsigned long rsp;
93 unsigned long ss;
94/* top of stack page */
95};
96
97#else /* __KERNEL__ */
98
99struct pt_regs {
100 unsigned long r15;
101 unsigned long r14;
102 unsigned long r13;
103 unsigned long r12;
104 unsigned long bp;
105 unsigned long bx;
106/* arguments: non interrupts/non tracing syscalls only save upto here*/
107 unsigned long r11;
108 unsigned long r10;
109 unsigned long r9;
110 unsigned long r8;
111 unsigned long ax;
112 unsigned long cx;
113 unsigned long dx;
114 unsigned long si;
115 unsigned long di;
116 unsigned long orig_ax;
117/* end of arguments */
118/* cpu exception frame or undefined */
119 unsigned long ip;
120 unsigned long cs;
121 unsigned long flags;
122 unsigned long sp;
123 unsigned long ss;
124/* top of stack page */
125};
126
127#endif /* __KERNEL__ */
128#endif /* !__i386__ */
129
130
131#ifdef CONFIG_X86_PTRACE_BTS
132/* a branch trace record entry
133 *
134 * In order to unify the interface between various processor versions,
135 * we use the below data structure for all processors.
136 */
137enum bts_qualifier {
138 BTS_INVALID = 0,
139 BTS_BRANCH,
140 BTS_TASK_ARRIVES,
141 BTS_TASK_DEPARTS
142};
143
144struct bts_struct {
145 __u64 qualifier;
146 union {
147 /* BTS_BRANCH */
148 struct {
149 __u64 from_ip;
150 __u64 to_ip;
151 } lbr;
152 /* BTS_TASK_ARRIVES or
153 BTS_TASK_DEPARTS */
154 __u64 jiffies;
155 } variant;
156};
157#endif /* CONFIG_X86_PTRACE_BTS */
158
159#ifdef __KERNEL__
160
161#include <linux/init.h>
162
163struct cpuinfo_x86;
164struct task_struct;
165
166#ifdef CONFIG_X86_PTRACE_BTS
167extern void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *);
168extern void ptrace_bts_take_timestamp(struct task_struct *, enum bts_qualifier);
169#else
170#define ptrace_bts_init_intel(config) do {} while (0)
171#endif /* CONFIG_X86_PTRACE_BTS */
172
173extern unsigned long profile_pc(struct pt_regs *regs);
174
175extern unsigned long
176convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
177extern void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
178 int error_code, int si_code);
179void signal_fault(struct pt_regs *regs, void __user *frame, char *where);
180
181extern long syscall_trace_enter(struct pt_regs *);
182extern void syscall_trace_leave(struct pt_regs *);
183
184static inline unsigned long regs_return_value(struct pt_regs *regs)
185{
186 return regs->ax;
187}
188
189/*
190 * user_mode_vm(regs) determines whether a register set came from user mode.
191 * This is true if V8086 mode was enabled OR if the register set was from
192 * protected mode with RPL-3 CS value. This tricky test checks that with
193 * one comparison. Many places in the kernel can bypass this full check
194 * if they have already ruled out V8086 mode, so user_mode(regs) can be used.
195 */
196static inline int user_mode(struct pt_regs *regs)
197{
198#ifdef CONFIG_X86_32
199 return (regs->cs & SEGMENT_RPL_MASK) == USER_RPL;
200#else
201 return !!(regs->cs & 3);
202#endif
203}
204
205static inline int user_mode_vm(struct pt_regs *regs)
206{
207#ifdef CONFIG_X86_32
208 return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >=
209 USER_RPL;
210#else
211 return user_mode(regs);
212#endif
213}
214
215static inline int v8086_mode(struct pt_regs *regs)
216{
217#ifdef CONFIG_X86_32
218 return (regs->flags & X86_VM_MASK);
219#else
220 return 0; /* No V86 mode support in long mode */
221#endif
222}
223
224/*
225 * X86_32 CPUs don't save ss and esp if the CPU is already in kernel mode
226 * when it traps. So regs will be the current sp.
227 *
228 * This is valid only for kernel mode traps.
229 */
230static inline unsigned long kernel_trap_sp(struct pt_regs *regs)
231{
232#ifdef CONFIG_X86_32
233 return (unsigned long)regs;
234#else
235 return regs->sp;
236#endif
237}
238
239static inline unsigned long instruction_pointer(struct pt_regs *regs)
240{
241 return regs->ip;
242}
243
244static inline unsigned long frame_pointer(struct pt_regs *regs)
245{
246 return regs->bp;
247}
248
249static inline unsigned long user_stack_pointer(struct pt_regs *regs)
250{
251 return regs->sp;
252}
253
254/*
255 * These are defined as per linux/ptrace.h, which see.
256 */
257#define arch_has_single_step() (1)
258extern void user_enable_single_step(struct task_struct *);
259extern void user_disable_single_step(struct task_struct *);
260
261extern void user_enable_block_step(struct task_struct *);
262#ifdef CONFIG_X86_DEBUGCTLMSR
263#define arch_has_block_step() (1)
264#else
265#define arch_has_block_step() (boot_cpu_data.x86 >= 6)
266#endif
267
268struct user_desc;
269extern int do_get_thread_area(struct task_struct *p, int idx,
270 struct user_desc __user *info);
271extern int do_set_thread_area(struct task_struct *p, int idx,
272 struct user_desc __user *info, int can_allocate);
273
274#define __ARCH_WANT_COMPAT_SYS_PTRACE
275
276#endif /* __KERNEL__ */
277
278#endif /* !__ASSEMBLY__ */
279
280#endif /* _ASM_X86_PTRACE_H */
diff --git a/arch/x86/include/asm/pvclock-abi.h b/arch/x86/include/asm/pvclock-abi.h
new file mode 100644
index 000000000000..6d93508f2626
--- /dev/null
+++ b/arch/x86/include/asm/pvclock-abi.h
@@ -0,0 +1,42 @@
1#ifndef _ASM_X86_PVCLOCK_ABI_H
2#define _ASM_X86_PVCLOCK_ABI_H
3#ifndef __ASSEMBLY__
4
5/*
6 * These structs MUST NOT be changed.
7 * They are the ABI between hypervisor and guest OS.
8 * Both Xen and KVM are using this.
9 *
10 * pvclock_vcpu_time_info holds the system time and the tsc timestamp
11 * of the last update. So the guest can use the tsc delta to get a
12 * more precise system time. There is one per virtual cpu.
13 *
14 * pvclock_wall_clock references the point in time when the system
15 * time was zero (usually boot time), thus the guest calculates the
16 * current wall clock by adding the system time.
17 *
18 * Protocol for the "version" fields is: hypervisor raises it (making
19 * it uneven) before it starts updating the fields and raises it again
20 * (making it even) when it is done. Thus the guest can make sure the
21 * time values it got are consistent by checking the version before
22 * and after reading them.
23 */
24
25struct pvclock_vcpu_time_info {
26 u32 version;
27 u32 pad0;
28 u64 tsc_timestamp;
29 u64 system_time;
30 u32 tsc_to_system_mul;
31 s8 tsc_shift;
32 u8 pad[3];
33} __attribute__((__packed__)); /* 32 bytes */
34
35struct pvclock_wall_clock {
36 u32 version;
37 u32 sec;
38 u32 nsec;
39} __attribute__((__packed__));
40
41#endif /* __ASSEMBLY__ */
42#endif /* _ASM_X86_PVCLOCK_ABI_H */
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
new file mode 100644
index 000000000000..53235fd5f8ce
--- /dev/null
+++ b/arch/x86/include/asm/pvclock.h
@@ -0,0 +1,14 @@
1#ifndef _ASM_X86_PVCLOCK_H
2#define _ASM_X86_PVCLOCK_H
3
4#include <linux/clocksource.h>
5#include <asm/pvclock-abi.h>
6
7/* some helper functions for xen and kvm pv clock sources */
8cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
9unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
10void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
11 struct pvclock_vcpu_time_info *vcpu,
12 struct timespec *ts);
13
14#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
new file mode 100644
index 000000000000..df7710354f85
--- /dev/null
+++ b/arch/x86/include/asm/reboot.h
@@ -0,0 +1,21 @@
1#ifndef _ASM_X86_REBOOT_H
2#define _ASM_X86_REBOOT_H
3
4struct pt_regs;
5
6struct machine_ops {
7 void (*restart)(char *cmd);
8 void (*halt)(void);
9 void (*power_off)(void);
10 void (*shutdown)(void);
11 void (*crash_shutdown)(struct pt_regs *);
12 void (*emergency_restart)(void);
13};
14
15extern struct machine_ops machine_ops;
16
17void native_machine_crash_shutdown(struct pt_regs *regs);
18void native_machine_shutdown(void);
19void machine_real_restart(const unsigned char *code, int length);
20
21#endif /* _ASM_X86_REBOOT_H */
diff --git a/arch/x86/include/asm/reboot_fixups.h b/arch/x86/include/asm/reboot_fixups.h
new file mode 100644
index 000000000000..765debe4c54c
--- /dev/null
+++ b/arch/x86/include/asm/reboot_fixups.h
@@ -0,0 +1,6 @@
1#ifndef _ASM_X86_REBOOT_FIXUPS_H
2#define _ASM_X86_REBOOT_FIXUPS_H
3
4extern void mach_reboot_fixups(void);
5
6#endif /* _ASM_X86_REBOOT_FIXUPS_H */
diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h
new file mode 100644
index 000000000000..d5cd6c586881
--- /dev/null
+++ b/arch/x86/include/asm/required-features.h
@@ -0,0 +1,82 @@
1#ifndef _ASM_X86_REQUIRED_FEATURES_H
2#define _ASM_X86_REQUIRED_FEATURES_H
3
4/* Define minimum CPUID feature set for kernel These bits are checked
5 really early to actually display a visible error message before the
6 kernel dies. Make sure to assign features to the proper mask!
7
8 Some requirements that are not in CPUID yet are also in the
9 CONFIG_X86_MINIMUM_CPU_FAMILY which is checked too.
10
11 The real information is in arch/x86/Kconfig.cpu, this just converts
12 the CONFIGs into a bitmask */
13
14#ifndef CONFIG_MATH_EMULATION
15# define NEED_FPU (1<<(X86_FEATURE_FPU & 31))
16#else
17# define NEED_FPU 0
18#endif
19
20#if defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64)
21# define NEED_PAE (1<<(X86_FEATURE_PAE & 31))
22#else
23# define NEED_PAE 0
24#endif
25
26#ifdef CONFIG_X86_CMPXCHG64
27# define NEED_CX8 (1<<(X86_FEATURE_CX8 & 31))
28#else
29# define NEED_CX8 0
30#endif
31
32#if defined(CONFIG_X86_CMOV) || defined(CONFIG_X86_64)
33# define NEED_CMOV (1<<(X86_FEATURE_CMOV & 31))
34#else
35# define NEED_CMOV 0
36#endif
37
38#ifdef CONFIG_X86_USE_3DNOW
39# define NEED_3DNOW (1<<(X86_FEATURE_3DNOW & 31))
40#else
41# define NEED_3DNOW 0
42#endif
43
44#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64)
45# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31))
46#else
47# define NEED_NOPL 0
48#endif
49
50#ifdef CONFIG_X86_64
51#define NEED_PSE 0
52#define NEED_MSR (1<<(X86_FEATURE_MSR & 31))
53#define NEED_PGE (1<<(X86_FEATURE_PGE & 31))
54#define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31))
55#define NEED_XMM (1<<(X86_FEATURE_XMM & 31))
56#define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31))
57#define NEED_LM (1<<(X86_FEATURE_LM & 31))
58#else
59#define NEED_PSE 0
60#define NEED_MSR 0
61#define NEED_PGE 0
62#define NEED_FXSR 0
63#define NEED_XMM 0
64#define NEED_XMM2 0
65#define NEED_LM 0
66#endif
67
68#define REQUIRED_MASK0 (NEED_FPU|NEED_PSE|NEED_MSR|NEED_PAE|\
69 NEED_CX8|NEED_PGE|NEED_FXSR|NEED_CMOV|\
70 NEED_XMM|NEED_XMM2)
71#define SSE_MASK (NEED_XMM|NEED_XMM2)
72
73#define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW)
74
75#define REQUIRED_MASK2 0
76#define REQUIRED_MASK3 (NEED_NOPL)
77#define REQUIRED_MASK4 0
78#define REQUIRED_MASK5 0
79#define REQUIRED_MASK6 0
80#define REQUIRED_MASK7 0
81
82#endif /* _ASM_X86_REQUIRED_FEATURES_H */
diff --git a/arch/x86/include/asm/resource.h b/arch/x86/include/asm/resource.h
new file mode 100644
index 000000000000..04bc4db8921b
--- /dev/null
+++ b/arch/x86/include/asm/resource.h
@@ -0,0 +1 @@
#include <asm-generic/resource.h>
diff --git a/arch/x86/include/asm/resume-trace.h b/arch/x86/include/asm/resume-trace.h
new file mode 100644
index 000000000000..3ff1c2cb1da5
--- /dev/null
+++ b/arch/x86/include/asm/resume-trace.h
@@ -0,0 +1,21 @@
1#ifndef _ASM_X86_RESUME_TRACE_H
2#define _ASM_X86_RESUME_TRACE_H
3
4#include <asm/asm.h>
5
6#define TRACE_RESUME(user) \
7do { \
8 if (pm_trace_enabled) { \
9 const void *tracedata; \
10 asm volatile(_ASM_MOV " $1f,%0\n" \
11 ".section .tracedata,\"a\"\n" \
12 "1:\t.word %c1\n\t" \
13 _ASM_PTR " %c2\n" \
14 ".previous" \
15 :"=r" (tracedata) \
16 : "i" (__LINE__), "i" (__FILE__)); \
17 generate_resume_trace(tracedata, user); \
18 } \
19} while (0)
20
21#endif /* _ASM_X86_RESUME_TRACE_H */
diff --git a/arch/x86/include/asm/rio.h b/arch/x86/include/asm/rio.h
new file mode 100644
index 000000000000..97bab6388a92
--- /dev/null
+++ b/arch/x86/include/asm/rio.h
@@ -0,0 +1,63 @@
1/*
2 * Derived from include/asm-x86/mach-summit/mach_mpparse.h
3 * and include/asm-x86/mach-default/bios_ebda.h
4 *
5 * Author: Laurent Vivier <Laurent.Vivier@bull.net>
6 */
7
8#ifndef _ASM_X86_RIO_H
9#define _ASM_X86_RIO_H
10
11#define RIO_TABLE_VERSION 3
12
13struct rio_table_hdr {
14 u8 version; /* Version number of this data structure */
15 u8 num_scal_dev; /* # of Scalability devices */
16 u8 num_rio_dev; /* # of RIO I/O devices */
17} __attribute__((packed));
18
19struct scal_detail {
20 u8 node_id; /* Scalability Node ID */
21 u32 CBAR; /* Address of 1MB register space */
22 u8 port0node; /* Node ID port connected to: 0xFF=None */
23 u8 port0port; /* Port num port connected to: 0,1,2, or */
24 /* 0xFF=None */
25 u8 port1node; /* Node ID port connected to: 0xFF = None */
26 u8 port1port; /* Port num port connected to: 0,1,2, or */
27 /* 0xFF=None */
28 u8 port2node; /* Node ID port connected to: 0xFF = None */
29 u8 port2port; /* Port num port connected to: 0,1,2, or */
30 /* 0xFF=None */
31 u8 chassis_num; /* 1 based Chassis number (1 = boot node) */
32} __attribute__((packed));
33
34struct rio_detail {
35 u8 node_id; /* RIO Node ID */
36 u32 BBAR; /* Address of 1MB register space */
37 u8 type; /* Type of device */
38 u8 owner_id; /* Node ID of Hurricane that owns this */
39 /* node */
40 u8 port0node; /* Node ID port connected to: 0xFF=None */
41 u8 port0port; /* Port num port connected to: 0,1,2, or */
42 /* 0xFF=None */
43 u8 port1node; /* Node ID port connected to: 0xFF=None */
44 u8 port1port; /* Port num port connected to: 0,1,2, or */
45 /* 0xFF=None */
46 u8 first_slot; /* Lowest slot number below this Calgary */
47 u8 status; /* Bit 0 = 1 : the XAPIC is used */
48 /* = 0 : the XAPIC is not used, ie: */
49 /* ints fwded to another XAPIC */
50 /* Bits1:7 Reserved */
51 u8 WP_index; /* instance index - lower ones have */
52 /* lower slot numbers/PCI bus numbers */
53 u8 chassis_num; /* 1 based Chassis number */
54} __attribute__((packed));
55
56enum {
57 HURR_SCALABILTY = 0, /* Hurricane Scalability info */
58 HURR_RIOIB = 2, /* Hurricane RIOIB info */
59 COMPAT_CALGARY = 4, /* Compatibility Calgary */
60 ALT_CALGARY = 5, /* Second Planar Calgary */
61};
62
63#endif /* _ASM_X86_RIO_H */
diff --git a/arch/x86/include/asm/rtc.h b/arch/x86/include/asm/rtc.h
new file mode 100644
index 000000000000..f71c3b0ed360
--- /dev/null
+++ b/arch/x86/include/asm/rtc.h
@@ -0,0 +1 @@
#include <asm-generic/rtc.h>
diff --git a/arch/x86/include/asm/rwlock.h b/arch/x86/include/asm/rwlock.h
new file mode 100644
index 000000000000..6a8c0d645108
--- /dev/null
+++ b/arch/x86/include/asm/rwlock.h
@@ -0,0 +1,8 @@
1#ifndef _ASM_X86_RWLOCK_H
2#define _ASM_X86_RWLOCK_H
3
4#define RW_LOCK_BIAS 0x01000000
5
6/* Actual code is in asm/spinlock.h or in arch/x86/lib/rwlock.S */
7
8#endif /* _ASM_X86_RWLOCK_H */
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
new file mode 100644
index 000000000000..ca7517d33776
--- /dev/null
+++ b/arch/x86/include/asm/rwsem.h
@@ -0,0 +1,265 @@
1/* rwsem.h: R/W semaphores implemented using XADD/CMPXCHG for i486+
2 *
3 * Written by David Howells (dhowells@redhat.com).
4 *
5 * Derived from asm-x86/semaphore.h
6 *
7 *
8 * The MSW of the count is the negated number of active writers and waiting
9 * lockers, and the LSW is the total number of active locks
10 *
11 * The lock count is initialized to 0 (no active and no waiting lockers).
12 *
13 * When a writer subtracts WRITE_BIAS, it'll get 0xffff0001 for the case of an
14 * uncontended lock. This can be determined because XADD returns the old value.
15 * Readers increment by 1 and see a positive value when uncontended, negative
16 * if there are writers (and maybe) readers waiting (in which case it goes to
17 * sleep).
18 *
19 * The value of WAITING_BIAS supports up to 32766 waiting processes. This can
20 * be extended to 65534 by manually checking the whole MSW rather than relying
21 * on the S flag.
22 *
23 * The value of ACTIVE_BIAS supports up to 65535 active processes.
24 *
25 * This should be totally fair - if anything is waiting, a process that wants a
26 * lock will go to the back of the queue. When the currently active lock is
27 * released, if there's a writer at the front of the queue, then that and only
28 * that will be woken up; if there's a bunch of consequtive readers at the
29 * front, then they'll all be woken up, but no other readers will be.
30 */
31
32#ifndef _ASM_X86_RWSEM_H
33#define _ASM_X86_RWSEM_H
34
35#ifndef _LINUX_RWSEM_H
36#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
37#endif
38
39#ifdef __KERNEL__
40
41#include <linux/list.h>
42#include <linux/spinlock.h>
43#include <linux/lockdep.h>
44
45struct rwsem_waiter;
46
47extern asmregparm struct rw_semaphore *
48 rwsem_down_read_failed(struct rw_semaphore *sem);
49extern asmregparm struct rw_semaphore *
50 rwsem_down_write_failed(struct rw_semaphore *sem);
51extern asmregparm struct rw_semaphore *
52 rwsem_wake(struct rw_semaphore *);
53extern asmregparm struct rw_semaphore *
54 rwsem_downgrade_wake(struct rw_semaphore *sem);
55
56/*
57 * the semaphore definition
58 */
59
60#define RWSEM_UNLOCKED_VALUE 0x00000000
61#define RWSEM_ACTIVE_BIAS 0x00000001
62#define RWSEM_ACTIVE_MASK 0x0000ffff
63#define RWSEM_WAITING_BIAS (-0x00010000)
64#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
65#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
66
67struct rw_semaphore {
68 signed long count;
69 spinlock_t wait_lock;
70 struct list_head wait_list;
71#ifdef CONFIG_DEBUG_LOCK_ALLOC
72 struct lockdep_map dep_map;
73#endif
74};
75
76#ifdef CONFIG_DEBUG_LOCK_ALLOC
77# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
78#else
79# define __RWSEM_DEP_MAP_INIT(lockname)
80#endif
81
82
83#define __RWSEM_INITIALIZER(name) \
84{ \
85 RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
86 LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \
87}
88
89#define DECLARE_RWSEM(name) \
90 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
91
92extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
93 struct lock_class_key *key);
94
95#define init_rwsem(sem) \
96do { \
97 static struct lock_class_key __key; \
98 \
99 __init_rwsem((sem), #sem, &__key); \
100} while (0)
101
102/*
103 * lock for reading
104 */
105static inline void __down_read(struct rw_semaphore *sem)
106{
107 asm volatile("# beginning down_read\n\t"
108 LOCK_PREFIX " incl (%%eax)\n\t"
109 /* adds 0x00000001, returns the old value */
110 " jns 1f\n"
111 " call call_rwsem_down_read_failed\n"
112 "1:\n\t"
113 "# ending down_read\n\t"
114 : "+m" (sem->count)
115 : "a" (sem)
116 : "memory", "cc");
117}
118
119/*
120 * trylock for reading -- returns 1 if successful, 0 if contention
121 */
122static inline int __down_read_trylock(struct rw_semaphore *sem)
123{
124 __s32 result, tmp;
125 asm volatile("# beginning __down_read_trylock\n\t"
126 " movl %0,%1\n\t"
127 "1:\n\t"
128 " movl %1,%2\n\t"
129 " addl %3,%2\n\t"
130 " jle 2f\n\t"
131 LOCK_PREFIX " cmpxchgl %2,%0\n\t"
132 " jnz 1b\n\t"
133 "2:\n\t"
134 "# ending __down_read_trylock\n\t"
135 : "+m" (sem->count), "=&a" (result), "=&r" (tmp)
136 : "i" (RWSEM_ACTIVE_READ_BIAS)
137 : "memory", "cc");
138 return result >= 0 ? 1 : 0;
139}
140
141/*
142 * lock for writing
143 */
144static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
145{
146 int tmp;
147
148 tmp = RWSEM_ACTIVE_WRITE_BIAS;
149 asm volatile("# beginning down_write\n\t"
150 LOCK_PREFIX " xadd %%edx,(%%eax)\n\t"
151 /* subtract 0x0000ffff, returns the old value */
152 " testl %%edx,%%edx\n\t"
153 /* was the count 0 before? */
154 " jz 1f\n"
155 " call call_rwsem_down_write_failed\n"
156 "1:\n"
157 "# ending down_write"
158 : "+m" (sem->count), "=d" (tmp)
159 : "a" (sem), "1" (tmp)
160 : "memory", "cc");
161}
162
163static inline void __down_write(struct rw_semaphore *sem)
164{
165 __down_write_nested(sem, 0);
166}
167
168/*
169 * trylock for writing -- returns 1 if successful, 0 if contention
170 */
171static inline int __down_write_trylock(struct rw_semaphore *sem)
172{
173 signed long ret = cmpxchg(&sem->count,
174 RWSEM_UNLOCKED_VALUE,
175 RWSEM_ACTIVE_WRITE_BIAS);
176 if (ret == RWSEM_UNLOCKED_VALUE)
177 return 1;
178 return 0;
179}
180
181/*
182 * unlock after reading
183 */
184static inline void __up_read(struct rw_semaphore *sem)
185{
186 __s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
187 asm volatile("# beginning __up_read\n\t"
188 LOCK_PREFIX " xadd %%edx,(%%eax)\n\t"
189 /* subtracts 1, returns the old value */
190 " jns 1f\n\t"
191 " call call_rwsem_wake\n"
192 "1:\n"
193 "# ending __up_read\n"
194 : "+m" (sem->count), "=d" (tmp)
195 : "a" (sem), "1" (tmp)
196 : "memory", "cc");
197}
198
199/*
200 * unlock after writing
201 */
202static inline void __up_write(struct rw_semaphore *sem)
203{
204 asm volatile("# beginning __up_write\n\t"
205 " movl %2,%%edx\n\t"
206 LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t"
207 /* tries to transition
208 0xffff0001 -> 0x00000000 */
209 " jz 1f\n"
210 " call call_rwsem_wake\n"
211 "1:\n\t"
212 "# ending __up_write\n"
213 : "+m" (sem->count)
214 : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS)
215 : "memory", "cc", "edx");
216}
217
218/*
219 * downgrade write lock to read lock
220 */
221static inline void __downgrade_write(struct rw_semaphore *sem)
222{
223 asm volatile("# beginning __downgrade_write\n\t"
224 LOCK_PREFIX " addl %2,(%%eax)\n\t"
225 /* transitions 0xZZZZ0001 -> 0xYYYY0001 */
226 " jns 1f\n\t"
227 " call call_rwsem_downgrade_wake\n"
228 "1:\n\t"
229 "# ending __downgrade_write\n"
230 : "+m" (sem->count)
231 : "a" (sem), "i" (-RWSEM_WAITING_BIAS)
232 : "memory", "cc");
233}
234
235/*
236 * implement atomic add functionality
237 */
238static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
239{
240 asm volatile(LOCK_PREFIX "addl %1,%0"
241 : "+m" (sem->count)
242 : "ir" (delta));
243}
244
245/*
246 * implement exchange and add functionality
247 */
248static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
249{
250 int tmp = delta;
251
252 asm volatile(LOCK_PREFIX "xadd %0,%1"
253 : "+r" (tmp), "+m" (sem->count)
254 : : "memory");
255
256 return tmp + delta;
257}
258
259static inline int rwsem_is_locked(struct rw_semaphore *sem)
260{
261 return (sem->count != 0);
262}
263
264#endif /* __KERNEL__ */
265#endif /* _ASM_X86_RWSEM_H */
diff --git a/arch/x86/include/asm/scatterlist.h b/arch/x86/include/asm/scatterlist.h
new file mode 100644
index 000000000000..263d397d2eef
--- /dev/null
+++ b/arch/x86/include/asm/scatterlist.h
@@ -0,0 +1,33 @@
1#ifndef _ASM_X86_SCATTERLIST_H
2#define _ASM_X86_SCATTERLIST_H
3
4#include <asm/types.h>
5
6struct scatterlist {
7#ifdef CONFIG_DEBUG_SG
8 unsigned long sg_magic;
9#endif
10 unsigned long page_link;
11 unsigned int offset;
12 unsigned int length;
13 dma_addr_t dma_address;
14 unsigned int dma_length;
15};
16
17#define ARCH_HAS_SG_CHAIN
18#define ISA_DMA_THRESHOLD (0x00ffffff)
19
20/*
21 * These macros should be used after a pci_map_sg call has been done
22 * to get bus addresses of each of the SG entries and their lengths.
23 * You should only work with the number of sg entries pci_map_sg
24 * returns.
25 */
26#define sg_dma_address(sg) ((sg)->dma_address)
27#ifdef CONFIG_X86_32
28# define sg_dma_len(sg) ((sg)->length)
29#else
30# define sg_dma_len(sg) ((sg)->dma_length)
31#endif
32
33#endif /* _ASM_X86_SCATTERLIST_H */
diff --git a/arch/x86/include/asm/seccomp.h b/arch/x86/include/asm/seccomp.h
new file mode 100644
index 000000000000..c62e58a5a90d
--- /dev/null
+++ b/arch/x86/include/asm/seccomp.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "seccomp_32.h"
3#else
4# include "seccomp_64.h"
5#endif
diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h
new file mode 100644
index 000000000000..a6ad87b352c4
--- /dev/null
+++ b/arch/x86/include/asm/seccomp_32.h
@@ -0,0 +1,17 @@
1#ifndef _ASM_X86_SECCOMP_32_H
2#define _ASM_X86_SECCOMP_32_H
3
4#include <linux/thread_info.h>
5
6#ifdef TIF_32BIT
7#error "unexpected TIF_32BIT on i386"
8#endif
9
10#include <linux/unistd.h>
11
12#define __NR_seccomp_read __NR_read
13#define __NR_seccomp_write __NR_write
14#define __NR_seccomp_exit __NR_exit
15#define __NR_seccomp_sigreturn __NR_sigreturn
16
17#endif /* _ASM_X86_SECCOMP_32_H */
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
new file mode 100644
index 000000000000..4171bb794e9e
--- /dev/null
+++ b/arch/x86/include/asm/seccomp_64.h
@@ -0,0 +1,25 @@
1#ifndef _ASM_X86_SECCOMP_64_H
2#define _ASM_X86_SECCOMP_64_H
3
4#include <linux/thread_info.h>
5
6#ifdef TIF_32BIT
7#error "unexpected TIF_32BIT on x86_64"
8#else
9#define TIF_32BIT TIF_IA32
10#endif
11
12#include <linux/unistd.h>
13#include <asm/ia32_unistd.h>
14
15#define __NR_seccomp_read __NR_read
16#define __NR_seccomp_write __NR_write
17#define __NR_seccomp_exit __NR_exit
18#define __NR_seccomp_sigreturn __NR_rt_sigreturn
19
20#define __NR_seccomp_read_32 __NR_ia32_read
21#define __NR_seccomp_write_32 __NR_ia32_write
22#define __NR_seccomp_exit_32 __NR_ia32_exit
23#define __NR_seccomp_sigreturn_32 __NR_ia32_sigreturn
24
25#endif /* _ASM_X86_SECCOMP_64_H */
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
new file mode 100644
index 000000000000..2b8c5160388f
--- /dev/null
+++ b/arch/x86/include/asm/sections.h
@@ -0,0 +1 @@
#include <asm-generic/sections.h>
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
new file mode 100644
index 000000000000..1dc1b51ac623
--- /dev/null
+++ b/arch/x86/include/asm/segment.h
@@ -0,0 +1,209 @@
1#ifndef _ASM_X86_SEGMENT_H
2#define _ASM_X86_SEGMENT_H
3
4/* Constructor for a conventional segment GDT (or LDT) entry */
5/* This is a macro so it can be used in initializers */
6#define GDT_ENTRY(flags, base, limit) \
7 ((((base) & 0xff000000ULL) << (56-24)) | \
8 (((flags) & 0x0000f0ffULL) << 40) | \
9 (((limit) & 0x000f0000ULL) << (48-16)) | \
10 (((base) & 0x00ffffffULL) << 16) | \
11 (((limit) & 0x0000ffffULL)))
12
13/* Simple and small GDT entries for booting only */
14
15#define GDT_ENTRY_BOOT_CS 2
16#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8)
17
18#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1)
19#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8)
20
21#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2)
22#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8)
23
24#ifdef CONFIG_X86_32
25/*
26 * The layout of the per-CPU GDT under Linux:
27 *
28 * 0 - null
29 * 1 - reserved
30 * 2 - reserved
31 * 3 - reserved
32 *
33 * 4 - unused <==== new cacheline
34 * 5 - unused
35 *
36 * ------- start of TLS (Thread-Local Storage) segments:
37 *
38 * 6 - TLS segment #1 [ glibc's TLS segment ]
39 * 7 - TLS segment #2 [ Wine's %fs Win32 segment ]
40 * 8 - TLS segment #3
41 * 9 - reserved
42 * 10 - reserved
43 * 11 - reserved
44 *
45 * ------- start of kernel segments:
46 *
47 * 12 - kernel code segment <==== new cacheline
48 * 13 - kernel data segment
49 * 14 - default user CS
50 * 15 - default user DS
51 * 16 - TSS
52 * 17 - LDT
53 * 18 - PNPBIOS support (16->32 gate)
54 * 19 - PNPBIOS support
55 * 20 - PNPBIOS support
56 * 21 - PNPBIOS support
57 * 22 - PNPBIOS support
58 * 23 - APM BIOS support
59 * 24 - APM BIOS support
60 * 25 - APM BIOS support
61 *
62 * 26 - ESPFIX small SS
63 * 27 - per-cpu [ offset to per-cpu data area ]
64 * 28 - unused
65 * 29 - unused
66 * 30 - unused
67 * 31 - TSS for double fault handler
68 */
69#define GDT_ENTRY_TLS_MIN 6
70#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1)
71
72#define GDT_ENTRY_DEFAULT_USER_CS 14
73
74#define GDT_ENTRY_DEFAULT_USER_DS 15
75
76#define GDT_ENTRY_KERNEL_BASE 12
77
78#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0)
79
80#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1)
81
82#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4)
83#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5)
84
85#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6)
86#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11)
87
88#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14)
89#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8)
90
91#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15)
92#ifdef CONFIG_SMP
93#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
94#else
95#define __KERNEL_PERCPU 0
96#endif
97
98#define GDT_ENTRY_DOUBLEFAULT_TSS 31
99
100/*
101 * The GDT has 32 entries
102 */
103#define GDT_ENTRIES 32
104
105/* The PnP BIOS entries in the GDT */
106#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0)
107#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1)
108#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2)
109#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3)
110#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4)
111
112/* The PnP BIOS selectors */
113#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */
114#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */
115#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */
116#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */
117#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */
118
119/* Bottom two bits of selector give the ring privilege level */
120#define SEGMENT_RPL_MASK 0x3
121/* Bit 2 is table indicator (LDT/GDT) */
122#define SEGMENT_TI_MASK 0x4
123
124/* User mode is privilege level 3 */
125#define USER_RPL 0x3
126/* LDT segment has TI set, GDT has it cleared */
127#define SEGMENT_LDT 0x4
128#define SEGMENT_GDT 0x0
129
130/*
131 * Matching rules for certain types of segments.
132 */
133
134/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */
135#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8)
136
137
138#else
139#include <asm/cache.h>
140
141#define GDT_ENTRY_KERNEL32_CS 1
142#define GDT_ENTRY_KERNEL_CS 2
143#define GDT_ENTRY_KERNEL_DS 3
144
145#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8)
146
147/*
148 * we cannot use the same code segment descriptor for user and kernel
149 * -- not even in the long flat mode, because of different DPL /kkeil
150 * The segment offset needs to contain a RPL. Grr. -AK
151 * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)
152 */
153#define GDT_ENTRY_DEFAULT_USER32_CS 4
154#define GDT_ENTRY_DEFAULT_USER_DS 5
155#define GDT_ENTRY_DEFAULT_USER_CS 6
156#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS * 8 + 3)
157#define __USER32_DS __USER_DS
158
159#define GDT_ENTRY_TSS 8 /* needs two entries */
160#define GDT_ENTRY_LDT 10 /* needs two entries */
161#define GDT_ENTRY_TLS_MIN 12
162#define GDT_ENTRY_TLS_MAX 14
163
164#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */
165#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3)
166
167/* TLS indexes for 64bit - hardcoded in arch_prctl */
168#define FS_TLS 0
169#define GS_TLS 1
170
171#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3)
172#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3)
173
174#define GDT_ENTRIES 16
175
176#endif
177
178#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8)
179#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8)
180#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3)
181#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3)
182#ifndef CONFIG_PARAVIRT
183#define get_kernel_rpl() 0
184#endif
185
186/* User mode is privilege level 3 */
187#define USER_RPL 0x3
188/* LDT segment has TI set, GDT has it cleared */
189#define SEGMENT_LDT 0x4
190#define SEGMENT_GDT 0x0
191
192/* Bottom two bits of selector give the ring privilege level */
193#define SEGMENT_RPL_MASK 0x3
194/* Bit 2 is table indicator (LDT/GDT) */
195#define SEGMENT_TI_MASK 0x4
196
197#define IDT_ENTRIES 256
198#define NUM_EXCEPTION_VECTORS 32
199#define GDT_SIZE (GDT_ENTRIES * 8)
200#define GDT_ENTRY_TLS_ENTRIES 3
201#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
202
203#ifdef __KERNEL__
204#ifndef __ASSEMBLY__
205extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10];
206#endif
207#endif
208
209#endif /* _ASM_X86_SEGMENT_H */
diff --git a/arch/x86/include/asm/sembuf.h b/arch/x86/include/asm/sembuf.h
new file mode 100644
index 000000000000..ee50c801f7b7
--- /dev/null
+++ b/arch/x86/include/asm/sembuf.h
@@ -0,0 +1,24 @@
1#ifndef _ASM_X86_SEMBUF_H
2#define _ASM_X86_SEMBUF_H
3
4/*
5 * The semid64_ds structure for x86 architecture.
6 * Note extra padding because this structure is passed back and forth
7 * between kernel and user space.
8 *
9 * Pad space is left for:
10 * - 64-bit time_t to solve y2038 problem
11 * - 2 miscellaneous 32-bit values
12 */
13struct semid64_ds {
14 struct ipc64_perm sem_perm; /* permissions .. see ipc.h */
15 __kernel_time_t sem_otime; /* last semop time */
16 unsigned long __unused1;
17 __kernel_time_t sem_ctime; /* last change time */
18 unsigned long __unused2;
19 unsigned long sem_nsems; /* no. of semaphores in array */
20 unsigned long __unused3;
21 unsigned long __unused4;
22};
23
24#endif /* _ASM_X86_SEMBUF_H */
diff --git a/arch/x86/include/asm/serial.h b/arch/x86/include/asm/serial.h
new file mode 100644
index 000000000000..628c801535ea
--- /dev/null
+++ b/arch/x86/include/asm/serial.h
@@ -0,0 +1,29 @@
1#ifndef _ASM_X86_SERIAL_H
2#define _ASM_X86_SERIAL_H
3
4/*
5 * This assumes you have a 1.8432 MHz clock for your UART.
6 *
7 * It'd be nice if someone built a serial card with a 24.576 MHz
8 * clock, since the 16550A is capable of handling a top speed of 1.5
9 * megabits/second; but this requires the faster clock.
10 */
11#define BASE_BAUD ( 1843200 / 16 )
12
13/* Standard COM flags (except for COM4, because of the 8514 problem) */
14#ifdef CONFIG_SERIAL_DETECT_IRQ
15#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST | ASYNC_AUTO_IRQ)
16#define STD_COM4_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_AUTO_IRQ)
17#else
18#define STD_COM_FLAGS (ASYNC_BOOT_AUTOCONF | ASYNC_SKIP_TEST)
19#define STD_COM4_FLAGS ASYNC_BOOT_AUTOCONF
20#endif
21
22#define SERIAL_PORT_DFNS \
23 /* UART CLK PORT IRQ FLAGS */ \
24 { 0, BASE_BAUD, 0x3F8, 4, STD_COM_FLAGS }, /* ttyS0 */ \
25 { 0, BASE_BAUD, 0x2F8, 3, STD_COM_FLAGS }, /* ttyS1 */ \
26 { 0, BASE_BAUD, 0x3E8, 4, STD_COM_FLAGS }, /* ttyS2 */ \
27 { 0, BASE_BAUD, 0x2E8, 3, STD_COM4_FLAGS }, /* ttyS3 */
28
29#endif /* _ASM_X86_SERIAL_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
new file mode 100644
index 000000000000..f12d37237465
--- /dev/null
+++ b/arch/x86/include/asm/setup.h
@@ -0,0 +1,105 @@
1#ifndef _ASM_X86_SETUP_H
2#define _ASM_X86_SETUP_H
3
4#define COMMAND_LINE_SIZE 2048
5
6#ifndef __ASSEMBLY__
7
8/* Interrupt control for vSMPowered x86_64 systems */
9void vsmp_init(void);
10
11#ifdef CONFIG_X86_VISWS
12extern void visws_early_detect(void);
13extern int is_visws_box(void);
14#else
15static inline void visws_early_detect(void) { }
16static inline int is_visws_box(void) { return 0; }
17#endif
18
19/*
20 * Any setup quirks to be performed?
21 */
22struct mpc_config_processor;
23struct mpc_config_bus;
24struct mp_config_oemtable;
25struct x86_quirks {
26 int (*arch_pre_time_init)(void);
27 int (*arch_time_init)(void);
28 int (*arch_pre_intr_init)(void);
29 int (*arch_intr_init)(void);
30 int (*arch_trap_init)(void);
31 char * (*arch_memory_setup)(void);
32 int (*mach_get_smp_config)(unsigned int early);
33 int (*mach_find_smp_config)(unsigned int reserve);
34
35 int *mpc_record;
36 int (*mpc_apic_id)(struct mpc_config_processor *m);
37 void (*mpc_oem_bus_info)(struct mpc_config_bus *m, char *name);
38 void (*mpc_oem_pci_bus)(struct mpc_config_bus *m);
39 void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable,
40 unsigned short oemsize);
41 int (*setup_ioapic_ids)(void);
42};
43
44extern struct x86_quirks *x86_quirks;
45extern unsigned long saved_video_mode;
46
47#ifndef CONFIG_PARAVIRT
48#define paravirt_post_allocator_init() do {} while (0)
49#endif
50#endif /* __ASSEMBLY__ */
51
52#ifdef __KERNEL__
53
54#ifdef __i386__
55
56#include <linux/pfn.h>
57/*
58 * Reserved space for vmalloc and iomap - defined in asm/page.h
59 */
60#define MAXMEM_PFN PFN_DOWN(MAXMEM)
61#define MAX_NONPAE_PFN (1 << 20)
62
63#endif /* __i386__ */
64
65#define PARAM_SIZE 4096 /* sizeof(struct boot_params) */
66
67#define OLD_CL_MAGIC 0xA33F
68#define OLD_CL_ADDRESS 0x020 /* Relative to real mode data */
69#define NEW_CL_POINTER 0x228 /* Relative to real mode data */
70
71#ifndef __ASSEMBLY__
72#include <asm/bootparam.h>
73
74#ifndef _SETUP
75
76/*
77 * This is set up by the setup-routine at boot-time
78 */
79extern struct boot_params boot_params;
80
81/*
82 * Do NOT EVER look at the BIOS memory size location.
83 * It does not work on many machines.
84 */
85#define LOWMEMSIZE() (0x9f000)
86
87#ifdef __i386__
88
89void __init i386_start_kernel(void);
90extern void probe_roms(void);
91
92extern unsigned long init_pg_tables_start;
93extern unsigned long init_pg_tables_end;
94
95#else
96void __init x86_64_init_pda(void);
97void __init x86_64_start_kernel(char *real_mode);
98void __init x86_64_start_reservations(char *real_mode_data);
99
100#endif /* __i386__ */
101#endif /* _SETUP */
102#endif /* __ASSEMBLY__ */
103#endif /* __KERNEL__ */
104
105#endif /* _ASM_X86_SETUP_H */
diff --git a/arch/x86/include/asm/shmbuf.h b/arch/x86/include/asm/shmbuf.h
new file mode 100644
index 000000000000..b51413b74971
--- /dev/null
+++ b/arch/x86/include/asm/shmbuf.h
@@ -0,0 +1,51 @@
1#ifndef _ASM_X86_SHMBUF_H
2#define _ASM_X86_SHMBUF_H
3
4/*
5 * The shmid64_ds structure for x86 architecture.
6 * Note extra padding because this structure is passed back and forth
7 * between kernel and user space.
8 *
9 * Pad space on 32 bit is left for:
10 * - 64-bit time_t to solve y2038 problem
11 * - 2 miscellaneous 32-bit values
12 *
13 * Pad space on 64 bit is left for:
14 * - 2 miscellaneous 64-bit values
15 */
16
17struct shmid64_ds {
18 struct ipc64_perm shm_perm; /* operation perms */
19 size_t shm_segsz; /* size of segment (bytes) */
20 __kernel_time_t shm_atime; /* last attach time */
21#ifdef __i386__
22 unsigned long __unused1;
23#endif
24 __kernel_time_t shm_dtime; /* last detach time */
25#ifdef __i386__
26 unsigned long __unused2;
27#endif
28 __kernel_time_t shm_ctime; /* last change time */
29#ifdef __i386__
30 unsigned long __unused3;
31#endif
32 __kernel_pid_t shm_cpid; /* pid of creator */
33 __kernel_pid_t shm_lpid; /* pid of last operator */
34 unsigned long shm_nattch; /* no. of current attaches */
35 unsigned long __unused4;
36 unsigned long __unused5;
37};
38
39struct shminfo64 {
40 unsigned long shmmax;
41 unsigned long shmmin;
42 unsigned long shmmni;
43 unsigned long shmseg;
44 unsigned long shmall;
45 unsigned long __unused1;
46 unsigned long __unused2;
47 unsigned long __unused3;
48 unsigned long __unused4;
49};
50
51#endif /* _ASM_X86_SHMBUF_H */
diff --git a/arch/x86/include/asm/shmparam.h b/arch/x86/include/asm/shmparam.h
new file mode 100644
index 000000000000..0880cf0917b9
--- /dev/null
+++ b/arch/x86/include/asm/shmparam.h
@@ -0,0 +1,6 @@
1#ifndef _ASM_X86_SHMPARAM_H
2#define _ASM_X86_SHMPARAM_H
3
4#define SHMLBA PAGE_SIZE /* attach addr a multiple of this */
5
6#endif /* _ASM_X86_SHMPARAM_H */
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
new file mode 100644
index 000000000000..0afcb5e58acc
--- /dev/null
+++ b/arch/x86/include/asm/sigcontext.h
@@ -0,0 +1,284 @@
1#ifndef _ASM_X86_SIGCONTEXT_H
2#define _ASM_X86_SIGCONTEXT_H
3
4#include <linux/compiler.h>
5#include <asm/types.h>
6
7#define FP_XSTATE_MAGIC1 0x46505853U
8#define FP_XSTATE_MAGIC2 0x46505845U
9#define FP_XSTATE_MAGIC2_SIZE sizeof(FP_XSTATE_MAGIC2)
10
11/*
12 * bytes 464..511 in the current 512byte layout of fxsave/fxrstor frame
13 * are reserved for SW usage. On cpu's supporting xsave/xrstor, these bytes
14 * are used to extended the fpstate pointer in the sigcontext, which now
15 * includes the extended state information along with fpstate information.
16 *
17 * Presence of FP_XSTATE_MAGIC1 at the beginning of this SW reserved
18 * area and FP_XSTATE_MAGIC2 at the end of memory layout
19 * (extended_size - FP_XSTATE_MAGIC2_SIZE) indicates the presence of the
20 * extended state information in the memory layout pointed by the fpstate
21 * pointer in sigcontext.
22 */
23struct _fpx_sw_bytes {
24 __u32 magic1; /* FP_XSTATE_MAGIC1 */
25 __u32 extended_size; /* total size of the layout referred by
26 * fpstate pointer in the sigcontext.
27 */
28 __u64 xstate_bv;
29 /* feature bit mask (including fp/sse/extended
30 * state) that is present in the memory
31 * layout.
32 */
33 __u32 xstate_size; /* actual xsave state size, based on the
34 * features saved in the layout.
35 * 'extended_size' will be greater than
36 * 'xstate_size'.
37 */
38 __u32 padding[7]; /* for future use. */
39};
40
41#ifdef __i386__
42/*
43 * As documented in the iBCS2 standard..
44 *
45 * The first part of "struct _fpstate" is just the normal i387
46 * hardware setup, the extra "status" word is used to save the
47 * coprocessor status word before entering the handler.
48 *
49 * Pentium III FXSR, SSE support
50 * Gareth Hughes <gareth@valinux.com>, May 2000
51 *
52 * The FPU state data structure has had to grow to accommodate the
53 * extended FPU state required by the Streaming SIMD Extensions.
54 * There is no documented standard to accomplish this at the moment.
55 */
56struct _fpreg {
57 unsigned short significand[4];
58 unsigned short exponent;
59};
60
61struct _fpxreg {
62 unsigned short significand[4];
63 unsigned short exponent;
64 unsigned short padding[3];
65};
66
67struct _xmmreg {
68 unsigned long element[4];
69};
70
71struct _fpstate {
72 /* Regular FPU environment */
73 unsigned long cw;
74 unsigned long sw;
75 unsigned long tag;
76 unsigned long ipoff;
77 unsigned long cssel;
78 unsigned long dataoff;
79 unsigned long datasel;
80 struct _fpreg _st[8];
81 unsigned short status;
82 unsigned short magic; /* 0xffff = regular FPU data only */
83
84 /* FXSR FPU environment */
85 unsigned long _fxsr_env[6]; /* FXSR FPU env is ignored */
86 unsigned long mxcsr;
87 unsigned long reserved;
88 struct _fpxreg _fxsr_st[8]; /* FXSR FPU reg data is ignored */
89 struct _xmmreg _xmm[8];
90 unsigned long padding1[44];
91
92 union {
93 unsigned long padding2[12];
94 struct _fpx_sw_bytes sw_reserved; /* represents the extended
95 * state info */
96 };
97};
98
99#define X86_FXSR_MAGIC 0x0000
100
101#ifdef __KERNEL__
102struct sigcontext {
103 unsigned short gs, __gsh;
104 unsigned short fs, __fsh;
105 unsigned short es, __esh;
106 unsigned short ds, __dsh;
107 unsigned long di;
108 unsigned long si;
109 unsigned long bp;
110 unsigned long sp;
111 unsigned long bx;
112 unsigned long dx;
113 unsigned long cx;
114 unsigned long ax;
115 unsigned long trapno;
116 unsigned long err;
117 unsigned long ip;
118 unsigned short cs, __csh;
119 unsigned long flags;
120 unsigned long sp_at_signal;
121 unsigned short ss, __ssh;
122
123 /*
124 * fpstate is really (struct _fpstate *) or (struct _xstate *)
125 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
126 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
127 * of extended memory layout. See comments at the defintion of
128 * (struct _fpx_sw_bytes)
129 */
130 void __user *fpstate; /* zero when no FPU/extended context */
131 unsigned long oldmask;
132 unsigned long cr2;
133};
134#else /* __KERNEL__ */
135/*
136 * User-space might still rely on the old definition:
137 */
138struct sigcontext {
139 unsigned short gs, __gsh;
140 unsigned short fs, __fsh;
141 unsigned short es, __esh;
142 unsigned short ds, __dsh;
143 unsigned long edi;
144 unsigned long esi;
145 unsigned long ebp;
146 unsigned long esp;
147 unsigned long ebx;
148 unsigned long edx;
149 unsigned long ecx;
150 unsigned long eax;
151 unsigned long trapno;
152 unsigned long err;
153 unsigned long eip;
154 unsigned short cs, __csh;
155 unsigned long eflags;
156 unsigned long esp_at_signal;
157 unsigned short ss, __ssh;
158 struct _fpstate __user *fpstate;
159 unsigned long oldmask;
160 unsigned long cr2;
161};
162#endif /* !__KERNEL__ */
163
164#else /* __i386__ */
165
166/* FXSAVE frame */
167/* Note: reserved1/2 may someday contain valuable data. Always save/restore
168 them when you change signal frames. */
169struct _fpstate {
170 __u16 cwd;
171 __u16 swd;
172 __u16 twd; /* Note this is not the same as the
173 32bit/x87/FSAVE twd */
174 __u16 fop;
175 __u64 rip;
176 __u64 rdp;
177 __u32 mxcsr;
178 __u32 mxcsr_mask;
179 __u32 st_space[32]; /* 8*16 bytes for each FP-reg */
180 __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg */
181 __u32 reserved2[12];
182 union {
183 __u32 reserved3[12];
184 struct _fpx_sw_bytes sw_reserved; /* represents the extended
185 * state information */
186 };
187};
188
189#ifdef __KERNEL__
190struct sigcontext {
191 unsigned long r8;
192 unsigned long r9;
193 unsigned long r10;
194 unsigned long r11;
195 unsigned long r12;
196 unsigned long r13;
197 unsigned long r14;
198 unsigned long r15;
199 unsigned long di;
200 unsigned long si;
201 unsigned long bp;
202 unsigned long bx;
203 unsigned long dx;
204 unsigned long ax;
205 unsigned long cx;
206 unsigned long sp;
207 unsigned long ip;
208 unsigned long flags;
209 unsigned short cs;
210 unsigned short gs;
211 unsigned short fs;
212 unsigned short __pad0;
213 unsigned long err;
214 unsigned long trapno;
215 unsigned long oldmask;
216 unsigned long cr2;
217
218 /*
219 * fpstate is really (struct _fpstate *) or (struct _xstate *)
220 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
221 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
222 * of extended memory layout. See comments at the defintion of
223 * (struct _fpx_sw_bytes)
224 */
225 void __user *fpstate; /* zero when no FPU/extended context */
226 unsigned long reserved1[8];
227};
228#else /* __KERNEL__ */
229/*
230 * User-space might still rely on the old definition:
231 */
232struct sigcontext {
233 unsigned long r8;
234 unsigned long r9;
235 unsigned long r10;
236 unsigned long r11;
237 unsigned long r12;
238 unsigned long r13;
239 unsigned long r14;
240 unsigned long r15;
241 unsigned long rdi;
242 unsigned long rsi;
243 unsigned long rbp;
244 unsigned long rbx;
245 unsigned long rdx;
246 unsigned long rax;
247 unsigned long rcx;
248 unsigned long rsp;
249 unsigned long rip;
250 unsigned long eflags; /* RFLAGS */
251 unsigned short cs;
252 unsigned short gs;
253 unsigned short fs;
254 unsigned short __pad0;
255 unsigned long err;
256 unsigned long trapno;
257 unsigned long oldmask;
258 unsigned long cr2;
259 struct _fpstate __user *fpstate; /* zero when no FPU context */
260 unsigned long reserved1[8];
261};
262#endif /* !__KERNEL__ */
263
264#endif /* !__i386__ */
265
266struct _xsave_hdr {
267 __u64 xstate_bv;
268 __u64 reserved1[2];
269 __u64 reserved2[5];
270};
271
272/*
273 * Extended state pointed by the fpstate pointer in the sigcontext.
274 * In addition to the fpstate, information encoded in the xstate_hdr
275 * indicates the presence of other extended state information
276 * supported by the processor and OS.
277 */
278struct _xstate {
279 struct _fpstate fpstate;
280 struct _xsave_hdr xstate_hdr;
281 /* new processor state extensions go here */
282};
283
284#endif /* _ASM_X86_SIGCONTEXT_H */
diff --git a/arch/x86/include/asm/sigcontext32.h b/arch/x86/include/asm/sigcontext32.h
new file mode 100644
index 000000000000..6126188cf3a9
--- /dev/null
+++ b/arch/x86/include/asm/sigcontext32.h
@@ -0,0 +1,75 @@
1#ifndef _ASM_X86_SIGCONTEXT32_H
2#define _ASM_X86_SIGCONTEXT32_H
3
4/* signal context for 32bit programs. */
5
6#define X86_FXSR_MAGIC 0x0000
7
8struct _fpreg {
9 unsigned short significand[4];
10 unsigned short exponent;
11};
12
13struct _fpxreg {
14 unsigned short significand[4];
15 unsigned short exponent;
16 unsigned short padding[3];
17};
18
19struct _xmmreg {
20 __u32 element[4];
21};
22
23/* FSAVE frame with extensions */
24struct _fpstate_ia32 {
25 /* Regular FPU environment */
26 __u32 cw;
27 __u32 sw;
28 __u32 tag; /* not compatible to 64bit twd */
29 __u32 ipoff;
30 __u32 cssel;
31 __u32 dataoff;
32 __u32 datasel;
33 struct _fpreg _st[8];
34 unsigned short status;
35 unsigned short magic; /* 0xffff = regular FPU data only */
36
37 /* FXSR FPU environment */
38 __u32 _fxsr_env[6];
39 __u32 mxcsr;
40 __u32 reserved;
41 struct _fpxreg _fxsr_st[8];
42 struct _xmmreg _xmm[8]; /* It's actually 16 */
43 __u32 padding[44];
44 union {
45 __u32 padding2[12];
46 struct _fpx_sw_bytes sw_reserved;
47 };
48};
49
50struct sigcontext_ia32 {
51 unsigned short gs, __gsh;
52 unsigned short fs, __fsh;
53 unsigned short es, __esh;
54 unsigned short ds, __dsh;
55 unsigned int di;
56 unsigned int si;
57 unsigned int bp;
58 unsigned int sp;
59 unsigned int bx;
60 unsigned int dx;
61 unsigned int cx;
62 unsigned int ax;
63 unsigned int trapno;
64 unsigned int err;
65 unsigned int ip;
66 unsigned short cs, __csh;
67 unsigned int flags;
68 unsigned int sp_at_signal;
69 unsigned short ss, __ssh;
70 unsigned int fpstate; /* really (struct _fpstate_ia32 *) */
71 unsigned int oldmask;
72 unsigned int cr2;
73};
74
75#endif /* _ASM_X86_SIGCONTEXT32_H */
diff --git a/arch/x86/include/asm/siginfo.h b/arch/x86/include/asm/siginfo.h
new file mode 100644
index 000000000000..fc1aa5535646
--- /dev/null
+++ b/arch/x86/include/asm/siginfo.h
@@ -0,0 +1,10 @@
1#ifndef _ASM_X86_SIGINFO_H
2#define _ASM_X86_SIGINFO_H
3
4#ifdef __x86_64__
5# define __ARCH_SI_PREAMBLE_SIZE (4 * sizeof(int))
6#endif
7
8#include <asm-generic/siginfo.h>
9
10#endif /* _ASM_X86_SIGINFO_H */
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
new file mode 100644
index 000000000000..96ac44f275da
--- /dev/null
+++ b/arch/x86/include/asm/signal.h
@@ -0,0 +1,262 @@
1#ifndef _ASM_X86_SIGNAL_H
2#define _ASM_X86_SIGNAL_H
3
4#ifndef __ASSEMBLY__
5#include <linux/types.h>
6#include <linux/time.h>
7#include <linux/compiler.h>
8
9/* Avoid too many header ordering problems. */
10struct siginfo;
11
12#ifdef __KERNEL__
13#include <linux/linkage.h>
14
15/* Most things should be clean enough to redefine this at will, if care
16 is taken to make libc match. */
17
18#define _NSIG 64
19
20#ifdef __i386__
21# define _NSIG_BPW 32
22#else
23# define _NSIG_BPW 64
24#endif
25
26#define _NSIG_WORDS (_NSIG / _NSIG_BPW)
27
28typedef unsigned long old_sigset_t; /* at least 32 bits */
29
30typedef struct {
31 unsigned long sig[_NSIG_WORDS];
32} sigset_t;
33
34#else
35/* Here we must cater to libcs that poke about in kernel headers. */
36
37#define NSIG 32
38typedef unsigned long sigset_t;
39
40#endif /* __KERNEL__ */
41#endif /* __ASSEMBLY__ */
42
43#define SIGHUP 1
44#define SIGINT 2
45#define SIGQUIT 3
46#define SIGILL 4
47#define SIGTRAP 5
48#define SIGABRT 6
49#define SIGIOT 6
50#define SIGBUS 7
51#define SIGFPE 8
52#define SIGKILL 9
53#define SIGUSR1 10
54#define SIGSEGV 11
55#define SIGUSR2 12
56#define SIGPIPE 13
57#define SIGALRM 14
58#define SIGTERM 15
59#define SIGSTKFLT 16
60#define SIGCHLD 17
61#define SIGCONT 18
62#define SIGSTOP 19
63#define SIGTSTP 20
64#define SIGTTIN 21
65#define SIGTTOU 22
66#define SIGURG 23
67#define SIGXCPU 24
68#define SIGXFSZ 25
69#define SIGVTALRM 26
70#define SIGPROF 27
71#define SIGWINCH 28
72#define SIGIO 29
73#define SIGPOLL SIGIO
74/*
75#define SIGLOST 29
76*/
77#define SIGPWR 30
78#define SIGSYS 31
79#define SIGUNUSED 31
80
81/* These should not be considered constants from userland. */
82#define SIGRTMIN 32
83#define SIGRTMAX _NSIG
84
85/*
86 * SA_FLAGS values:
87 *
88 * SA_ONSTACK indicates that a registered stack_t will be used.
89 * SA_RESTART flag to get restarting signals (which were the default long ago)
90 * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop.
91 * SA_RESETHAND clears the handler when the signal is delivered.
92 * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies.
93 * SA_NODEFER prevents the current signal from being masked in the handler.
94 *
95 * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single
96 * Unix names RESETHAND and NODEFER respectively.
97 */
98#define SA_NOCLDSTOP 0x00000001u
99#define SA_NOCLDWAIT 0x00000002u
100#define SA_SIGINFO 0x00000004u
101#define SA_ONSTACK 0x08000000u
102#define SA_RESTART 0x10000000u
103#define SA_NODEFER 0x40000000u
104#define SA_RESETHAND 0x80000000u
105
106#define SA_NOMASK SA_NODEFER
107#define SA_ONESHOT SA_RESETHAND
108
109#define SA_RESTORER 0x04000000
110
111/*
112 * sigaltstack controls
113 */
114#define SS_ONSTACK 1
115#define SS_DISABLE 2
116
117#define MINSIGSTKSZ 2048
118#define SIGSTKSZ 8192
119
120#include <asm-generic/signal.h>
121
122#ifndef __ASSEMBLY__
123
124#ifdef __i386__
125# ifdef __KERNEL__
126struct old_sigaction {
127 __sighandler_t sa_handler;
128 old_sigset_t sa_mask;
129 unsigned long sa_flags;
130 __sigrestore_t sa_restorer;
131};
132
133struct sigaction {
134 __sighandler_t sa_handler;
135 unsigned long sa_flags;
136 __sigrestore_t sa_restorer;
137 sigset_t sa_mask; /* mask last for extensibility */
138};
139
140struct k_sigaction {
141 struct sigaction sa;
142};
143
144extern void do_notify_resume(struct pt_regs *, void *, __u32);
145
146# else /* __KERNEL__ */
147/* Here we must cater to libcs that poke about in kernel headers. */
148
149struct sigaction {
150 union {
151 __sighandler_t _sa_handler;
152 void (*_sa_sigaction)(int, struct siginfo *, void *);
153 } _u;
154 sigset_t sa_mask;
155 unsigned long sa_flags;
156 void (*sa_restorer)(void);
157};
158
159#define sa_handler _u._sa_handler
160#define sa_sigaction _u._sa_sigaction
161
162# endif /* ! __KERNEL__ */
163#else /* __i386__ */
164
165struct sigaction {
166 __sighandler_t sa_handler;
167 unsigned long sa_flags;
168 __sigrestore_t sa_restorer;
169 sigset_t sa_mask; /* mask last for extensibility */
170};
171
172struct k_sigaction {
173 struct sigaction sa;
174};
175
176#endif /* !__i386__ */
177
178typedef struct sigaltstack {
179 void __user *ss_sp;
180 int ss_flags;
181 size_t ss_size;
182} stack_t;
183
184#ifdef __KERNEL__
185#include <asm/sigcontext.h>
186
187#ifdef __i386__
188
189#define __HAVE_ARCH_SIG_BITOPS
190
191#define sigaddset(set,sig) \
192 (__builtin_constant_p(sig) \
193 ? __const_sigaddset((set), (sig)) \
194 : __gen_sigaddset((set), (sig)))
195
196static inline void __gen_sigaddset(sigset_t *set, int _sig)
197{
198 asm("btsl %1,%0" : "+m"(*set) : "Ir"(_sig - 1) : "cc");
199}
200
201static inline void __const_sigaddset(sigset_t *set, int _sig)
202{
203 unsigned long sig = _sig - 1;
204 set->sig[sig / _NSIG_BPW] |= 1 << (sig % _NSIG_BPW);
205}
206
207#define sigdelset(set, sig) \
208 (__builtin_constant_p(sig) \
209 ? __const_sigdelset((set), (sig)) \
210 : __gen_sigdelset((set), (sig)))
211
212
213static inline void __gen_sigdelset(sigset_t *set, int _sig)
214{
215 asm("btrl %1,%0" : "+m"(*set) : "Ir"(_sig - 1) : "cc");
216}
217
218static inline void __const_sigdelset(sigset_t *set, int _sig)
219{
220 unsigned long sig = _sig - 1;
221 set->sig[sig / _NSIG_BPW] &= ~(1 << (sig % _NSIG_BPW));
222}
223
224static inline int __const_sigismember(sigset_t *set, int _sig)
225{
226 unsigned long sig = _sig - 1;
227 return 1 & (set->sig[sig / _NSIG_BPW] >> (sig % _NSIG_BPW));
228}
229
230static inline int __gen_sigismember(sigset_t *set, int _sig)
231{
232 int ret;
233 asm("btl %2,%1\n\tsbbl %0,%0"
234 : "=r"(ret) : "m"(*set), "Ir"(_sig-1) : "cc");
235 return ret;
236}
237
238#define sigismember(set, sig) \
239 (__builtin_constant_p(sig) \
240 ? __const_sigismember((set), (sig)) \
241 : __gen_sigismember((set), (sig)))
242
243static inline int sigfindinword(unsigned long word)
244{
245 asm("bsfl %1,%0" : "=r"(word) : "rm"(word) : "cc");
246 return word;
247}
248
249struct pt_regs;
250
251#else /* __i386__ */
252
253#undef __HAVE_ARCH_SIG_BITOPS
254
255#endif /* !__i386__ */
256
257#define ptrace_signal_deliver(regs, cookie) do { } while (0)
258
259#endif /* __KERNEL__ */
260#endif /* __ASSEMBLY__ */
261
262#endif /* _ASM_X86_SIGNAL_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
new file mode 100644
index 000000000000..2766021aef80
--- /dev/null
+++ b/arch/x86/include/asm/smp.h
@@ -0,0 +1,229 @@
1#ifndef _ASM_X86_SMP_H
2#define _ASM_X86_SMP_H
3#ifndef __ASSEMBLY__
4#include <linux/cpumask.h>
5#include <linux/init.h>
6#include <asm/percpu.h>
7
8/*
9 * We need the APIC definitions automatically as part of 'smp.h'
10 */
11#ifdef CONFIG_X86_LOCAL_APIC
12# include <asm/mpspec.h>
13# include <asm/apic.h>
14# ifdef CONFIG_X86_IO_APIC
15# include <asm/io_apic.h>
16# endif
17#endif
18#include <asm/pda.h>
19#include <asm/thread_info.h>
20
21extern cpumask_t cpu_callout_map;
22extern cpumask_t cpu_initialized;
23extern cpumask_t cpu_callin_map;
24
25extern void (*mtrr_hook)(void);
26extern void zap_low_mappings(void);
27
28extern int __cpuinit get_local_pda(int cpu);
29
30extern int smp_num_siblings;
31extern unsigned int num_processors;
32extern cpumask_t cpu_initialized;
33
34DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
35DECLARE_PER_CPU(cpumask_t, cpu_core_map);
36DECLARE_PER_CPU(u16, cpu_llc_id);
37#ifdef CONFIG_X86_32
38DECLARE_PER_CPU(int, cpu_number);
39#endif
40
41DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
42DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
43
44/* Static state in head.S used to set up a CPU */
45extern struct {
46 void *sp;
47 unsigned short ss;
48} stack_start;
49
50struct smp_ops {
51 void (*smp_prepare_boot_cpu)(void);
52 void (*smp_prepare_cpus)(unsigned max_cpus);
53 void (*smp_cpus_done)(unsigned max_cpus);
54
55 void (*smp_send_stop)(void);
56 void (*smp_send_reschedule)(int cpu);
57
58 int (*cpu_up)(unsigned cpu);
59 int (*cpu_disable)(void);
60 void (*cpu_die)(unsigned int cpu);
61 void (*play_dead)(void);
62
63 void (*send_call_func_ipi)(cpumask_t mask);
64 void (*send_call_func_single_ipi)(int cpu);
65};
66
67/* Globals due to paravirt */
68extern void set_cpu_sibling_map(int cpu);
69
70#ifdef CONFIG_SMP
71#ifndef CONFIG_PARAVIRT
72#define startup_ipi_hook(phys_apicid, start_eip, start_esp) do { } while (0)
73#endif
74extern struct smp_ops smp_ops;
75
76static inline void smp_send_stop(void)
77{
78 smp_ops.smp_send_stop();
79}
80
81static inline void smp_prepare_boot_cpu(void)
82{
83 smp_ops.smp_prepare_boot_cpu();
84}
85
86static inline void smp_prepare_cpus(unsigned int max_cpus)
87{
88 smp_ops.smp_prepare_cpus(max_cpus);
89}
90
91static inline void smp_cpus_done(unsigned int max_cpus)
92{
93 smp_ops.smp_cpus_done(max_cpus);
94}
95
96static inline int __cpu_up(unsigned int cpu)
97{
98 return smp_ops.cpu_up(cpu);
99}
100
101static inline int __cpu_disable(void)
102{
103 return smp_ops.cpu_disable();
104}
105
106static inline void __cpu_die(unsigned int cpu)
107{
108 smp_ops.cpu_die(cpu);
109}
110
111static inline void play_dead(void)
112{
113 smp_ops.play_dead();
114}
115
116static inline void smp_send_reschedule(int cpu)
117{
118 smp_ops.smp_send_reschedule(cpu);
119}
120
121static inline void arch_send_call_function_single_ipi(int cpu)
122{
123 smp_ops.send_call_func_single_ipi(cpu);
124}
125
126static inline void arch_send_call_function_ipi(cpumask_t mask)
127{
128 smp_ops.send_call_func_ipi(mask);
129}
130
131void cpu_disable_common(void);
132void native_smp_prepare_boot_cpu(void);
133void native_smp_prepare_cpus(unsigned int max_cpus);
134void native_smp_cpus_done(unsigned int max_cpus);
135int native_cpu_up(unsigned int cpunum);
136int native_cpu_disable(void);
137void native_cpu_die(unsigned int cpu);
138void native_play_dead(void);
139void play_dead_common(void);
140
141void native_send_call_func_ipi(cpumask_t mask);
142void native_send_call_func_single_ipi(int cpu);
143
144extern void prefill_possible_map(void);
145
146void smp_store_cpu_info(int id);
147#define cpu_physical_id(cpu) per_cpu(x86_cpu_to_apicid, cpu)
148
149/* We don't mark CPUs online until __cpu_up(), so we need another measure */
150static inline int num_booting_cpus(void)
151{
152 return cpus_weight(cpu_callout_map);
153}
154#else
155static inline void prefill_possible_map(void)
156{
157}
158#endif /* CONFIG_SMP */
159
160extern unsigned disabled_cpus __cpuinitdata;
161
162#ifdef CONFIG_X86_32_SMP
163/*
164 * This function is needed by all SMP systems. It must _always_ be valid
165 * from the initial startup. We map APIC_BASE very early in page_setup(),
166 * so this is correct in the x86 case.
167 */
168#define raw_smp_processor_id() (x86_read_percpu(cpu_number))
169extern int safe_smp_processor_id(void);
170
171#elif defined(CONFIG_X86_64_SMP)
172#define raw_smp_processor_id() read_pda(cpunumber)
173
174#define stack_smp_processor_id() \
175({ \
176 struct thread_info *ti; \
177 __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \
178 ti->cpu; \
179})
180#define safe_smp_processor_id() smp_processor_id()
181
182#else /* !CONFIG_X86_32_SMP && !CONFIG_X86_64_SMP */
183#define cpu_physical_id(cpu) boot_cpu_physical_apicid
184#define safe_smp_processor_id() 0
185#define stack_smp_processor_id() 0
186#endif
187
188#ifdef CONFIG_X86_LOCAL_APIC
189
190#ifndef CONFIG_X86_64
191static inline int logical_smp_processor_id(void)
192{
193 /* we don't want to mark this access volatile - bad code generation */
194 return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
195}
196
197#include <mach_apicdef.h>
198static inline unsigned int read_apic_id(void)
199{
200 unsigned int reg;
201
202 reg = *(u32 *)(APIC_BASE + APIC_ID);
203
204 return GET_APIC_ID(reg);
205}
206#endif
207
208
209# if defined(APIC_DEFINITION) || defined(CONFIG_X86_64)
210extern int hard_smp_processor_id(void);
211# else
212#include <mach_apicdef.h>
213static inline int hard_smp_processor_id(void)
214{
215 /* we don't want to mark this access volatile - bad code generation */
216 return read_apic_id();
217}
218# endif /* APIC_DEFINITION */
219
220#else /* CONFIG_X86_LOCAL_APIC */
221
222# ifndef CONFIG_SMP
223# define hard_smp_processor_id() 0
224# endif
225
226#endif /* CONFIG_X86_LOCAL_APIC */
227
228#endif /* __ASSEMBLY__ */
229#endif /* _ASM_X86_SMP_H */
diff --git a/arch/x86/include/asm/socket.h b/arch/x86/include/asm/socket.h
new file mode 100644
index 000000000000..8ab9cc8b2ecc
--- /dev/null
+++ b/arch/x86/include/asm/socket.h
@@ -0,0 +1,57 @@
1#ifndef _ASM_X86_SOCKET_H
2#define _ASM_X86_SOCKET_H
3
4#include <asm/sockios.h>
5
6/* For setsockopt(2) */
7#define SOL_SOCKET 1
8
9#define SO_DEBUG 1
10#define SO_REUSEADDR 2
11#define SO_TYPE 3
12#define SO_ERROR 4
13#define SO_DONTROUTE 5
14#define SO_BROADCAST 6
15#define SO_SNDBUF 7
16#define SO_RCVBUF 8
17#define SO_SNDBUFFORCE 32
18#define SO_RCVBUFFORCE 33
19#define SO_KEEPALIVE 9
20#define SO_OOBINLINE 10
21#define SO_NO_CHECK 11
22#define SO_PRIORITY 12
23#define SO_LINGER 13
24#define SO_BSDCOMPAT 14
25/* To add :#define SO_REUSEPORT 15 */
26#define SO_PASSCRED 16
27#define SO_PEERCRED 17
28#define SO_RCVLOWAT 18
29#define SO_SNDLOWAT 19
30#define SO_RCVTIMEO 20
31#define SO_SNDTIMEO 21
32
33/* Security levels - as per NRL IPv6 - don't actually do anything */
34#define SO_SECURITY_AUTHENTICATION 22
35#define SO_SECURITY_ENCRYPTION_TRANSPORT 23
36#define SO_SECURITY_ENCRYPTION_NETWORK 24
37
38#define SO_BINDTODEVICE 25
39
40/* Socket filtering */
41#define SO_ATTACH_FILTER 26
42#define SO_DETACH_FILTER 27
43
44#define SO_PEERNAME 28
45#define SO_TIMESTAMP 29
46#define SCM_TIMESTAMP SO_TIMESTAMP
47
48#define SO_ACCEPTCONN 30
49
50#define SO_PEERSEC 31
51#define SO_PASSSEC 34
52#define SO_TIMESTAMPNS 35
53#define SCM_TIMESTAMPNS SO_TIMESTAMPNS
54
55#define SO_MARK 36
56
57#endif /* _ASM_X86_SOCKET_H */
diff --git a/arch/x86/include/asm/sockios.h b/arch/x86/include/asm/sockios.h
new file mode 100644
index 000000000000..49cc72b5d3c9
--- /dev/null
+++ b/arch/x86/include/asm/sockios.h
@@ -0,0 +1,13 @@
1#ifndef _ASM_X86_SOCKIOS_H
2#define _ASM_X86_SOCKIOS_H
3
4/* Socket-level I/O control calls. */
5#define FIOSETOWN 0x8901
6#define SIOCSPGRP 0x8902
7#define FIOGETOWN 0x8903
8#define SIOCGPGRP 0x8904
9#define SIOCATMARK 0x8905
10#define SIOCGSTAMP 0x8906 /* Get stamp (timeval) */
11#define SIOCGSTAMPNS 0x8907 /* Get stamp (timespec) */
12
13#endif /* _ASM_X86_SOCKIOS_H */
diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h
new file mode 100644
index 000000000000..be44f7dab395
--- /dev/null
+++ b/arch/x86/include/asm/sparsemem.h
@@ -0,0 +1,34 @@
1#ifndef _ASM_X86_SPARSEMEM_H
2#define _ASM_X86_SPARSEMEM_H
3
4#ifdef CONFIG_SPARSEMEM
5/*
6 * generic non-linear memory support:
7 *
8 * 1) we will not split memory into more chunks than will fit into the flags
9 * field of the struct page
10 *
11 * SECTION_SIZE_BITS 2^n: size of each section
12 * MAX_PHYSADDR_BITS 2^n: max size of physical address space
13 * MAX_PHYSMEM_BITS 2^n: how much memory we can have in that space
14 *
15 */
16
17#ifdef CONFIG_X86_32
18# ifdef CONFIG_X86_PAE
19# define SECTION_SIZE_BITS 29
20# define MAX_PHYSADDR_BITS 36
21# define MAX_PHYSMEM_BITS 36
22# else
23# define SECTION_SIZE_BITS 26
24# define MAX_PHYSADDR_BITS 32
25# define MAX_PHYSMEM_BITS 32
26# endif
27#else /* CONFIG_X86_32 */
28# define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */
29# define MAX_PHYSADDR_BITS 44
30# define MAX_PHYSMEM_BITS 44
31#endif
32
33#endif /* CONFIG_SPARSEMEM */
34#endif /* _ASM_X86_SPARSEMEM_H */
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
new file mode 100644
index 000000000000..d17c91981da2
--- /dev/null
+++ b/arch/x86/include/asm/spinlock.h
@@ -0,0 +1,364 @@
1#ifndef _ASM_X86_SPINLOCK_H
2#define _ASM_X86_SPINLOCK_H
3
4#include <asm/atomic.h>
5#include <asm/rwlock.h>
6#include <asm/page.h>
7#include <asm/processor.h>
8#include <linux/compiler.h>
9#include <asm/paravirt.h>
10/*
11 * Your basic SMP spinlocks, allowing only a single CPU anywhere
12 *
13 * Simple spin lock operations. There are two variants, one clears IRQ's
14 * on the local processor, one does not.
15 *
16 * These are fair FIFO ticket locks, which are currently limited to 256
17 * CPUs.
18 *
19 * (the type definitions are in asm/spinlock_types.h)
20 */
21
22#ifdef CONFIG_X86_32
23# define LOCK_PTR_REG "a"
24# define REG_PTR_MODE "k"
25#else
26# define LOCK_PTR_REG "D"
27# define REG_PTR_MODE "q"
28#endif
29
30#if defined(CONFIG_X86_32) && \
31 (defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE))
32/*
33 * On PPro SMP or if we are using OOSTORE, we use a locked operation to unlock
34 * (PPro errata 66, 92)
35 */
36# define UNLOCK_LOCK_PREFIX LOCK_PREFIX
37#else
38# define UNLOCK_LOCK_PREFIX
39#endif
40
41/*
42 * Ticket locks are conceptually two parts, one indicating the current head of
43 * the queue, and the other indicating the current tail. The lock is acquired
44 * by atomically noting the tail and incrementing it by one (thus adding
45 * ourself to the queue and noting our position), then waiting until the head
46 * becomes equal to the the initial value of the tail.
47 *
48 * We use an xadd covering *both* parts of the lock, to increment the tail and
49 * also load the position of the head, which takes care of memory ordering
50 * issues and should be optimal for the uncontended case. Note the tail must be
51 * in the high part, because a wide xadd increment of the low part would carry
52 * up and contaminate the high part.
53 *
54 * With fewer than 2^8 possible CPUs, we can use x86's partial registers to
55 * save some instructions and make the code more elegant. There really isn't
56 * much between them in performance though, especially as locks are out of line.
57 */
58#if (NR_CPUS < 256)
59#define TICKET_SHIFT 8
60
61static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
62{
63 short inc = 0x0100;
64
65 asm volatile (
66 LOCK_PREFIX "xaddw %w0, %1\n"
67 "1:\t"
68 "cmpb %h0, %b0\n\t"
69 "je 2f\n\t"
70 "rep ; nop\n\t"
71 "movb %1, %b0\n\t"
72 /* don't need lfence here, because loads are in-order */
73 "jmp 1b\n"
74 "2:"
75 : "+Q" (inc), "+m" (lock->slock)
76 :
77 : "memory", "cc");
78}
79
80static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
81{
82 int tmp, new;
83
84 asm volatile("movzwl %2, %0\n\t"
85 "cmpb %h0,%b0\n\t"
86 "leal 0x100(%" REG_PTR_MODE "0), %1\n\t"
87 "jne 1f\n\t"
88 LOCK_PREFIX "cmpxchgw %w1,%2\n\t"
89 "1:"
90 "sete %b1\n\t"
91 "movzbl %b1,%0\n\t"
92 : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
93 :
94 : "memory", "cc");
95
96 return tmp;
97}
98
99static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
100{
101 asm volatile(UNLOCK_LOCK_PREFIX "incb %0"
102 : "+m" (lock->slock)
103 :
104 : "memory", "cc");
105}
106#else
107#define TICKET_SHIFT 16
108
109static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
110{
111 int inc = 0x00010000;
112 int tmp;
113
114 asm volatile(LOCK_PREFIX "xaddl %0, %1\n"
115 "movzwl %w0, %2\n\t"
116 "shrl $16, %0\n\t"
117 "1:\t"
118 "cmpl %0, %2\n\t"
119 "je 2f\n\t"
120 "rep ; nop\n\t"
121 "movzwl %1, %2\n\t"
122 /* don't need lfence here, because loads are in-order */
123 "jmp 1b\n"
124 "2:"
125 : "+r" (inc), "+m" (lock->slock), "=&r" (tmp)
126 :
127 : "memory", "cc");
128}
129
130static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
131{
132 int tmp;
133 int new;
134
135 asm volatile("movl %2,%0\n\t"
136 "movl %0,%1\n\t"
137 "roll $16, %0\n\t"
138 "cmpl %0,%1\n\t"
139 "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t"
140 "jne 1f\n\t"
141 LOCK_PREFIX "cmpxchgl %1,%2\n\t"
142 "1:"
143 "sete %b1\n\t"
144 "movzbl %b1,%0\n\t"
145 : "=&a" (tmp), "=&q" (new), "+m" (lock->slock)
146 :
147 : "memory", "cc");
148
149 return tmp;
150}
151
152static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
153{
154 asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
155 : "+m" (lock->slock)
156 :
157 : "memory", "cc");
158}
159#endif
160
161static inline int __ticket_spin_is_locked(raw_spinlock_t *lock)
162{
163 int tmp = ACCESS_ONCE(lock->slock);
164
165 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
166}
167
168static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
169{
170 int tmp = ACCESS_ONCE(lock->slock);
171
172 return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1;
173}
174
175#ifdef CONFIG_PARAVIRT
176/*
177 * Define virtualization-friendly old-style lock byte lock, for use in
178 * pv_lock_ops if desired.
179 *
180 * This differs from the pre-2.6.24 spinlock by always using xchgb
181 * rather than decb to take the lock; this allows it to use a
182 * zero-initialized lock structure. It also maintains a 1-byte
183 * contention counter, so that we can implement
184 * __byte_spin_is_contended.
185 */
186struct __byte_spinlock {
187 s8 lock;
188 s8 spinners;
189};
190
191static inline int __byte_spin_is_locked(raw_spinlock_t *lock)
192{
193 struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
194 return bl->lock != 0;
195}
196
197static inline int __byte_spin_is_contended(raw_spinlock_t *lock)
198{
199 struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
200 return bl->spinners != 0;
201}
202
203static inline void __byte_spin_lock(raw_spinlock_t *lock)
204{
205 struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
206 s8 val = 1;
207
208 asm("1: xchgb %1, %0\n"
209 " test %1,%1\n"
210 " jz 3f\n"
211 " " LOCK_PREFIX "incb %2\n"
212 "2: rep;nop\n"
213 " cmpb $1, %0\n"
214 " je 2b\n"
215 " " LOCK_PREFIX "decb %2\n"
216 " jmp 1b\n"
217 "3:"
218 : "+m" (bl->lock), "+q" (val), "+m" (bl->spinners): : "memory");
219}
220
221static inline int __byte_spin_trylock(raw_spinlock_t *lock)
222{
223 struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
224 u8 old = 1;
225
226 asm("xchgb %1,%0"
227 : "+m" (bl->lock), "+q" (old) : : "memory");
228
229 return old == 0;
230}
231
232static inline void __byte_spin_unlock(raw_spinlock_t *lock)
233{
234 struct __byte_spinlock *bl = (struct __byte_spinlock *)lock;
235 smp_wmb();
236 bl->lock = 0;
237}
238#else /* !CONFIG_PARAVIRT */
239static inline int __raw_spin_is_locked(raw_spinlock_t *lock)
240{
241 return __ticket_spin_is_locked(lock);
242}
243
244static inline int __raw_spin_is_contended(raw_spinlock_t *lock)
245{
246 return __ticket_spin_is_contended(lock);
247}
248
249static __always_inline void __raw_spin_lock(raw_spinlock_t *lock)
250{
251 __ticket_spin_lock(lock);
252}
253
254static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock)
255{
256 return __ticket_spin_trylock(lock);
257}
258
259static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock)
260{
261 __ticket_spin_unlock(lock);
262}
263
264static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock,
265 unsigned long flags)
266{
267 __raw_spin_lock(lock);
268}
269
270#endif /* CONFIG_PARAVIRT */
271
272static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
273{
274 while (__raw_spin_is_locked(lock))
275 cpu_relax();
276}
277
278/*
279 * Read-write spinlocks, allowing multiple readers
280 * but only one writer.
281 *
282 * NOTE! it is quite common to have readers in interrupts
283 * but no interrupt writers. For those circumstances we
284 * can "mix" irq-safe locks - any writer needs to get a
285 * irq-safe write-lock, but readers can get non-irqsafe
286 * read-locks.
287 *
288 * On x86, we implement read-write locks as a 32-bit counter
289 * with the high bit (sign) being the "contended" bit.
290 */
291
292/**
293 * read_can_lock - would read_trylock() succeed?
294 * @lock: the rwlock in question.
295 */
296static inline int __raw_read_can_lock(raw_rwlock_t *lock)
297{
298 return (int)(lock)->lock > 0;
299}
300
301/**
302 * write_can_lock - would write_trylock() succeed?
303 * @lock: the rwlock in question.
304 */
305static inline int __raw_write_can_lock(raw_rwlock_t *lock)
306{
307 return (lock)->lock == RW_LOCK_BIAS;
308}
309
310static inline void __raw_read_lock(raw_rwlock_t *rw)
311{
312 asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
313 "jns 1f\n"
314 "call __read_lock_failed\n\t"
315 "1:\n"
316 ::LOCK_PTR_REG (rw) : "memory");
317}
318
319static inline void __raw_write_lock(raw_rwlock_t *rw)
320{
321 asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
322 "jz 1f\n"
323 "call __write_lock_failed\n\t"
324 "1:\n"
325 ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
326}
327
328static inline int __raw_read_trylock(raw_rwlock_t *lock)
329{
330 atomic_t *count = (atomic_t *)lock;
331
332 atomic_dec(count);
333 if (atomic_read(count) >= 0)
334 return 1;
335 atomic_inc(count);
336 return 0;
337}
338
339static inline int __raw_write_trylock(raw_rwlock_t *lock)
340{
341 atomic_t *count = (atomic_t *)lock;
342
343 if (atomic_sub_and_test(RW_LOCK_BIAS, count))
344 return 1;
345 atomic_add(RW_LOCK_BIAS, count);
346 return 0;
347}
348
349static inline void __raw_read_unlock(raw_rwlock_t *rw)
350{
351 asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
352}
353
354static inline void __raw_write_unlock(raw_rwlock_t *rw)
355{
356 asm volatile(LOCK_PREFIX "addl %1, %0"
357 : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
358}
359
360#define _raw_spin_relax(lock) cpu_relax()
361#define _raw_read_relax(lock) cpu_relax()
362#define _raw_write_relax(lock) cpu_relax()
363
364#endif /* _ASM_X86_SPINLOCK_H */
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
new file mode 100644
index 000000000000..845f81c87091
--- /dev/null
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -0,0 +1,20 @@
1#ifndef _ASM_X86_SPINLOCK_TYPES_H
2#define _ASM_X86_SPINLOCK_TYPES_H
3
4#ifndef __LINUX_SPINLOCK_TYPES_H
5# error "please don't include this file directly"
6#endif
7
8typedef struct raw_spinlock {
9 unsigned int slock;
10} raw_spinlock_t;
11
12#define __RAW_SPIN_LOCK_UNLOCKED { 0 }
13
14typedef struct {
15 unsigned int lock;
16} raw_rwlock_t;
17
18#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
19
20#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h
new file mode 100644
index 000000000000..b508d639d1a7
--- /dev/null
+++ b/arch/x86/include/asm/srat.h
@@ -0,0 +1,39 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26
27#ifndef _ASM_X86_SRAT_H
28#define _ASM_X86_SRAT_H
29
30#ifdef CONFIG_ACPI_NUMA
31extern int get_memcfg_from_srat(void);
32#else
33static inline int get_memcfg_from_srat(void)
34{
35 return 0;
36}
37#endif
38
39#endif /* _ASM_X86_SRAT_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
new file mode 100644
index 000000000000..f517944b2b17
--- /dev/null
+++ b/arch/x86/include/asm/stacktrace.h
@@ -0,0 +1,21 @@
1#ifndef _ASM_X86_STACKTRACE_H
2#define _ASM_X86_STACKTRACE_H
3
4extern int kstack_depth_to_print;
5
6/* Generic stack tracer with callbacks */
7
8struct stacktrace_ops {
9 void (*warning)(void *data, char *msg);
10 /* msg must contain %s for the symbol */
11 void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
12 void (*address)(void *data, unsigned long address, int reliable);
13 /* On negative return stop dumping */
14 int (*stack)(void *data, char *name);
15};
16
17void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
18 unsigned long *stack, unsigned long bp,
19 const struct stacktrace_ops *ops, void *data);
20
21#endif /* _ASM_X86_STACKTRACE_H */
diff --git a/arch/x86/include/asm/stat.h b/arch/x86/include/asm/stat.h
new file mode 100644
index 000000000000..e0b1d9bbcbc6
--- /dev/null
+++ b/arch/x86/include/asm/stat.h
@@ -0,0 +1,114 @@
1#ifndef _ASM_X86_STAT_H
2#define _ASM_X86_STAT_H
3
4#define STAT_HAVE_NSEC 1
5
6#ifdef __i386__
7struct stat {
8 unsigned long st_dev;
9 unsigned long st_ino;
10 unsigned short st_mode;
11 unsigned short st_nlink;
12 unsigned short st_uid;
13 unsigned short st_gid;
14 unsigned long st_rdev;
15 unsigned long st_size;
16 unsigned long st_blksize;
17 unsigned long st_blocks;
18 unsigned long st_atime;
19 unsigned long st_atime_nsec;
20 unsigned long st_mtime;
21 unsigned long st_mtime_nsec;
22 unsigned long st_ctime;
23 unsigned long st_ctime_nsec;
24 unsigned long __unused4;
25 unsigned long __unused5;
26};
27
28#define STAT64_HAS_BROKEN_ST_INO 1
29
30/* This matches struct stat64 in glibc2.1, hence the absolutely
31 * insane amounts of padding around dev_t's.
32 */
33struct stat64 {
34 unsigned long long st_dev;
35 unsigned char __pad0[4];
36
37 unsigned long __st_ino;
38
39 unsigned int st_mode;
40 unsigned int st_nlink;
41
42 unsigned long st_uid;
43 unsigned long st_gid;
44
45 unsigned long long st_rdev;
46 unsigned char __pad3[4];
47
48 long long st_size;
49 unsigned long st_blksize;
50
51 /* Number 512-byte blocks allocated. */
52 unsigned long long st_blocks;
53
54 unsigned long st_atime;
55 unsigned long st_atime_nsec;
56
57 unsigned long st_mtime;
58 unsigned int st_mtime_nsec;
59
60 unsigned long st_ctime;
61 unsigned long st_ctime_nsec;
62
63 unsigned long long st_ino;
64};
65
66#else /* __i386__ */
67
68struct stat {
69 unsigned long st_dev;
70 unsigned long st_ino;
71 unsigned long st_nlink;
72
73 unsigned int st_mode;
74 unsigned int st_uid;
75 unsigned int st_gid;
76 unsigned int __pad0;
77 unsigned long st_rdev;
78 long st_size;
79 long st_blksize;
80 long st_blocks; /* Number 512-byte blocks allocated. */
81
82 unsigned long st_atime;
83 unsigned long st_atime_nsec;
84 unsigned long st_mtime;
85 unsigned long st_mtime_nsec;
86 unsigned long st_ctime;
87 unsigned long st_ctime_nsec;
88 long __unused[3];
89};
90#endif
91
92/* for 32bit emulation and 32 bit kernels */
93struct __old_kernel_stat {
94 unsigned short st_dev;
95 unsigned short st_ino;
96 unsigned short st_mode;
97 unsigned short st_nlink;
98 unsigned short st_uid;
99 unsigned short st_gid;
100 unsigned short st_rdev;
101#ifdef __i386__
102 unsigned long st_size;
103 unsigned long st_atime;
104 unsigned long st_mtime;
105 unsigned long st_ctime;
106#else
107 unsigned int st_size;
108 unsigned int st_atime;
109 unsigned int st_mtime;
110 unsigned int st_ctime;
111#endif
112};
113
114#endif /* _ASM_X86_STAT_H */
diff --git a/arch/x86/include/asm/statfs.h b/arch/x86/include/asm/statfs.h
new file mode 100644
index 000000000000..2d0adbf99a8e
--- /dev/null
+++ b/arch/x86/include/asm/statfs.h
@@ -0,0 +1,12 @@
1#ifndef _ASM_X86_STATFS_H
2#define _ASM_X86_STATFS_H
3
4/*
5 * We need compat_statfs64 to be packed, because the i386 ABI won't
6 * add padding at the end to bring it to a multiple of 8 bytes, but
7 * the x86_64 ABI will.
8 */
9#define ARCH_PACK_COMPAT_STATFS64 __attribute__((packed,aligned(4)))
10
11#include <asm-generic/statfs.h>
12#endif /* _ASM_X86_STATFS_H */
diff --git a/arch/x86/include/asm/string.h b/arch/x86/include/asm/string.h
new file mode 100644
index 000000000000..6dfd6d9373a0
--- /dev/null
+++ b/arch/x86/include/asm/string.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "string_32.h"
3#else
4# include "string_64.h"
5#endif
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
new file mode 100644
index 000000000000..0e0e3ba827f7
--- /dev/null
+++ b/arch/x86/include/asm/string_32.h
@@ -0,0 +1,326 @@
1#ifndef _ASM_X86_STRING_32_H
2#define _ASM_X86_STRING_32_H
3
4#ifdef __KERNEL__
5
6/* Let gcc decide whether to inline or use the out of line functions */
7
8#define __HAVE_ARCH_STRCPY
9extern char *strcpy(char *dest, const char *src);
10
11#define __HAVE_ARCH_STRNCPY
12extern char *strncpy(char *dest, const char *src, size_t count);
13
14#define __HAVE_ARCH_STRCAT
15extern char *strcat(char *dest, const char *src);
16
17#define __HAVE_ARCH_STRNCAT
18extern char *strncat(char *dest, const char *src, size_t count);
19
20#define __HAVE_ARCH_STRCMP
21extern int strcmp(const char *cs, const char *ct);
22
23#define __HAVE_ARCH_STRNCMP
24extern int strncmp(const char *cs, const char *ct, size_t count);
25
26#define __HAVE_ARCH_STRCHR
27extern char *strchr(const char *s, int c);
28
29#define __HAVE_ARCH_STRLEN
30extern size_t strlen(const char *s);
31
32static __always_inline void *__memcpy(void *to, const void *from, size_t n)
33{
34 int d0, d1, d2;
35 asm volatile("rep ; movsl\n\t"
36 "movl %4,%%ecx\n\t"
37 "andl $3,%%ecx\n\t"
38 "jz 1f\n\t"
39 "rep ; movsb\n\t"
40 "1:"
41 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
42 : "0" (n / 4), "g" (n), "1" ((long)to), "2" ((long)from)
43 : "memory");
44 return to;
45}
46
47/*
48 * This looks ugly, but the compiler can optimize it totally,
49 * as the count is constant.
50 */
51static __always_inline void *__constant_memcpy(void *to, const void *from,
52 size_t n)
53{
54 long esi, edi;
55 if (!n)
56 return to;
57
58 switch (n) {
59 case 1:
60 *(char *)to = *(char *)from;
61 return to;
62 case 2:
63 *(short *)to = *(short *)from;
64 return to;
65 case 4:
66 *(int *)to = *(int *)from;
67 return to;
68
69 case 3:
70 *(short *)to = *(short *)from;
71 *((char *)to + 2) = *((char *)from + 2);
72 return to;
73 case 5:
74 *(int *)to = *(int *)from;
75 *((char *)to + 4) = *((char *)from + 4);
76 return to;
77 case 6:
78 *(int *)to = *(int *)from;
79 *((short *)to + 2) = *((short *)from + 2);
80 return to;
81 case 8:
82 *(int *)to = *(int *)from;
83 *((int *)to + 1) = *((int *)from + 1);
84 return to;
85 }
86
87 esi = (long)from;
88 edi = (long)to;
89 if (n >= 5 * 4) {
90 /* large block: use rep prefix */
91 int ecx;
92 asm volatile("rep ; movsl"
93 : "=&c" (ecx), "=&D" (edi), "=&S" (esi)
94 : "0" (n / 4), "1" (edi), "2" (esi)
95 : "memory"
96 );
97 } else {
98 /* small block: don't clobber ecx + smaller code */
99 if (n >= 4 * 4)
100 asm volatile("movsl"
101 : "=&D"(edi), "=&S"(esi)
102 : "0"(edi), "1"(esi)
103 : "memory");
104 if (n >= 3 * 4)
105 asm volatile("movsl"
106 : "=&D"(edi), "=&S"(esi)
107 : "0"(edi), "1"(esi)
108 : "memory");
109 if (n >= 2 * 4)
110 asm volatile("movsl"
111 : "=&D"(edi), "=&S"(esi)
112 : "0"(edi), "1"(esi)
113 : "memory");
114 if (n >= 1 * 4)
115 asm volatile("movsl"
116 : "=&D"(edi), "=&S"(esi)
117 : "0"(edi), "1"(esi)
118 : "memory");
119 }
120 switch (n % 4) {
121 /* tail */
122 case 0:
123 return to;
124 case 1:
125 asm volatile("movsb"
126 : "=&D"(edi), "=&S"(esi)
127 : "0"(edi), "1"(esi)
128 : "memory");
129 return to;
130 case 2:
131 asm volatile("movsw"
132 : "=&D"(edi), "=&S"(esi)
133 : "0"(edi), "1"(esi)
134 : "memory");
135 return to;
136 default:
137 asm volatile("movsw\n\tmovsb"
138 : "=&D"(edi), "=&S"(esi)
139 : "0"(edi), "1"(esi)
140 : "memory");
141 return to;
142 }
143}
144
145#define __HAVE_ARCH_MEMCPY
146
147#ifdef CONFIG_X86_USE_3DNOW
148
149#include <asm/mmx.h>
150
151/*
152 * This CPU favours 3DNow strongly (eg AMD Athlon)
153 */
154
155static inline void *__constant_memcpy3d(void *to, const void *from, size_t len)
156{
157 if (len < 512)
158 return __constant_memcpy(to, from, len);
159 return _mmx_memcpy(to, from, len);
160}
161
162static inline void *__memcpy3d(void *to, const void *from, size_t len)
163{
164 if (len < 512)
165 return __memcpy(to, from, len);
166 return _mmx_memcpy(to, from, len);
167}
168
169#define memcpy(t, f, n) \
170 (__builtin_constant_p((n)) \
171 ? __constant_memcpy3d((t), (f), (n)) \
172 : __memcpy3d((t), (f), (n)))
173
174#else
175
176/*
177 * No 3D Now!
178 */
179
180#define memcpy(t, f, n) \
181 (__builtin_constant_p((n)) \
182 ? __constant_memcpy((t), (f), (n)) \
183 : __memcpy((t), (f), (n)))
184
185#endif
186
187#define __HAVE_ARCH_MEMMOVE
188void *memmove(void *dest, const void *src, size_t n);
189
190#define memcmp __builtin_memcmp
191
192#define __HAVE_ARCH_MEMCHR
193extern void *memchr(const void *cs, int c, size_t count);
194
195static inline void *__memset_generic(void *s, char c, size_t count)
196{
197 int d0, d1;
198 asm volatile("rep\n\t"
199 "stosb"
200 : "=&c" (d0), "=&D" (d1)
201 : "a" (c), "1" (s), "0" (count)
202 : "memory");
203 return s;
204}
205
206/* we might want to write optimized versions of these later */
207#define __constant_count_memset(s, c, count) __memset_generic((s), (c), (count))
208
209/*
210 * memset(x, 0, y) is a reasonably common thing to do, so we want to fill
211 * things 32 bits at a time even when we don't know the size of the
212 * area at compile-time..
213 */
214static __always_inline
215void *__constant_c_memset(void *s, unsigned long c, size_t count)
216{
217 int d0, d1;
218 asm volatile("rep ; stosl\n\t"
219 "testb $2,%b3\n\t"
220 "je 1f\n\t"
221 "stosw\n"
222 "1:\ttestb $1,%b3\n\t"
223 "je 2f\n\t"
224 "stosb\n"
225 "2:"
226 : "=&c" (d0), "=&D" (d1)
227 : "a" (c), "q" (count), "0" (count/4), "1" ((long)s)
228 : "memory");
229 return s;
230}
231
232/* Added by Gertjan van Wingerde to make minix and sysv module work */
233#define __HAVE_ARCH_STRNLEN
234extern size_t strnlen(const char *s, size_t count);
235/* end of additional stuff */
236
237#define __HAVE_ARCH_STRSTR
238extern char *strstr(const char *cs, const char *ct);
239
240/*
241 * This looks horribly ugly, but the compiler can optimize it totally,
242 * as we by now know that both pattern and count is constant..
243 */
244static __always_inline
245void *__constant_c_and_count_memset(void *s, unsigned long pattern,
246 size_t count)
247{
248 switch (count) {
249 case 0:
250 return s;
251 case 1:
252 *(unsigned char *)s = pattern & 0xff;
253 return s;
254 case 2:
255 *(unsigned short *)s = pattern & 0xffff;
256 return s;
257 case 3:
258 *(unsigned short *)s = pattern & 0xffff;
259 *((unsigned char *)s + 2) = pattern & 0xff;
260 return s;
261 case 4:
262 *(unsigned long *)s = pattern;
263 return s;
264 }
265
266#define COMMON(x) \
267 asm volatile("rep ; stosl" \
268 x \
269 : "=&c" (d0), "=&D" (d1) \
270 : "a" (eax), "0" (count/4), "1" ((long)s) \
271 : "memory")
272
273 {
274 int d0, d1;
275#if __GNUC__ == 4 && __GNUC_MINOR__ == 0
276 /* Workaround for broken gcc 4.0 */
277 register unsigned long eax asm("%eax") = pattern;
278#else
279 unsigned long eax = pattern;
280#endif
281
282 switch (count % 4) {
283 case 0:
284 COMMON("");
285 return s;
286 case 1:
287 COMMON("\n\tstosb");
288 return s;
289 case 2:
290 COMMON("\n\tstosw");
291 return s;
292 default:
293 COMMON("\n\tstosw\n\tstosb");
294 return s;
295 }
296 }
297
298#undef COMMON
299}
300
301#define __constant_c_x_memset(s, c, count) \
302 (__builtin_constant_p(count) \
303 ? __constant_c_and_count_memset((s), (c), (count)) \
304 : __constant_c_memset((s), (c), (count)))
305
306#define __memset(s, c, count) \
307 (__builtin_constant_p(count) \
308 ? __constant_count_memset((s), (c), (count)) \
309 : __memset_generic((s), (c), (count)))
310
311#define __HAVE_ARCH_MEMSET
312#define memset(s, c, count) \
313 (__builtin_constant_p(c) \
314 ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \
315 (count)) \
316 : __memset((s), (c), (count)))
317
318/*
319 * find the first occurrence of byte 'c', or 1 past the area if none
320 */
321#define __HAVE_ARCH_MEMSCAN
322extern void *memscan(void *addr, int c, size_t size);
323
324#endif /* __KERNEL__ */
325
326#endif /* _ASM_X86_STRING_32_H */
diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
new file mode 100644
index 000000000000..2afe164bf1e6
--- /dev/null
+++ b/arch/x86/include/asm/string_64.h
@@ -0,0 +1,60 @@
1#ifndef _ASM_X86_STRING_64_H
2#define _ASM_X86_STRING_64_H
3
4#ifdef __KERNEL__
5
6/* Written 2002 by Andi Kleen */
7
8/* Only used for special circumstances. Stolen from i386/string.h */
9static __always_inline void *__inline_memcpy(void *to, const void *from, size_t n)
10{
11 unsigned long d0, d1, d2;
12 asm volatile("rep ; movsl\n\t"
13 "testb $2,%b4\n\t"
14 "je 1f\n\t"
15 "movsw\n"
16 "1:\ttestb $1,%b4\n\t"
17 "je 2f\n\t"
18 "movsb\n"
19 "2:"
20 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
21 : "0" (n / 4), "q" (n), "1" ((long)to), "2" ((long)from)
22 : "memory");
23 return to;
24}
25
26/* Even with __builtin_ the compiler may decide to use the out of line
27 function. */
28
29#define __HAVE_ARCH_MEMCPY 1
30#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
31extern void *memcpy(void *to, const void *from, size_t len);
32#else
33extern void *__memcpy(void *to, const void *from, size_t len);
34#define memcpy(dst, src, len) \
35({ \
36 size_t __len = (len); \
37 void *__ret; \
38 if (__builtin_constant_p(len) && __len >= 64) \
39 __ret = __memcpy((dst), (src), __len); \
40 else \
41 __ret = __builtin_memcpy((dst), (src), __len); \
42 __ret; \
43})
44#endif
45
46#define __HAVE_ARCH_MEMSET
47void *memset(void *s, int c, size_t n);
48
49#define __HAVE_ARCH_MEMMOVE
50void *memmove(void *dest, const void *src, size_t count);
51
52int memcmp(const void *cs, const void *ct, size_t count);
53size_t strlen(const char *s);
54char *strcpy(char *dest, const char *src);
55char *strcat(char *dest, const char *src);
56int strcmp(const char *cs, const char *ct);
57
58#endif /* __KERNEL__ */
59
60#endif /* _ASM_X86_STRING_64_H */
diff --git a/arch/x86/include/asm/summit/apic.h b/arch/x86/include/asm/summit/apic.h
new file mode 100644
index 000000000000..9b3070f1c2ac
--- /dev/null
+++ b/arch/x86/include/asm/summit/apic.h
@@ -0,0 +1,184 @@
1#ifndef __ASM_SUMMIT_APIC_H
2#define __ASM_SUMMIT_APIC_H
3
4#include <asm/smp.h>
5
6#define esr_disable (1)
7#define NO_BALANCE_IRQ (0)
8
9/* In clustered mode, the high nibble of APIC ID is a cluster number.
10 * The low nibble is a 4-bit bitmap. */
11#define XAPIC_DEST_CPUS_SHIFT 4
12#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
13#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
14
15#define APIC_DFR_VALUE (APIC_DFR_CLUSTER)
16
17static inline cpumask_t target_cpus(void)
18{
19 /* CPU_MASK_ALL (0xff) has undefined behaviour with
20 * dest_LowestPrio mode logical clustered apic interrupt routing
21 * Just start on cpu 0. IRQ balancing will spread load
22 */
23 return cpumask_of_cpu(0);
24}
25
26#define INT_DELIVERY_MODE (dest_LowestPrio)
27#define INT_DEST_MODE 1 /* logical delivery broadcast to all procs */
28
29static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid)
30{
31 return 0;
32}
33
34/* we don't use the phys_cpu_present_map to indicate apicid presence */
35static inline unsigned long check_apicid_present(int bit)
36{
37 return 1;
38}
39
40#define apicid_cluster(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK)
41
42extern u8 cpu_2_logical_apicid[];
43
44static inline void init_apic_ldr(void)
45{
46 unsigned long val, id;
47 int count = 0;
48 u8 my_id = (u8)hard_smp_processor_id();
49 u8 my_cluster = (u8)apicid_cluster(my_id);
50#ifdef CONFIG_SMP
51 u8 lid;
52 int i;
53
54 /* Create logical APIC IDs by counting CPUs already in cluster. */
55 for (count = 0, i = NR_CPUS; --i >= 0; ) {
56 lid = cpu_2_logical_apicid[i];
57 if (lid != BAD_APICID && apicid_cluster(lid) == my_cluster)
58 ++count;
59 }
60#endif
61 /* We only have a 4 wide bitmap in cluster mode. If a deranged
62 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
63 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
64 id = my_cluster | (1UL << count);
65 apic_write(APIC_DFR, APIC_DFR_VALUE);
66 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
67 val |= SET_APIC_LOGICAL_ID(id);
68 apic_write(APIC_LDR, val);
69}
70
71static inline int multi_timer_check(int apic, int irq)
72{
73 return 0;
74}
75
76static inline int apic_id_registered(void)
77{
78 return 1;
79}
80
81static inline void setup_apic_routing(void)
82{
83 printk("Enabling APIC mode: Summit. Using %d I/O APICs\n",
84 nr_ioapics);
85}
86
87static inline int apicid_to_node(int logical_apicid)
88{
89#ifdef CONFIG_SMP
90 return apicid_2_node[hard_smp_processor_id()];
91#else
92 return 0;
93#endif
94}
95
96/* Mapping from cpu number to logical apicid */
97static inline int cpu_to_logical_apicid(int cpu)
98{
99#ifdef CONFIG_SMP
100 if (cpu >= NR_CPUS)
101 return BAD_APICID;
102 return (int)cpu_2_logical_apicid[cpu];
103#else
104 return logical_smp_processor_id();
105#endif
106}
107
108static inline int cpu_present_to_apicid(int mps_cpu)
109{
110 if (mps_cpu < NR_CPUS)
111 return (int)per_cpu(x86_bios_cpu_apicid, mps_cpu);
112 else
113 return BAD_APICID;
114}
115
116static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_id_map)
117{
118 /* For clustered we don't have a good way to do this yet - hack */
119 return physids_promote(0x0F);
120}
121
122static inline physid_mask_t apicid_to_cpu_present(int apicid)
123{
124 return physid_mask_of_physid(0);
125}
126
127static inline void setup_portio_remap(void)
128{
129}
130
131static inline int check_phys_apicid_present(int boot_cpu_physical_apicid)
132{
133 return 1;
134}
135
136static inline void enable_apic_mode(void)
137{
138}
139
140static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask)
141{
142 int num_bits_set;
143 int cpus_found = 0;
144 int cpu;
145 int apicid;
146
147 num_bits_set = cpus_weight(cpumask);
148 /* Return id to all */
149 if (num_bits_set == NR_CPUS)
150 return (int) 0xFF;
151 /*
152 * The cpus in the mask must all be on the apic cluster. If are not
153 * on the same apicid cluster return default value of TARGET_CPUS.
154 */
155 cpu = first_cpu(cpumask);
156 apicid = cpu_to_logical_apicid(cpu);
157 while (cpus_found < num_bits_set) {
158 if (cpu_isset(cpu, cpumask)) {
159 int new_apicid = cpu_to_logical_apicid(cpu);
160 if (apicid_cluster(apicid) !=
161 apicid_cluster(new_apicid)){
162 printk ("%s: Not a valid mask!\n", __func__);
163 return 0xFF;
164 }
165 apicid = apicid | new_apicid;
166 cpus_found++;
167 }
168 cpu++;
169 }
170 return apicid;
171}
172
173/* cpuid returns the value latched in the HW at reset, not the APIC ID
174 * register's value. For any box whose BIOS changes APIC IDs, like
175 * clustered APIC systems, we must use hard_smp_processor_id.
176 *
177 * See Intel's IA-32 SW Dev's Manual Vol2 under CPUID.
178 */
179static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb)
180{
181 return hard_smp_processor_id() >> index_msb;
182}
183
184#endif /* __ASM_SUMMIT_APIC_H */
diff --git a/arch/x86/include/asm/summit/apicdef.h b/arch/x86/include/asm/summit/apicdef.h
new file mode 100644
index 000000000000..f3fbca1f61c1
--- /dev/null
+++ b/arch/x86/include/asm/summit/apicdef.h
@@ -0,0 +1,13 @@
1#ifndef __ASM_SUMMIT_APICDEF_H
2#define __ASM_SUMMIT_APICDEF_H
3
4#define APIC_ID_MASK (0xFF<<24)
5
6static inline unsigned get_apic_id(unsigned long x)
7{
8 return (x>>24)&0xFF;
9}
10
11#define GET_APIC_ID(x) get_apic_id(x)
12
13#endif
diff --git a/arch/x86/include/asm/summit/ipi.h b/arch/x86/include/asm/summit/ipi.h
new file mode 100644
index 000000000000..53bd1e7bd7b4
--- /dev/null
+++ b/arch/x86/include/asm/summit/ipi.h
@@ -0,0 +1,25 @@
1#ifndef __ASM_SUMMIT_IPI_H
2#define __ASM_SUMMIT_IPI_H
3
4void send_IPI_mask_sequence(cpumask_t mask, int vector);
5
6static inline void send_IPI_mask(cpumask_t mask, int vector)
7{
8 send_IPI_mask_sequence(mask, vector);
9}
10
11static inline void send_IPI_allbutself(int vector)
12{
13 cpumask_t mask = cpu_online_map;
14 cpu_clear(smp_processor_id(), mask);
15
16 if (!cpus_empty(mask))
17 send_IPI_mask(mask, vector);
18}
19
20static inline void send_IPI_all(int vector)
21{
22 send_IPI_mask(cpu_online_map, vector);
23}
24
25#endif /* __ASM_SUMMIT_IPI_H */
diff --git a/arch/x86/include/asm/summit/mpparse.h b/arch/x86/include/asm/summit/mpparse.h
new file mode 100644
index 000000000000..013ce6fab2d5
--- /dev/null
+++ b/arch/x86/include/asm/summit/mpparse.h
@@ -0,0 +1,109 @@
1#ifndef __ASM_SUMMIT_MPPARSE_H
2#define __ASM_SUMMIT_MPPARSE_H
3
4#include <asm/tsc.h>
5
6extern int use_cyclone;
7
8#ifdef CONFIG_X86_SUMMIT_NUMA
9extern void setup_summit(void);
10#else
11#define setup_summit() {}
12#endif
13
14static inline int mps_oem_check(struct mp_config_table *mpc, char *oem,
15 char *productid)
16{
17 if (!strncmp(oem, "IBM ENSW", 8) &&
18 (!strncmp(productid, "VIGIL SMP", 9)
19 || !strncmp(productid, "EXA", 3)
20 || !strncmp(productid, "RUTHLESS SMP", 12))){
21 mark_tsc_unstable("Summit based system");
22 use_cyclone = 1; /*enable cyclone-timer*/
23 setup_summit();
24 return 1;
25 }
26 return 0;
27}
28
29/* Hook from generic ACPI tables.c */
30static inline int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
31{
32 if (!strncmp(oem_id, "IBM", 3) &&
33 (!strncmp(oem_table_id, "SERVIGIL", 8)
34 || !strncmp(oem_table_id, "EXA", 3))){
35 mark_tsc_unstable("Summit based system");
36 use_cyclone = 1; /*enable cyclone-timer*/
37 setup_summit();
38 return 1;
39 }
40 return 0;
41}
42
43struct rio_table_hdr {
44 unsigned char version; /* Version number of this data structure */
45 /* Version 3 adds chassis_num & WP_index */
46 unsigned char num_scal_dev; /* # of Scalability devices (Twisters for Vigil) */
47 unsigned char num_rio_dev; /* # of RIO I/O devices (Cyclones and Winnipegs) */
48} __attribute__((packed));
49
50struct scal_detail {
51 unsigned char node_id; /* Scalability Node ID */
52 unsigned long CBAR; /* Address of 1MB register space */
53 unsigned char port0node; /* Node ID port connected to: 0xFF=None */
54 unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
55 unsigned char port1node; /* Node ID port connected to: 0xFF = None */
56 unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
57 unsigned char port2node; /* Node ID port connected to: 0xFF = None */
58 unsigned char port2port; /* Port num port connected to: 0,1,2, or 0xFF=None */
59 unsigned char chassis_num; /* 1 based Chassis number (1 = boot node) */
60} __attribute__((packed));
61
62struct rio_detail {
63 unsigned char node_id; /* RIO Node ID */
64 unsigned long BBAR; /* Address of 1MB register space */
65 unsigned char type; /* Type of device */
66 unsigned char owner_id; /* For WPEG: Node ID of Cyclone that owns this WPEG*/
67 /* For CYC: Node ID of Twister that owns this CYC */
68 unsigned char port0node; /* Node ID port connected to: 0xFF=None */
69 unsigned char port0port; /* Port num port connected to: 0,1,2, or 0xFF=None */
70 unsigned char port1node; /* Node ID port connected to: 0xFF=None */
71 unsigned char port1port; /* Port num port connected to: 0,1,2, or 0xFF=None */
72 unsigned char first_slot; /* For WPEG: Lowest slot number below this WPEG */
73 /* For CYC: 0 */
74 unsigned char status; /* For WPEG: Bit 0 = 1 : the XAPIC is used */
75 /* = 0 : the XAPIC is not used, ie:*/
76 /* ints fwded to another XAPIC */
77 /* Bits1:7 Reserved */
78 /* For CYC: Bits0:7 Reserved */
79 unsigned char WP_index; /* For WPEG: WPEG instance index - lower ones have */
80 /* lower slot numbers/PCI bus numbers */
81 /* For CYC: No meaning */
82 unsigned char chassis_num; /* 1 based Chassis number */
83 /* For LookOut WPEGs this field indicates the */
84 /* Expansion Chassis #, enumerated from Boot */
85 /* Node WPEG external port, then Boot Node CYC */
86 /* external port, then Next Vigil chassis WPEG */
87 /* external port, etc. */
88 /* Shared Lookouts have only 1 chassis number (the */
89 /* first one assigned) */
90} __attribute__((packed));
91
92
93typedef enum {
94 CompatTwister = 0, /* Compatibility Twister */
95 AltTwister = 1, /* Alternate Twister of internal 8-way */
96 CompatCyclone = 2, /* Compatibility Cyclone */
97 AltCyclone = 3, /* Alternate Cyclone of internal 8-way */
98 CompatWPEG = 4, /* Compatibility WPEG */
99 AltWPEG = 5, /* Second Planar WPEG */
100 LookOutAWPEG = 6, /* LookOut WPEG */
101 LookOutBWPEG = 7, /* LookOut WPEG */
102} node_type;
103
104static inline int is_WPEG(struct rio_detail *rio){
105 return (rio->type == CompatWPEG || rio->type == AltWPEG ||
106 rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
107}
108
109#endif /* __ASM_SUMMIT_MPPARSE_H */
diff --git a/arch/x86/include/asm/suspend.h b/arch/x86/include/asm/suspend.h
new file mode 100644
index 000000000000..9bd521fe4570
--- /dev/null
+++ b/arch/x86/include/asm/suspend.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "suspend_32.h"
3#else
4# include "suspend_64.h"
5#endif
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
new file mode 100644
index 000000000000..a5074bd0f8be
--- /dev/null
+++ b/arch/x86/include/asm/suspend_32.h
@@ -0,0 +1,51 @@
1/*
2 * Copyright 2001-2002 Pavel Machek <pavel@suse.cz>
3 * Based on code
4 * Copyright 2001 Patrick Mochel <mochel@osdl.org>
5 */
6#ifndef _ASM_X86_SUSPEND_32_H
7#define _ASM_X86_SUSPEND_32_H
8
9#include <asm/desc.h>
10#include <asm/i387.h>
11
12static inline int arch_prepare_suspend(void) { return 0; }
13
14/* image of the saved processor state */
15struct saved_context {
16 u16 es, fs, gs, ss;
17 unsigned long cr0, cr2, cr3, cr4;
18 struct desc_ptr gdt;
19 struct desc_ptr idt;
20 u16 ldt;
21 u16 tss;
22 unsigned long tr;
23 unsigned long safety;
24 unsigned long return_address;
25} __attribute__((packed));
26
27#ifdef CONFIG_ACPI
28extern unsigned long saved_eip;
29extern unsigned long saved_esp;
30extern unsigned long saved_ebp;
31extern unsigned long saved_ebx;
32extern unsigned long saved_esi;
33extern unsigned long saved_edi;
34
35static inline void acpi_save_register_state(unsigned long return_point)
36{
37 saved_eip = return_point;
38 asm volatile("movl %%esp,%0" : "=m" (saved_esp));
39 asm volatile("movl %%ebp,%0" : "=m" (saved_ebp));
40 asm volatile("movl %%ebx,%0" : "=m" (saved_ebx));
41 asm volatile("movl %%edi,%0" : "=m" (saved_edi));
42 asm volatile("movl %%esi,%0" : "=m" (saved_esi));
43}
44
45#define acpi_restore_register_state() do {} while (0)
46
47/* routines for saving/restoring kernel state */
48extern int acpi_save_state_mem(void);
49#endif
50
51#endif /* _ASM_X86_SUSPEND_32_H */
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
new file mode 100644
index 000000000000..06284f42b759
--- /dev/null
+++ b/arch/x86/include/asm/suspend_64.h
@@ -0,0 +1,52 @@
1/*
2 * Copyright 2001-2003 Pavel Machek <pavel@suse.cz>
3 * Based on code
4 * Copyright 2001 Patrick Mochel <mochel@osdl.org>
5 */
6#ifndef _ASM_X86_SUSPEND_64_H
7#define _ASM_X86_SUSPEND_64_H
8
9#include <asm/desc.h>
10#include <asm/i387.h>
11
12static inline int arch_prepare_suspend(void)
13{
14 return 0;
15}
16
17/*
18 * Image of the saved processor state, used by the low level ACPI suspend to
19 * RAM code and by the low level hibernation code.
20 *
21 * If you modify it, fix arch/x86/kernel/acpi/wakeup_64.S and make sure that
22 * __save/__restore_processor_state(), defined in arch/x86/kernel/suspend_64.c,
23 * still work as required.
24 */
25struct saved_context {
26 struct pt_regs regs;
27 u16 ds, es, fs, gs, ss;
28 unsigned long gs_base, gs_kernel_base, fs_base;
29 unsigned long cr0, cr2, cr3, cr4, cr8;
30 unsigned long efer;
31 u16 gdt_pad;
32 u16 gdt_limit;
33 unsigned long gdt_base;
34 u16 idt_pad;
35 u16 idt_limit;
36 unsigned long idt_base;
37 u16 ldt;
38 u16 tss;
39 unsigned long tr;
40 unsigned long safety;
41 unsigned long return_address;
42} __attribute__((packed));
43
44#define loaddebug(thread,register) \
45 set_debugreg((thread)->debugreg##register, register)
46
47/* routines for saving/restoring kernel state */
48extern int acpi_save_state_mem(void);
49extern char core_restore_code;
50extern char restore_registers;
51
52#endif /* _ASM_X86_SUSPEND_64_H */
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
new file mode 100644
index 000000000000..51fb2c76ad74
--- /dev/null
+++ b/arch/x86/include/asm/swiotlb.h
@@ -0,0 +1,58 @@
1#ifndef _ASM_X86_SWIOTLB_H
2#define _ASM_X86_SWIOTLB_H
3
4#include <asm/dma-mapping.h>
5
6/* SWIOTLB interface */
7
8extern dma_addr_t swiotlb_map_single(struct device *hwdev, void *ptr,
9 size_t size, int dir);
10extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size,
11 dma_addr_t *dma_handle, gfp_t flags);
12extern void swiotlb_unmap_single(struct device *hwdev, dma_addr_t dev_addr,
13 size_t size, int dir);
14extern void swiotlb_sync_single_for_cpu(struct device *hwdev,
15 dma_addr_t dev_addr,
16 size_t size, int dir);
17extern void swiotlb_sync_single_for_device(struct device *hwdev,
18 dma_addr_t dev_addr,
19 size_t size, int dir);
20extern void swiotlb_sync_single_range_for_cpu(struct device *hwdev,
21 dma_addr_t dev_addr,
22 unsigned long offset,
23 size_t size, int dir);
24extern void swiotlb_sync_single_range_for_device(struct device *hwdev,
25 dma_addr_t dev_addr,
26 unsigned long offset,
27 size_t size, int dir);
28extern void swiotlb_sync_sg_for_cpu(struct device *hwdev,
29 struct scatterlist *sg, int nelems,
30 int dir);
31extern void swiotlb_sync_sg_for_device(struct device *hwdev,
32 struct scatterlist *sg, int nelems,
33 int dir);
34extern int swiotlb_map_sg(struct device *hwdev, struct scatterlist *sg,
35 int nents, int direction);
36extern void swiotlb_unmap_sg(struct device *hwdev, struct scatterlist *sg,
37 int nents, int direction);
38extern int swiotlb_dma_mapping_error(struct device *hwdev, dma_addr_t dma_addr);
39extern void swiotlb_free_coherent(struct device *hwdev, size_t size,
40 void *vaddr, dma_addr_t dma_handle);
41extern int swiotlb_dma_supported(struct device *hwdev, u64 mask);
42extern void swiotlb_init(void);
43
44extern int swiotlb_force;
45
46#ifdef CONFIG_SWIOTLB
47extern int swiotlb;
48extern void pci_swiotlb_init(void);
49#else
50#define swiotlb 0
51static inline void pci_swiotlb_init(void)
52{
53}
54#endif
55
56static inline void dma_mark_clean(void *addr, size_t size) {}
57
58#endif /* _ASM_X86_SWIOTLB_H */
diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h
new file mode 100644
index 000000000000..9d09b4073b60
--- /dev/null
+++ b/arch/x86/include/asm/sync_bitops.h
@@ -0,0 +1,130 @@
1#ifndef _ASM_X86_SYNC_BITOPS_H
2#define _ASM_X86_SYNC_BITOPS_H
3
4/*
5 * Copyright 1992, Linus Torvalds.
6 */
7
8/*
9 * These have to be done with inline assembly: that way the bit-setting
10 * is guaranteed to be atomic. All bit operations return 0 if the bit
11 * was cleared before the operation and != 0 if it was not.
12 *
13 * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1).
14 */
15
16#define ADDR (*(volatile long *)addr)
17
18/**
19 * sync_set_bit - Atomically set a bit in memory
20 * @nr: the bit to set
21 * @addr: the address to start counting from
22 *
23 * This function is atomic and may not be reordered. See __set_bit()
24 * if you do not require the atomic guarantees.
25 *
26 * Note that @nr may be almost arbitrarily large; this function is not
27 * restricted to acting on a single-word quantity.
28 */
29static inline void sync_set_bit(int nr, volatile unsigned long *addr)
30{
31 asm volatile("lock; btsl %1,%0"
32 : "+m" (ADDR)
33 : "Ir" (nr)
34 : "memory");
35}
36
37/**
38 * sync_clear_bit - Clears a bit in memory
39 * @nr: Bit to clear
40 * @addr: Address to start counting from
41 *
42 * sync_clear_bit() is atomic and may not be reordered. However, it does
43 * not contain a memory barrier, so if it is used for locking purposes,
44 * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit()
45 * in order to ensure changes are visible on other processors.
46 */
47static inline void sync_clear_bit(int nr, volatile unsigned long *addr)
48{
49 asm volatile("lock; btrl %1,%0"
50 : "+m" (ADDR)
51 : "Ir" (nr)
52 : "memory");
53}
54
55/**
56 * sync_change_bit - Toggle a bit in memory
57 * @nr: Bit to change
58 * @addr: Address to start counting from
59 *
60 * sync_change_bit() is atomic and may not be reordered.
61 * Note that @nr may be almost arbitrarily large; this function is not
62 * restricted to acting on a single-word quantity.
63 */
64static inline void sync_change_bit(int nr, volatile unsigned long *addr)
65{
66 asm volatile("lock; btcl %1,%0"
67 : "+m" (ADDR)
68 : "Ir" (nr)
69 : "memory");
70}
71
72/**
73 * sync_test_and_set_bit - Set a bit and return its old value
74 * @nr: Bit to set
75 * @addr: Address to count from
76 *
77 * This operation is atomic and cannot be reordered.
78 * It also implies a memory barrier.
79 */
80static inline int sync_test_and_set_bit(int nr, volatile unsigned long *addr)
81{
82 int oldbit;
83
84 asm volatile("lock; btsl %2,%1\n\tsbbl %0,%0"
85 : "=r" (oldbit), "+m" (ADDR)
86 : "Ir" (nr) : "memory");
87 return oldbit;
88}
89
90/**
91 * sync_test_and_clear_bit - Clear a bit and return its old value
92 * @nr: Bit to clear
93 * @addr: Address to count from
94 *
95 * This operation is atomic and cannot be reordered.
96 * It also implies a memory barrier.
97 */
98static inline int sync_test_and_clear_bit(int nr, volatile unsigned long *addr)
99{
100 int oldbit;
101
102 asm volatile("lock; btrl %2,%1\n\tsbbl %0,%0"
103 : "=r" (oldbit), "+m" (ADDR)
104 : "Ir" (nr) : "memory");
105 return oldbit;
106}
107
108/**
109 * sync_test_and_change_bit - Change a bit and return its old value
110 * @nr: Bit to change
111 * @addr: Address to count from
112 *
113 * This operation is atomic and cannot be reordered.
114 * It also implies a memory barrier.
115 */
116static inline int sync_test_and_change_bit(int nr, volatile unsigned long *addr)
117{
118 int oldbit;
119
120 asm volatile("lock; btcl %2,%1\n\tsbbl %0,%0"
121 : "=r" (oldbit), "+m" (ADDR)
122 : "Ir" (nr) : "memory");
123 return oldbit;
124}
125
126#define sync_test_bit(nr, addr) test_bit(nr, addr)
127
128#undef ADDR
129
130#endif /* _ASM_X86_SYNC_BITOPS_H */
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
new file mode 100644
index 000000000000..d82f39bb7905
--- /dev/null
+++ b/arch/x86/include/asm/syscall.h
@@ -0,0 +1,213 @@
1/*
2 * Access to user system call parameters and results
3 *
4 * Copyright (C) 2008 Red Hat, Inc. All rights reserved.
5 *
6 * This copyrighted material is made available to anyone wishing to use,
7 * modify, copy, or redistribute it subject to the terms and conditions
8 * of the GNU General Public License v.2.
9 *
10 * See asm-generic/syscall.h for descriptions of what we must do here.
11 */
12
13#ifndef _ASM_X86_SYSCALL_H
14#define _ASM_X86_SYSCALL_H
15
16#include <linux/sched.h>
17#include <linux/err.h>
18
19static inline long syscall_get_nr(struct task_struct *task,
20 struct pt_regs *regs)
21{
22 /*
23 * We always sign-extend a -1 value being set here,
24 * so this is always either -1L or a syscall number.
25 */
26 return regs->orig_ax;
27}
28
29static inline void syscall_rollback(struct task_struct *task,
30 struct pt_regs *regs)
31{
32 regs->ax = regs->orig_ax;
33}
34
35static inline long syscall_get_error(struct task_struct *task,
36 struct pt_regs *regs)
37{
38 unsigned long error = regs->ax;
39#ifdef CONFIG_IA32_EMULATION
40 /*
41 * TS_COMPAT is set for 32-bit syscall entries and then
42 * remains set until we return to user mode.
43 */
44 if (task_thread_info(task)->status & TS_COMPAT)
45 /*
46 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO
47 * and will match correctly in comparisons.
48 */
49 error = (long) (int) error;
50#endif
51 return IS_ERR_VALUE(error) ? error : 0;
52}
53
54static inline long syscall_get_return_value(struct task_struct *task,
55 struct pt_regs *regs)
56{
57 return regs->ax;
58}
59
60static inline void syscall_set_return_value(struct task_struct *task,
61 struct pt_regs *regs,
62 int error, long val)
63{
64 regs->ax = (long) error ?: val;
65}
66
67#ifdef CONFIG_X86_32
68
69static inline void syscall_get_arguments(struct task_struct *task,
70 struct pt_regs *regs,
71 unsigned int i, unsigned int n,
72 unsigned long *args)
73{
74 BUG_ON(i + n > 6);
75 memcpy(args, &regs->bx + i, n * sizeof(args[0]));
76}
77
78static inline void syscall_set_arguments(struct task_struct *task,
79 struct pt_regs *regs,
80 unsigned int i, unsigned int n,
81 const unsigned long *args)
82{
83 BUG_ON(i + n > 6);
84 memcpy(&regs->bx + i, args, n * sizeof(args[0]));
85}
86
87#else /* CONFIG_X86_64 */
88
89static inline void syscall_get_arguments(struct task_struct *task,
90 struct pt_regs *regs,
91 unsigned int i, unsigned int n,
92 unsigned long *args)
93{
94# ifdef CONFIG_IA32_EMULATION
95 if (task_thread_info(task)->status & TS_COMPAT)
96 switch (i) {
97 case 0:
98 if (!n--) break;
99 *args++ = regs->bx;
100 case 1:
101 if (!n--) break;
102 *args++ = regs->cx;
103 case 2:
104 if (!n--) break;
105 *args++ = regs->dx;
106 case 3:
107 if (!n--) break;
108 *args++ = regs->si;
109 case 4:
110 if (!n--) break;
111 *args++ = regs->di;
112 case 5:
113 if (!n--) break;
114 *args++ = regs->bp;
115 case 6:
116 if (!n--) break;
117 default:
118 BUG();
119 break;
120 }
121 else
122# endif
123 switch (i) {
124 case 0:
125 if (!n--) break;
126 *args++ = regs->di;
127 case 1:
128 if (!n--) break;
129 *args++ = regs->si;
130 case 2:
131 if (!n--) break;
132 *args++ = regs->dx;
133 case 3:
134 if (!n--) break;
135 *args++ = regs->r10;
136 case 4:
137 if (!n--) break;
138 *args++ = regs->r8;
139 case 5:
140 if (!n--) break;
141 *args++ = regs->r9;
142 case 6:
143 if (!n--) break;
144 default:
145 BUG();
146 break;
147 }
148}
149
150static inline void syscall_set_arguments(struct task_struct *task,
151 struct pt_regs *regs,
152 unsigned int i, unsigned int n,
153 const unsigned long *args)
154{
155# ifdef CONFIG_IA32_EMULATION
156 if (task_thread_info(task)->status & TS_COMPAT)
157 switch (i) {
158 case 0:
159 if (!n--) break;
160 regs->bx = *args++;
161 case 1:
162 if (!n--) break;
163 regs->cx = *args++;
164 case 2:
165 if (!n--) break;
166 regs->dx = *args++;
167 case 3:
168 if (!n--) break;
169 regs->si = *args++;
170 case 4:
171 if (!n--) break;
172 regs->di = *args++;
173 case 5:
174 if (!n--) break;
175 regs->bp = *args++;
176 case 6:
177 if (!n--) break;
178 default:
179 BUG();
180 break;
181 }
182 else
183# endif
184 switch (i) {
185 case 0:
186 if (!n--) break;
187 regs->di = *args++;
188 case 1:
189 if (!n--) break;
190 regs->si = *args++;
191 case 2:
192 if (!n--) break;
193 regs->dx = *args++;
194 case 3:
195 if (!n--) break;
196 regs->r10 = *args++;
197 case 4:
198 if (!n--) break;
199 regs->r8 = *args++;
200 case 5:
201 if (!n--) break;
202 regs->r9 = *args++;
203 case 6:
204 if (!n--) break;
205 default:
206 BUG();
207 break;
208 }
209}
210
211#endif /* CONFIG_X86_32 */
212
213#endif /* _ASM_X86_SYSCALL_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
new file mode 100644
index 000000000000..87803da44010
--- /dev/null
+++ b/arch/x86/include/asm/syscalls.h
@@ -0,0 +1,93 @@
1/*
2 * syscalls.h - Linux syscall interfaces (arch-specific)
3 *
4 * Copyright (c) 2008 Jaswinder Singh
5 *
6 * This file is released under the GPLv2.
7 * See the file COPYING for more details.
8 */
9
10#ifndef _ASM_X86_SYSCALLS_H
11#define _ASM_X86_SYSCALLS_H
12
13#include <linux/compiler.h>
14#include <linux/linkage.h>
15#include <linux/types.h>
16#include <linux/signal.h>
17
18/* Common in X86_32 and X86_64 */
19/* kernel/ioport.c */
20asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
21
22/* X86_32 only */
23#ifdef CONFIG_X86_32
24/* kernel/process_32.c */
25asmlinkage int sys_fork(struct pt_regs);
26asmlinkage int sys_clone(struct pt_regs);
27asmlinkage int sys_vfork(struct pt_regs);
28asmlinkage int sys_execve(struct pt_regs);
29
30/* kernel/signal_32.c */
31asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
32asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
33 struct old_sigaction __user *);
34asmlinkage int sys_sigaltstack(unsigned long);
35asmlinkage unsigned long sys_sigreturn(unsigned long);
36asmlinkage int sys_rt_sigreturn(unsigned long);
37
38/* kernel/ioport.c */
39asmlinkage long sys_iopl(unsigned long);
40
41/* kernel/ldt.c */
42asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
43
44/* kernel/sys_i386_32.c */
45asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
46 unsigned long, unsigned long, unsigned long);
47struct mmap_arg_struct;
48asmlinkage int old_mmap(struct mmap_arg_struct __user *);
49struct sel_arg_struct;
50asmlinkage int old_select(struct sel_arg_struct __user *);
51asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
52struct old_utsname;
53asmlinkage int sys_uname(struct old_utsname __user *);
54struct oldold_utsname;
55asmlinkage int sys_olduname(struct oldold_utsname __user *);
56
57/* kernel/tls.c */
58asmlinkage int sys_set_thread_area(struct user_desc __user *);
59asmlinkage int sys_get_thread_area(struct user_desc __user *);
60
61/* kernel/vm86_32.c */
62asmlinkage int sys_vm86old(struct pt_regs);
63asmlinkage int sys_vm86(struct pt_regs);
64
65#else /* CONFIG_X86_32 */
66
67/* X86_64 only */
68/* kernel/process_64.c */
69asmlinkage long sys_fork(struct pt_regs *);
70asmlinkage long sys_clone(unsigned long, unsigned long,
71 void __user *, void __user *,
72 struct pt_regs *);
73asmlinkage long sys_vfork(struct pt_regs *);
74asmlinkage long sys_execve(char __user *, char __user * __user *,
75 char __user * __user *,
76 struct pt_regs *);
77
78/* kernel/ioport.c */
79asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
80
81/* kernel/signal_64.c */
82asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
83 struct pt_regs *);
84asmlinkage long sys_rt_sigreturn(struct pt_regs *);
85
86/* kernel/sys_x86_64.c */
87asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
88 unsigned long, unsigned long, unsigned long);
89struct new_utsname;
90asmlinkage long sys_uname(struct new_utsname __user *);
91
92#endif /* CONFIG_X86_32 */
93#endif /* _ASM_X86_SYSCALLS_H */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
new file mode 100644
index 000000000000..2ed3f0f44ff7
--- /dev/null
+++ b/arch/x86/include/asm/system.h
@@ -0,0 +1,425 @@
1#ifndef _ASM_X86_SYSTEM_H
2#define _ASM_X86_SYSTEM_H
3
4#include <asm/asm.h>
5#include <asm/segment.h>
6#include <asm/cpufeature.h>
7#include <asm/cmpxchg.h>
8#include <asm/nops.h>
9
10#include <linux/kernel.h>
11#include <linux/irqflags.h>
12
13/* entries in ARCH_DLINFO: */
14#ifdef CONFIG_IA32_EMULATION
15# define AT_VECTOR_SIZE_ARCH 2
16#else
17# define AT_VECTOR_SIZE_ARCH 1
18#endif
19
20#ifdef CONFIG_X86_32
21
22struct task_struct; /* one of the stranger aspects of C forward declarations */
23struct task_struct *__switch_to(struct task_struct *prev,
24 struct task_struct *next);
25
26/*
27 * Saving eflags is important. It switches not only IOPL between tasks,
28 * it also protects other tasks from NT leaking through sysenter etc.
29 */
30#define switch_to(prev, next, last) \
31do { \
32 /* \
33 * Context-switching clobbers all registers, so we clobber \
34 * them explicitly, via unused output variables. \
35 * (EAX and EBP is not listed because EBP is saved/restored \
36 * explicitly for wchan access and EAX is the return value of \
37 * __switch_to()) \
38 */ \
39 unsigned long ebx, ecx, edx, esi, edi; \
40 \
41 asm volatile("pushfl\n\t" /* save flags */ \
42 "pushl %%ebp\n\t" /* save EBP */ \
43 "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \
44 "movl %[next_sp],%%esp\n\t" /* restore ESP */ \
45 "movl $1f,%[prev_ip]\n\t" /* save EIP */ \
46 "pushl %[next_ip]\n\t" /* restore EIP */ \
47 "jmp __switch_to\n" /* regparm call */ \
48 "1:\t" \
49 "popl %%ebp\n\t" /* restore EBP */ \
50 "popfl\n" /* restore flags */ \
51 \
52 /* output parameters */ \
53 : [prev_sp] "=m" (prev->thread.sp), \
54 [prev_ip] "=m" (prev->thread.ip), \
55 "=a" (last), \
56 \
57 /* clobbered output registers: */ \
58 "=b" (ebx), "=c" (ecx), "=d" (edx), \
59 "=S" (esi), "=D" (edi) \
60 \
61 /* input parameters: */ \
62 : [next_sp] "m" (next->thread.sp), \
63 [next_ip] "m" (next->thread.ip), \
64 \
65 /* regparm parameters for __switch_to(): */ \
66 [prev] "a" (prev), \
67 [next] "d" (next) \
68 \
69 : /* reloaded segment registers */ \
70 "memory"); \
71} while (0)
72
73/*
74 * disable hlt during certain critical i/o operations
75 */
76#define HAVE_DISABLE_HLT
77#else
78#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
79#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
80
81/* frame pointer must be last for get_wchan */
82#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
83#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t"
84
85#define __EXTRA_CLOBBER \
86 , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \
87 "r12", "r13", "r14", "r15"
88
89/* Save restore flags to clear handle leaking NT */
90#define switch_to(prev, next, last) \
91 asm volatile(SAVE_CONTEXT \
92 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
93 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
94 "call __switch_to\n\t" \
95 ".globl thread_return\n" \
96 "thread_return:\n\t" \
97 "movq %%gs:%P[pda_pcurrent],%%rsi\n\t" \
98 "movq %P[thread_info](%%rsi),%%r8\n\t" \
99 LOCK_PREFIX "btr %[tif_fork],%P[ti_flags](%%r8)\n\t" \
100 "movq %%rax,%%rdi\n\t" \
101 "jc ret_from_fork\n\t" \
102 RESTORE_CONTEXT \
103 : "=a" (last) \
104 : [next] "S" (next), [prev] "D" (prev), \
105 [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \
106 [ti_flags] "i" (offsetof(struct thread_info, flags)), \
107 [tif_fork] "i" (TIF_FORK), \
108 [thread_info] "i" (offsetof(struct task_struct, stack)), \
109 [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \
110 : "memory", "cc" __EXTRA_CLOBBER)
111#endif
112
113#ifdef __KERNEL__
114#define _set_base(addr, base) do { unsigned long __pr; \
115__asm__ __volatile__ ("movw %%dx,%1\n\t" \
116 "rorl $16,%%edx\n\t" \
117 "movb %%dl,%2\n\t" \
118 "movb %%dh,%3" \
119 :"=&d" (__pr) \
120 :"m" (*((addr)+2)), \
121 "m" (*((addr)+4)), \
122 "m" (*((addr)+7)), \
123 "0" (base) \
124 ); } while (0)
125
126#define _set_limit(addr, limit) do { unsigned long __lr; \
127__asm__ __volatile__ ("movw %%dx,%1\n\t" \
128 "rorl $16,%%edx\n\t" \
129 "movb %2,%%dh\n\t" \
130 "andb $0xf0,%%dh\n\t" \
131 "orb %%dh,%%dl\n\t" \
132 "movb %%dl,%2" \
133 :"=&d" (__lr) \
134 :"m" (*(addr)), \
135 "m" (*((addr)+6)), \
136 "0" (limit) \
137 ); } while (0)
138
139#define set_base(ldt, base) _set_base(((char *)&(ldt)) , (base))
140#define set_limit(ldt, limit) _set_limit(((char *)&(ldt)) , ((limit)-1))
141
142extern void native_load_gs_index(unsigned);
143
144/*
145 * Load a segment. Fall back on loading the zero
146 * segment if something goes wrong..
147 */
148#define loadsegment(seg, value) \
149 asm volatile("\n" \
150 "1:\t" \
151 "movl %k0,%%" #seg "\n" \
152 "2:\n" \
153 ".section .fixup,\"ax\"\n" \
154 "3:\t" \
155 "movl %k1, %%" #seg "\n\t" \
156 "jmp 2b\n" \
157 ".previous\n" \
158 _ASM_EXTABLE(1b,3b) \
159 : :"r" (value), "r" (0) : "memory")
160
161
162/*
163 * Save a segment register away
164 */
165#define savesegment(seg, value) \
166 asm("mov %%" #seg ",%0":"=r" (value) : : "memory")
167
168static inline unsigned long get_limit(unsigned long segment)
169{
170 unsigned long __limit;
171 asm("lsll %1,%0" : "=r" (__limit) : "r" (segment));
172 return __limit + 1;
173}
174
175static inline void native_clts(void)
176{
177 asm volatile("clts");
178}
179
180/*
181 * Volatile isn't enough to prevent the compiler from reordering the
182 * read/write functions for the control registers and messing everything up.
183 * A memory clobber would solve the problem, but would prevent reordering of
184 * all loads stores around it, which can hurt performance. Solution is to
185 * use a variable and mimic reads and writes to it to enforce serialization
186 */
187static unsigned long __force_order;
188
189static inline unsigned long native_read_cr0(void)
190{
191 unsigned long val;
192 asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order));
193 return val;
194}
195
196static inline void native_write_cr0(unsigned long val)
197{
198 asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order));
199}
200
201static inline unsigned long native_read_cr2(void)
202{
203 unsigned long val;
204 asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order));
205 return val;
206}
207
208static inline void native_write_cr2(unsigned long val)
209{
210 asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order));
211}
212
213static inline unsigned long native_read_cr3(void)
214{
215 unsigned long val;
216 asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order));
217 return val;
218}
219
220static inline void native_write_cr3(unsigned long val)
221{
222 asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order));
223}
224
225static inline unsigned long native_read_cr4(void)
226{
227 unsigned long val;
228 asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order));
229 return val;
230}
231
232static inline unsigned long native_read_cr4_safe(void)
233{
234 unsigned long val;
235 /* This could fault if %cr4 does not exist. In x86_64, a cr4 always
236 * exists, so it will never fail. */
237#ifdef CONFIG_X86_32
238 asm volatile("1: mov %%cr4, %0\n"
239 "2:\n"
240 _ASM_EXTABLE(1b, 2b)
241 : "=r" (val), "=m" (__force_order) : "0" (0));
242#else
243 val = native_read_cr4();
244#endif
245 return val;
246}
247
248static inline void native_write_cr4(unsigned long val)
249{
250 asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order));
251}
252
253#ifdef CONFIG_X86_64
254static inline unsigned long native_read_cr8(void)
255{
256 unsigned long cr8;
257 asm volatile("movq %%cr8,%0" : "=r" (cr8));
258 return cr8;
259}
260
261static inline void native_write_cr8(unsigned long val)
262{
263 asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
264}
265#endif
266
267static inline void native_wbinvd(void)
268{
269 asm volatile("wbinvd": : :"memory");
270}
271
272#ifdef CONFIG_PARAVIRT
273#include <asm/paravirt.h>
274#else
275#define read_cr0() (native_read_cr0())
276#define write_cr0(x) (native_write_cr0(x))
277#define read_cr2() (native_read_cr2())
278#define write_cr2(x) (native_write_cr2(x))
279#define read_cr3() (native_read_cr3())
280#define write_cr3(x) (native_write_cr3(x))
281#define read_cr4() (native_read_cr4())
282#define read_cr4_safe() (native_read_cr4_safe())
283#define write_cr4(x) (native_write_cr4(x))
284#define wbinvd() (native_wbinvd())
285#ifdef CONFIG_X86_64
286#define read_cr8() (native_read_cr8())
287#define write_cr8(x) (native_write_cr8(x))
288#define load_gs_index native_load_gs_index
289#endif
290
291/* Clear the 'TS' bit */
292#define clts() (native_clts())
293
294#endif/* CONFIG_PARAVIRT */
295
296#define stts() write_cr0(read_cr0() | X86_CR0_TS)
297
298#endif /* __KERNEL__ */
299
300static inline void clflush(volatile void *__p)
301{
302 asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p));
303}
304
305#define nop() asm volatile ("nop")
306
307void disable_hlt(void);
308void enable_hlt(void);
309
310void cpu_idle_wait(void);
311
312extern unsigned long arch_align_stack(unsigned long sp);
313extern void free_init_pages(char *what, unsigned long begin, unsigned long end);
314
315void default_idle(void);
316
317/*
318 * Force strict CPU ordering.
319 * And yes, this is required on UP too when we're talking
320 * to devices.
321 */
322#ifdef CONFIG_X86_32
323/*
324 * Some non-Intel clones support out of order store. wmb() ceases to be a
325 * nop for these.
326 */
327#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2)
328#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2)
329#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM)
330#else
331#define mb() asm volatile("mfence":::"memory")
332#define rmb() asm volatile("lfence":::"memory")
333#define wmb() asm volatile("sfence" ::: "memory")
334#endif
335
336/**
337 * read_barrier_depends - Flush all pending reads that subsequents reads
338 * depend on.
339 *
340 * No data-dependent reads from memory-like regions are ever reordered
341 * over this barrier. All reads preceding this primitive are guaranteed
342 * to access memory (but not necessarily other CPUs' caches) before any
343 * reads following this primitive that depend on the data return by
344 * any of the preceding reads. This primitive is much lighter weight than
345 * rmb() on most CPUs, and is never heavier weight than is
346 * rmb().
347 *
348 * These ordering constraints are respected by both the local CPU
349 * and the compiler.
350 *
351 * Ordering is not guaranteed by anything other than these primitives,
352 * not even by data dependencies. See the documentation for
353 * memory_barrier() for examples and URLs to more information.
354 *
355 * For example, the following code would force ordering (the initial
356 * value of "a" is zero, "b" is one, and "p" is "&a"):
357 *
358 * <programlisting>
359 * CPU 0 CPU 1
360 *
361 * b = 2;
362 * memory_barrier();
363 * p = &b; q = p;
364 * read_barrier_depends();
365 * d = *q;
366 * </programlisting>
367 *
368 * because the read of "*q" depends on the read of "p" and these
369 * two reads are separated by a read_barrier_depends(). However,
370 * the following code, with the same initial values for "a" and "b":
371 *
372 * <programlisting>
373 * CPU 0 CPU 1
374 *
375 * a = 2;
376 * memory_barrier();
377 * b = 3; y = b;
378 * read_barrier_depends();
379 * x = a;
380 * </programlisting>
381 *
382 * does not enforce ordering, since there is no data dependency between
383 * the read of "a" and the read of "b". Therefore, on some CPUs, such
384 * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb()
385 * in cases like this where there are no data dependencies.
386 **/
387
388#define read_barrier_depends() do { } while (0)
389
390#ifdef CONFIG_SMP
391#define smp_mb() mb()
392#ifdef CONFIG_X86_PPRO_FENCE
393# define smp_rmb() rmb()
394#else
395# define smp_rmb() barrier()
396#endif
397#ifdef CONFIG_X86_OOSTORE
398# define smp_wmb() wmb()
399#else
400# define smp_wmb() barrier()
401#endif
402#define smp_read_barrier_depends() read_barrier_depends()
403#define set_mb(var, value) do { (void)xchg(&var, value); } while (0)
404#else
405#define smp_mb() barrier()
406#define smp_rmb() barrier()
407#define smp_wmb() barrier()
408#define smp_read_barrier_depends() do { } while (0)
409#define set_mb(var, value) do { var = value; barrier(); } while (0)
410#endif
411
412/*
413 * Stop RDTSC speculation. This is needed when you need to use RDTSC
414 * (or get_cycles or vread that possibly accesses the TSC) in a defined
415 * code region.
416 *
417 * (Could use an alternative three way for this if there was one.)
418 */
419static inline void rdtsc_barrier(void)
420{
421 alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
422 alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
423}
424
425#endif /* _ASM_X86_SYSTEM_H */
diff --git a/arch/x86/include/asm/system_64.h b/arch/x86/include/asm/system_64.h
new file mode 100644
index 000000000000..1159e091ad09
--- /dev/null
+++ b/arch/x86/include/asm/system_64.h
@@ -0,0 +1,22 @@
1#ifndef _ASM_X86_SYSTEM_64_H
2#define _ASM_X86_SYSTEM_64_H
3
4#include <asm/segment.h>
5#include <asm/cmpxchg.h>
6
7
8static inline unsigned long read_cr8(void)
9{
10 unsigned long cr8;
11 asm volatile("movq %%cr8,%0" : "=r" (cr8));
12 return cr8;
13}
14
15static inline void write_cr8(unsigned long val)
16{
17 asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
18}
19
20#include <linux/irqflags.h>
21
22#endif /* _ASM_X86_SYSTEM_64_H */
diff --git a/arch/x86/include/asm/tce.h b/arch/x86/include/asm/tce.h
new file mode 100644
index 000000000000..7a6677c1a715
--- /dev/null
+++ b/arch/x86/include/asm/tce.h
@@ -0,0 +1,48 @@
1/*
2 * This file is derived from asm-powerpc/tce.h.
3 *
4 * Copyright (C) IBM Corporation, 2006
5 *
6 * Author: Muli Ben-Yehuda <muli@il.ibm.com>
7 * Author: Jon Mason <jdmason@us.ibm.com>
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
22 */
23
24#ifndef _ASM_X86_TCE_H
25#define _ASM_X86_TCE_H
26
27extern unsigned int specified_table_size;
28struct iommu_table;
29
30#define TCE_ENTRY_SIZE 8 /* in bytes */
31
32#define TCE_READ_SHIFT 0
33#define TCE_WRITE_SHIFT 1
34#define TCE_HUBID_SHIFT 2 /* unused */
35#define TCE_RSVD_SHIFT 8 /* unused */
36#define TCE_RPN_SHIFT 12
37#define TCE_UNUSED_SHIFT 48 /* unused */
38
39#define TCE_RPN_MASK 0x0000fffffffff000ULL
40
41extern void tce_build(struct iommu_table *tbl, unsigned long index,
42 unsigned int npages, unsigned long uaddr, int direction);
43extern void tce_free(struct iommu_table *tbl, long index, unsigned int npages);
44extern void * __init alloc_tce_table(void);
45extern void __init free_tce_table(void *tbl);
46extern int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar);
47
48#endif /* _ASM_X86_TCE_H */
diff --git a/arch/x86/include/asm/termbits.h b/arch/x86/include/asm/termbits.h
new file mode 100644
index 000000000000..af1b70ea440f
--- /dev/null
+++ b/arch/x86/include/asm/termbits.h
@@ -0,0 +1,198 @@
1#ifndef _ASM_X86_TERMBITS_H
2#define _ASM_X86_TERMBITS_H
3
4#include <linux/posix_types.h>
5
6typedef unsigned char cc_t;
7typedef unsigned int speed_t;
8typedef unsigned int tcflag_t;
9
10#define NCCS 19
11struct termios {
12 tcflag_t c_iflag; /* input mode flags */
13 tcflag_t c_oflag; /* output mode flags */
14 tcflag_t c_cflag; /* control mode flags */
15 tcflag_t c_lflag; /* local mode flags */
16 cc_t c_line; /* line discipline */
17 cc_t c_cc[NCCS]; /* control characters */
18};
19
20struct termios2 {
21 tcflag_t c_iflag; /* input mode flags */
22 tcflag_t c_oflag; /* output mode flags */
23 tcflag_t c_cflag; /* control mode flags */
24 tcflag_t c_lflag; /* local mode flags */
25 cc_t c_line; /* line discipline */
26 cc_t c_cc[NCCS]; /* control characters */
27 speed_t c_ispeed; /* input speed */
28 speed_t c_ospeed; /* output speed */
29};
30
31struct ktermios {
32 tcflag_t c_iflag; /* input mode flags */
33 tcflag_t c_oflag; /* output mode flags */
34 tcflag_t c_cflag; /* control mode flags */
35 tcflag_t c_lflag; /* local mode flags */
36 cc_t c_line; /* line discipline */
37 cc_t c_cc[NCCS]; /* control characters */
38 speed_t c_ispeed; /* input speed */
39 speed_t c_ospeed; /* output speed */
40};
41
42/* c_cc characters */
43#define VINTR 0
44#define VQUIT 1
45#define VERASE 2
46#define VKILL 3
47#define VEOF 4
48#define VTIME 5
49#define VMIN 6
50#define VSWTC 7
51#define VSTART 8
52#define VSTOP 9
53#define VSUSP 10
54#define VEOL 11
55#define VREPRINT 12
56#define VDISCARD 13
57#define VWERASE 14
58#define VLNEXT 15
59#define VEOL2 16
60
61/* c_iflag bits */
62#define IGNBRK 0000001
63#define BRKINT 0000002
64#define IGNPAR 0000004
65#define PARMRK 0000010
66#define INPCK 0000020
67#define ISTRIP 0000040
68#define INLCR 0000100
69#define IGNCR 0000200
70#define ICRNL 0000400
71#define IUCLC 0001000
72#define IXON 0002000
73#define IXANY 0004000
74#define IXOFF 0010000
75#define IMAXBEL 0020000
76#define IUTF8 0040000
77
78/* c_oflag bits */
79#define OPOST 0000001
80#define OLCUC 0000002
81#define ONLCR 0000004
82#define OCRNL 0000010
83#define ONOCR 0000020
84#define ONLRET 0000040
85#define OFILL 0000100
86#define OFDEL 0000200
87#define NLDLY 0000400
88#define NL0 0000000
89#define NL1 0000400
90#define CRDLY 0003000
91#define CR0 0000000
92#define CR1 0001000
93#define CR2 0002000
94#define CR3 0003000
95#define TABDLY 0014000
96#define TAB0 0000000
97#define TAB1 0004000
98#define TAB2 0010000
99#define TAB3 0014000
100#define XTABS 0014000
101#define BSDLY 0020000
102#define BS0 0000000
103#define BS1 0020000
104#define VTDLY 0040000
105#define VT0 0000000
106#define VT1 0040000
107#define FFDLY 0100000
108#define FF0 0000000
109#define FF1 0100000
110
111/* c_cflag bit meaning */
112#define CBAUD 0010017
113#define B0 0000000 /* hang up */
114#define B50 0000001
115#define B75 0000002
116#define B110 0000003
117#define B134 0000004
118#define B150 0000005
119#define B200 0000006
120#define B300 0000007
121#define B600 0000010
122#define B1200 0000011
123#define B1800 0000012
124#define B2400 0000013
125#define B4800 0000014
126#define B9600 0000015
127#define B19200 0000016
128#define B38400 0000017
129#define EXTA B19200
130#define EXTB B38400
131#define CSIZE 0000060
132#define CS5 0000000
133#define CS6 0000020
134#define CS7 0000040
135#define CS8 0000060
136#define CSTOPB 0000100
137#define CREAD 0000200
138#define PARENB 0000400
139#define PARODD 0001000
140#define HUPCL 0002000
141#define CLOCAL 0004000
142#define CBAUDEX 0010000
143#define BOTHER 0010000 /* non standard rate */
144#define B57600 0010001
145#define B115200 0010002
146#define B230400 0010003
147#define B460800 0010004
148#define B500000 0010005
149#define B576000 0010006
150#define B921600 0010007
151#define B1000000 0010010
152#define B1152000 0010011
153#define B1500000 0010012
154#define B2000000 0010013
155#define B2500000 0010014
156#define B3000000 0010015
157#define B3500000 0010016
158#define B4000000 0010017
159#define CIBAUD 002003600000 /* input baud rate */
160#define CMSPAR 010000000000 /* mark or space (stick) parity */
161#define CRTSCTS 020000000000 /* flow control */
162
163#define IBSHIFT 16 /* Shift from CBAUD to CIBAUD */
164
165/* c_lflag bits */
166#define ISIG 0000001
167#define ICANON 0000002
168#define XCASE 0000004
169#define ECHO 0000010
170#define ECHOE 0000020
171#define ECHOK 0000040
172#define ECHONL 0000100
173#define NOFLSH 0000200
174#define TOSTOP 0000400
175#define ECHOCTL 0001000
176#define ECHOPRT 0002000
177#define ECHOKE 0004000
178#define FLUSHO 0010000
179#define PENDIN 0040000
180#define IEXTEN 0100000
181
182/* tcflow() and TCXONC use these */
183#define TCOOFF 0
184#define TCOON 1
185#define TCIOFF 2
186#define TCION 3
187
188/* tcflush() and TCFLSH use these */
189#define TCIFLUSH 0
190#define TCOFLUSH 1
191#define TCIOFLUSH 2
192
193/* tcsetattr uses these */
194#define TCSANOW 0
195#define TCSADRAIN 1
196#define TCSAFLUSH 2
197
198#endif /* _ASM_X86_TERMBITS_H */
diff --git a/arch/x86/include/asm/termios.h b/arch/x86/include/asm/termios.h
new file mode 100644
index 000000000000..f72956331c49
--- /dev/null
+++ b/arch/x86/include/asm/termios.h
@@ -0,0 +1,113 @@
1#ifndef _ASM_X86_TERMIOS_H
2#define _ASM_X86_TERMIOS_H
3
4#include <asm/termbits.h>
5#include <asm/ioctls.h>
6
7struct winsize {
8 unsigned short ws_row;
9 unsigned short ws_col;
10 unsigned short ws_xpixel;
11 unsigned short ws_ypixel;
12};
13
14#define NCC 8
15struct termio {
16 unsigned short c_iflag; /* input mode flags */
17 unsigned short c_oflag; /* output mode flags */
18 unsigned short c_cflag; /* control mode flags */
19 unsigned short c_lflag; /* local mode flags */
20 unsigned char c_line; /* line discipline */
21 unsigned char c_cc[NCC]; /* control characters */
22};
23
24/* modem lines */
25#define TIOCM_LE 0x001
26#define TIOCM_DTR 0x002
27#define TIOCM_RTS 0x004
28#define TIOCM_ST 0x008
29#define TIOCM_SR 0x010
30#define TIOCM_CTS 0x020
31#define TIOCM_CAR 0x040
32#define TIOCM_RNG 0x080
33#define TIOCM_DSR 0x100
34#define TIOCM_CD TIOCM_CAR
35#define TIOCM_RI TIOCM_RNG
36#define TIOCM_OUT1 0x2000
37#define TIOCM_OUT2 0x4000
38#define TIOCM_LOOP 0x8000
39
40/* ioctl (fd, TIOCSERGETLSR, &result) where result may be as below */
41
42#ifdef __KERNEL__
43
44#include <asm/uaccess.h>
45
46/* intr=^C quit=^\ erase=del kill=^U
47 eof=^D vtime=\0 vmin=\1 sxtc=\0
48 start=^Q stop=^S susp=^Z eol=\0
49 reprint=^R discard=^U werase=^W lnext=^V
50 eol2=\0
51*/
52#define INIT_C_CC "\003\034\177\025\004\0\1\0\021\023\032\0\022\017\027\026\0"
53
54/*
55 * Translate a "termio" structure into a "termios". Ugh.
56 */
57#define SET_LOW_TERMIOS_BITS(termios, termio, x) { \
58 unsigned short __tmp; \
59 get_user(__tmp,&(termio)->x); \
60 *(unsigned short *) &(termios)->x = __tmp; \
61}
62
63static inline int user_termio_to_kernel_termios(struct ktermios *termios,
64 struct termio __user *termio)
65{
66 SET_LOW_TERMIOS_BITS(termios, termio, c_iflag);
67 SET_LOW_TERMIOS_BITS(termios, termio, c_oflag);
68 SET_LOW_TERMIOS_BITS(termios, termio, c_cflag);
69 SET_LOW_TERMIOS_BITS(termios, termio, c_lflag);
70 return copy_from_user(termios->c_cc, termio->c_cc, NCC);
71}
72
73/*
74 * Translate a "termios" structure into a "termio". Ugh.
75 */
76static inline int kernel_termios_to_user_termio(struct termio __user *termio,
77 struct ktermios *termios)
78{
79 put_user((termios)->c_iflag, &(termio)->c_iflag);
80 put_user((termios)->c_oflag, &(termio)->c_oflag);
81 put_user((termios)->c_cflag, &(termio)->c_cflag);
82 put_user((termios)->c_lflag, &(termio)->c_lflag);
83 put_user((termios)->c_line, &(termio)->c_line);
84 return copy_to_user((termio)->c_cc, (termios)->c_cc, NCC);
85}
86
87static inline int user_termios_to_kernel_termios(struct ktermios *k,
88 struct termios2 __user *u)
89{
90 return copy_from_user(k, u, sizeof(struct termios2));
91}
92
93static inline int kernel_termios_to_user_termios(struct termios2 __user *u,
94 struct ktermios *k)
95{
96 return copy_to_user(u, k, sizeof(struct termios2));
97}
98
99static inline int user_termios_to_kernel_termios_1(struct ktermios *k,
100 struct termios __user *u)
101{
102 return copy_from_user(k, u, sizeof(struct termios));
103}
104
105static inline int kernel_termios_to_user_termios_1(struct termios __user *u,
106 struct ktermios *k)
107{
108 return copy_to_user(u, k, sizeof(struct termios));
109}
110
111#endif /* __KERNEL__ */
112
113#endif /* _ASM_X86_TERMIOS_H */
diff --git a/arch/x86/include/asm/therm_throt.h b/arch/x86/include/asm/therm_throt.h
new file mode 100644
index 000000000000..c62349ee7860
--- /dev/null
+++ b/arch/x86/include/asm/therm_throt.h
@@ -0,0 +1,9 @@
1#ifndef _ASM_X86_THERM_THROT_H
2#define _ASM_X86_THERM_THROT_H
3
4#include <asm/atomic.h>
5
6extern atomic_t therm_throt_en;
7int therm_throt_process(int curr);
8
9#endif /* _ASM_X86_THERM_THROT_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
new file mode 100644
index 000000000000..e44d379faad2
--- /dev/null
+++ b/arch/x86/include/asm/thread_info.h
@@ -0,0 +1,264 @@
1/* thread_info.h: low-level thread information
2 *
3 * Copyright (C) 2002 David Howells (dhowells@redhat.com)
4 * - Incorporating suggestions made by Linus Torvalds and Dave Miller
5 */
6
7#ifndef _ASM_X86_THREAD_INFO_H
8#define _ASM_X86_THREAD_INFO_H
9
10#include <linux/compiler.h>
11#include <asm/page.h>
12#include <asm/types.h>
13
14/*
15 * low level task data that entry.S needs immediate access to
16 * - this struct should fit entirely inside of one cache line
17 * - this struct shares the supervisor stack pages
18 */
19#ifndef __ASSEMBLY__
20struct task_struct;
21struct exec_domain;
22#include <asm/processor.h>
23
24struct thread_info {
25 struct task_struct *task; /* main task structure */
26 struct exec_domain *exec_domain; /* execution domain */
27 unsigned long flags; /* low level flags */
28 __u32 status; /* thread synchronous flags */
29 __u32 cpu; /* current CPU */
30 int preempt_count; /* 0 => preemptable,
31 <0 => BUG */
32 mm_segment_t addr_limit;
33 struct restart_block restart_block;
34 void __user *sysenter_return;
35#ifdef CONFIG_X86_32
36 unsigned long previous_esp; /* ESP of the previous stack in
37 case of nested (IRQ) stacks
38 */
39 __u8 supervisor_stack[0];
40#endif
41};
42
43#define INIT_THREAD_INFO(tsk) \
44{ \
45 .task = &tsk, \
46 .exec_domain = &default_exec_domain, \
47 .flags = 0, \
48 .cpu = 0, \
49 .preempt_count = 1, \
50 .addr_limit = KERNEL_DS, \
51 .restart_block = { \
52 .fn = do_no_restart_syscall, \
53 }, \
54}
55
56#define init_thread_info (init_thread_union.thread_info)
57#define init_stack (init_thread_union.stack)
58
59#else /* !__ASSEMBLY__ */
60
61#include <asm/asm-offsets.h>
62
63#endif
64
65/*
66 * thread information flags
67 * - these are process state flags that various assembly files
68 * may need to access
69 * - pending work-to-be-done flags are in LSW
70 * - other flags in MSW
71 * Warning: layout of LSW is hardcoded in entry.S
72 */
73#define TIF_SYSCALL_TRACE 0 /* syscall trace active */
74#define TIF_NOTIFY_RESUME 1 /* callback before returning to user */
75#define TIF_SIGPENDING 2 /* signal pending */
76#define TIF_NEED_RESCHED 3 /* rescheduling necessary */
77#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/
78#define TIF_IRET 5 /* force IRET */
79#define TIF_SYSCALL_EMU 6 /* syscall emulation active */
80#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
81#define TIF_SECCOMP 8 /* secure computing */
82#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
83#define TIF_NOTSC 16 /* TSC is not accessible in userland */
84#define TIF_IA32 17 /* 32bit process */
85#define TIF_FORK 18 /* ret_from_fork */
86#define TIF_ABI_PENDING 19
87#define TIF_MEMDIE 20
88#define TIF_DEBUG 21 /* uses debug registers */
89#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
90#define TIF_FREEZE 23 /* is freezing for suspend */
91#define TIF_FORCED_TF 24 /* true if TF in eflags artificially */
92#define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */
93#define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */
94#define TIF_BTS_TRACE_TS 27 /* record scheduling event timestamps */
95
96#define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE)
97#define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME)
98#define _TIF_SIGPENDING (1 << TIF_SIGPENDING)
99#define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP)
100#define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED)
101#define _TIF_IRET (1 << TIF_IRET)
102#define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU)
103#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
104#define _TIF_SECCOMP (1 << TIF_SECCOMP)
105#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
106#define _TIF_NOTSC (1 << TIF_NOTSC)
107#define _TIF_IA32 (1 << TIF_IA32)
108#define _TIF_FORK (1 << TIF_FORK)
109#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING)
110#define _TIF_DEBUG (1 << TIF_DEBUG)
111#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
112#define _TIF_FREEZE (1 << TIF_FREEZE)
113#define _TIF_FORCED_TF (1 << TIF_FORCED_TF)
114#define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR)
115#define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR)
116#define _TIF_BTS_TRACE_TS (1 << TIF_BTS_TRACE_TS)
117
118/* work to do in syscall_trace_enter() */
119#define _TIF_WORK_SYSCALL_ENTRY \
120 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | \
121 _TIF_SYSCALL_AUDIT | _TIF_SECCOMP | _TIF_SINGLESTEP)
122
123/* work to do in syscall_trace_leave() */
124#define _TIF_WORK_SYSCALL_EXIT \
125 (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP)
126
127/* work to do on interrupt/exception return */
128#define _TIF_WORK_MASK \
129 (0x0000FFFF & \
130 ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \
131 _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
132
133/* work to do on any return to user space */
134#define _TIF_ALLWORK_MASK (0x0000FFFF & ~_TIF_SECCOMP)
135
136/* Only used for 64 bit */
137#define _TIF_DO_NOTIFY_MASK \
138 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
139
140/* flags to check in __switch_to() */
141#define _TIF_WORK_CTXSW \
142 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_BTS_TRACE_TS| \
143 _TIF_NOTSC)
144
145#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW
146#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
147
148#define PREEMPT_ACTIVE 0x10000000
149
150/* thread information allocation */
151#ifdef CONFIG_DEBUG_STACK_USAGE
152#define THREAD_FLAGS (GFP_KERNEL | __GFP_ZERO)
153#else
154#define THREAD_FLAGS GFP_KERNEL
155#endif
156
157#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
158
159#define alloc_thread_info(tsk) \
160 ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER))
161
162#ifdef CONFIG_X86_32
163
164#define STACK_WARN (THREAD_SIZE/8)
165/*
166 * macros/functions for gaining access to the thread information structure
167 *
168 * preempt_count needs to be 1 initially, until the scheduler is functional.
169 */
170#ifndef __ASSEMBLY__
171
172
173/* how to get the current stack pointer from C */
174register unsigned long current_stack_pointer asm("esp") __used;
175
176/* how to get the thread information struct from C */
177static inline struct thread_info *current_thread_info(void)
178{
179 return (struct thread_info *)
180 (current_stack_pointer & ~(THREAD_SIZE - 1));
181}
182
183#else /* !__ASSEMBLY__ */
184
185/* how to get the thread information struct from ASM */
186#define GET_THREAD_INFO(reg) \
187 movl $-THREAD_SIZE, reg; \
188 andl %esp, reg
189
190/* use this one if reg already contains %esp */
191#define GET_THREAD_INFO_WITH_ESP(reg) \
192 andl $-THREAD_SIZE, reg
193
194#endif
195
196#else /* X86_32 */
197
198#include <asm/pda.h>
199
200/*
201 * macros/functions for gaining access to the thread information structure
202 * preempt_count needs to be 1 initially, until the scheduler is functional.
203 */
204#ifndef __ASSEMBLY__
205static inline struct thread_info *current_thread_info(void)
206{
207 struct thread_info *ti;
208 ti = (void *)(read_pda(kernelstack) + PDA_STACKOFFSET - THREAD_SIZE);
209 return ti;
210}
211
212/* do not use in interrupt context */
213static inline struct thread_info *stack_thread_info(void)
214{
215 struct thread_info *ti;
216 asm("andq %%rsp,%0; " : "=r" (ti) : "0" (~(THREAD_SIZE - 1)));
217 return ti;
218}
219
220#else /* !__ASSEMBLY__ */
221
222/* how to get the thread information struct from ASM */
223#define GET_THREAD_INFO(reg) \
224 movq %gs:pda_kernelstack,reg ; \
225 subq $(THREAD_SIZE-PDA_STACKOFFSET),reg
226
227#endif
228
229#endif /* !X86_32 */
230
231/*
232 * Thread-synchronous status.
233 *
234 * This is different from the flags in that nobody else
235 * ever touches our thread-synchronous status, so we don't
236 * have to worry about atomic accesses.
237 */
238#define TS_USEDFPU 0x0001 /* FPU was used by this task
239 this quantum (SMP) */
240#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/
241#define TS_POLLING 0x0004 /* true if in idle loop
242 and not sleeping */
243#define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */
244#define TS_XSAVE 0x0010 /* Use xsave/xrstor */
245
246#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING)
247
248#ifndef __ASSEMBLY__
249#define HAVE_SET_RESTORE_SIGMASK 1
250static inline void set_restore_sigmask(void)
251{
252 struct thread_info *ti = current_thread_info();
253 ti->status |= TS_RESTORE_SIGMASK;
254 set_bit(TIF_SIGPENDING, (unsigned long *)&ti->flags);
255}
256#endif /* !__ASSEMBLY__ */
257
258#ifndef __ASSEMBLY__
259extern void arch_task_cache_init(void);
260extern void free_thread_info(struct thread_info *ti);
261extern int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src);
262#define arch_task_cache_init arch_task_cache_init
263#endif
264#endif /* _ASM_X86_THREAD_INFO_H */
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
new file mode 100644
index 000000000000..50c733aac421
--- /dev/null
+++ b/arch/x86/include/asm/time.h
@@ -0,0 +1,63 @@
1#ifndef _ASM_X86_TIME_H
2#define _ASM_X86_TIME_H
3
4extern void hpet_time_init(void);
5
6#include <asm/mc146818rtc.h>
7#ifdef CONFIG_X86_32
8#include <linux/efi.h>
9
10static inline unsigned long native_get_wallclock(void)
11{
12 unsigned long retval;
13
14 if (efi_enabled)
15 retval = efi_get_time();
16 else
17 retval = mach_get_cmos_time();
18
19 return retval;
20}
21
22static inline int native_set_wallclock(unsigned long nowtime)
23{
24 int retval;
25
26 if (efi_enabled)
27 retval = efi_set_rtc_mmss(nowtime);
28 else
29 retval = mach_set_rtc_mmss(nowtime);
30
31 return retval;
32}
33
34#else
35extern void native_time_init_hook(void);
36
37static inline unsigned long native_get_wallclock(void)
38{
39 return mach_get_cmos_time();
40}
41
42static inline int native_set_wallclock(unsigned long nowtime)
43{
44 return mach_set_rtc_mmss(nowtime);
45}
46
47#endif
48
49extern void time_init(void);
50
51#ifdef CONFIG_PARAVIRT
52#include <asm/paravirt.h>
53#else /* !CONFIG_PARAVIRT */
54
55#define get_wallclock() native_get_wallclock()
56#define set_wallclock(x) native_set_wallclock(x)
57#define choose_time_init() hpet_time_init
58
59#endif /* CONFIG_PARAVIRT */
60
61extern unsigned long __init calibrate_cpu(void);
62
63#endif /* _ASM_X86_TIME_H */
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
new file mode 100644
index 000000000000..2bb6a835c453
--- /dev/null
+++ b/arch/x86/include/asm/timer.h
@@ -0,0 +1,66 @@
1#ifndef _ASM_X86_TIMER_H
2#define _ASM_X86_TIMER_H
3#include <linux/init.h>
4#include <linux/pm.h>
5#include <linux/percpu.h>
6
7#define TICK_SIZE (tick_nsec / 1000)
8
9unsigned long long native_sched_clock(void);
10unsigned long native_calibrate_tsc(void);
11
12#ifdef CONFIG_X86_32
13extern int timer_ack;
14extern int recalibrate_cpu_khz(void);
15#endif /* CONFIG_X86_32 */
16
17extern int no_timer_check;
18
19#ifndef CONFIG_PARAVIRT
20#define calibrate_tsc() native_calibrate_tsc()
21#endif
22
23/* Accelerators for sched_clock()
24 * convert from cycles(64bits) => nanoseconds (64bits)
25 * basic equation:
26 * ns = cycles / (freq / ns_per_sec)
27 * ns = cycles * (ns_per_sec / freq)
28 * ns = cycles * (10^9 / (cpu_khz * 10^3))
29 * ns = cycles * (10^6 / cpu_khz)
30 *
31 * Then we use scaling math (suggested by george@mvista.com) to get:
32 * ns = cycles * (10^6 * SC / cpu_khz) / SC
33 * ns = cycles * cyc2ns_scale / SC
34 *
35 * And since SC is a constant power of two, we can convert the div
36 * into a shift.
37 *
38 * We can use khz divisor instead of mhz to keep a better precision, since
39 * cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
40 * (mathieu.desnoyers@polymtl.ca)
41 *
42 * -johnstul@us.ibm.com "math is hard, lets go shopping!"
43 */
44
45DECLARE_PER_CPU(unsigned long, cyc2ns);
46
47#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
48
49static inline unsigned long long __cycles_2_ns(unsigned long long cyc)
50{
51 return cyc * per_cpu(cyc2ns, smp_processor_id()) >> CYC2NS_SCALE_FACTOR;
52}
53
54static inline unsigned long long cycles_2_ns(unsigned long long cyc)
55{
56 unsigned long long ns;
57 unsigned long flags;
58
59 local_irq_save(flags);
60 ns = __cycles_2_ns(cyc);
61 local_irq_restore(flags);
62
63 return ns;
64}
65
66#endif /* _ASM_X86_TIMER_H */
diff --git a/arch/x86/include/asm/timex.h b/arch/x86/include/asm/timex.h
new file mode 100644
index 000000000000..1287dc1347d6
--- /dev/null
+++ b/arch/x86/include/asm/timex.h
@@ -0,0 +1,19 @@
1/* x86 architecture timex specifications */
2#ifndef _ASM_X86_TIMEX_H
3#define _ASM_X86_TIMEX_H
4
5#include <asm/processor.h>
6#include <asm/tsc.h>
7
8#ifdef CONFIG_X86_ELAN
9# define PIT_TICK_RATE 1189200 /* AMD Elan has different frequency! */
10#elif defined(CONFIG_X86_RDC321X)
11# define PIT_TICK_RATE 1041667 /* Underlying HZ for R8610 */
12#else
13# define PIT_TICK_RATE 1193182 /* Underlying HZ */
14#endif
15#define CLOCK_TICK_RATE PIT_TICK_RATE
16
17#define ARCH_HAS_READ_CURRENT_TIMER
18
19#endif /* _ASM_X86_TIMEX_H */
diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h
new file mode 100644
index 000000000000..829215fef9ee
--- /dev/null
+++ b/arch/x86/include/asm/tlb.h
@@ -0,0 +1,11 @@
1#ifndef _ASM_X86_TLB_H
2#define _ASM_X86_TLB_H
3
4#define tlb_start_vma(tlb, vma) do { } while (0)
5#define tlb_end_vma(tlb, vma) do { } while (0)
6#define __tlb_remove_tlb_entry(tlb, ptep, address) do { } while (0)
7#define tlb_flush(tlb) flush_tlb_mm((tlb)->mm)
8
9#include <asm-generic/tlb.h>
10
11#endif /* _ASM_X86_TLB_H */
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
new file mode 100644
index 000000000000..0e7bbb549116
--- /dev/null
+++ b/arch/x86/include/asm/tlbflush.h
@@ -0,0 +1,178 @@
1#ifndef _ASM_X86_TLBFLUSH_H
2#define _ASM_X86_TLBFLUSH_H
3
4#include <linux/mm.h>
5#include <linux/sched.h>
6
7#include <asm/processor.h>
8#include <asm/system.h>
9
10#ifdef CONFIG_PARAVIRT
11#include <asm/paravirt.h>
12#else
13#define __flush_tlb() __native_flush_tlb()
14#define __flush_tlb_global() __native_flush_tlb_global()
15#define __flush_tlb_single(addr) __native_flush_tlb_single(addr)
16#endif
17
18static inline void __native_flush_tlb(void)
19{
20 write_cr3(read_cr3());
21}
22
23static inline void __native_flush_tlb_global(void)
24{
25 unsigned long flags;
26 unsigned long cr4;
27
28 /*
29 * Read-modify-write to CR4 - protect it from preemption and
30 * from interrupts. (Use the raw variant because this code can
31 * be called from deep inside debugging code.)
32 */
33 raw_local_irq_save(flags);
34
35 cr4 = read_cr4();
36 /* clear PGE */
37 write_cr4(cr4 & ~X86_CR4_PGE);
38 /* write old PGE again and flush TLBs */
39 write_cr4(cr4);
40
41 raw_local_irq_restore(flags);
42}
43
44static inline void __native_flush_tlb_single(unsigned long addr)
45{
46 asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
47}
48
49static inline void __flush_tlb_all(void)
50{
51 if (cpu_has_pge)
52 __flush_tlb_global();
53 else
54 __flush_tlb();
55}
56
57static inline void __flush_tlb_one(unsigned long addr)
58{
59 if (cpu_has_invlpg)
60 __flush_tlb_single(addr);
61 else
62 __flush_tlb();
63}
64
65#ifdef CONFIG_X86_32
66# define TLB_FLUSH_ALL 0xffffffff
67#else
68# define TLB_FLUSH_ALL -1ULL
69#endif
70
71/*
72 * TLB flushing:
73 *
74 * - flush_tlb() flushes the current mm struct TLBs
75 * - flush_tlb_all() flushes all processes TLBs
76 * - flush_tlb_mm(mm) flushes the specified mm context TLB's
77 * - flush_tlb_page(vma, vmaddr) flushes one page
78 * - flush_tlb_range(vma, start, end) flushes a range of pages
79 * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages
80 * - flush_tlb_others(cpumask, mm, va) flushes TLBs on other cpus
81 *
82 * ..but the i386 has somewhat limited tlb flushing capabilities,
83 * and page-granular flushes are available only on i486 and up.
84 *
85 * x86-64 can only flush individual pages or full VMs. For a range flush
86 * we always do the full VM. Might be worth trying if for a small
87 * range a few INVLPGs in a row are a win.
88 */
89
90#ifndef CONFIG_SMP
91
92#define flush_tlb() __flush_tlb()
93#define flush_tlb_all() __flush_tlb_all()
94#define local_flush_tlb() __flush_tlb()
95
96static inline void flush_tlb_mm(struct mm_struct *mm)
97{
98 if (mm == current->active_mm)
99 __flush_tlb();
100}
101
102static inline void flush_tlb_page(struct vm_area_struct *vma,
103 unsigned long addr)
104{
105 if (vma->vm_mm == current->active_mm)
106 __flush_tlb_one(addr);
107}
108
109static inline void flush_tlb_range(struct vm_area_struct *vma,
110 unsigned long start, unsigned long end)
111{
112 if (vma->vm_mm == current->active_mm)
113 __flush_tlb();
114}
115
116static inline void native_flush_tlb_others(const cpumask_t *cpumask,
117 struct mm_struct *mm,
118 unsigned long va)
119{
120}
121
122static inline void reset_lazy_tlbstate(void)
123{
124}
125
126#else /* SMP */
127
128#include <asm/smp.h>
129
130#define local_flush_tlb() __flush_tlb()
131
132extern void flush_tlb_all(void);
133extern void flush_tlb_current_task(void);
134extern void flush_tlb_mm(struct mm_struct *);
135extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
136
137#define flush_tlb() flush_tlb_current_task()
138
139static inline void flush_tlb_range(struct vm_area_struct *vma,
140 unsigned long start, unsigned long end)
141{
142 flush_tlb_mm(vma->vm_mm);
143}
144
145void native_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm,
146 unsigned long va);
147
148#define TLBSTATE_OK 1
149#define TLBSTATE_LAZY 2
150
151#ifdef CONFIG_X86_32
152struct tlb_state {
153 struct mm_struct *active_mm;
154 int state;
155 char __cacheline_padding[L1_CACHE_BYTES-8];
156};
157DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate);
158
159void reset_lazy_tlbstate(void);
160#else
161static inline void reset_lazy_tlbstate(void)
162{
163}
164#endif
165
166#endif /* SMP */
167
168#ifndef CONFIG_PARAVIRT
169#define flush_tlb_others(mask, mm, va) native_flush_tlb_others(&mask, mm, va)
170#endif
171
172static inline void flush_tlb_kernel_range(unsigned long start,
173 unsigned long end)
174{
175 flush_tlb_all();
176}
177
178#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
new file mode 100644
index 000000000000..90ac7718469a
--- /dev/null
+++ b/arch/x86/include/asm/topology.h
@@ -0,0 +1,258 @@
1/*
2 * Written by: Matthew Dobson, IBM Corporation
3 *
4 * Copyright (C) 2002, IBM Corp.
5 *
6 * All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
16 * NON INFRINGEMENT. See the GNU General Public License for more
17 * details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with this program; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * Send feedback to <colpatch@us.ibm.com>
24 */
25#ifndef _ASM_X86_TOPOLOGY_H
26#define _ASM_X86_TOPOLOGY_H
27
28#ifdef CONFIG_X86_32
29# ifdef CONFIG_X86_HT
30# define ENABLE_TOPO_DEFINES
31# endif
32#else
33# ifdef CONFIG_SMP
34# define ENABLE_TOPO_DEFINES
35# endif
36#endif
37
38/* Node not present */
39#define NUMA_NO_NODE (-1)
40
41#ifdef CONFIG_NUMA
42#include <linux/cpumask.h>
43#include <asm/mpspec.h>
44
45#ifdef CONFIG_X86_32
46
47/* Mappings between node number and cpus on that node. */
48extern cpumask_t node_to_cpumask_map[];
49
50/* Mappings between logical cpu number and node number */
51extern int cpu_to_node_map[];
52
53/* Returns the number of the node containing CPU 'cpu' */
54static inline int cpu_to_node(int cpu)
55{
56 return cpu_to_node_map[cpu];
57}
58#define early_cpu_to_node(cpu) cpu_to_node(cpu)
59
60/* Returns a bitmask of CPUs on Node 'node'.
61 *
62 * Side note: this function creates the returned cpumask on the stack
63 * so with a high NR_CPUS count, excessive stack space is used. The
64 * node_to_cpumask_ptr function should be used whenever possible.
65 */
66static inline cpumask_t node_to_cpumask(int node)
67{
68 return node_to_cpumask_map[node];
69}
70
71#else /* CONFIG_X86_64 */
72
73/* Mappings between node number and cpus on that node. */
74extern cpumask_t *node_to_cpumask_map;
75
76/* Mappings between logical cpu number and node number */
77DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
78
79/* Returns the number of the current Node. */
80#define numa_node_id() read_pda(nodenumber)
81
82#ifdef CONFIG_DEBUG_PER_CPU_MAPS
83extern int cpu_to_node(int cpu);
84extern int early_cpu_to_node(int cpu);
85extern const cpumask_t *_node_to_cpumask_ptr(int node);
86extern cpumask_t node_to_cpumask(int node);
87
88#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
89
90/* Returns the number of the node containing CPU 'cpu' */
91static inline int cpu_to_node(int cpu)
92{
93 return per_cpu(x86_cpu_to_node_map, cpu);
94}
95
96/* Same function but used if called before per_cpu areas are setup */
97static inline int early_cpu_to_node(int cpu)
98{
99 if (early_per_cpu_ptr(x86_cpu_to_node_map))
100 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
101
102 return per_cpu(x86_cpu_to_node_map, cpu);
103}
104
105/* Returns a pointer to the cpumask of CPUs on Node 'node'. */
106static inline const cpumask_t *_node_to_cpumask_ptr(int node)
107{
108 return &node_to_cpumask_map[node];
109}
110
111/* Returns a bitmask of CPUs on Node 'node'. */
112static inline cpumask_t node_to_cpumask(int node)
113{
114 return node_to_cpumask_map[node];
115}
116
117#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
118
119/* Replace default node_to_cpumask_ptr with optimized version */
120#define node_to_cpumask_ptr(v, node) \
121 const cpumask_t *v = _node_to_cpumask_ptr(node)
122
123#define node_to_cpumask_ptr_next(v, node) \
124 v = _node_to_cpumask_ptr(node)
125
126#endif /* CONFIG_X86_64 */
127
128/*
129 * Returns the number of the node containing Node 'node'. This
130 * architecture is flat, so it is a pretty simple function!
131 */
132#define parent_node(node) (node)
133
134#define pcibus_to_node(bus) __pcibus_to_node(bus)
135#define pcibus_to_cpumask(bus) __pcibus_to_cpumask(bus)
136
137#ifdef CONFIG_X86_32
138extern unsigned long node_start_pfn[];
139extern unsigned long node_end_pfn[];
140extern unsigned long node_remap_size[];
141#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
142
143# define SD_CACHE_NICE_TRIES 1
144# define SD_IDLE_IDX 1
145# define SD_NEWIDLE_IDX 2
146# define SD_FORKEXEC_IDX 0
147
148#else
149
150# define SD_CACHE_NICE_TRIES 2
151# define SD_IDLE_IDX 2
152# define SD_NEWIDLE_IDX 2
153# define SD_FORKEXEC_IDX 1
154
155#endif
156
157/* sched_domains SD_NODE_INIT for NUMAQ machines */
158#define SD_NODE_INIT (struct sched_domain) { \
159 .min_interval = 8, \
160 .max_interval = 32, \
161 .busy_factor = 32, \
162 .imbalance_pct = 125, \
163 .cache_nice_tries = SD_CACHE_NICE_TRIES, \
164 .busy_idx = 3, \
165 .idle_idx = SD_IDLE_IDX, \
166 .newidle_idx = SD_NEWIDLE_IDX, \
167 .wake_idx = 1, \
168 .forkexec_idx = SD_FORKEXEC_IDX, \
169 .flags = SD_LOAD_BALANCE \
170 | SD_BALANCE_EXEC \
171 | SD_BALANCE_FORK \
172 | SD_SERIALIZE \
173 | SD_WAKE_BALANCE, \
174 .last_balance = jiffies, \
175 .balance_interval = 1, \
176}
177
178#ifdef CONFIG_X86_64_ACPI_NUMA
179extern int __node_distance(int, int);
180#define node_distance(a, b) __node_distance(a, b)
181#endif
182
183#else /* !CONFIG_NUMA */
184
185#define numa_node_id() 0
186#define cpu_to_node(cpu) 0
187#define early_cpu_to_node(cpu) 0
188
189static inline const cpumask_t *_node_to_cpumask_ptr(int node)
190{
191 return &cpu_online_map;
192}
193static inline cpumask_t node_to_cpumask(int node)
194{
195 return cpu_online_map;
196}
197static inline int node_to_first_cpu(int node)
198{
199 return first_cpu(cpu_online_map);
200}
201
202/* Replace default node_to_cpumask_ptr with optimized version */
203#define node_to_cpumask_ptr(v, node) \
204 const cpumask_t *v = _node_to_cpumask_ptr(node)
205
206#define node_to_cpumask_ptr_next(v, node) \
207 v = _node_to_cpumask_ptr(node)
208#endif
209
210#include <asm-generic/topology.h>
211
212#ifdef CONFIG_NUMA
213/* Returns the number of the first CPU on Node 'node'. */
214static inline int node_to_first_cpu(int node)
215{
216 node_to_cpumask_ptr(mask, node);
217 return first_cpu(*mask);
218}
219#endif
220
221extern cpumask_t cpu_coregroup_map(int cpu);
222
223#ifdef ENABLE_TOPO_DEFINES
224#define topology_physical_package_id(cpu) (cpu_data(cpu).phys_proc_id)
225#define topology_core_id(cpu) (cpu_data(cpu).cpu_core_id)
226#define topology_core_siblings(cpu) (per_cpu(cpu_core_map, cpu))
227#define topology_thread_siblings(cpu) (per_cpu(cpu_sibling_map, cpu))
228
229/* indicates that pointers to the topology cpumask_t maps are valid */
230#define arch_provides_topology_pointers yes
231#endif
232
233static inline void arch_fix_phys_package_id(int num, u32 slot)
234{
235}
236
237struct pci_bus;
238void set_pci_bus_resources_arch_default(struct pci_bus *b);
239
240#ifdef CONFIG_SMP
241#define mc_capable() (boot_cpu_data.x86_max_cores > 1)
242#define smt_capable() (smp_num_siblings > 1)
243#endif
244
245#ifdef CONFIG_NUMA
246extern int get_mp_bus_to_node(int busnum);
247extern void set_mp_bus_to_node(int busnum, int node);
248#else
249static inline int get_mp_bus_to_node(int busnum)
250{
251 return 0;
252}
253static inline void set_mp_bus_to_node(int busnum, int node)
254{
255}
256#endif
257
258#endif /* _ASM_X86_TOPOLOGY_H */
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
new file mode 100644
index 000000000000..fa0d79facdbc
--- /dev/null
+++ b/arch/x86/include/asm/trampoline.h
@@ -0,0 +1,21 @@
1#ifndef _ASM_X86_TRAMPOLINE_H
2#define _ASM_X86_TRAMPOLINE_H
3
4#ifndef __ASSEMBLY__
5
6/*
7 * Trampoline 80x86 program as an array.
8 */
9extern const unsigned char trampoline_data [];
10extern const unsigned char trampoline_end [];
11extern unsigned char *trampoline_base;
12
13extern unsigned long init_rsp;
14extern unsigned long initial_code;
15
16#define TRAMPOLINE_BASE 0x6000
17extern unsigned long setup_trampoline(void);
18
19#endif /* __ASSEMBLY__ */
20
21#endif /* _ASM_X86_TRAMPOLINE_H */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
new file mode 100644
index 000000000000..45dee286e45c
--- /dev/null
+++ b/arch/x86/include/asm/traps.h
@@ -0,0 +1,81 @@
1#ifndef _ASM_X86_TRAPS_H
2#define _ASM_X86_TRAPS_H
3
4#include <asm/debugreg.h>
5
6#ifdef CONFIG_X86_32
7#define dotraplinkage
8#else
9#define dotraplinkage asmlinkage
10#endif
11
12asmlinkage void divide_error(void);
13asmlinkage void debug(void);
14asmlinkage void nmi(void);
15asmlinkage void int3(void);
16asmlinkage void overflow(void);
17asmlinkage void bounds(void);
18asmlinkage void invalid_op(void);
19asmlinkage void device_not_available(void);
20#ifdef CONFIG_X86_64
21asmlinkage void double_fault(void);
22#endif
23asmlinkage void coprocessor_segment_overrun(void);
24asmlinkage void invalid_TSS(void);
25asmlinkage void segment_not_present(void);
26asmlinkage void stack_segment(void);
27asmlinkage void general_protection(void);
28asmlinkage void page_fault(void);
29asmlinkage void spurious_interrupt_bug(void);
30asmlinkage void coprocessor_error(void);
31asmlinkage void alignment_check(void);
32#ifdef CONFIG_X86_MCE
33asmlinkage void machine_check(void);
34#endif /* CONFIG_X86_MCE */
35asmlinkage void simd_coprocessor_error(void);
36
37dotraplinkage void do_divide_error(struct pt_regs *, long);
38dotraplinkage void do_debug(struct pt_regs *, long);
39dotraplinkage void do_nmi(struct pt_regs *, long);
40dotraplinkage void do_int3(struct pt_regs *, long);
41dotraplinkage void do_overflow(struct pt_regs *, long);
42dotraplinkage void do_bounds(struct pt_regs *, long);
43dotraplinkage void do_invalid_op(struct pt_regs *, long);
44dotraplinkage void do_device_not_available(struct pt_regs *, long);
45dotraplinkage void do_coprocessor_segment_overrun(struct pt_regs *, long);
46dotraplinkage void do_invalid_TSS(struct pt_regs *, long);
47dotraplinkage void do_segment_not_present(struct pt_regs *, long);
48dotraplinkage void do_stack_segment(struct pt_regs *, long);
49dotraplinkage void do_general_protection(struct pt_regs *, long);
50dotraplinkage void do_page_fault(struct pt_regs *, unsigned long);
51dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long);
52dotraplinkage void do_coprocessor_error(struct pt_regs *, long);
53dotraplinkage void do_alignment_check(struct pt_regs *, long);
54#ifdef CONFIG_X86_MCE
55dotraplinkage void do_machine_check(struct pt_regs *, long);
56#endif
57dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
58#ifdef CONFIG_X86_32
59dotraplinkage void do_iret_error(struct pt_regs *, long);
60#endif
61
62static inline int get_si_code(unsigned long condition)
63{
64 if (condition & DR_STEP)
65 return TRAP_TRACE;
66 else if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3))
67 return TRAP_HWBKPT;
68 else
69 return TRAP_BRKPT;
70}
71
72extern int panic_on_unrecovered_nmi;
73extern int kstack_depth_to_print;
74
75#ifdef CONFIG_X86_32
76void math_error(void __user *);
77unsigned long patch_espfix_desc(unsigned long, unsigned long);
78asmlinkage void math_emulate(long);
79#endif
80
81#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
new file mode 100644
index 000000000000..38ae163cc91b
--- /dev/null
+++ b/arch/x86/include/asm/tsc.h
@@ -0,0 +1,62 @@
1/*
2 * x86 TSC related functions
3 */
4#ifndef _ASM_X86_TSC_H
5#define _ASM_X86_TSC_H
6
7#include <asm/processor.h>
8
9#define NS_SCALE 10 /* 2^10, carefully chosen */
10#define US_SCALE 32 /* 2^32, arbitralrily chosen */
11
12/*
13 * Standard way to access the cycle counter.
14 */
15typedef unsigned long long cycles_t;
16
17extern unsigned int cpu_khz;
18extern unsigned int tsc_khz;
19
20extern void disable_TSC(void);
21
22static inline cycles_t get_cycles(void)
23{
24 unsigned long long ret = 0;
25
26#ifndef CONFIG_X86_TSC
27 if (!cpu_has_tsc)
28 return 0;
29#endif
30 rdtscll(ret);
31
32 return ret;
33}
34
35static __always_inline cycles_t vget_cycles(void)
36{
37 /*
38 * We only do VDSOs on TSC capable CPUs, so this shouldnt
39 * access boot_cpu_data (which is not VDSO-safe):
40 */
41#ifndef CONFIG_X86_TSC
42 if (!cpu_has_tsc)
43 return 0;
44#endif
45 return (cycles_t)__native_read_tsc();
46}
47
48extern void tsc_init(void);
49extern void mark_tsc_unstable(char *reason);
50extern int unsynchronized_tsc(void);
51int check_tsc_unstable(void);
52
53/*
54 * Boot-time check whether the TSCs are synchronized across
55 * all CPUs/cores:
56 */
57extern void check_tsc_sync_source(int cpu);
58extern void check_tsc_sync_target(void);
59
60extern int notsc_setup(char *);
61
62#endif /* _ASM_X86_TSC_H */
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
new file mode 100644
index 000000000000..e6f736320077
--- /dev/null
+++ b/arch/x86/include/asm/types.h
@@ -0,0 +1,36 @@
1#ifndef _ASM_X86_TYPES_H
2#define _ASM_X86_TYPES_H
3
4#include <asm-generic/int-ll64.h>
5
6#ifndef __ASSEMBLY__
7
8typedef unsigned short umode_t;
9
10#endif /* __ASSEMBLY__ */
11
12/*
13 * These aren't exported outside the kernel to avoid name space clashes
14 */
15#ifdef __KERNEL__
16
17#ifdef CONFIG_X86_32
18# define BITS_PER_LONG 32
19#else
20# define BITS_PER_LONG 64
21#endif
22
23#ifndef __ASSEMBLY__
24
25typedef u64 dma64_addr_t;
26#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G)
27/* DMA addresses come in 32-bit and 64-bit flavours. */
28typedef u64 dma_addr_t;
29#else
30typedef u32 dma_addr_t;
31#endif
32
33#endif /* __ASSEMBLY__ */
34#endif /* __KERNEL__ */
35
36#endif /* _ASM_X86_TYPES_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
new file mode 100644
index 000000000000..35c54921b2e4
--- /dev/null
+++ b/arch/x86/include/asm/uaccess.h
@@ -0,0 +1,454 @@
1#ifndef _ASM_X86_UACCESS_H
2#define _ASM_X86_UACCESS_H
3/*
4 * User space memory access functions
5 */
6#include <linux/errno.h>
7#include <linux/compiler.h>
8#include <linux/thread_info.h>
9#include <linux/prefetch.h>
10#include <linux/string.h>
11#include <asm/asm.h>
12#include <asm/page.h>
13
14#define VERIFY_READ 0
15#define VERIFY_WRITE 1
16
17/*
18 * The fs value determines whether argument validity checking should be
19 * performed or not. If get_fs() == USER_DS, checking is performed, with
20 * get_fs() == KERNEL_DS, checking is bypassed.
21 *
22 * For historical reasons, these macros are grossly misnamed.
23 */
24
25#define MAKE_MM_SEG(s) ((mm_segment_t) { (s) })
26
27#define KERNEL_DS MAKE_MM_SEG(-1UL)
28#define USER_DS MAKE_MM_SEG(PAGE_OFFSET)
29
30#define get_ds() (KERNEL_DS)
31#define get_fs() (current_thread_info()->addr_limit)
32#define set_fs(x) (current_thread_info()->addr_limit = (x))
33
34#define segment_eq(a, b) ((a).seg == (b).seg)
35
36#define __addr_ok(addr) \
37 ((unsigned long __force)(addr) < \
38 (current_thread_info()->addr_limit.seg))
39
40/*
41 * Test whether a block of memory is a valid user space address.
42 * Returns 0 if the range is valid, nonzero otherwise.
43 *
44 * This is equivalent to the following test:
45 * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64)
46 *
47 * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
48 */
49
50#define __range_not_ok(addr, size) \
51({ \
52 unsigned long flag, roksum; \
53 __chk_user_ptr(addr); \
54 asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \
55 : "=&r" (flag), "=r" (roksum) \
56 : "1" (addr), "g" ((long)(size)), \
57 "rm" (current_thread_info()->addr_limit.seg)); \
58 flag; \
59})
60
61/**
62 * access_ok: - Checks if a user space pointer is valid
63 * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that
64 * %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe
65 * to write to a block, it is always safe to read from it.
66 * @addr: User space pointer to start of block to check
67 * @size: Size of block to check
68 *
69 * Context: User context only. This function may sleep.
70 *
71 * Checks if a pointer to a block of memory in user space is valid.
72 *
73 * Returns true (nonzero) if the memory block may be valid, false (zero)
74 * if it is definitely invalid.
75 *
76 * Note that, depending on architecture, this function probably just
77 * checks that the pointer is in the user space range - after calling
78 * this function, memory access functions may still return -EFAULT.
79 */
80#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0))
81
82/*
83 * The exception table consists of pairs of addresses: the first is the
84 * address of an instruction that is allowed to fault, and the second is
85 * the address at which the program should continue. No registers are
86 * modified, so it is entirely up to the continuation code to figure out
87 * what to do.
88 *
89 * All the routines below use bits of fixup code that are out of line
90 * with the main instruction path. This means when everything is well,
91 * we don't even have to jump over them. Further, they do not intrude
92 * on our cache or tlb entries.
93 */
94
95struct exception_table_entry {
96 unsigned long insn, fixup;
97};
98
99extern int fixup_exception(struct pt_regs *regs);
100
101/*
102 * These are the main single-value transfer routines. They automatically
103 * use the right size if we just have the right pointer type.
104 *
105 * This gets kind of ugly. We want to return _two_ values in "get_user()"
106 * and yet we don't want to do any pointers, because that is too much
107 * of a performance impact. Thus we have a few rather ugly macros here,
108 * and hide all the ugliness from the user.
109 *
110 * The "__xxx" versions of the user access functions are versions that
111 * do not verify the address space, that must have been done previously
112 * with a separate "access_ok()" call (this is used when we do multiple
113 * accesses to the same area of user memory).
114 */
115
116extern int __get_user_1(void);
117extern int __get_user_2(void);
118extern int __get_user_4(void);
119extern int __get_user_8(void);
120extern int __get_user_bad(void);
121
122#define __get_user_x(size, ret, x, ptr) \
123 asm volatile("call __get_user_" #size \
124 : "=a" (ret),"=d" (x) \
125 : "0" (ptr)) \
126
127/* Careful: we have to cast the result to the type of the pointer
128 * for sign reasons */
129
130/**
131 * get_user: - Get a simple variable from user space.
132 * @x: Variable to store result.
133 * @ptr: Source address, in user space.
134 *
135 * Context: User context only. This function may sleep.
136 *
137 * This macro copies a single simple variable from user space to kernel
138 * space. It supports simple types like char and int, but not larger
139 * data types like structures or arrays.
140 *
141 * @ptr must have pointer-to-simple-variable type, and the result of
142 * dereferencing @ptr must be assignable to @x without a cast.
143 *
144 * Returns zero on success, or -EFAULT on error.
145 * On error, the variable @x is set to zero.
146 */
147#ifdef CONFIG_X86_32
148#define __get_user_8(__ret_gu, __val_gu, ptr) \
149 __get_user_x(X, __ret_gu, __val_gu, ptr)
150#else
151#define __get_user_8(__ret_gu, __val_gu, ptr) \
152 __get_user_x(8, __ret_gu, __val_gu, ptr)
153#endif
154
155#define get_user(x, ptr) \
156({ \
157 int __ret_gu; \
158 unsigned long __val_gu; \
159 __chk_user_ptr(ptr); \
160 switch (sizeof(*(ptr))) { \
161 case 1: \
162 __get_user_x(1, __ret_gu, __val_gu, ptr); \
163 break; \
164 case 2: \
165 __get_user_x(2, __ret_gu, __val_gu, ptr); \
166 break; \
167 case 4: \
168 __get_user_x(4, __ret_gu, __val_gu, ptr); \
169 break; \
170 case 8: \
171 __get_user_8(__ret_gu, __val_gu, ptr); \
172 break; \
173 default: \
174 __get_user_x(X, __ret_gu, __val_gu, ptr); \
175 break; \
176 } \
177 (x) = (__typeof__(*(ptr)))__val_gu; \
178 __ret_gu; \
179})
180
181#define __put_user_x(size, x, ptr, __ret_pu) \
182 asm volatile("call __put_user_" #size : "=a" (__ret_pu) \
183 :"0" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
184
185
186
187#ifdef CONFIG_X86_32
188#define __put_user_u64(x, addr, err) \
189 asm volatile("1: movl %%eax,0(%2)\n" \
190 "2: movl %%edx,4(%2)\n" \
191 "3:\n" \
192 ".section .fixup,\"ax\"\n" \
193 "4: movl %3,%0\n" \
194 " jmp 3b\n" \
195 ".previous\n" \
196 _ASM_EXTABLE(1b, 4b) \
197 _ASM_EXTABLE(2b, 4b) \
198 : "=r" (err) \
199 : "A" (x), "r" (addr), "i" (-EFAULT), "0" (err))
200
201#define __put_user_x8(x, ptr, __ret_pu) \
202 asm volatile("call __put_user_8" : "=a" (__ret_pu) \
203 : "A" ((typeof(*(ptr)))(x)), "c" (ptr) : "ebx")
204#else
205#define __put_user_u64(x, ptr, retval) \
206 __put_user_asm(x, ptr, retval, "q", "", "Zr", -EFAULT)
207#define __put_user_x8(x, ptr, __ret_pu) __put_user_x(8, x, ptr, __ret_pu)
208#endif
209
210extern void __put_user_bad(void);
211
212/*
213 * Strange magic calling convention: pointer in %ecx,
214 * value in %eax(:%edx), return value in %eax. clobbers %rbx
215 */
216extern void __put_user_1(void);
217extern void __put_user_2(void);
218extern void __put_user_4(void);
219extern void __put_user_8(void);
220
221#ifdef CONFIG_X86_WP_WORKS_OK
222
223/**
224 * put_user: - Write a simple value into user space.
225 * @x: Value to copy to user space.
226 * @ptr: Destination address, in user space.
227 *
228 * Context: User context only. This function may sleep.
229 *
230 * This macro copies a single simple value from kernel space to user
231 * space. It supports simple types like char and int, but not larger
232 * data types like structures or arrays.
233 *
234 * @ptr must have pointer-to-simple-variable type, and @x must be assignable
235 * to the result of dereferencing @ptr.
236 *
237 * Returns zero on success, or -EFAULT on error.
238 */
239#define put_user(x, ptr) \
240({ \
241 int __ret_pu; \
242 __typeof__(*(ptr)) __pu_val; \
243 __chk_user_ptr(ptr); \
244 __pu_val = x; \
245 switch (sizeof(*(ptr))) { \
246 case 1: \
247 __put_user_x(1, __pu_val, ptr, __ret_pu); \
248 break; \
249 case 2: \
250 __put_user_x(2, __pu_val, ptr, __ret_pu); \
251 break; \
252 case 4: \
253 __put_user_x(4, __pu_val, ptr, __ret_pu); \
254 break; \
255 case 8: \
256 __put_user_x8(__pu_val, ptr, __ret_pu); \
257 break; \
258 default: \
259 __put_user_x(X, __pu_val, ptr, __ret_pu); \
260 break; \
261 } \
262 __ret_pu; \
263})
264
265#define __put_user_size(x, ptr, size, retval, errret) \
266do { \
267 retval = 0; \
268 __chk_user_ptr(ptr); \
269 switch (size) { \
270 case 1: \
271 __put_user_asm(x, ptr, retval, "b", "b", "iq", errret); \
272 break; \
273 case 2: \
274 __put_user_asm(x, ptr, retval, "w", "w", "ir", errret); \
275 break; \
276 case 4: \
277 __put_user_asm(x, ptr, retval, "l", "k", "ir", errret);\
278 break; \
279 case 8: \
280 __put_user_u64((__typeof__(*ptr))(x), ptr, retval); \
281 break; \
282 default: \
283 __put_user_bad(); \
284 } \
285} while (0)
286
287#else
288
289#define __put_user_size(x, ptr, size, retval, errret) \
290do { \
291 __typeof__(*(ptr))__pus_tmp = x; \
292 retval = 0; \
293 \
294 if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, size) != 0)) \
295 retval = errret; \
296} while (0)
297
298#define put_user(x, ptr) \
299({ \
300 int __ret_pu; \
301 __typeof__(*(ptr))__pus_tmp = x; \
302 __ret_pu = 0; \
303 if (unlikely(__copy_to_user_ll(ptr, &__pus_tmp, \
304 sizeof(*(ptr))) != 0)) \
305 __ret_pu = -EFAULT; \
306 __ret_pu; \
307})
308#endif
309
310#ifdef CONFIG_X86_32
311#define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad()
312#else
313#define __get_user_asm_u64(x, ptr, retval, errret) \
314 __get_user_asm(x, ptr, retval, "q", "", "=r", errret)
315#endif
316
317#define __get_user_size(x, ptr, size, retval, errret) \
318do { \
319 retval = 0; \
320 __chk_user_ptr(ptr); \
321 switch (size) { \
322 case 1: \
323 __get_user_asm(x, ptr, retval, "b", "b", "=q", errret); \
324 break; \
325 case 2: \
326 __get_user_asm(x, ptr, retval, "w", "w", "=r", errret); \
327 break; \
328 case 4: \
329 __get_user_asm(x, ptr, retval, "l", "k", "=r", errret); \
330 break; \
331 case 8: \
332 __get_user_asm_u64(x, ptr, retval, errret); \
333 break; \
334 default: \
335 (x) = __get_user_bad(); \
336 } \
337} while (0)
338
339#define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \
340 asm volatile("1: mov"itype" %2,%"rtype"1\n" \
341 "2:\n" \
342 ".section .fixup,\"ax\"\n" \
343 "3: mov %3,%0\n" \
344 " xor"itype" %"rtype"1,%"rtype"1\n" \
345 " jmp 2b\n" \
346 ".previous\n" \
347 _ASM_EXTABLE(1b, 3b) \
348 : "=r" (err), ltype(x) \
349 : "m" (__m(addr)), "i" (errret), "0" (err))
350
351#define __put_user_nocheck(x, ptr, size) \
352({ \
353 long __pu_err; \
354 __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \
355 __pu_err; \
356})
357
358#define __get_user_nocheck(x, ptr, size) \
359({ \
360 long __gu_err; \
361 unsigned long __gu_val; \
362 __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \
363 (x) = (__force __typeof__(*(ptr)))__gu_val; \
364 __gu_err; \
365})
366
367/* FIXME: this hack is definitely wrong -AK */
368struct __large_struct { unsigned long buf[100]; };
369#define __m(x) (*(struct __large_struct __user *)(x))
370
371/*
372 * Tell gcc we read from memory instead of writing: this is because
373 * we do not write to any memory gcc knows about, so there are no
374 * aliasing issues.
375 */
376#define __put_user_asm(x, addr, err, itype, rtype, ltype, errret) \
377 asm volatile("1: mov"itype" %"rtype"1,%2\n" \
378 "2:\n" \
379 ".section .fixup,\"ax\"\n" \
380 "3: mov %3,%0\n" \
381 " jmp 2b\n" \
382 ".previous\n" \
383 _ASM_EXTABLE(1b, 3b) \
384 : "=r"(err) \
385 : ltype(x), "m" (__m(addr)), "i" (errret), "0" (err))
386/**
387 * __get_user: - Get a simple variable from user space, with less checking.
388 * @x: Variable to store result.
389 * @ptr: Source address, in user space.
390 *
391 * Context: User context only. This function may sleep.
392 *
393 * This macro copies a single simple variable from user space to kernel
394 * space. It supports simple types like char and int, but not larger
395 * data types like structures or arrays.
396 *
397 * @ptr must have pointer-to-simple-variable type, and the result of
398 * dereferencing @ptr must be assignable to @x without a cast.
399 *
400 * Caller must check the pointer with access_ok() before calling this
401 * function.
402 *
403 * Returns zero on success, or -EFAULT on error.
404 * On error, the variable @x is set to zero.
405 */
406
407#define __get_user(x, ptr) \
408 __get_user_nocheck((x), (ptr), sizeof(*(ptr)))
409/**
410 * __put_user: - Write a simple value into user space, with less checking.
411 * @x: Value to copy to user space.
412 * @ptr: Destination address, in user space.
413 *
414 * Context: User context only. This function may sleep.
415 *
416 * This macro copies a single simple value from kernel space to user
417 * space. It supports simple types like char and int, but not larger
418 * data types like structures or arrays.
419 *
420 * @ptr must have pointer-to-simple-variable type, and @x must be assignable
421 * to the result of dereferencing @ptr.
422 *
423 * Caller must check the pointer with access_ok() before calling this
424 * function.
425 *
426 * Returns zero on success, or -EFAULT on error.
427 */
428
429#define __put_user(x, ptr) \
430 __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)))
431
432#define __get_user_unaligned __get_user
433#define __put_user_unaligned __put_user
434
435/*
436 * movsl can be slow when source and dest are not both 8-byte aligned
437 */
438#ifdef CONFIG_X86_INTEL_USERCOPY
439extern struct movsl_mask {
440 int mask;
441} ____cacheline_aligned_in_smp movsl_mask;
442#endif
443
444#define ARCH_HAS_NOCACHE_UACCESS 1
445
446#ifdef CONFIG_X86_32
447# include "uaccess_32.h"
448#else
449# define ARCH_HAS_SEARCH_EXTABLE
450# include "uaccess_64.h"
451#endif
452
453#endif /* _ASM_X86_UACCESS_H */
454
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
new file mode 100644
index 000000000000..d095a3aeea1b
--- /dev/null
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -0,0 +1,218 @@
1#ifndef _ASM_X86_UACCESS_32_H
2#define _ASM_X86_UACCESS_32_H
3
4/*
5 * User space memory access functions
6 */
7#include <linux/errno.h>
8#include <linux/thread_info.h>
9#include <linux/prefetch.h>
10#include <linux/string.h>
11#include <asm/asm.h>
12#include <asm/page.h>
13
14unsigned long __must_check __copy_to_user_ll
15 (void __user *to, const void *from, unsigned long n);
16unsigned long __must_check __copy_from_user_ll
17 (void *to, const void __user *from, unsigned long n);
18unsigned long __must_check __copy_from_user_ll_nozero
19 (void *to, const void __user *from, unsigned long n);
20unsigned long __must_check __copy_from_user_ll_nocache
21 (void *to, const void __user *from, unsigned long n);
22unsigned long __must_check __copy_from_user_ll_nocache_nozero
23 (void *to, const void __user *from, unsigned long n);
24
25/**
26 * __copy_to_user_inatomic: - Copy a block of data into user space, with less checking.
27 * @to: Destination address, in user space.
28 * @from: Source address, in kernel space.
29 * @n: Number of bytes to copy.
30 *
31 * Context: User context only.
32 *
33 * Copy data from kernel space to user space. Caller must check
34 * the specified block with access_ok() before calling this function.
35 * The caller should also make sure he pins the user space address
36 * so that the we don't result in page fault and sleep.
37 *
38 * Here we special-case 1, 2 and 4-byte copy_*_user invocations. On a fault
39 * we return the initial request size (1, 2 or 4), as copy_*_user should do.
40 * If a store crosses a page boundary and gets a fault, the x86 will not write
41 * anything, so this is accurate.
42 */
43
44static __always_inline unsigned long __must_check
45__copy_to_user_inatomic(void __user *to, const void *from, unsigned long n)
46{
47 if (__builtin_constant_p(n)) {
48 unsigned long ret;
49
50 switch (n) {
51 case 1:
52 __put_user_size(*(u8 *)from, (u8 __user *)to,
53 1, ret, 1);
54 return ret;
55 case 2:
56 __put_user_size(*(u16 *)from, (u16 __user *)to,
57 2, ret, 2);
58 return ret;
59 case 4:
60 __put_user_size(*(u32 *)from, (u32 __user *)to,
61 4, ret, 4);
62 return ret;
63 }
64 }
65 return __copy_to_user_ll(to, from, n);
66}
67
68/**
69 * __copy_to_user: - Copy a block of data into user space, with less checking.
70 * @to: Destination address, in user space.
71 * @from: Source address, in kernel space.
72 * @n: Number of bytes to copy.
73 *
74 * Context: User context only. This function may sleep.
75 *
76 * Copy data from kernel space to user space. Caller must check
77 * the specified block with access_ok() before calling this function.
78 *
79 * Returns number of bytes that could not be copied.
80 * On success, this will be zero.
81 */
82static __always_inline unsigned long __must_check
83__copy_to_user(void __user *to, const void *from, unsigned long n)
84{
85 might_sleep();
86 return __copy_to_user_inatomic(to, from, n);
87}
88
89static __always_inline unsigned long
90__copy_from_user_inatomic(void *to, const void __user *from, unsigned long n)
91{
92 /* Avoid zeroing the tail if the copy fails..
93 * If 'n' is constant and 1, 2, or 4, we do still zero on a failure,
94 * but as the zeroing behaviour is only significant when n is not
95 * constant, that shouldn't be a problem.
96 */
97 if (__builtin_constant_p(n)) {
98 unsigned long ret;
99
100 switch (n) {
101 case 1:
102 __get_user_size(*(u8 *)to, from, 1, ret, 1);
103 return ret;
104 case 2:
105 __get_user_size(*(u16 *)to, from, 2, ret, 2);
106 return ret;
107 case 4:
108 __get_user_size(*(u32 *)to, from, 4, ret, 4);
109 return ret;
110 }
111 }
112 return __copy_from_user_ll_nozero(to, from, n);
113}
114
115/**
116 * __copy_from_user: - Copy a block of data from user space, with less checking.
117 * @to: Destination address, in kernel space.
118 * @from: Source address, in user space.
119 * @n: Number of bytes to copy.
120 *
121 * Context: User context only. This function may sleep.
122 *
123 * Copy data from user space to kernel space. Caller must check
124 * the specified block with access_ok() before calling this function.
125 *
126 * Returns number of bytes that could not be copied.
127 * On success, this will be zero.
128 *
129 * If some data could not be copied, this function will pad the copied
130 * data to the requested size using zero bytes.
131 *
132 * An alternate version - __copy_from_user_inatomic() - may be called from
133 * atomic context and will fail rather than sleep. In this case the
134 * uncopied bytes will *NOT* be padded with zeros. See fs/filemap.h
135 * for explanation of why this is needed.
136 */
137static __always_inline unsigned long
138__copy_from_user(void *to, const void __user *from, unsigned long n)
139{
140 might_sleep();
141 if (__builtin_constant_p(n)) {
142 unsigned long ret;
143
144 switch (n) {
145 case 1:
146 __get_user_size(*(u8 *)to, from, 1, ret, 1);
147 return ret;
148 case 2:
149 __get_user_size(*(u16 *)to, from, 2, ret, 2);
150 return ret;
151 case 4:
152 __get_user_size(*(u32 *)to, from, 4, ret, 4);
153 return ret;
154 }
155 }
156 return __copy_from_user_ll(to, from, n);
157}
158
159static __always_inline unsigned long __copy_from_user_nocache(void *to,
160 const void __user *from, unsigned long n)
161{
162 might_sleep();
163 if (__builtin_constant_p(n)) {
164 unsigned long ret;
165
166 switch (n) {
167 case 1:
168 __get_user_size(*(u8 *)to, from, 1, ret, 1);
169 return ret;
170 case 2:
171 __get_user_size(*(u16 *)to, from, 2, ret, 2);
172 return ret;
173 case 4:
174 __get_user_size(*(u32 *)to, from, 4, ret, 4);
175 return ret;
176 }
177 }
178 return __copy_from_user_ll_nocache(to, from, n);
179}
180
181static __always_inline unsigned long
182__copy_from_user_inatomic_nocache(void *to, const void __user *from,
183 unsigned long n)
184{
185 return __copy_from_user_ll_nocache_nozero(to, from, n);
186}
187
188unsigned long __must_check copy_to_user(void __user *to,
189 const void *from, unsigned long n);
190unsigned long __must_check copy_from_user(void *to,
191 const void __user *from,
192 unsigned long n);
193long __must_check strncpy_from_user(char *dst, const char __user *src,
194 long count);
195long __must_check __strncpy_from_user(char *dst,
196 const char __user *src, long count);
197
198/**
199 * strlen_user: - Get the size of a string in user space.
200 * @str: The string to measure.
201 *
202 * Context: User context only. This function may sleep.
203 *
204 * Get the size of a NUL-terminated string in user space.
205 *
206 * Returns the size of the string INCLUDING the terminating NUL.
207 * On exception, returns 0.
208 *
209 * If there is a limit on the length of a valid string, you may wish to
210 * consider using strnlen_user() instead.
211 */
212#define strlen_user(str) strnlen_user(str, LONG_MAX)
213
214long strnlen_user(const char __user *str, long n);
215unsigned long __must_check clear_user(void __user *mem, unsigned long len);
216unsigned long __must_check __clear_user(void __user *mem, unsigned long len);
217
218#endif /* _ASM_X86_UACCESS_32_H */
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
new file mode 100644
index 000000000000..664f15280f14
--- /dev/null
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -0,0 +1,202 @@
1#ifndef _ASM_X86_UACCESS_64_H
2#define _ASM_X86_UACCESS_64_H
3
4/*
5 * User space memory access functions
6 */
7#include <linux/compiler.h>
8#include <linux/errno.h>
9#include <linux/prefetch.h>
10#include <linux/lockdep.h>
11#include <asm/page.h>
12
13/*
14 * Copy To/From Userspace
15 */
16
17/* Handles exceptions in both to and from, but doesn't do access_ok */
18__must_check unsigned long
19copy_user_generic(void *to, const void *from, unsigned len);
20
21__must_check unsigned long
22copy_to_user(void __user *to, const void *from, unsigned len);
23__must_check unsigned long
24copy_from_user(void *to, const void __user *from, unsigned len);
25__must_check unsigned long
26copy_in_user(void __user *to, const void __user *from, unsigned len);
27
28static __always_inline __must_check
29int __copy_from_user(void *dst, const void __user *src, unsigned size)
30{
31 int ret = 0;
32 if (!__builtin_constant_p(size))
33 return copy_user_generic(dst, (__force void *)src, size);
34 switch (size) {
35 case 1:__get_user_asm(*(u8 *)dst, (u8 __user *)src,
36 ret, "b", "b", "=q", 1);
37 return ret;
38 case 2:__get_user_asm(*(u16 *)dst, (u16 __user *)src,
39 ret, "w", "w", "=r", 2);
40 return ret;
41 case 4:__get_user_asm(*(u32 *)dst, (u32 __user *)src,
42 ret, "l", "k", "=r", 4);
43 return ret;
44 case 8:__get_user_asm(*(u64 *)dst, (u64 __user *)src,
45 ret, "q", "", "=r", 8);
46 return ret;
47 case 10:
48 __get_user_asm(*(u64 *)dst, (u64 __user *)src,
49 ret, "q", "", "=r", 16);
50 if (unlikely(ret))
51 return ret;
52 __get_user_asm(*(u16 *)(8 + (char *)dst),
53 (u16 __user *)(8 + (char __user *)src),
54 ret, "w", "w", "=r", 2);
55 return ret;
56 case 16:
57 __get_user_asm(*(u64 *)dst, (u64 __user *)src,
58 ret, "q", "", "=r", 16);
59 if (unlikely(ret))
60 return ret;
61 __get_user_asm(*(u64 *)(8 + (char *)dst),
62 (u64 __user *)(8 + (char __user *)src),
63 ret, "q", "", "=r", 8);
64 return ret;
65 default:
66 return copy_user_generic(dst, (__force void *)src, size);
67 }
68}
69
70static __always_inline __must_check
71int __copy_to_user(void __user *dst, const void *src, unsigned size)
72{
73 int ret = 0;
74 if (!__builtin_constant_p(size))
75 return copy_user_generic((__force void *)dst, src, size);
76 switch (size) {
77 case 1:__put_user_asm(*(u8 *)src, (u8 __user *)dst,
78 ret, "b", "b", "iq", 1);
79 return ret;
80 case 2:__put_user_asm(*(u16 *)src, (u16 __user *)dst,
81 ret, "w", "w", "ir", 2);
82 return ret;
83 case 4:__put_user_asm(*(u32 *)src, (u32 __user *)dst,
84 ret, "l", "k", "ir", 4);
85 return ret;
86 case 8:__put_user_asm(*(u64 *)src, (u64 __user *)dst,
87 ret, "q", "", "ir", 8);
88 return ret;
89 case 10:
90 __put_user_asm(*(u64 *)src, (u64 __user *)dst,
91 ret, "q", "", "ir", 10);
92 if (unlikely(ret))
93 return ret;
94 asm("":::"memory");
95 __put_user_asm(4[(u16 *)src], 4 + (u16 __user *)dst,
96 ret, "w", "w", "ir", 2);
97 return ret;
98 case 16:
99 __put_user_asm(*(u64 *)src, (u64 __user *)dst,
100 ret, "q", "", "ir", 16);
101 if (unlikely(ret))
102 return ret;
103 asm("":::"memory");
104 __put_user_asm(1[(u64 *)src], 1 + (u64 __user *)dst,
105 ret, "q", "", "ir", 8);
106 return ret;
107 default:
108 return copy_user_generic((__force void *)dst, src, size);
109 }
110}
111
112static __always_inline __must_check
113int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
114{
115 int ret = 0;
116 if (!__builtin_constant_p(size))
117 return copy_user_generic((__force void *)dst,
118 (__force void *)src, size);
119 switch (size) {
120 case 1: {
121 u8 tmp;
122 __get_user_asm(tmp, (u8 __user *)src,
123 ret, "b", "b", "=q", 1);
124 if (likely(!ret))
125 __put_user_asm(tmp, (u8 __user *)dst,
126 ret, "b", "b", "iq", 1);
127 return ret;
128 }
129 case 2: {
130 u16 tmp;
131 __get_user_asm(tmp, (u16 __user *)src,
132 ret, "w", "w", "=r", 2);
133 if (likely(!ret))
134 __put_user_asm(tmp, (u16 __user *)dst,
135 ret, "w", "w", "ir", 2);
136 return ret;
137 }
138
139 case 4: {
140 u32 tmp;
141 __get_user_asm(tmp, (u32 __user *)src,
142 ret, "l", "k", "=r", 4);
143 if (likely(!ret))
144 __put_user_asm(tmp, (u32 __user *)dst,
145 ret, "l", "k", "ir", 4);
146 return ret;
147 }
148 case 8: {
149 u64 tmp;
150 __get_user_asm(tmp, (u64 __user *)src,
151 ret, "q", "", "=r", 8);
152 if (likely(!ret))
153 __put_user_asm(tmp, (u64 __user *)dst,
154 ret, "q", "", "ir", 8);
155 return ret;
156 }
157 default:
158 return copy_user_generic((__force void *)dst,
159 (__force void *)src, size);
160 }
161}
162
163__must_check long
164strncpy_from_user(char *dst, const char __user *src, long count);
165__must_check long
166__strncpy_from_user(char *dst, const char __user *src, long count);
167__must_check long strnlen_user(const char __user *str, long n);
168__must_check long __strnlen_user(const char __user *str, long n);
169__must_check long strlen_user(const char __user *str);
170__must_check unsigned long clear_user(void __user *mem, unsigned long len);
171__must_check unsigned long __clear_user(void __user *mem, unsigned long len);
172
173__must_check long __copy_from_user_inatomic(void *dst, const void __user *src,
174 unsigned size);
175
176static __must_check __always_inline int
177__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
178{
179 return copy_user_generic((__force void *)dst, src, size);
180}
181
182extern long __copy_user_nocache(void *dst, const void __user *src,
183 unsigned size, int zerorest);
184
185static inline int __copy_from_user_nocache(void *dst, const void __user *src,
186 unsigned size)
187{
188 might_sleep();
189 return __copy_user_nocache(dst, src, size, 1);
190}
191
192static inline int __copy_from_user_inatomic_nocache(void *dst,
193 const void __user *src,
194 unsigned size)
195{
196 return __copy_user_nocache(dst, src, size, 0);
197}
198
199unsigned long
200copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest);
201
202#endif /* _ASM_X86_UACCESS_64_H */
diff --git a/arch/x86/include/asm/ucontext.h b/arch/x86/include/asm/ucontext.h
new file mode 100644
index 000000000000..87324cf439d9
--- /dev/null
+++ b/arch/x86/include/asm/ucontext.h
@@ -0,0 +1,18 @@
1#ifndef _ASM_X86_UCONTEXT_H
2#define _ASM_X86_UCONTEXT_H
3
4#define UC_FP_XSTATE 0x1 /* indicates the presence of extended state
5 * information in the memory layout pointed
6 * by the fpstate pointer in the ucontext's
7 * sigcontext struct (uc_mcontext).
8 */
9
10struct ucontext {
11 unsigned long uc_flags;
12 struct ucontext *uc_link;
13 stack_t uc_stack;
14 struct sigcontext uc_mcontext;
15 sigset_t uc_sigmask; /* mask last for extensibility */
16};
17
18#endif /* _ASM_X86_UCONTEXT_H */
diff --git a/arch/x86/include/asm/unaligned.h b/arch/x86/include/asm/unaligned.h
new file mode 100644
index 000000000000..a7bd416b4763
--- /dev/null
+++ b/arch/x86/include/asm/unaligned.h
@@ -0,0 +1,14 @@
1#ifndef _ASM_X86_UNALIGNED_H
2#define _ASM_X86_UNALIGNED_H
3
4/*
5 * The x86 can do unaligned accesses itself.
6 */
7
8#include <linux/unaligned/access_ok.h>
9#include <linux/unaligned/generic.h>
10
11#define get_unaligned __get_unaligned_le
12#define put_unaligned __put_unaligned_le
13
14#endif /* _ASM_X86_UNALIGNED_H */
diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h
new file mode 100644
index 000000000000..2a58ed3e51d8
--- /dev/null
+++ b/arch/x86/include/asm/unistd.h
@@ -0,0 +1,13 @@
1#ifdef __KERNEL__
2# ifdef CONFIG_X86_32
3# include "unistd_32.h"
4# else
5# include "unistd_64.h"
6# endif
7#else
8# ifdef __i386__
9# include "unistd_32.h"
10# else
11# include "unistd_64.h"
12# endif
13#endif
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
new file mode 100644
index 000000000000..f2bba78430a4
--- /dev/null
+++ b/arch/x86/include/asm/unistd_32.h
@@ -0,0 +1,379 @@
1#ifndef _ASM_X86_UNISTD_32_H
2#define _ASM_X86_UNISTD_32_H
3
4/*
5 * This file contains the system call numbers.
6 */
7
8#define __NR_restart_syscall 0
9#define __NR_exit 1
10#define __NR_fork 2
11#define __NR_read 3
12#define __NR_write 4
13#define __NR_open 5
14#define __NR_close 6
15#define __NR_waitpid 7
16#define __NR_creat 8
17#define __NR_link 9
18#define __NR_unlink 10
19#define __NR_execve 11
20#define __NR_chdir 12
21#define __NR_time 13
22#define __NR_mknod 14
23#define __NR_chmod 15
24#define __NR_lchown 16
25#define __NR_break 17
26#define __NR_oldstat 18
27#define __NR_lseek 19
28#define __NR_getpid 20
29#define __NR_mount 21
30#define __NR_umount 22
31#define __NR_setuid 23
32#define __NR_getuid 24
33#define __NR_stime 25
34#define __NR_ptrace 26
35#define __NR_alarm 27
36#define __NR_oldfstat 28
37#define __NR_pause 29
38#define __NR_utime 30
39#define __NR_stty 31
40#define __NR_gtty 32
41#define __NR_access 33
42#define __NR_nice 34
43#define __NR_ftime 35
44#define __NR_sync 36
45#define __NR_kill 37
46#define __NR_rename 38
47#define __NR_mkdir 39
48#define __NR_rmdir 40
49#define __NR_dup 41
50#define __NR_pipe 42
51#define __NR_times 43
52#define __NR_prof 44
53#define __NR_brk 45
54#define __NR_setgid 46
55#define __NR_getgid 47
56#define __NR_signal 48
57#define __NR_geteuid 49
58#define __NR_getegid 50
59#define __NR_acct 51
60#define __NR_umount2 52
61#define __NR_lock 53
62#define __NR_ioctl 54
63#define __NR_fcntl 55
64#define __NR_mpx 56
65#define __NR_setpgid 57
66#define __NR_ulimit 58
67#define __NR_oldolduname 59
68#define __NR_umask 60
69#define __NR_chroot 61
70#define __NR_ustat 62
71#define __NR_dup2 63
72#define __NR_getppid 64
73#define __NR_getpgrp 65
74#define __NR_setsid 66
75#define __NR_sigaction 67
76#define __NR_sgetmask 68
77#define __NR_ssetmask 69
78#define __NR_setreuid 70
79#define __NR_setregid 71
80#define __NR_sigsuspend 72
81#define __NR_sigpending 73
82#define __NR_sethostname 74
83#define __NR_setrlimit 75
84#define __NR_getrlimit 76 /* Back compatible 2Gig limited rlimit */
85#define __NR_getrusage 77
86#define __NR_gettimeofday 78
87#define __NR_settimeofday 79
88#define __NR_getgroups 80
89#define __NR_setgroups 81
90#define __NR_select 82
91#define __NR_symlink 83
92#define __NR_oldlstat 84
93#define __NR_readlink 85
94#define __NR_uselib 86
95#define __NR_swapon 87
96#define __NR_reboot 88
97#define __NR_readdir 89
98#define __NR_mmap 90
99#define __NR_munmap 91
100#define __NR_truncate 92
101#define __NR_ftruncate 93
102#define __NR_fchmod 94
103#define __NR_fchown 95
104#define __NR_getpriority 96
105#define __NR_setpriority 97
106#define __NR_profil 98
107#define __NR_statfs 99
108#define __NR_fstatfs 100
109#define __NR_ioperm 101
110#define __NR_socketcall 102
111#define __NR_syslog 103
112#define __NR_setitimer 104
113#define __NR_getitimer 105
114#define __NR_stat 106
115#define __NR_lstat 107
116#define __NR_fstat 108
117#define __NR_olduname 109
118#define __NR_iopl 110
119#define __NR_vhangup 111
120#define __NR_idle 112
121#define __NR_vm86old 113
122#define __NR_wait4 114
123#define __NR_swapoff 115
124#define __NR_sysinfo 116
125#define __NR_ipc 117
126#define __NR_fsync 118
127#define __NR_sigreturn 119
128#define __NR_clone 120
129#define __NR_setdomainname 121
130#define __NR_uname 122
131#define __NR_modify_ldt 123
132#define __NR_adjtimex 124
133#define __NR_mprotect 125
134#define __NR_sigprocmask 126
135#define __NR_create_module 127
136#define __NR_init_module 128
137#define __NR_delete_module 129
138#define __NR_get_kernel_syms 130
139#define __NR_quotactl 131
140#define __NR_getpgid 132
141#define __NR_fchdir 133
142#define __NR_bdflush 134
143#define __NR_sysfs 135
144#define __NR_personality 136
145#define __NR_afs_syscall 137 /* Syscall for Andrew File System */
146#define __NR_setfsuid 138
147#define __NR_setfsgid 139
148#define __NR__llseek 140
149#define __NR_getdents 141
150#define __NR__newselect 142
151#define __NR_flock 143
152#define __NR_msync 144
153#define __NR_readv 145
154#define __NR_writev 146
155#define __NR_getsid 147
156#define __NR_fdatasync 148
157#define __NR__sysctl 149
158#define __NR_mlock 150
159#define __NR_munlock 151
160#define __NR_mlockall 152
161#define __NR_munlockall 153
162#define __NR_sched_setparam 154
163#define __NR_sched_getparam 155
164#define __NR_sched_setscheduler 156
165#define __NR_sched_getscheduler 157
166#define __NR_sched_yield 158
167#define __NR_sched_get_priority_max 159
168#define __NR_sched_get_priority_min 160
169#define __NR_sched_rr_get_interval 161
170#define __NR_nanosleep 162
171#define __NR_mremap 163
172#define __NR_setresuid 164
173#define __NR_getresuid 165
174#define __NR_vm86 166
175#define __NR_query_module 167
176#define __NR_poll 168
177#define __NR_nfsservctl 169
178#define __NR_setresgid 170
179#define __NR_getresgid 171
180#define __NR_prctl 172
181#define __NR_rt_sigreturn 173
182#define __NR_rt_sigaction 174
183#define __NR_rt_sigprocmask 175
184#define __NR_rt_sigpending 176
185#define __NR_rt_sigtimedwait 177
186#define __NR_rt_sigqueueinfo 178
187#define __NR_rt_sigsuspend 179
188#define __NR_pread64 180
189#define __NR_pwrite64 181
190#define __NR_chown 182
191#define __NR_getcwd 183
192#define __NR_capget 184
193#define __NR_capset 185
194#define __NR_sigaltstack 186
195#define __NR_sendfile 187
196#define __NR_getpmsg 188 /* some people actually want streams */
197#define __NR_putpmsg 189 /* some people actually want streams */
198#define __NR_vfork 190
199#define __NR_ugetrlimit 191 /* SuS compliant getrlimit */
200#define __NR_mmap2 192
201#define __NR_truncate64 193
202#define __NR_ftruncate64 194
203#define __NR_stat64 195
204#define __NR_lstat64 196
205#define __NR_fstat64 197
206#define __NR_lchown32 198
207#define __NR_getuid32 199
208#define __NR_getgid32 200
209#define __NR_geteuid32 201
210#define __NR_getegid32 202
211#define __NR_setreuid32 203
212#define __NR_setregid32 204
213#define __NR_getgroups32 205
214#define __NR_setgroups32 206
215#define __NR_fchown32 207
216#define __NR_setresuid32 208
217#define __NR_getresuid32 209
218#define __NR_setresgid32 210
219#define __NR_getresgid32 211
220#define __NR_chown32 212
221#define __NR_setuid32 213
222#define __NR_setgid32 214
223#define __NR_setfsuid32 215
224#define __NR_setfsgid32 216
225#define __NR_pivot_root 217
226#define __NR_mincore 218
227#define __NR_madvise 219
228#define __NR_madvise1 219 /* delete when C lib stub is removed */
229#define __NR_getdents64 220
230#define __NR_fcntl64 221
231/* 223 is unused */
232#define __NR_gettid 224
233#define __NR_readahead 225
234#define __NR_setxattr 226
235#define __NR_lsetxattr 227
236#define __NR_fsetxattr 228
237#define __NR_getxattr 229
238#define __NR_lgetxattr 230
239#define __NR_fgetxattr 231
240#define __NR_listxattr 232
241#define __NR_llistxattr 233
242#define __NR_flistxattr 234
243#define __NR_removexattr 235
244#define __NR_lremovexattr 236
245#define __NR_fremovexattr 237
246#define __NR_tkill 238
247#define __NR_sendfile64 239
248#define __NR_futex 240
249#define __NR_sched_setaffinity 241
250#define __NR_sched_getaffinity 242
251#define __NR_set_thread_area 243
252#define __NR_get_thread_area 244
253#define __NR_io_setup 245
254#define __NR_io_destroy 246
255#define __NR_io_getevents 247
256#define __NR_io_submit 248
257#define __NR_io_cancel 249
258#define __NR_fadvise64 250
259/* 251 is available for reuse (was briefly sys_set_zone_reclaim) */
260#define __NR_exit_group 252
261#define __NR_lookup_dcookie 253
262#define __NR_epoll_create 254
263#define __NR_epoll_ctl 255
264#define __NR_epoll_wait 256
265#define __NR_remap_file_pages 257
266#define __NR_set_tid_address 258
267#define __NR_timer_create 259
268#define __NR_timer_settime (__NR_timer_create+1)
269#define __NR_timer_gettime (__NR_timer_create+2)
270#define __NR_timer_getoverrun (__NR_timer_create+3)
271#define __NR_timer_delete (__NR_timer_create+4)
272#define __NR_clock_settime (__NR_timer_create+5)
273#define __NR_clock_gettime (__NR_timer_create+6)
274#define __NR_clock_getres (__NR_timer_create+7)
275#define __NR_clock_nanosleep (__NR_timer_create+8)
276#define __NR_statfs64 268
277#define __NR_fstatfs64 269
278#define __NR_tgkill 270
279#define __NR_utimes 271
280#define __NR_fadvise64_64 272
281#define __NR_vserver 273
282#define __NR_mbind 274
283#define __NR_get_mempolicy 275
284#define __NR_set_mempolicy 276
285#define __NR_mq_open 277
286#define __NR_mq_unlink (__NR_mq_open+1)
287#define __NR_mq_timedsend (__NR_mq_open+2)
288#define __NR_mq_timedreceive (__NR_mq_open+3)
289#define __NR_mq_notify (__NR_mq_open+4)
290#define __NR_mq_getsetattr (__NR_mq_open+5)
291#define __NR_kexec_load 283
292#define __NR_waitid 284
293/* #define __NR_sys_setaltroot 285 */
294#define __NR_add_key 286
295#define __NR_request_key 287
296#define __NR_keyctl 288
297#define __NR_ioprio_set 289
298#define __NR_ioprio_get 290
299#define __NR_inotify_init 291
300#define __NR_inotify_add_watch 292
301#define __NR_inotify_rm_watch 293
302#define __NR_migrate_pages 294
303#define __NR_openat 295
304#define __NR_mkdirat 296
305#define __NR_mknodat 297
306#define __NR_fchownat 298
307#define __NR_futimesat 299
308#define __NR_fstatat64 300
309#define __NR_unlinkat 301
310#define __NR_renameat 302
311#define __NR_linkat 303
312#define __NR_symlinkat 304
313#define __NR_readlinkat 305
314#define __NR_fchmodat 306
315#define __NR_faccessat 307
316#define __NR_pselect6 308
317#define __NR_ppoll 309
318#define __NR_unshare 310
319#define __NR_set_robust_list 311
320#define __NR_get_robust_list 312
321#define __NR_splice 313
322#define __NR_sync_file_range 314
323#define __NR_tee 315
324#define __NR_vmsplice 316
325#define __NR_move_pages 317
326#define __NR_getcpu 318
327#define __NR_epoll_pwait 319
328#define __NR_utimensat 320
329#define __NR_signalfd 321
330#define __NR_timerfd_create 322
331#define __NR_eventfd 323
332#define __NR_fallocate 324
333#define __NR_timerfd_settime 325
334#define __NR_timerfd_gettime 326
335#define __NR_signalfd4 327
336#define __NR_eventfd2 328
337#define __NR_epoll_create1 329
338#define __NR_dup3 330
339#define __NR_pipe2 331
340#define __NR_inotify_init1 332
341
342#ifdef __KERNEL__
343
344#define __ARCH_WANT_IPC_PARSE_VERSION
345#define __ARCH_WANT_OLD_READDIR
346#define __ARCH_WANT_OLD_STAT
347#define __ARCH_WANT_STAT64
348#define __ARCH_WANT_SYS_ALARM
349#define __ARCH_WANT_SYS_GETHOSTNAME
350#define __ARCH_WANT_SYS_PAUSE
351#define __ARCH_WANT_SYS_SGETMASK
352#define __ARCH_WANT_SYS_SIGNAL
353#define __ARCH_WANT_SYS_TIME
354#define __ARCH_WANT_SYS_UTIME
355#define __ARCH_WANT_SYS_WAITPID
356#define __ARCH_WANT_SYS_SOCKETCALL
357#define __ARCH_WANT_SYS_FADVISE64
358#define __ARCH_WANT_SYS_GETPGRP
359#define __ARCH_WANT_SYS_LLSEEK
360#define __ARCH_WANT_SYS_NICE
361#define __ARCH_WANT_SYS_OLD_GETRLIMIT
362#define __ARCH_WANT_SYS_OLDUMOUNT
363#define __ARCH_WANT_SYS_SIGPENDING
364#define __ARCH_WANT_SYS_SIGPROCMASK
365#define __ARCH_WANT_SYS_RT_SIGACTION
366#define __ARCH_WANT_SYS_RT_SIGSUSPEND
367
368/*
369 * "Conditional" syscalls
370 *
371 * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
372 * but it doesn't work on all toolchains, so we just do it by hand
373 */
374#ifndef cond_syscall
375#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
376#endif
377
378#endif /* __KERNEL__ */
379#endif /* _ASM_X86_UNISTD_32_H */
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
new file mode 100644
index 000000000000..834b2c1d89fb
--- /dev/null
+++ b/arch/x86/include/asm/unistd_64.h
@@ -0,0 +1,693 @@
1#ifndef _ASM_X86_UNISTD_64_H
2#define _ASM_X86_UNISTD_64_H
3
4#ifndef __SYSCALL
5#define __SYSCALL(a, b)
6#endif
7
8/*
9 * This file contains the system call numbers.
10 *
11 * Note: holes are not allowed.
12 */
13
14/* at least 8 syscall per cacheline */
15#define __NR_read 0
16__SYSCALL(__NR_read, sys_read)
17#define __NR_write 1
18__SYSCALL(__NR_write, sys_write)
19#define __NR_open 2
20__SYSCALL(__NR_open, sys_open)
21#define __NR_close 3
22__SYSCALL(__NR_close, sys_close)
23#define __NR_stat 4
24__SYSCALL(__NR_stat, sys_newstat)
25#define __NR_fstat 5
26__SYSCALL(__NR_fstat, sys_newfstat)
27#define __NR_lstat 6
28__SYSCALL(__NR_lstat, sys_newlstat)
29#define __NR_poll 7
30__SYSCALL(__NR_poll, sys_poll)
31
32#define __NR_lseek 8
33__SYSCALL(__NR_lseek, sys_lseek)
34#define __NR_mmap 9
35__SYSCALL(__NR_mmap, sys_mmap)
36#define __NR_mprotect 10
37__SYSCALL(__NR_mprotect, sys_mprotect)
38#define __NR_munmap 11
39__SYSCALL(__NR_munmap, sys_munmap)
40#define __NR_brk 12
41__SYSCALL(__NR_brk, sys_brk)
42#define __NR_rt_sigaction 13
43__SYSCALL(__NR_rt_sigaction, sys_rt_sigaction)
44#define __NR_rt_sigprocmask 14
45__SYSCALL(__NR_rt_sigprocmask, sys_rt_sigprocmask)
46#define __NR_rt_sigreturn 15
47__SYSCALL(__NR_rt_sigreturn, stub_rt_sigreturn)
48
49#define __NR_ioctl 16
50__SYSCALL(__NR_ioctl, sys_ioctl)
51#define __NR_pread64 17
52__SYSCALL(__NR_pread64, sys_pread64)
53#define __NR_pwrite64 18
54__SYSCALL(__NR_pwrite64, sys_pwrite64)
55#define __NR_readv 19
56__SYSCALL(__NR_readv, sys_readv)
57#define __NR_writev 20
58__SYSCALL(__NR_writev, sys_writev)
59#define __NR_access 21
60__SYSCALL(__NR_access, sys_access)
61#define __NR_pipe 22
62__SYSCALL(__NR_pipe, sys_pipe)
63#define __NR_select 23
64__SYSCALL(__NR_select, sys_select)
65
66#define __NR_sched_yield 24
67__SYSCALL(__NR_sched_yield, sys_sched_yield)
68#define __NR_mremap 25
69__SYSCALL(__NR_mremap, sys_mremap)
70#define __NR_msync 26
71__SYSCALL(__NR_msync, sys_msync)
72#define __NR_mincore 27
73__SYSCALL(__NR_mincore, sys_mincore)
74#define __NR_madvise 28
75__SYSCALL(__NR_madvise, sys_madvise)
76#define __NR_shmget 29
77__SYSCALL(__NR_shmget, sys_shmget)
78#define __NR_shmat 30
79__SYSCALL(__NR_shmat, sys_shmat)
80#define __NR_shmctl 31
81__SYSCALL(__NR_shmctl, sys_shmctl)
82
83#define __NR_dup 32
84__SYSCALL(__NR_dup, sys_dup)
85#define __NR_dup2 33
86__SYSCALL(__NR_dup2, sys_dup2)
87#define __NR_pause 34
88__SYSCALL(__NR_pause, sys_pause)
89#define __NR_nanosleep 35
90__SYSCALL(__NR_nanosleep, sys_nanosleep)
91#define __NR_getitimer 36
92__SYSCALL(__NR_getitimer, sys_getitimer)
93#define __NR_alarm 37
94__SYSCALL(__NR_alarm, sys_alarm)
95#define __NR_setitimer 38
96__SYSCALL(__NR_setitimer, sys_setitimer)
97#define __NR_getpid 39
98__SYSCALL(__NR_getpid, sys_getpid)
99
100#define __NR_sendfile 40
101__SYSCALL(__NR_sendfile, sys_sendfile64)
102#define __NR_socket 41
103__SYSCALL(__NR_socket, sys_socket)
104#define __NR_connect 42
105__SYSCALL(__NR_connect, sys_connect)
106#define __NR_accept 43
107__SYSCALL(__NR_accept, sys_accept)
108#define __NR_sendto 44
109__SYSCALL(__NR_sendto, sys_sendto)
110#define __NR_recvfrom 45
111__SYSCALL(__NR_recvfrom, sys_recvfrom)
112#define __NR_sendmsg 46
113__SYSCALL(__NR_sendmsg, sys_sendmsg)
114#define __NR_recvmsg 47
115__SYSCALL(__NR_recvmsg, sys_recvmsg)
116
117#define __NR_shutdown 48
118__SYSCALL(__NR_shutdown, sys_shutdown)
119#define __NR_bind 49
120__SYSCALL(__NR_bind, sys_bind)
121#define __NR_listen 50
122__SYSCALL(__NR_listen, sys_listen)
123#define __NR_getsockname 51
124__SYSCALL(__NR_getsockname, sys_getsockname)
125#define __NR_getpeername 52
126__SYSCALL(__NR_getpeername, sys_getpeername)
127#define __NR_socketpair 53
128__SYSCALL(__NR_socketpair, sys_socketpair)
129#define __NR_setsockopt 54
130__SYSCALL(__NR_setsockopt, sys_setsockopt)
131#define __NR_getsockopt 55
132__SYSCALL(__NR_getsockopt, sys_getsockopt)
133
134#define __NR_clone 56
135__SYSCALL(__NR_clone, stub_clone)
136#define __NR_fork 57
137__SYSCALL(__NR_fork, stub_fork)
138#define __NR_vfork 58
139__SYSCALL(__NR_vfork, stub_vfork)
140#define __NR_execve 59
141__SYSCALL(__NR_execve, stub_execve)
142#define __NR_exit 60
143__SYSCALL(__NR_exit, sys_exit)
144#define __NR_wait4 61
145__SYSCALL(__NR_wait4, sys_wait4)
146#define __NR_kill 62
147__SYSCALL(__NR_kill, sys_kill)
148#define __NR_uname 63
149__SYSCALL(__NR_uname, sys_uname)
150
151#define __NR_semget 64
152__SYSCALL(__NR_semget, sys_semget)
153#define __NR_semop 65
154__SYSCALL(__NR_semop, sys_semop)
155#define __NR_semctl 66
156__SYSCALL(__NR_semctl, sys_semctl)
157#define __NR_shmdt 67
158__SYSCALL(__NR_shmdt, sys_shmdt)
159#define __NR_msgget 68
160__SYSCALL(__NR_msgget, sys_msgget)
161#define __NR_msgsnd 69
162__SYSCALL(__NR_msgsnd, sys_msgsnd)
163#define __NR_msgrcv 70
164__SYSCALL(__NR_msgrcv, sys_msgrcv)
165#define __NR_msgctl 71
166__SYSCALL(__NR_msgctl, sys_msgctl)
167
168#define __NR_fcntl 72
169__SYSCALL(__NR_fcntl, sys_fcntl)
170#define __NR_flock 73
171__SYSCALL(__NR_flock, sys_flock)
172#define __NR_fsync 74
173__SYSCALL(__NR_fsync, sys_fsync)
174#define __NR_fdatasync 75
175__SYSCALL(__NR_fdatasync, sys_fdatasync)
176#define __NR_truncate 76
177__SYSCALL(__NR_truncate, sys_truncate)
178#define __NR_ftruncate 77
179__SYSCALL(__NR_ftruncate, sys_ftruncate)
180#define __NR_getdents 78
181__SYSCALL(__NR_getdents, sys_getdents)
182#define __NR_getcwd 79
183__SYSCALL(__NR_getcwd, sys_getcwd)
184
185#define __NR_chdir 80
186__SYSCALL(__NR_chdir, sys_chdir)
187#define __NR_fchdir 81
188__SYSCALL(__NR_fchdir, sys_fchdir)
189#define __NR_rename 82
190__SYSCALL(__NR_rename, sys_rename)
191#define __NR_mkdir 83
192__SYSCALL(__NR_mkdir, sys_mkdir)
193#define __NR_rmdir 84
194__SYSCALL(__NR_rmdir, sys_rmdir)
195#define __NR_creat 85
196__SYSCALL(__NR_creat, sys_creat)
197#define __NR_link 86
198__SYSCALL(__NR_link, sys_link)
199#define __NR_unlink 87
200__SYSCALL(__NR_unlink, sys_unlink)
201
202#define __NR_symlink 88
203__SYSCALL(__NR_symlink, sys_symlink)
204#define __NR_readlink 89
205__SYSCALL(__NR_readlink, sys_readlink)
206#define __NR_chmod 90
207__SYSCALL(__NR_chmod, sys_chmod)
208#define __NR_fchmod 91
209__SYSCALL(__NR_fchmod, sys_fchmod)
210#define __NR_chown 92
211__SYSCALL(__NR_chown, sys_chown)
212#define __NR_fchown 93
213__SYSCALL(__NR_fchown, sys_fchown)
214#define __NR_lchown 94
215__SYSCALL(__NR_lchown, sys_lchown)
216#define __NR_umask 95
217__SYSCALL(__NR_umask, sys_umask)
218
219#define __NR_gettimeofday 96
220__SYSCALL(__NR_gettimeofday, sys_gettimeofday)
221#define __NR_getrlimit 97
222__SYSCALL(__NR_getrlimit, sys_getrlimit)
223#define __NR_getrusage 98
224__SYSCALL(__NR_getrusage, sys_getrusage)
225#define __NR_sysinfo 99
226__SYSCALL(__NR_sysinfo, sys_sysinfo)
227#define __NR_times 100
228__SYSCALL(__NR_times, sys_times)
229#define __NR_ptrace 101
230__SYSCALL(__NR_ptrace, sys_ptrace)
231#define __NR_getuid 102
232__SYSCALL(__NR_getuid, sys_getuid)
233#define __NR_syslog 103
234__SYSCALL(__NR_syslog, sys_syslog)
235
236/* at the very end the stuff that never runs during the benchmarks */
237#define __NR_getgid 104
238__SYSCALL(__NR_getgid, sys_getgid)
239#define __NR_setuid 105
240__SYSCALL(__NR_setuid, sys_setuid)
241#define __NR_setgid 106
242__SYSCALL(__NR_setgid, sys_setgid)
243#define __NR_geteuid 107
244__SYSCALL(__NR_geteuid, sys_geteuid)
245#define __NR_getegid 108
246__SYSCALL(__NR_getegid, sys_getegid)
247#define __NR_setpgid 109
248__SYSCALL(__NR_setpgid, sys_setpgid)
249#define __NR_getppid 110
250__SYSCALL(__NR_getppid, sys_getppid)
251#define __NR_getpgrp 111
252__SYSCALL(__NR_getpgrp, sys_getpgrp)
253
254#define __NR_setsid 112
255__SYSCALL(__NR_setsid, sys_setsid)
256#define __NR_setreuid 113
257__SYSCALL(__NR_setreuid, sys_setreuid)
258#define __NR_setregid 114
259__SYSCALL(__NR_setregid, sys_setregid)
260#define __NR_getgroups 115
261__SYSCALL(__NR_getgroups, sys_getgroups)
262#define __NR_setgroups 116
263__SYSCALL(__NR_setgroups, sys_setgroups)
264#define __NR_setresuid 117
265__SYSCALL(__NR_setresuid, sys_setresuid)
266#define __NR_getresuid 118
267__SYSCALL(__NR_getresuid, sys_getresuid)
268#define __NR_setresgid 119
269__SYSCALL(__NR_setresgid, sys_setresgid)
270
271#define __NR_getresgid 120
272__SYSCALL(__NR_getresgid, sys_getresgid)
273#define __NR_getpgid 121
274__SYSCALL(__NR_getpgid, sys_getpgid)
275#define __NR_setfsuid 122
276__SYSCALL(__NR_setfsuid, sys_setfsuid)
277#define __NR_setfsgid 123
278__SYSCALL(__NR_setfsgid, sys_setfsgid)
279#define __NR_getsid 124
280__SYSCALL(__NR_getsid, sys_getsid)
281#define __NR_capget 125
282__SYSCALL(__NR_capget, sys_capget)
283#define __NR_capset 126
284__SYSCALL(__NR_capset, sys_capset)
285
286#define __NR_rt_sigpending 127
287__SYSCALL(__NR_rt_sigpending, sys_rt_sigpending)
288#define __NR_rt_sigtimedwait 128
289__SYSCALL(__NR_rt_sigtimedwait, sys_rt_sigtimedwait)
290#define __NR_rt_sigqueueinfo 129
291__SYSCALL(__NR_rt_sigqueueinfo, sys_rt_sigqueueinfo)
292#define __NR_rt_sigsuspend 130
293__SYSCALL(__NR_rt_sigsuspend, sys_rt_sigsuspend)
294#define __NR_sigaltstack 131
295__SYSCALL(__NR_sigaltstack, stub_sigaltstack)
296#define __NR_utime 132
297__SYSCALL(__NR_utime, sys_utime)
298#define __NR_mknod 133
299__SYSCALL(__NR_mknod, sys_mknod)
300
301/* Only needed for a.out */
302#define __NR_uselib 134
303__SYSCALL(__NR_uselib, sys_ni_syscall)
304#define __NR_personality 135
305__SYSCALL(__NR_personality, sys_personality)
306
307#define __NR_ustat 136
308__SYSCALL(__NR_ustat, sys_ustat)
309#define __NR_statfs 137
310__SYSCALL(__NR_statfs, sys_statfs)
311#define __NR_fstatfs 138
312__SYSCALL(__NR_fstatfs, sys_fstatfs)
313#define __NR_sysfs 139
314__SYSCALL(__NR_sysfs, sys_sysfs)
315
316#define __NR_getpriority 140
317__SYSCALL(__NR_getpriority, sys_getpriority)
318#define __NR_setpriority 141
319__SYSCALL(__NR_setpriority, sys_setpriority)
320#define __NR_sched_setparam 142
321__SYSCALL(__NR_sched_setparam, sys_sched_setparam)
322#define __NR_sched_getparam 143
323__SYSCALL(__NR_sched_getparam, sys_sched_getparam)
324#define __NR_sched_setscheduler 144
325__SYSCALL(__NR_sched_setscheduler, sys_sched_setscheduler)
326#define __NR_sched_getscheduler 145
327__SYSCALL(__NR_sched_getscheduler, sys_sched_getscheduler)
328#define __NR_sched_get_priority_max 146
329__SYSCALL(__NR_sched_get_priority_max, sys_sched_get_priority_max)
330#define __NR_sched_get_priority_min 147
331__SYSCALL(__NR_sched_get_priority_min, sys_sched_get_priority_min)
332#define __NR_sched_rr_get_interval 148
333__SYSCALL(__NR_sched_rr_get_interval, sys_sched_rr_get_interval)
334
335#define __NR_mlock 149
336__SYSCALL(__NR_mlock, sys_mlock)
337#define __NR_munlock 150
338__SYSCALL(__NR_munlock, sys_munlock)
339#define __NR_mlockall 151
340__SYSCALL(__NR_mlockall, sys_mlockall)
341#define __NR_munlockall 152
342__SYSCALL(__NR_munlockall, sys_munlockall)
343
344#define __NR_vhangup 153
345__SYSCALL(__NR_vhangup, sys_vhangup)
346
347#define __NR_modify_ldt 154
348__SYSCALL(__NR_modify_ldt, sys_modify_ldt)
349
350#define __NR_pivot_root 155
351__SYSCALL(__NR_pivot_root, sys_pivot_root)
352
353#define __NR__sysctl 156
354__SYSCALL(__NR__sysctl, sys_sysctl)
355
356#define __NR_prctl 157
357__SYSCALL(__NR_prctl, sys_prctl)
358#define __NR_arch_prctl 158
359__SYSCALL(__NR_arch_prctl, sys_arch_prctl)
360
361#define __NR_adjtimex 159
362__SYSCALL(__NR_adjtimex, sys_adjtimex)
363
364#define __NR_setrlimit 160
365__SYSCALL(__NR_setrlimit, sys_setrlimit)
366
367#define __NR_chroot 161
368__SYSCALL(__NR_chroot, sys_chroot)
369
370#define __NR_sync 162
371__SYSCALL(__NR_sync, sys_sync)
372
373#define __NR_acct 163
374__SYSCALL(__NR_acct, sys_acct)
375
376#define __NR_settimeofday 164
377__SYSCALL(__NR_settimeofday, sys_settimeofday)
378
379#define __NR_mount 165
380__SYSCALL(__NR_mount, sys_mount)
381#define __NR_umount2 166
382__SYSCALL(__NR_umount2, sys_umount)
383
384#define __NR_swapon 167
385__SYSCALL(__NR_swapon, sys_swapon)
386#define __NR_swapoff 168
387__SYSCALL(__NR_swapoff, sys_swapoff)
388
389#define __NR_reboot 169
390__SYSCALL(__NR_reboot, sys_reboot)
391
392#define __NR_sethostname 170
393__SYSCALL(__NR_sethostname, sys_sethostname)
394#define __NR_setdomainname 171
395__SYSCALL(__NR_setdomainname, sys_setdomainname)
396
397#define __NR_iopl 172
398__SYSCALL(__NR_iopl, stub_iopl)
399#define __NR_ioperm 173
400__SYSCALL(__NR_ioperm, sys_ioperm)
401
402#define __NR_create_module 174
403__SYSCALL(__NR_create_module, sys_ni_syscall)
404#define __NR_init_module 175
405__SYSCALL(__NR_init_module, sys_init_module)
406#define __NR_delete_module 176
407__SYSCALL(__NR_delete_module, sys_delete_module)
408#define __NR_get_kernel_syms 177
409__SYSCALL(__NR_get_kernel_syms, sys_ni_syscall)
410#define __NR_query_module 178
411__SYSCALL(__NR_query_module, sys_ni_syscall)
412
413#define __NR_quotactl 179
414__SYSCALL(__NR_quotactl, sys_quotactl)
415
416#define __NR_nfsservctl 180
417__SYSCALL(__NR_nfsservctl, sys_nfsservctl)
418
419/* reserved for LiS/STREAMS */
420#define __NR_getpmsg 181
421__SYSCALL(__NR_getpmsg, sys_ni_syscall)
422#define __NR_putpmsg 182
423__SYSCALL(__NR_putpmsg, sys_ni_syscall)
424
425/* reserved for AFS */
426#define __NR_afs_syscall 183
427__SYSCALL(__NR_afs_syscall, sys_ni_syscall)
428
429/* reserved for tux */
430#define __NR_tuxcall 184
431__SYSCALL(__NR_tuxcall, sys_ni_syscall)
432
433#define __NR_security 185
434__SYSCALL(__NR_security, sys_ni_syscall)
435
436#define __NR_gettid 186
437__SYSCALL(__NR_gettid, sys_gettid)
438
439#define __NR_readahead 187
440__SYSCALL(__NR_readahead, sys_readahead)
441#define __NR_setxattr 188
442__SYSCALL(__NR_setxattr, sys_setxattr)
443#define __NR_lsetxattr 189
444__SYSCALL(__NR_lsetxattr, sys_lsetxattr)
445#define __NR_fsetxattr 190
446__SYSCALL(__NR_fsetxattr, sys_fsetxattr)
447#define __NR_getxattr 191
448__SYSCALL(__NR_getxattr, sys_getxattr)
449#define __NR_lgetxattr 192
450__SYSCALL(__NR_lgetxattr, sys_lgetxattr)
451#define __NR_fgetxattr 193
452__SYSCALL(__NR_fgetxattr, sys_fgetxattr)
453#define __NR_listxattr 194
454__SYSCALL(__NR_listxattr, sys_listxattr)
455#define __NR_llistxattr 195
456__SYSCALL(__NR_llistxattr, sys_llistxattr)
457#define __NR_flistxattr 196
458__SYSCALL(__NR_flistxattr, sys_flistxattr)
459#define __NR_removexattr 197
460__SYSCALL(__NR_removexattr, sys_removexattr)
461#define __NR_lremovexattr 198
462__SYSCALL(__NR_lremovexattr, sys_lremovexattr)
463#define __NR_fremovexattr 199
464__SYSCALL(__NR_fremovexattr, sys_fremovexattr)
465#define __NR_tkill 200
466__SYSCALL(__NR_tkill, sys_tkill)
467#define __NR_time 201
468__SYSCALL(__NR_time, sys_time)
469#define __NR_futex 202
470__SYSCALL(__NR_futex, sys_futex)
471#define __NR_sched_setaffinity 203
472__SYSCALL(__NR_sched_setaffinity, sys_sched_setaffinity)
473#define __NR_sched_getaffinity 204
474__SYSCALL(__NR_sched_getaffinity, sys_sched_getaffinity)
475#define __NR_set_thread_area 205
476__SYSCALL(__NR_set_thread_area, sys_ni_syscall) /* use arch_prctl */
477#define __NR_io_setup 206
478__SYSCALL(__NR_io_setup, sys_io_setup)
479#define __NR_io_destroy 207
480__SYSCALL(__NR_io_destroy, sys_io_destroy)
481#define __NR_io_getevents 208
482__SYSCALL(__NR_io_getevents, sys_io_getevents)
483#define __NR_io_submit 209
484__SYSCALL(__NR_io_submit, sys_io_submit)
485#define __NR_io_cancel 210
486__SYSCALL(__NR_io_cancel, sys_io_cancel)
487#define __NR_get_thread_area 211
488__SYSCALL(__NR_get_thread_area, sys_ni_syscall) /* use arch_prctl */
489#define __NR_lookup_dcookie 212
490__SYSCALL(__NR_lookup_dcookie, sys_lookup_dcookie)
491#define __NR_epoll_create 213
492__SYSCALL(__NR_epoll_create, sys_epoll_create)
493#define __NR_epoll_ctl_old 214
494__SYSCALL(__NR_epoll_ctl_old, sys_ni_syscall)
495#define __NR_epoll_wait_old 215
496__SYSCALL(__NR_epoll_wait_old, sys_ni_syscall)
497#define __NR_remap_file_pages 216
498__SYSCALL(__NR_remap_file_pages, sys_remap_file_pages)
499#define __NR_getdents64 217
500__SYSCALL(__NR_getdents64, sys_getdents64)
501#define __NR_set_tid_address 218
502__SYSCALL(__NR_set_tid_address, sys_set_tid_address)
503#define __NR_restart_syscall 219
504__SYSCALL(__NR_restart_syscall, sys_restart_syscall)
505#define __NR_semtimedop 220
506__SYSCALL(__NR_semtimedop, sys_semtimedop)
507#define __NR_fadvise64 221
508__SYSCALL(__NR_fadvise64, sys_fadvise64)
509#define __NR_timer_create 222
510__SYSCALL(__NR_timer_create, sys_timer_create)
511#define __NR_timer_settime 223
512__SYSCALL(__NR_timer_settime, sys_timer_settime)
513#define __NR_timer_gettime 224
514__SYSCALL(__NR_timer_gettime, sys_timer_gettime)
515#define __NR_timer_getoverrun 225
516__SYSCALL(__NR_timer_getoverrun, sys_timer_getoverrun)
517#define __NR_timer_delete 226
518__SYSCALL(__NR_timer_delete, sys_timer_delete)
519#define __NR_clock_settime 227
520__SYSCALL(__NR_clock_settime, sys_clock_settime)
521#define __NR_clock_gettime 228
522__SYSCALL(__NR_clock_gettime, sys_clock_gettime)
523#define __NR_clock_getres 229
524__SYSCALL(__NR_clock_getres, sys_clock_getres)
525#define __NR_clock_nanosleep 230
526__SYSCALL(__NR_clock_nanosleep, sys_clock_nanosleep)
527#define __NR_exit_group 231
528__SYSCALL(__NR_exit_group, sys_exit_group)
529#define __NR_epoll_wait 232
530__SYSCALL(__NR_epoll_wait, sys_epoll_wait)
531#define __NR_epoll_ctl 233
532__SYSCALL(__NR_epoll_ctl, sys_epoll_ctl)
533#define __NR_tgkill 234
534__SYSCALL(__NR_tgkill, sys_tgkill)
535#define __NR_utimes 235
536__SYSCALL(__NR_utimes, sys_utimes)
537#define __NR_vserver 236
538__SYSCALL(__NR_vserver, sys_ni_syscall)
539#define __NR_mbind 237
540__SYSCALL(__NR_mbind, sys_mbind)
541#define __NR_set_mempolicy 238
542__SYSCALL(__NR_set_mempolicy, sys_set_mempolicy)
543#define __NR_get_mempolicy 239
544__SYSCALL(__NR_get_mempolicy, sys_get_mempolicy)
545#define __NR_mq_open 240
546__SYSCALL(__NR_mq_open, sys_mq_open)
547#define __NR_mq_unlink 241
548__SYSCALL(__NR_mq_unlink, sys_mq_unlink)
549#define __NR_mq_timedsend 242
550__SYSCALL(__NR_mq_timedsend, sys_mq_timedsend)
551#define __NR_mq_timedreceive 243
552__SYSCALL(__NR_mq_timedreceive, sys_mq_timedreceive)
553#define __NR_mq_notify 244
554__SYSCALL(__NR_mq_notify, sys_mq_notify)
555#define __NR_mq_getsetattr 245
556__SYSCALL(__NR_mq_getsetattr, sys_mq_getsetattr)
557#define __NR_kexec_load 246
558__SYSCALL(__NR_kexec_load, sys_kexec_load)
559#define __NR_waitid 247
560__SYSCALL(__NR_waitid, sys_waitid)
561#define __NR_add_key 248
562__SYSCALL(__NR_add_key, sys_add_key)
563#define __NR_request_key 249
564__SYSCALL(__NR_request_key, sys_request_key)
565#define __NR_keyctl 250
566__SYSCALL(__NR_keyctl, sys_keyctl)
567#define __NR_ioprio_set 251
568__SYSCALL(__NR_ioprio_set, sys_ioprio_set)
569#define __NR_ioprio_get 252
570__SYSCALL(__NR_ioprio_get, sys_ioprio_get)
571#define __NR_inotify_init 253
572__SYSCALL(__NR_inotify_init, sys_inotify_init)
573#define __NR_inotify_add_watch 254
574__SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch)
575#define __NR_inotify_rm_watch 255
576__SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
577#define __NR_migrate_pages 256
578__SYSCALL(__NR_migrate_pages, sys_migrate_pages)
579#define __NR_openat 257
580__SYSCALL(__NR_openat, sys_openat)
581#define __NR_mkdirat 258
582__SYSCALL(__NR_mkdirat, sys_mkdirat)
583#define __NR_mknodat 259
584__SYSCALL(__NR_mknodat, sys_mknodat)
585#define __NR_fchownat 260
586__SYSCALL(__NR_fchownat, sys_fchownat)
587#define __NR_futimesat 261
588__SYSCALL(__NR_futimesat, sys_futimesat)
589#define __NR_newfstatat 262
590__SYSCALL(__NR_newfstatat, sys_newfstatat)
591#define __NR_unlinkat 263
592__SYSCALL(__NR_unlinkat, sys_unlinkat)
593#define __NR_renameat 264
594__SYSCALL(__NR_renameat, sys_renameat)
595#define __NR_linkat 265
596__SYSCALL(__NR_linkat, sys_linkat)
597#define __NR_symlinkat 266
598__SYSCALL(__NR_symlinkat, sys_symlinkat)
599#define __NR_readlinkat 267
600__SYSCALL(__NR_readlinkat, sys_readlinkat)
601#define __NR_fchmodat 268
602__SYSCALL(__NR_fchmodat, sys_fchmodat)
603#define __NR_faccessat 269
604__SYSCALL(__NR_faccessat, sys_faccessat)
605#define __NR_pselect6 270
606__SYSCALL(__NR_pselect6, sys_pselect6)
607#define __NR_ppoll 271
608__SYSCALL(__NR_ppoll, sys_ppoll)
609#define __NR_unshare 272
610__SYSCALL(__NR_unshare, sys_unshare)
611#define __NR_set_robust_list 273
612__SYSCALL(__NR_set_robust_list, sys_set_robust_list)
613#define __NR_get_robust_list 274
614__SYSCALL(__NR_get_robust_list, sys_get_robust_list)
615#define __NR_splice 275
616__SYSCALL(__NR_splice, sys_splice)
617#define __NR_tee 276
618__SYSCALL(__NR_tee, sys_tee)
619#define __NR_sync_file_range 277
620__SYSCALL(__NR_sync_file_range, sys_sync_file_range)
621#define __NR_vmsplice 278
622__SYSCALL(__NR_vmsplice, sys_vmsplice)
623#define __NR_move_pages 279
624__SYSCALL(__NR_move_pages, sys_move_pages)
625#define __NR_utimensat 280
626__SYSCALL(__NR_utimensat, sys_utimensat)
627#define __IGNORE_getcpu /* implemented as a vsyscall */
628#define __NR_epoll_pwait 281
629__SYSCALL(__NR_epoll_pwait, sys_epoll_pwait)
630#define __NR_signalfd 282
631__SYSCALL(__NR_signalfd, sys_signalfd)
632#define __NR_timerfd_create 283
633__SYSCALL(__NR_timerfd_create, sys_timerfd_create)
634#define __NR_eventfd 284
635__SYSCALL(__NR_eventfd, sys_eventfd)
636#define __NR_fallocate 285
637__SYSCALL(__NR_fallocate, sys_fallocate)
638#define __NR_timerfd_settime 286
639__SYSCALL(__NR_timerfd_settime, sys_timerfd_settime)
640#define __NR_timerfd_gettime 287
641__SYSCALL(__NR_timerfd_gettime, sys_timerfd_gettime)
642#define __NR_paccept 288
643__SYSCALL(__NR_paccept, sys_paccept)
644#define __NR_signalfd4 289
645__SYSCALL(__NR_signalfd4, sys_signalfd4)
646#define __NR_eventfd2 290
647__SYSCALL(__NR_eventfd2, sys_eventfd2)
648#define __NR_epoll_create1 291
649__SYSCALL(__NR_epoll_create1, sys_epoll_create1)
650#define __NR_dup3 292
651__SYSCALL(__NR_dup3, sys_dup3)
652#define __NR_pipe2 293
653__SYSCALL(__NR_pipe2, sys_pipe2)
654#define __NR_inotify_init1 294
655__SYSCALL(__NR_inotify_init1, sys_inotify_init1)
656
657
658#ifndef __NO_STUBS
659#define __ARCH_WANT_OLD_READDIR
660#define __ARCH_WANT_OLD_STAT
661#define __ARCH_WANT_SYS_ALARM
662#define __ARCH_WANT_SYS_GETHOSTNAME
663#define __ARCH_WANT_SYS_PAUSE
664#define __ARCH_WANT_SYS_SGETMASK
665#define __ARCH_WANT_SYS_SIGNAL
666#define __ARCH_WANT_SYS_UTIME
667#define __ARCH_WANT_SYS_WAITPID
668#define __ARCH_WANT_SYS_SOCKETCALL
669#define __ARCH_WANT_SYS_FADVISE64
670#define __ARCH_WANT_SYS_GETPGRP
671#define __ARCH_WANT_SYS_LLSEEK
672#define __ARCH_WANT_SYS_NICE
673#define __ARCH_WANT_SYS_OLD_GETRLIMIT
674#define __ARCH_WANT_SYS_OLDUMOUNT
675#define __ARCH_WANT_SYS_SIGPENDING
676#define __ARCH_WANT_SYS_SIGPROCMASK
677#define __ARCH_WANT_SYS_RT_SIGACTION
678#define __ARCH_WANT_SYS_RT_SIGSUSPEND
679#define __ARCH_WANT_SYS_TIME
680#define __ARCH_WANT_COMPAT_SYS_TIME
681#endif /* __NO_STUBS */
682
683#ifdef __KERNEL__
684/*
685 * "Conditional" syscalls
686 *
687 * What we want is __attribute__((weak,alias("sys_ni_syscall"))),
688 * but it doesn't work on all toolchains, so we just do it by hand
689 */
690#define cond_syscall(x) asm(".weak\t" #x "\n\t.set\t" #x ",sys_ni_syscall")
691#endif /* __KERNEL__ */
692
693#endif /* _ASM_X86_UNISTD_64_H */
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
new file mode 100644
index 000000000000..8b064bd9c553
--- /dev/null
+++ b/arch/x86/include/asm/unwind.h
@@ -0,0 +1,13 @@
1#ifndef _ASM_X86_UNWIND_H
2#define _ASM_X86_UNWIND_H
3
4#define UNW_PC(frame) ((void)(frame), 0UL)
5#define UNW_SP(frame) ((void)(frame), 0UL)
6#define UNW_FP(frame) ((void)(frame), 0UL)
7
8static inline int arch_unw_user_mode(const void *info)
9{
10 return 0;
11}
12
13#endif /* _ASM_X86_UNWIND_H */
diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h
new file mode 100644
index 000000000000..999873b22e7f
--- /dev/null
+++ b/arch/x86/include/asm/user.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "user_32.h"
3#else
4# include "user_64.h"
5#endif
diff --git a/arch/x86/include/asm/user32.h b/arch/x86/include/asm/user32.h
new file mode 100644
index 000000000000..14cbb73ebcba
--- /dev/null
+++ b/arch/x86/include/asm/user32.h
@@ -0,0 +1,70 @@
1#ifndef _ASM_X86_USER32_H
2#define _ASM_X86_USER32_H
3
4/* IA32 compatible user structures for ptrace.
5 * These should be used for 32bit coredumps too. */
6
7struct user_i387_ia32_struct {
8 u32 cwd;
9 u32 swd;
10 u32 twd;
11 u32 fip;
12 u32 fcs;
13 u32 foo;
14 u32 fos;
15 u32 st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
16};
17
18/* FSAVE frame with extensions */
19struct user32_fxsr_struct {
20 unsigned short cwd;
21 unsigned short swd;
22 unsigned short twd; /* not compatible to 64bit twd */
23 unsigned short fop;
24 int fip;
25 int fcs;
26 int foo;
27 int fos;
28 int mxcsr;
29 int reserved;
30 int st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
31 int xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
32 int padding[56];
33};
34
35struct user_regs_struct32 {
36 __u32 ebx, ecx, edx, esi, edi, ebp, eax;
37 unsigned short ds, __ds, es, __es;
38 unsigned short fs, __fs, gs, __gs;
39 __u32 orig_eax, eip;
40 unsigned short cs, __cs;
41 __u32 eflags, esp;
42 unsigned short ss, __ss;
43};
44
45struct user32 {
46 struct user_regs_struct32 regs; /* Where the registers are actually stored */
47 int u_fpvalid; /* True if math co-processor being used. */
48 /* for this mess. Not yet used. */
49 struct user_i387_ia32_struct i387; /* Math Co-processor registers. */
50/* The rest of this junk is to help gdb figure out what goes where */
51 __u32 u_tsize; /* Text segment size (pages). */
52 __u32 u_dsize; /* Data segment size (pages). */
53 __u32 u_ssize; /* Stack segment size (pages). */
54 __u32 start_code; /* Starting virtual address of text. */
55 __u32 start_stack; /* Starting virtual address of stack area.
56 This is actually the bottom of the stack,
57 the top of the stack is always found in the
58 esp register. */
59 __u32 signal; /* Signal that caused the core dump. */
60 int reserved; /* No __u32er used */
61 __u32 u_ar0; /* Used by gdb to help find the values for */
62 /* the registers. */
63 __u32 u_fpstate; /* Math Co-processor pointer. */
64 __u32 magic; /* To uniquely identify a core file */
65 char u_comm[32]; /* User command that was responsible */
66 int u_debugreg[8];
67};
68
69
70#endif /* _ASM_X86_USER32_H */
diff --git a/arch/x86/include/asm/user_32.h b/arch/x86/include/asm/user_32.h
new file mode 100644
index 000000000000..bebfd8644016
--- /dev/null
+++ b/arch/x86/include/asm/user_32.h
@@ -0,0 +1,131 @@
1#ifndef _ASM_X86_USER_32_H
2#define _ASM_X86_USER_32_H
3
4#include <asm/page.h>
5/* Core file format: The core file is written in such a way that gdb
6 can understand it and provide useful information to the user (under
7 linux we use the 'trad-core' bfd). There are quite a number of
8 obstacles to being able to view the contents of the floating point
9 registers, and until these are solved you will not be able to view the
10 contents of them. Actually, you can read in the core file and look at
11 the contents of the user struct to find out what the floating point
12 registers contain.
13 The actual file contents are as follows:
14 UPAGE: 1 page consisting of a user struct that tells gdb what is present
15 in the file. Directly after this is a copy of the task_struct, which
16 is currently not used by gdb, but it may come in useful at some point.
17 All of the registers are stored as part of the upage. The upage should
18 always be only one page.
19 DATA: The data area is stored. We use current->end_text to
20 current->brk to pick up all of the user variables, plus any memory
21 that may have been malloced. No attempt is made to determine if a page
22 is demand-zero or if a page is totally unused, we just cover the entire
23 range. All of the addresses are rounded in such a way that an integral
24 number of pages is written.
25 STACK: We need the stack information in order to get a meaningful
26 backtrace. We need to write the data from (esp) to
27 current->start_stack, so we round each of these off in order to be able
28 to write an integer number of pages.
29 The minimum core file size is 3 pages, or 12288 bytes.
30*/
31
32/*
33 * Pentium III FXSR, SSE support
34 * Gareth Hughes <gareth@valinux.com>, May 2000
35 *
36 * Provide support for the GDB 5.0+ PTRACE_{GET|SET}FPXREGS requests for
37 * interacting with the FXSR-format floating point environment. Floating
38 * point data can be accessed in the regular format in the usual manner,
39 * and both the standard and SIMD floating point data can be accessed via
40 * the new ptrace requests. In either case, changes to the FPU environment
41 * will be reflected in the task's state as expected.
42 */
43
44struct user_i387_struct {
45 long cwd;
46 long swd;
47 long twd;
48 long fip;
49 long fcs;
50 long foo;
51 long fos;
52 long st_space[20]; /* 8*10 bytes for each FP-reg = 80 bytes */
53};
54
55struct user_fxsr_struct {
56 unsigned short cwd;
57 unsigned short swd;
58 unsigned short twd;
59 unsigned short fop;
60 long fip;
61 long fcs;
62 long foo;
63 long fos;
64 long mxcsr;
65 long reserved;
66 long st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
67 long xmm_space[32]; /* 8*16 bytes for each XMM-reg = 128 bytes */
68 long padding[56];
69};
70
71/*
72 * This is the old layout of "struct pt_regs", and
73 * is still the layout used by user mode (the new
74 * pt_regs doesn't have all registers as the kernel
75 * doesn't use the extra segment registers)
76 */
77struct user_regs_struct {
78 unsigned long bx;
79 unsigned long cx;
80 unsigned long dx;
81 unsigned long si;
82 unsigned long di;
83 unsigned long bp;
84 unsigned long ax;
85 unsigned long ds;
86 unsigned long es;
87 unsigned long fs;
88 unsigned long gs;
89 unsigned long orig_ax;
90 unsigned long ip;
91 unsigned long cs;
92 unsigned long flags;
93 unsigned long sp;
94 unsigned long ss;
95};
96
97/* When the kernel dumps core, it starts by dumping the user struct -
98 this will be used by gdb to figure out where the data and stack segments
99 are within the file, and what virtual addresses to use. */
100struct user{
101/* We start with the registers, to mimic the way that "memory" is returned
102 from the ptrace(3,...) function. */
103 struct user_regs_struct regs; /* Where the registers are actually stored */
104/* ptrace does not yet supply these. Someday.... */
105 int u_fpvalid; /* True if math co-processor being used. */
106 /* for this mess. Not yet used. */
107 struct user_i387_struct i387; /* Math Co-processor registers. */
108/* The rest of this junk is to help gdb figure out what goes where */
109 unsigned long int u_tsize; /* Text segment size (pages). */
110 unsigned long int u_dsize; /* Data segment size (pages). */
111 unsigned long int u_ssize; /* Stack segment size (pages). */
112 unsigned long start_code; /* Starting virtual address of text. */
113 unsigned long start_stack; /* Starting virtual address of stack area.
114 This is actually the bottom of the stack,
115 the top of the stack is always found in the
116 esp register. */
117 long int signal; /* Signal that caused the core dump. */
118 int reserved; /* No longer used */
119 unsigned long u_ar0; /* Used by gdb to help find the values for */
120 /* the registers. */
121 struct user_i387_struct *u_fpstate; /* Math Co-processor pointer. */
122 unsigned long magic; /* To uniquely identify a core file */
123 char u_comm[32]; /* User command that was responsible */
124 int u_debugreg[8];
125};
126#define NBPG PAGE_SIZE
127#define UPAGES 1
128#define HOST_TEXT_START_ADDR (u.start_code)
129#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
130
131#endif /* _ASM_X86_USER_32_H */
diff --git a/arch/x86/include/asm/user_64.h b/arch/x86/include/asm/user_64.h
new file mode 100644
index 000000000000..faf2cd3e0d76
--- /dev/null
+++ b/arch/x86/include/asm/user_64.h
@@ -0,0 +1,137 @@
1#ifndef _ASM_X86_USER_64_H
2#define _ASM_X86_USER_64_H
3
4#include <asm/types.h>
5#include <asm/page.h>
6/* Core file format: The core file is written in such a way that gdb
7 can understand it and provide useful information to the user.
8 There are quite a number of obstacles to being able to view the
9 contents of the floating point registers, and until these are
10 solved you will not be able to view the contents of them.
11 Actually, you can read in the core file and look at the contents of
12 the user struct to find out what the floating point registers
13 contain.
14
15 The actual file contents are as follows:
16 UPAGE: 1 page consisting of a user struct that tells gdb what is present
17 in the file. Directly after this is a copy of the task_struct, which
18 is currently not used by gdb, but it may come in useful at some point.
19 All of the registers are stored as part of the upage. The upage should
20 always be only one page.
21 DATA: The data area is stored. We use current->end_text to
22 current->brk to pick up all of the user variables, plus any memory
23 that may have been malloced. No attempt is made to determine if a page
24 is demand-zero or if a page is totally unused, we just cover the entire
25 range. All of the addresses are rounded in such a way that an integral
26 number of pages is written.
27 STACK: We need the stack information in order to get a meaningful
28 backtrace. We need to write the data from (esp) to
29 current->start_stack, so we round each of these off in order to be able
30 to write an integer number of pages.
31 The minimum core file size is 3 pages, or 12288 bytes. */
32
33/*
34 * Pentium III FXSR, SSE support
35 * Gareth Hughes <gareth@valinux.com>, May 2000
36 *
37 * Provide support for the GDB 5.0+ PTRACE_{GET|SET}FPXREGS requests for
38 * interacting with the FXSR-format floating point environment. Floating
39 * point data can be accessed in the regular format in the usual manner,
40 * and both the standard and SIMD floating point data can be accessed via
41 * the new ptrace requests. In either case, changes to the FPU environment
42 * will be reflected in the task's state as expected.
43 *
44 * x86-64 support by Andi Kleen.
45 */
46
47/* This matches the 64bit FXSAVE format as defined by AMD. It is the same
48 as the 32bit format defined by Intel, except that the selector:offset pairs
49 for data and eip are replaced with flat 64bit pointers. */
50struct user_i387_struct {
51 unsigned short cwd;
52 unsigned short swd;
53 unsigned short twd; /* Note this is not the same as
54 the 32bit/x87/FSAVE twd */
55 unsigned short fop;
56 __u64 rip;
57 __u64 rdp;
58 __u32 mxcsr;
59 __u32 mxcsr_mask;
60 __u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */
61 __u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */
62 __u32 padding[24];
63};
64
65/*
66 * Segment register layout in coredumps.
67 */
68struct user_regs_struct {
69 unsigned long r15;
70 unsigned long r14;
71 unsigned long r13;
72 unsigned long r12;
73 unsigned long bp;
74 unsigned long bx;
75 unsigned long r11;
76 unsigned long r10;
77 unsigned long r9;
78 unsigned long r8;
79 unsigned long ax;
80 unsigned long cx;
81 unsigned long dx;
82 unsigned long si;
83 unsigned long di;
84 unsigned long orig_ax;
85 unsigned long ip;
86 unsigned long cs;
87 unsigned long flags;
88 unsigned long sp;
89 unsigned long ss;
90 unsigned long fs_base;
91 unsigned long gs_base;
92 unsigned long ds;
93 unsigned long es;
94 unsigned long fs;
95 unsigned long gs;
96};
97
98/* When the kernel dumps core, it starts by dumping the user struct -
99 this will be used by gdb to figure out where the data and stack segments
100 are within the file, and what virtual addresses to use. */
101
102struct user {
103/* We start with the registers, to mimic the way that "memory" is returned
104 from the ptrace(3,...) function. */
105 struct user_regs_struct regs; /* Where the registers are actually stored */
106/* ptrace does not yet supply these. Someday.... */
107 int u_fpvalid; /* True if math co-processor being used. */
108 /* for this mess. Not yet used. */
109 int pad0;
110 struct user_i387_struct i387; /* Math Co-processor registers. */
111/* The rest of this junk is to help gdb figure out what goes where */
112 unsigned long int u_tsize; /* Text segment size (pages). */
113 unsigned long int u_dsize; /* Data segment size (pages). */
114 unsigned long int u_ssize; /* Stack segment size (pages). */
115 unsigned long start_code; /* Starting virtual address of text. */
116 unsigned long start_stack; /* Starting virtual address of stack area.
117 This is actually the bottom of the stack,
118 the top of the stack is always found in the
119 esp register. */
120 long int signal; /* Signal that caused the core dump. */
121 int reserved; /* No longer used */
122 int pad1;
123 unsigned long u_ar0; /* Used by gdb to help find the values for */
124 /* the registers. */
125 struct user_i387_struct *u_fpstate; /* Math Co-processor pointer. */
126 unsigned long magic; /* To uniquely identify a core file */
127 char u_comm[32]; /* User command that was responsible */
128 unsigned long u_debugreg[8];
129 unsigned long error_code; /* CPU error code or 0 */
130 unsigned long fault_address; /* CR3 or 0 */
131};
132#define NBPG PAGE_SIZE
133#define UPAGES 1
134#define HOST_TEXT_START_ADDR (u.start_code)
135#define HOST_STACK_END_ADDR (u.start_stack + u.u_ssize * NBPG)
136
137#endif /* _ASM_X86_USER_64_H */
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
new file mode 100644
index 000000000000..d931d3b7e6f7
--- /dev/null
+++ b/arch/x86/include/asm/uv/bios.h
@@ -0,0 +1,94 @@
1#ifndef _ASM_X86_UV_BIOS_H
2#define _ASM_X86_UV_BIOS_H
3
4/*
5 * UV BIOS layer definitions.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 *
21 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
22 * Copyright (c) Russ Anderson
23 */
24
25#include <linux/rtc.h>
26
27/*
28 * Values for the BIOS calls. It is passed as the first * argument in the
29 * BIOS call. Passing any other value in the first argument will result
30 * in a BIOS_STATUS_UNIMPLEMENTED return status.
31 */
32enum uv_bios_cmd {
33 UV_BIOS_COMMON,
34 UV_BIOS_GET_SN_INFO,
35 UV_BIOS_FREQ_BASE
36};
37
38/*
39 * Status values returned from a BIOS call.
40 */
41enum {
42 BIOS_STATUS_SUCCESS = 0,
43 BIOS_STATUS_UNIMPLEMENTED = -ENOSYS,
44 BIOS_STATUS_EINVAL = -EINVAL,
45 BIOS_STATUS_UNAVAIL = -EBUSY
46};
47
48/*
49 * The UV system table describes specific firmware
50 * capabilities available to the Linux kernel at runtime.
51 */
52struct uv_systab {
53 char signature[4]; /* must be "UVST" */
54 u32 revision; /* distinguish different firmware revs */
55 u64 function; /* BIOS runtime callback function ptr */
56};
57
58enum {
59 BIOS_FREQ_BASE_PLATFORM = 0,
60 BIOS_FREQ_BASE_INTERVAL_TIMER = 1,
61 BIOS_FREQ_BASE_REALTIME_CLOCK = 2
62};
63
64union partition_info_u {
65 u64 val;
66 struct {
67 u64 hub_version : 8,
68 partition_id : 16,
69 coherence_id : 16,
70 region_size : 24;
71 };
72};
73
74/*
75 * bios calls have 6 parameters
76 */
77extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64);
78extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64);
79extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
80
81extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *);
82extern s64 uv_bios_freq_base(u64, u64 *);
83
84extern void uv_bios_init(void);
85
86extern int uv_type;
87extern long sn_partition_id;
88extern long uv_coherency_id;
89extern long uv_region_size;
90#define partition_coherence_id() (uv_coherency_id)
91
92extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
93
94#endif /* _ASM_X86_UV_BIOS_H */
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
new file mode 100644
index 000000000000..e2363253bbbf
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -0,0 +1,332 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * SGI UV Broadcast Assist Unit definitions
7 *
8 * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
9 */
10
11#ifndef _ASM_X86_UV_UV_BAU_H
12#define _ASM_X86_UV_UV_BAU_H
13
14#include <linux/bitmap.h>
15#define BITSPERBYTE 8
16
17/*
18 * Broadcast Assist Unit messaging structures
19 *
20 * Selective Broadcast activations are induced by software action
21 * specifying a particular 8-descriptor "set" via a 6-bit index written
22 * to an MMR.
23 * Thus there are 64 unique 512-byte sets of SB descriptors - one set for
24 * each 6-bit index value. These descriptor sets are mapped in sequence
25 * starting with set 0 located at the address specified in the
26 * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
27 * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
28 *
29 * We will use 31 sets, one for sending BAU messages from each of the 32
30 * cpu's on the node.
31 *
32 * TLB shootdown will use the first of the 8 descriptors of each set.
33 * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
34 */
35
36#define UV_ITEMS_PER_DESCRIPTOR 8
37#define UV_CPUS_PER_ACT_STATUS 32
38#define UV_ACT_STATUS_MASK 0x3
39#define UV_ACT_STATUS_SIZE 2
40#define UV_ACTIVATION_DESCRIPTOR_SIZE 32
41#define UV_DISTRIBUTION_SIZE 256
42#define UV_SW_ACK_NPENDING 8
43#define UV_NET_ENDPOINT_INTD 0x38
44#define UV_DESC_BASE_PNODE_SHIFT 49
45#define UV_PAYLOADQ_PNODE_SHIFT 49
46#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
47#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
48
49/*
50 * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
51 */
52#define DESC_STATUS_IDLE 0
53#define DESC_STATUS_ACTIVE 1
54#define DESC_STATUS_DESTINATION_TIMEOUT 2
55#define DESC_STATUS_SOURCE_TIMEOUT 3
56
57/*
58 * source side threshholds at which message retries print a warning
59 */
60#define SOURCE_TIMEOUT_LIMIT 20
61#define DESTINATION_TIMEOUT_LIMIT 20
62
63/*
64 * number of entries in the destination side payload queue
65 */
66#define DEST_Q_SIZE 17
67/*
68 * number of destination side software ack resources
69 */
70#define DEST_NUM_RESOURCES 8
71#define MAX_CPUS_PER_NODE 32
72/*
73 * completion statuses for sending a TLB flush message
74 */
75#define FLUSH_RETRY 1
76#define FLUSH_GIVEUP 2
77#define FLUSH_COMPLETE 3
78
79/*
80 * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
81 * If the 'multilevel' flag in the header portion of the descriptor
82 * has been set to 0, then endpoint multi-unicast mode is selected.
83 * The distribution specification (32 bytes) is interpreted as a 256-bit
84 * distribution vector. Adjacent bits correspond to consecutive even numbered
85 * nodeIDs. The result of adding the index of a given bit to the 15-bit
86 * 'base_dest_nodeid' field of the header corresponds to the
87 * destination nodeID associated with that specified bit.
88 */
89struct bau_target_nodemask {
90 unsigned long bits[BITS_TO_LONGS(256)];
91};
92
93/*
94 * mask of cpu's on a node
95 * (during initialization we need to check that unsigned long has
96 * enough bits for max. cpu's per node)
97 */
98struct bau_local_cpumask {
99 unsigned long bits;
100};
101
102/*
103 * Payload: 16 bytes (128 bits) (bytes 0x20-0x2f of descriptor)
104 * only 12 bytes (96 bits) of the payload area are usable.
105 * An additional 3 bytes (bits 27:4) of the header address are carried
106 * to the next bytes of the destination payload queue.
107 * And an additional 2 bytes of the header Suppl_A field are also
108 * carried to the destination payload queue.
109 * But the first byte of the Suppl_A becomes bits 127:120 (the 16th byte)
110 * of the destination payload queue, which is written by the hardware
111 * with the s/w ack resource bit vector.
112 * [ effective message contents (16 bytes (128 bits) maximum), not counting
113 * the s/w ack bit vector ]
114 */
115
116/*
117 * The payload is software-defined for INTD transactions
118 */
119struct bau_msg_payload {
120 unsigned long address; /* signifies a page or all TLB's
121 of the cpu */
122 /* 64 bits */
123 unsigned short sending_cpu; /* filled in by sender */
124 /* 16 bits */
125 unsigned short acknowledge_count;/* filled in by destination */
126 /* 16 bits */
127 unsigned int reserved1:32; /* not usable */
128};
129
130
131/*
132 * Message header: 16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
133 * see table 4.2.3.0.1 in broacast_assist spec.
134 */
135struct bau_msg_header {
136 int dest_subnodeid:6; /* must be zero */
137 /* bits 5:0 */
138 int base_dest_nodeid:15; /* nasid>>1 (pnode) of first bit in node_map */
139 /* bits 20:6 */
140 int command:8; /* message type */
141 /* bits 28:21 */
142 /* 0x38: SN3net EndPoint Message */
143 int rsvd_1:3; /* must be zero */
144 /* bits 31:29 */
145 /* int will align on 32 bits */
146 int rsvd_2:9; /* must be zero */
147 /* bits 40:32 */
148 /* Suppl_A is 56-41 */
149 int payload_2a:8; /* becomes byte 16 of msg */
150 /* bits 48:41 */ /* not currently using */
151 int payload_2b:8; /* becomes byte 17 of msg */
152 /* bits 56:49 */ /* not currently using */
153 /* Address field (96:57) is never used as an
154 address (these are address bits 42:3) */
155 int rsvd_3:1; /* must be zero */
156 /* bit 57 */
157 /* address bits 27:4 are payload */
158 /* these 24 bits become bytes 12-14 of msg */
159 int replied_to:1; /* sent as 0 by the source to byte 12 */
160 /* bit 58 */
161
162 int payload_1a:5; /* not currently used */
163 /* bits 63:59 */
164 int payload_1b:8; /* not currently used */
165 /* bits 71:64 */
166 int payload_1c:8; /* not currently used */
167 /* bits 79:72 */
168 int payload_1d:2; /* not currently used */
169 /* bits 81:80 */
170
171 int rsvd_4:7; /* must be zero */
172 /* bits 88:82 */
173 int sw_ack_flag:1; /* software acknowledge flag */
174 /* bit 89 */
175 /* INTD trasactions at destination are to
176 wait for software acknowledge */
177 int rsvd_5:6; /* must be zero */
178 /* bits 95:90 */
179 int rsvd_6:5; /* must be zero */
180 /* bits 100:96 */
181 int int_both:1; /* if 1, interrupt both sockets on the blade */
182 /* bit 101*/
183 int fairness:3; /* usually zero */
184 /* bits 104:102 */
185 int multilevel:1; /* multi-level multicast format */
186 /* bit 105 */
187 /* 0 for TLB: endpoint multi-unicast messages */
188 int chaining:1; /* next descriptor is part of this activation*/
189 /* bit 106 */
190 int rsvd_7:21; /* must be zero */
191 /* bits 127:107 */
192};
193
194/*
195 * The activation descriptor:
196 * The format of the message to send, plus all accompanying control
197 * Should be 64 bytes
198 */
199struct bau_desc {
200 struct bau_target_nodemask distribution;
201 /*
202 * message template, consisting of header and payload:
203 */
204 struct bau_msg_header header;
205 struct bau_msg_payload payload;
206};
207/*
208 * -payload-- ---------header------
209 * bytes 0-11 bits 41-56 bits 58-81
210 * A B (2) C (3)
211 *
212 * A/B/C are moved to:
213 * A C B
214 * bytes 0-11 bytes 12-14 bytes 16-17 (byte 15 filled in by hw as vector)
215 * ------------payload queue-----------
216 */
217
218/*
219 * The payload queue on the destination side is an array of these.
220 * With BAU_MISC_CONTROL set for software acknowledge mode, the messages
221 * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17
222 * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120)
223 * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from
224 * sw_ack_vector and payload_2)
225 * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software
226 * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload
227 * operation."
228 */
229struct bau_payload_queue_entry {
230 unsigned long address; /* signifies a page or all TLB's
231 of the cpu */
232 /* 64 bits, bytes 0-7 */
233
234 unsigned short sending_cpu; /* cpu that sent the message */
235 /* 16 bits, bytes 8-9 */
236
237 unsigned short acknowledge_count; /* filled in by destination */
238 /* 16 bits, bytes 10-11 */
239
240 unsigned short replied_to:1; /* sent as 0 by the source */
241 /* 1 bit */
242 unsigned short unused1:7; /* not currently using */
243 /* 7 bits: byte 12) */
244
245 unsigned char unused2[2]; /* not currently using */
246 /* bytes 13-14 */
247
248 unsigned char sw_ack_vector; /* filled in by the hardware */
249 /* byte 15 (bits 127:120) */
250
251 unsigned char unused4[3]; /* not currently using bytes 17-19 */
252 /* bytes 17-19 */
253
254 int number_of_cpus; /* filled in at destination */
255 /* 32 bits, bytes 20-23 (aligned) */
256
257 unsigned char unused5[8]; /* not using */
258 /* bytes 24-31 */
259};
260
261/*
262 * one for every slot in the destination payload queue
263 */
264struct bau_msg_status {
265 struct bau_local_cpumask seen_by; /* map of cpu's */
266};
267
268/*
269 * one for every slot in the destination software ack resources
270 */
271struct bau_sw_ack_status {
272 struct bau_payload_queue_entry *msg; /* associated message */
273 int watcher; /* cpu monitoring, or -1 */
274};
275
276/*
277 * one on every node and per-cpu; to locate the software tables
278 */
279struct bau_control {
280 struct bau_desc *descriptor_base;
281 struct bau_payload_queue_entry *bau_msg_head;
282 struct bau_payload_queue_entry *va_queue_first;
283 struct bau_payload_queue_entry *va_queue_last;
284 struct bau_msg_status *msg_statuses;
285 int *watching; /* pointer to array */
286};
287
288/*
289 * This structure is allocated per_cpu for UV TLB shootdown statistics.
290 */
291struct ptc_stats {
292 unsigned long ptc_i; /* number of IPI-style flushes */
293 unsigned long requestor; /* number of nodes this cpu sent to */
294 unsigned long requestee; /* times cpu was remotely requested */
295 unsigned long alltlb; /* times all tlb's on this cpu were flushed */
296 unsigned long onetlb; /* times just one tlb on this cpu was flushed */
297 unsigned long s_retry; /* retries on source side timeouts */
298 unsigned long d_retry; /* retries on destination side timeouts */
299 unsigned long sflush; /* cycles spent in uv_flush_tlb_others */
300 unsigned long dflush; /* cycles spent on destination side */
301 unsigned long retriesok; /* successes on retries */
302 unsigned long nomsg; /* interrupts with no message */
303 unsigned long multmsg; /* interrupts with multiple messages */
304 unsigned long ntargeted;/* nodes targeted */
305};
306
307static inline int bau_node_isset(int node, struct bau_target_nodemask *dstp)
308{
309 return constant_test_bit(node, &dstp->bits[0]);
310}
311static inline void bau_node_set(int node, struct bau_target_nodemask *dstp)
312{
313 __set_bit(node, &dstp->bits[0]);
314}
315static inline void bau_nodes_clear(struct bau_target_nodemask *dstp, int nbits)
316{
317 bitmap_zero(&dstp->bits[0], nbits);
318}
319
320static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
321{
322 bitmap_zero(&dstp->bits, nbits);
323}
324
325#define cpubit_isset(cpu, bau_local_cpumask) \
326 test_bit((cpu), (bau_local_cpumask).bits)
327
328extern int uv_flush_tlb_others(cpumask_t *, struct mm_struct *, unsigned long);
329extern void uv_bau_message_intr1(void);
330extern void uv_bau_timeout_intr1(void);
331
332#endif /* _ASM_X86_UV_UV_BAU_H */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
new file mode 100644
index 000000000000..c6ad93e315c8
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -0,0 +1,354 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * SGI UV architectural definitions
7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
9 */
10
11#ifndef _ASM_X86_UV_UV_HUB_H
12#define _ASM_X86_UV_UV_HUB_H
13
14#include <linux/numa.h>
15#include <linux/percpu.h>
16#include <asm/types.h>
17#include <asm/percpu.h>
18
19
20/*
21 * Addressing Terminology
22 *
23 * M - The low M bits of a physical address represent the offset
24 * into the blade local memory. RAM memory on a blade is physically
25 * contiguous (although various IO spaces may punch holes in
26 * it)..
27 *
28 * N - Number of bits in the node portion of a socket physical
29 * address.
30 *
31 * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of
32 * routers always have low bit of 1, C/MBricks have low bit
33 * equal to 0. Most addressing macros that target UV hub chips
34 * right shift the NASID by 1 to exclude the always-zero bit.
35 * NASIDs contain up to 15 bits.
36 *
37 * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead
38 * of nasids.
39 *
40 * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant
41 * of the nasid for socket usage.
42 *
43 *
44 * NumaLink Global Physical Address Format:
45 * +--------------------------------+---------------------+
46 * |00..000| GNODE | NodeOffset |
47 * +--------------------------------+---------------------+
48 * |<-------53 - M bits --->|<--------M bits ----->
49 *
50 * M - number of node offset bits (35 .. 40)
51 *
52 *
53 * Memory/UV-HUB Processor Socket Address Format:
54 * +----------------+---------------+---------------------+
55 * |00..000000000000| PNODE | NodeOffset |
56 * +----------------+---------------+---------------------+
57 * <--- N bits --->|<--------M bits ----->
58 *
59 * M - number of node offset bits (35 .. 40)
60 * N - number of PNODE bits (0 .. 10)
61 *
62 * Note: M + N cannot currently exceed 44 (x86_64) or 46 (IA64).
63 * The actual values are configuration dependent and are set at
64 * boot time. M & N values are set by the hardware/BIOS at boot.
65 *
66 *
67 * APICID format
68 * NOTE!!!!!! This is the current format of the APICID. However, code
69 * should assume that this will change in the future. Use functions
70 * in this file for all APICID bit manipulations and conversion.
71 *
72 * 1111110000000000
73 * 5432109876543210
74 * pppppppppplc0cch
75 * sssssssssss
76 *
77 * p = pnode bits
78 * l = socket number on board
79 * c = core
80 * h = hyperthread
81 * s = bits that are in the SOCKET_ID CSR
82 *
83 * Note: Processor only supports 12 bits in the APICID register. The ACPI
84 * tables hold all 16 bits. Software needs to be aware of this.
85 *
86 * Unless otherwise specified, all references to APICID refer to
87 * the FULL value contained in ACPI tables, not the subset in the
88 * processor APICID register.
89 */
90
91
92/*
93 * Maximum number of bricks in all partitions and in all coherency domains.
94 * This is the total number of bricks accessible in the numalink fabric. It
95 * includes all C & M bricks. Routers are NOT included.
96 *
97 * This value is also the value of the maximum number of non-router NASIDs
98 * in the numalink fabric.
99 *
100 * NOTE: a brick may contain 1 or 2 OS nodes. Don't get these confused.
101 */
102#define UV_MAX_NUMALINK_BLADES 16384
103
104/*
105 * Maximum number of C/Mbricks within a software SSI (hardware may support
106 * more).
107 */
108#define UV_MAX_SSI_BLADES 256
109
110/*
111 * The largest possible NASID of a C or M brick (+ 2)
112 */
113#define UV_MAX_NASID_VALUE (UV_MAX_NUMALINK_NODES * 2)
114
115/*
116 * The following defines attributes of the HUB chip. These attributes are
117 * frequently referenced and are kept in the per-cpu data areas of each cpu.
118 * They are kept together in a struct to minimize cache misses.
119 */
120struct uv_hub_info_s {
121 unsigned long global_mmr_base;
122 unsigned long gpa_mask;
123 unsigned long gnode_upper;
124 unsigned long lowmem_remap_top;
125 unsigned long lowmem_remap_base;
126 unsigned short pnode;
127 unsigned short pnode_mask;
128 unsigned short coherency_domain_number;
129 unsigned short numa_blade_id;
130 unsigned char blade_processor_id;
131 unsigned char m_val;
132 unsigned char n_val;
133};
134DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
135#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
136#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
137
138/*
139 * Local & Global MMR space macros.
140 * Note: macros are intended to be used ONLY by inline functions
141 * in this file - not by other kernel code.
142 * n - NASID (full 15-bit global nasid)
143 * g - GNODE (full 15-bit global nasid, right shifted 1)
144 * p - PNODE (local part of nsids, right shifted 1)
145 */
146#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
147#define UV_PNODE_TO_NASID(p) (((p) << 1) | uv_hub_info->gnode_upper)
148
149#define UV_LOCAL_MMR_BASE 0xf4000000UL
150#define UV_GLOBAL_MMR32_BASE 0xf8000000UL
151#define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base)
152#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
153#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
154
155#define UV_GLOBAL_MMR32_PNODE_SHIFT 15
156#define UV_GLOBAL_MMR64_PNODE_SHIFT 26
157
158#define UV_GLOBAL_MMR32_PNODE_BITS(p) ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
159
160#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
161 ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT)
162
163#define UV_APIC_PNODE_SHIFT 6
164
165/*
166 * Macros for converting between kernel virtual addresses, socket local physical
167 * addresses, and UV global physical addresses.
168 * Note: use the standard __pa() & __va() macros for converting
169 * between socket virtual and socket physical addresses.
170 */
171
172/* socket phys RAM --> UV global physical address */
173static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr)
174{
175 if (paddr < uv_hub_info->lowmem_remap_top)
176 paddr += uv_hub_info->lowmem_remap_base;
177 return paddr | uv_hub_info->gnode_upper;
178}
179
180
181/* socket virtual --> UV global physical address */
182static inline unsigned long uv_gpa(void *v)
183{
184 return __pa(v) | uv_hub_info->gnode_upper;
185}
186
187/* socket virtual --> UV global physical address */
188static inline void *uv_vgpa(void *v)
189{
190 return (void *)uv_gpa(v);
191}
192
193/* UV global physical address --> socket virtual */
194static inline void *uv_va(unsigned long gpa)
195{
196 return __va(gpa & uv_hub_info->gpa_mask);
197}
198
199/* pnode, offset --> socket virtual */
200static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
201{
202 return __va(((unsigned long)pnode << uv_hub_info->m_val) | offset);
203}
204
205
206/*
207 * Extract a PNODE from an APICID (full apicid, not processor subset)
208 */
209static inline int uv_apicid_to_pnode(int apicid)
210{
211 return (apicid >> UV_APIC_PNODE_SHIFT);
212}
213
214/*
215 * Access global MMRs using the low memory MMR32 space. This region supports
216 * faster MMR access but not all MMRs are accessible in this space.
217 */
218static inline unsigned long *uv_global_mmr32_address(int pnode,
219 unsigned long offset)
220{
221 return __va(UV_GLOBAL_MMR32_BASE |
222 UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset);
223}
224
225static inline void uv_write_global_mmr32(int pnode, unsigned long offset,
226 unsigned long val)
227{
228 *uv_global_mmr32_address(pnode, offset) = val;
229}
230
231static inline unsigned long uv_read_global_mmr32(int pnode,
232 unsigned long offset)
233{
234 return *uv_global_mmr32_address(pnode, offset);
235}
236
237/*
238 * Access Global MMR space using the MMR space located at the top of physical
239 * memory.
240 */
241static inline unsigned long *uv_global_mmr64_address(int pnode,
242 unsigned long offset)
243{
244 return __va(UV_GLOBAL_MMR64_BASE |
245 UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
246}
247
248static inline void uv_write_global_mmr64(int pnode, unsigned long offset,
249 unsigned long val)
250{
251 *uv_global_mmr64_address(pnode, offset) = val;
252}
253
254static inline unsigned long uv_read_global_mmr64(int pnode,
255 unsigned long offset)
256{
257 return *uv_global_mmr64_address(pnode, offset);
258}
259
260/*
261 * Access hub local MMRs. Faster than using global space but only local MMRs
262 * are accessible.
263 */
264static inline unsigned long *uv_local_mmr_address(unsigned long offset)
265{
266 return __va(UV_LOCAL_MMR_BASE | offset);
267}
268
269static inline unsigned long uv_read_local_mmr(unsigned long offset)
270{
271 return *uv_local_mmr_address(offset);
272}
273
274static inline void uv_write_local_mmr(unsigned long offset, unsigned long val)
275{
276 *uv_local_mmr_address(offset) = val;
277}
278
279/*
280 * Structures and definitions for converting between cpu, node, pnode, and blade
281 * numbers.
282 */
283struct uv_blade_info {
284 unsigned short nr_possible_cpus;
285 unsigned short nr_online_cpus;
286 unsigned short pnode;
287};
288extern struct uv_blade_info *uv_blade_info;
289extern short *uv_node_to_blade;
290extern short *uv_cpu_to_blade;
291extern short uv_possible_blades;
292
293/* Blade-local cpu number of current cpu. Numbered 0 .. <# cpus on the blade> */
294static inline int uv_blade_processor_id(void)
295{
296 return uv_hub_info->blade_processor_id;
297}
298
299/* Blade number of current cpu. Numnbered 0 .. <#blades -1> */
300static inline int uv_numa_blade_id(void)
301{
302 return uv_hub_info->numa_blade_id;
303}
304
305/* Convert a cpu number to the the UV blade number */
306static inline int uv_cpu_to_blade_id(int cpu)
307{
308 return uv_cpu_to_blade[cpu];
309}
310
311/* Convert linux node number to the UV blade number */
312static inline int uv_node_to_blade_id(int nid)
313{
314 return uv_node_to_blade[nid];
315}
316
317/* Convert a blade id to the PNODE of the blade */
318static inline int uv_blade_to_pnode(int bid)
319{
320 return uv_blade_info[bid].pnode;
321}
322
323/* Determine the number of possible cpus on a blade */
324static inline int uv_blade_nr_possible_cpus(int bid)
325{
326 return uv_blade_info[bid].nr_possible_cpus;
327}
328
329/* Determine the number of online cpus on a blade */
330static inline int uv_blade_nr_online_cpus(int bid)
331{
332 return uv_blade_info[bid].nr_online_cpus;
333}
334
335/* Convert a cpu id to the PNODE of the blade containing the cpu */
336static inline int uv_cpu_to_pnode(int cpu)
337{
338 return uv_blade_info[uv_cpu_to_blade_id(cpu)].pnode;
339}
340
341/* Convert a linux node number to the PNODE of the blade */
342static inline int uv_node_to_pnode(int nid)
343{
344 return uv_blade_info[uv_node_to_blade_id(nid)].pnode;
345}
346
347/* Maximum possible number of blades */
348static inline int uv_num_possible_blades(void)
349{
350 return uv_possible_blades;
351}
352
353#endif /* _ASM_X86_UV_UV_HUB_H */
354
diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h
new file mode 100644
index 000000000000..9613c8c0b647
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_irq.h
@@ -0,0 +1,36 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * SGI UV IRQ definitions
7 *
8 * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
9 */
10
11#ifndef _ASM_X86_UV_UV_IRQ_H
12#define _ASM_X86_UV_UV_IRQ_H
13
14/* If a generic version of this structure gets defined, eliminate this one. */
15struct uv_IO_APIC_route_entry {
16 __u64 vector : 8,
17 delivery_mode : 3,
18 dest_mode : 1,
19 delivery_status : 1,
20 polarity : 1,
21 __reserved_1 : 1,
22 trigger : 1,
23 mask : 1,
24 __reserved_2 : 15,
25 dest : 32;
26};
27
28extern struct irq_chip uv_irq_chip;
29
30extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long);
31extern void arch_disable_uv_irq(int, unsigned long);
32
33extern int uv_setup_irq(char *, int, int, unsigned long);
34extern void uv_teardown_irq(unsigned int, int, unsigned long);
35
36#endif /* _ASM_X86_UV_UV_IRQ_H */
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
new file mode 100644
index 000000000000..dd627793a234
--- /dev/null
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -0,0 +1,1295 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * SGI UV MMR definitions
7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved.
9 */
10
11#ifndef _ASM_X86_UV_UV_MMRS_H
12#define _ASM_X86_UV_UV_MMRS_H
13
14#define UV_MMR_ENABLE (1UL << 63)
15
16/* ========================================================================= */
17/* UVH_BAU_DATA_CONFIG */
18/* ========================================================================= */
19#define UVH_BAU_DATA_CONFIG 0x61680UL
20#define UVH_BAU_DATA_CONFIG_32 0x0438
21
22#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
23#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
24#define UVH_BAU_DATA_CONFIG_DM_SHFT 8
25#define UVH_BAU_DATA_CONFIG_DM_MASK 0x0000000000000700UL
26#define UVH_BAU_DATA_CONFIG_DESTMODE_SHFT 11
27#define UVH_BAU_DATA_CONFIG_DESTMODE_MASK 0x0000000000000800UL
28#define UVH_BAU_DATA_CONFIG_STATUS_SHFT 12
29#define UVH_BAU_DATA_CONFIG_STATUS_MASK 0x0000000000001000UL
30#define UVH_BAU_DATA_CONFIG_P_SHFT 13
31#define UVH_BAU_DATA_CONFIG_P_MASK 0x0000000000002000UL
32#define UVH_BAU_DATA_CONFIG_T_SHFT 15
33#define UVH_BAU_DATA_CONFIG_T_MASK 0x0000000000008000UL
34#define UVH_BAU_DATA_CONFIG_M_SHFT 16
35#define UVH_BAU_DATA_CONFIG_M_MASK 0x0000000000010000UL
36#define UVH_BAU_DATA_CONFIG_APIC_ID_SHFT 32
37#define UVH_BAU_DATA_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
38
39union uvh_bau_data_config_u {
40 unsigned long v;
41 struct uvh_bau_data_config_s {
42 unsigned long vector_ : 8; /* RW */
43 unsigned long dm : 3; /* RW */
44 unsigned long destmode : 1; /* RW */
45 unsigned long status : 1; /* RO */
46 unsigned long p : 1; /* RO */
47 unsigned long rsvd_14 : 1; /* */
48 unsigned long t : 1; /* RO */
49 unsigned long m : 1; /* RW */
50 unsigned long rsvd_17_31: 15; /* */
51 unsigned long apic_id : 32; /* RW */
52 } s;
53};
54
55/* ========================================================================= */
56/* UVH_EVENT_OCCURRED0 */
57/* ========================================================================= */
58#define UVH_EVENT_OCCURRED0 0x70000UL
59#define UVH_EVENT_OCCURRED0_32 0x005e8
60
61#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0
62#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
63#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
64#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
65#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
66#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
67#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3
68#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
69#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4
70#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
71#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5
72#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
73#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6
74#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
75#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
76#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
77#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
78#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
79#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
80#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
81#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
82#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
83#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
84#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
85#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
86#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
87#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
88#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
89#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
90#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
91#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
92#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
93#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
94#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
95#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
96#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
97#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
98#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
99#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
100#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
101#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
102#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
103#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
104#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
105#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
106#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
107#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
108#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
109#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
110#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
111#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
112#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
113#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
114#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
115#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
116#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
117#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
118#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
119#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
120#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
121#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
122#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
123#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
124#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
125#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
126#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
127#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
128#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
129#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
130#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
131#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
132#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
133#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
134#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
135#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
136#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
137#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
138#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
139#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
140#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
141#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
142#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
143#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
144#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
145#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
146#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
147#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43
148#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
149#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
150#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
151#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45
152#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
153#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
154#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
155#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
156#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
157#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
158#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
159#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
160#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
161#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
162#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
163#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51
164#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
165#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52
166#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
167#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53
168#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
169#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54
170#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
171#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55
172#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
173#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
174#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
175union uvh_event_occurred0_u {
176 unsigned long v;
177 struct uvh_event_occurred0_s {
178 unsigned long lb_hcerr : 1; /* RW, W1C */
179 unsigned long gr0_hcerr : 1; /* RW, W1C */
180 unsigned long gr1_hcerr : 1; /* RW, W1C */
181 unsigned long lh_hcerr : 1; /* RW, W1C */
182 unsigned long rh_hcerr : 1; /* RW, W1C */
183 unsigned long xn_hcerr : 1; /* RW, W1C */
184 unsigned long si_hcerr : 1; /* RW, W1C */
185 unsigned long lb_aoerr0 : 1; /* RW, W1C */
186 unsigned long gr0_aoerr0 : 1; /* RW, W1C */
187 unsigned long gr1_aoerr0 : 1; /* RW, W1C */
188 unsigned long lh_aoerr0 : 1; /* RW, W1C */
189 unsigned long rh_aoerr0 : 1; /* RW, W1C */
190 unsigned long xn_aoerr0 : 1; /* RW, W1C */
191 unsigned long si_aoerr0 : 1; /* RW, W1C */
192 unsigned long lb_aoerr1 : 1; /* RW, W1C */
193 unsigned long gr0_aoerr1 : 1; /* RW, W1C */
194 unsigned long gr1_aoerr1 : 1; /* RW, W1C */
195 unsigned long lh_aoerr1 : 1; /* RW, W1C */
196 unsigned long rh_aoerr1 : 1; /* RW, W1C */
197 unsigned long xn_aoerr1 : 1; /* RW, W1C */
198 unsigned long si_aoerr1 : 1; /* RW, W1C */
199 unsigned long rh_vpi_int : 1; /* RW, W1C */
200 unsigned long system_shutdown_int : 1; /* RW, W1C */
201 unsigned long lb_irq_int_0 : 1; /* RW, W1C */
202 unsigned long lb_irq_int_1 : 1; /* RW, W1C */
203 unsigned long lb_irq_int_2 : 1; /* RW, W1C */
204 unsigned long lb_irq_int_3 : 1; /* RW, W1C */
205 unsigned long lb_irq_int_4 : 1; /* RW, W1C */
206 unsigned long lb_irq_int_5 : 1; /* RW, W1C */
207 unsigned long lb_irq_int_6 : 1; /* RW, W1C */
208 unsigned long lb_irq_int_7 : 1; /* RW, W1C */
209 unsigned long lb_irq_int_8 : 1; /* RW, W1C */
210 unsigned long lb_irq_int_9 : 1; /* RW, W1C */
211 unsigned long lb_irq_int_10 : 1; /* RW, W1C */
212 unsigned long lb_irq_int_11 : 1; /* RW, W1C */
213 unsigned long lb_irq_int_12 : 1; /* RW, W1C */
214 unsigned long lb_irq_int_13 : 1; /* RW, W1C */
215 unsigned long lb_irq_int_14 : 1; /* RW, W1C */
216 unsigned long lb_irq_int_15 : 1; /* RW, W1C */
217 unsigned long l1_nmi_int : 1; /* RW, W1C */
218 unsigned long stop_clock : 1; /* RW, W1C */
219 unsigned long asic_to_l1 : 1; /* RW, W1C */
220 unsigned long l1_to_asic : 1; /* RW, W1C */
221 unsigned long ltc_int : 1; /* RW, W1C */
222 unsigned long la_seq_trigger : 1; /* RW, W1C */
223 unsigned long ipi_int : 1; /* RW, W1C */
224 unsigned long extio_int0 : 1; /* RW, W1C */
225 unsigned long extio_int1 : 1; /* RW, W1C */
226 unsigned long extio_int2 : 1; /* RW, W1C */
227 unsigned long extio_int3 : 1; /* RW, W1C */
228 unsigned long profile_int : 1; /* RW, W1C */
229 unsigned long rtc0 : 1; /* RW, W1C */
230 unsigned long rtc1 : 1; /* RW, W1C */
231 unsigned long rtc2 : 1; /* RW, W1C */
232 unsigned long rtc3 : 1; /* RW, W1C */
233 unsigned long bau_data : 1; /* RW, W1C */
234 unsigned long power_management_req : 1; /* RW, W1C */
235 unsigned long rsvd_57_63 : 7; /* */
236 } s;
237};
238
239/* ========================================================================= */
240/* UVH_EVENT_OCCURRED0_ALIAS */
241/* ========================================================================= */
242#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
243#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0
244
245/* ========================================================================= */
246/* UVH_INT_CMPB */
247/* ========================================================================= */
248#define UVH_INT_CMPB 0x22080UL
249
250#define UVH_INT_CMPB_REAL_TIME_CMPB_SHFT 0
251#define UVH_INT_CMPB_REAL_TIME_CMPB_MASK 0x00ffffffffffffffUL
252
253union uvh_int_cmpb_u {
254 unsigned long v;
255 struct uvh_int_cmpb_s {
256 unsigned long real_time_cmpb : 56; /* RW */
257 unsigned long rsvd_56_63 : 8; /* */
258 } s;
259};
260
261/* ========================================================================= */
262/* UVH_INT_CMPC */
263/* ========================================================================= */
264#define UVH_INT_CMPC 0x22100UL
265
266#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0
267#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL
268
269union uvh_int_cmpc_u {
270 unsigned long v;
271 struct uvh_int_cmpc_s {
272 unsigned long real_time_cmpc : 56; /* RW */
273 unsigned long rsvd_56_63 : 8; /* */
274 } s;
275};
276
277/* ========================================================================= */
278/* UVH_INT_CMPD */
279/* ========================================================================= */
280#define UVH_INT_CMPD 0x22180UL
281
282#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0
283#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL
284
285union uvh_int_cmpd_u {
286 unsigned long v;
287 struct uvh_int_cmpd_s {
288 unsigned long real_time_cmpd : 56; /* RW */
289 unsigned long rsvd_56_63 : 8; /* */
290 } s;
291};
292
293/* ========================================================================= */
294/* UVH_IPI_INT */
295/* ========================================================================= */
296#define UVH_IPI_INT 0x60500UL
297#define UVH_IPI_INT_32 0x0348
298
299#define UVH_IPI_INT_VECTOR_SHFT 0
300#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
301#define UVH_IPI_INT_DELIVERY_MODE_SHFT 8
302#define UVH_IPI_INT_DELIVERY_MODE_MASK 0x0000000000000700UL
303#define UVH_IPI_INT_DESTMODE_SHFT 11
304#define UVH_IPI_INT_DESTMODE_MASK 0x0000000000000800UL
305#define UVH_IPI_INT_APIC_ID_SHFT 16
306#define UVH_IPI_INT_APIC_ID_MASK 0x0000ffffffff0000UL
307#define UVH_IPI_INT_SEND_SHFT 63
308#define UVH_IPI_INT_SEND_MASK 0x8000000000000000UL
309
310union uvh_ipi_int_u {
311 unsigned long v;
312 struct uvh_ipi_int_s {
313 unsigned long vector_ : 8; /* RW */
314 unsigned long delivery_mode : 3; /* RW */
315 unsigned long destmode : 1; /* RW */
316 unsigned long rsvd_12_15 : 4; /* */
317 unsigned long apic_id : 32; /* RW */
318 unsigned long rsvd_48_62 : 15; /* */
319 unsigned long send : 1; /* WP */
320 } s;
321};
322
323/* ========================================================================= */
324/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */
325/* ========================================================================= */
326#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
327#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x009c0
328
329#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
330#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
331#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_SHFT 49
332#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_NODE_ID_MASK 0x7ffe000000000000UL
333
334union uvh_lb_bau_intd_payload_queue_first_u {
335 unsigned long v;
336 struct uvh_lb_bau_intd_payload_queue_first_s {
337 unsigned long rsvd_0_3: 4; /* */
338 unsigned long address : 39; /* RW */
339 unsigned long rsvd_43_48: 6; /* */
340 unsigned long node_id : 14; /* RW */
341 unsigned long rsvd_63 : 1; /* */
342 } s;
343};
344
345/* ========================================================================= */
346/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */
347/* ========================================================================= */
348#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
349#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x009c8
350
351#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
352#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
353
354union uvh_lb_bau_intd_payload_queue_last_u {
355 unsigned long v;
356 struct uvh_lb_bau_intd_payload_queue_last_s {
357 unsigned long rsvd_0_3: 4; /* */
358 unsigned long address : 39; /* RW */
359 unsigned long rsvd_43_63: 21; /* */
360 } s;
361};
362
363/* ========================================================================= */
364/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */
365/* ========================================================================= */
366#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
367#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x009d0
368
369#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
370#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
371
372union uvh_lb_bau_intd_payload_queue_tail_u {
373 unsigned long v;
374 struct uvh_lb_bau_intd_payload_queue_tail_s {
375 unsigned long rsvd_0_3: 4; /* */
376 unsigned long address : 39; /* RW */
377 unsigned long rsvd_43_63: 21; /* */
378 } s;
379};
380
381/* ========================================================================= */
382/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */
383/* ========================================================================= */
384#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
385#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0a68
386
387#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
388#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
389#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_SHFT 1
390#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_1_MASK 0x0000000000000002UL
391#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_SHFT 2
392#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_2_MASK 0x0000000000000004UL
393#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_SHFT 3
394#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_3_MASK 0x0000000000000008UL
395#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_SHFT 4
396#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_4_MASK 0x0000000000000010UL
397#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_SHFT 5
398#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_5_MASK 0x0000000000000020UL
399#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_SHFT 6
400#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_6_MASK 0x0000000000000040UL
401#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_SHFT 7
402#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_7_MASK 0x0000000000000080UL
403#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_SHFT 8
404#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_0_MASK 0x0000000000000100UL
405#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_SHFT 9
406#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_1_MASK 0x0000000000000200UL
407#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_SHFT 10
408#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_2_MASK 0x0000000000000400UL
409#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_SHFT 11
410#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_3_MASK 0x0000000000000800UL
411#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_SHFT 12
412#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_4_MASK 0x0000000000001000UL
413#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_SHFT 13
414#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_5_MASK 0x0000000000002000UL
415#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_SHFT 14
416#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
417#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
418#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
419union uvh_lb_bau_intd_software_acknowledge_u {
420 unsigned long v;
421 struct uvh_lb_bau_intd_software_acknowledge_s {
422 unsigned long pending_0 : 1; /* RW, W1C */
423 unsigned long pending_1 : 1; /* RW, W1C */
424 unsigned long pending_2 : 1; /* RW, W1C */
425 unsigned long pending_3 : 1; /* RW, W1C */
426 unsigned long pending_4 : 1; /* RW, W1C */
427 unsigned long pending_5 : 1; /* RW, W1C */
428 unsigned long pending_6 : 1; /* RW, W1C */
429 unsigned long pending_7 : 1; /* RW, W1C */
430 unsigned long timeout_0 : 1; /* RW, W1C */
431 unsigned long timeout_1 : 1; /* RW, W1C */
432 unsigned long timeout_2 : 1; /* RW, W1C */
433 unsigned long timeout_3 : 1; /* RW, W1C */
434 unsigned long timeout_4 : 1; /* RW, W1C */
435 unsigned long timeout_5 : 1; /* RW, W1C */
436 unsigned long timeout_6 : 1; /* RW, W1C */
437 unsigned long timeout_7 : 1; /* RW, W1C */
438 unsigned long rsvd_16_63: 48; /* */
439 } s;
440};
441
442/* ========================================================================= */
443/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */
444/* ========================================================================= */
445#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
446#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70
447
448/* ========================================================================= */
449/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */
450/* ========================================================================= */
451#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
452#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x009a8
453
454#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0
455#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL
456#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT 62
457#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_MASK 0x4000000000000000UL
458#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_SHFT 63
459#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INIT_MASK 0x8000000000000000UL
460
461union uvh_lb_bau_sb_activation_control_u {
462 unsigned long v;
463 struct uvh_lb_bau_sb_activation_control_s {
464 unsigned long index : 6; /* RW */
465 unsigned long rsvd_6_61: 56; /* */
466 unsigned long push : 1; /* WP */
467 unsigned long init : 1; /* WP */
468 } s;
469};
470
471/* ========================================================================= */
472/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */
473/* ========================================================================= */
474#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
475#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x009b0
476
477#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0
478#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL
479
480union uvh_lb_bau_sb_activation_status_0_u {
481 unsigned long v;
482 struct uvh_lb_bau_sb_activation_status_0_s {
483 unsigned long status : 64; /* RW */
484 } s;
485};
486
487/* ========================================================================= */
488/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */
489/* ========================================================================= */
490#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
491#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x009b8
492
493#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0
494#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL
495
496union uvh_lb_bau_sb_activation_status_1_u {
497 unsigned long v;
498 struct uvh_lb_bau_sb_activation_status_1_s {
499 unsigned long status : 64; /* RW */
500 } s;
501};
502
503/* ========================================================================= */
504/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */
505/* ========================================================================= */
506#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
507#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x009a0
508
509#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12
510#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
511#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_SHFT 49
512#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_NODE_ID_MASK 0x7ffe000000000000UL
513
514union uvh_lb_bau_sb_descriptor_base_u {
515 unsigned long v;
516 struct uvh_lb_bau_sb_descriptor_base_s {
517 unsigned long rsvd_0_11 : 12; /* */
518 unsigned long page_address : 31; /* RW */
519 unsigned long rsvd_43_48 : 6; /* */
520 unsigned long node_id : 14; /* RW */
521 unsigned long rsvd_63 : 1; /* */
522 } s;
523};
524
525/* ========================================================================= */
526/* UVH_LB_MCAST_AOERR0_RPT_ENABLE */
527/* ========================================================================= */
528#define UVH_LB_MCAST_AOERR0_RPT_ENABLE 0x50b20UL
529
530#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_SHFT 0
531#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_OBESE_MSG_MASK 0x0000000000000001UL
532#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_SHFT 1
533#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_DATA_SB_ERR_MASK 0x0000000000000002UL
534#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_SHFT 2
535#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_NACK_BUFF_PARITY_MASK 0x0000000000000004UL
536#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_SHFT 3
537#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_TIMEOUT_MASK 0x0000000000000008UL
538#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_SHFT 4
539#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_INACTIVE_REPLY_MASK 0x0000000000000010UL
540#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_SHFT 5
541#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_UPGRADE_ERROR_MASK 0x0000000000000020UL
542#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_SHFT 6
543#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REG_COUNT_UNDERFLOW_MASK 0x0000000000000040UL
544#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_SHFT 7
545#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MCAST_REP_OBESE_MSG_MASK 0x0000000000000080UL
546#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_SHFT 8
547#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_RUNT_MSG_MASK 0x0000000000000100UL
548#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_SHFT 9
549#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_OBESE_MSG_MASK 0x0000000000000200UL
550#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_SHFT 10
551#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REQ_DATA_SB_ERR_MASK 0x0000000000000400UL
552#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_SHFT 11
553#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_RUNT_MSG_MASK 0x0000000000000800UL
554#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_SHFT 12
555#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_OBESE_MSG_MASK 0x0000000000001000UL
556#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_SHFT 13
557#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_DATA_SB_ERR_MASK 0x0000000000002000UL
558#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_SHFT 14
559#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_REP_COMMAND_ERR_MASK 0x0000000000004000UL
560#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_SHFT 15
561#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_UCACHE_PEND_TIMEOUT_MASK 0x0000000000008000UL
562#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_SHFT 16
563#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_RUNT_MSG_MASK 0x0000000000010000UL
564#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_SHFT 17
565#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_OBESE_MSG_MASK 0x0000000000020000UL
566#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_SHFT 18
567#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REQ_DATA_SB_ERR_MASK 0x0000000000040000UL
568#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_SHFT 19
569#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_RUNT_MSG_MASK 0x0000000000080000UL
570#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_SHFT 20
571#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_OBESE_MSG_MASK 0x0000000000100000UL
572#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_SHFT 21
573#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_REP_DATA_SB_ERR_MASK 0x0000000000200000UL
574#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_SHFT 22
575#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_AMO_TIMEOUT_MASK 0x0000000000400000UL
576#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_SHFT 23
577#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_PUT_TIMEOUT_MASK 0x0000000000800000UL
578#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_SHFT 24
579#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_MACC_SPURIOUS_EVENT_MASK 0x0000000001000000UL
580#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_SHFT 25
581#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IOH_DESTINATION_TABLE_PARITY_MASK 0x0000000002000000UL
582#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_SHFT 26
583#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_HAD_ERROR_REPLY_MASK 0x0000000004000000UL
584#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_SHFT 27
585#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_GET_TIMEOUT_MASK 0x0000000008000000UL
586#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_SHFT 28
587#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_LOCK_MANAGER_HAD_ERROR_REPLY_MASK 0x0000000010000000UL
588#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_SHFT 29
589#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_HAD_ERROR_REPLY_MASK 0x0000000020000000UL
590#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_SHFT 30
591#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_PUT_TIMEOUT_MASK 0x0000000040000000UL
592#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_SHFT 31
593#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SB_ACTIVATION_OVERRUN_MASK 0x0000000080000000UL
594#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_SHFT 32
595#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_HAD_ERROR_REPLY_MASK 0x0000000100000000UL
596#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_SHFT 33
597#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_COMPLETED_GB_ACTIVATION_TIMEOUT_MASK 0x0000000200000000UL
598#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_SHFT 34
599#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_0_PARITY_MASK 0x0000000400000000UL
600#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_SHFT 35
601#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_DESCRIPTOR_BUFFER_1_PARITY_MASK 0x0000000800000000UL
602#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_SHFT 36
603#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_SOCKET_DESTINATION_TABLE_PARITY_MASK 0x0000001000000000UL
604#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_SHFT 37
605#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_BAU_REPLY_PAYLOAD_CORRUPTION_MASK 0x0000002000000000UL
606#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_SHFT 38
607#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_IO_PORT_DESTINATION_TABLE_PARITY_MASK 0x0000004000000000UL
608#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_SHFT 39
609#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INTD_SOFT_ACK_TIMEOUT_MASK 0x0000008000000000UL
610#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_SHFT 40
611#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_OBESE_MSG_MASK 0x0000010000000000UL
612#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_SHFT 41
613#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_REP_COMMAND_ERR_MASK 0x0000020000000000UL
614#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_SHFT 42
615#define UVH_LB_MCAST_AOERR0_RPT_ENABLE_INT_TIMEOUT_MASK 0x0000040000000000UL
616
617union uvh_lb_mcast_aoerr0_rpt_enable_u {
618 unsigned long v;
619 struct uvh_lb_mcast_aoerr0_rpt_enable_s {
620 unsigned long mcast_obese_msg : 1; /* RW */
621 unsigned long mcast_data_sb_err : 1; /* RW */
622 unsigned long mcast_nack_buff_parity : 1; /* RW */
623 unsigned long mcast_timeout : 1; /* RW */
624 unsigned long mcast_inactive_reply : 1; /* RW */
625 unsigned long mcast_upgrade_error : 1; /* RW */
626 unsigned long mcast_reg_count_underflow : 1; /* RW */
627 unsigned long mcast_rep_obese_msg : 1; /* RW */
628 unsigned long ucache_req_runt_msg : 1; /* RW */
629 unsigned long ucache_req_obese_msg : 1; /* RW */
630 unsigned long ucache_req_data_sb_err : 1; /* RW */
631 unsigned long ucache_rep_runt_msg : 1; /* RW */
632 unsigned long ucache_rep_obese_msg : 1; /* RW */
633 unsigned long ucache_rep_data_sb_err : 1; /* RW */
634 unsigned long ucache_rep_command_err : 1; /* RW */
635 unsigned long ucache_pend_timeout : 1; /* RW */
636 unsigned long macc_req_runt_msg : 1; /* RW */
637 unsigned long macc_req_obese_msg : 1; /* RW */
638 unsigned long macc_req_data_sb_err : 1; /* RW */
639 unsigned long macc_rep_runt_msg : 1; /* RW */
640 unsigned long macc_rep_obese_msg : 1; /* RW */
641 unsigned long macc_rep_data_sb_err : 1; /* RW */
642 unsigned long macc_amo_timeout : 1; /* RW */
643 unsigned long macc_put_timeout : 1; /* RW */
644 unsigned long macc_spurious_event : 1; /* RW */
645 unsigned long ioh_destination_table_parity : 1; /* RW */
646 unsigned long get_had_error_reply : 1; /* RW */
647 unsigned long get_timeout : 1; /* RW */
648 unsigned long lock_manager_had_error_reply : 1; /* RW */
649 unsigned long put_had_error_reply : 1; /* RW */
650 unsigned long put_timeout : 1; /* RW */
651 unsigned long sb_activation_overrun : 1; /* RW */
652 unsigned long completed_gb_activation_had_error_reply : 1; /* RW */
653 unsigned long completed_gb_activation_timeout : 1; /* RW */
654 unsigned long descriptor_buffer_0_parity : 1; /* RW */
655 unsigned long descriptor_buffer_1_parity : 1; /* RW */
656 unsigned long socket_destination_table_parity : 1; /* RW */
657 unsigned long bau_reply_payload_corruption : 1; /* RW */
658 unsigned long io_port_destination_table_parity : 1; /* RW */
659 unsigned long intd_soft_ack_timeout : 1; /* RW */
660 unsigned long int_rep_obese_msg : 1; /* RW */
661 unsigned long int_rep_command_err : 1; /* RW */
662 unsigned long int_timeout : 1; /* RW */
663 unsigned long rsvd_43_63 : 21; /* */
664 } s;
665};
666
667/* ========================================================================= */
668/* UVH_LOCAL_INT0_CONFIG */
669/* ========================================================================= */
670#define UVH_LOCAL_INT0_CONFIG 0x61000UL
671
672#define UVH_LOCAL_INT0_CONFIG_VECTOR_SHFT 0
673#define UVH_LOCAL_INT0_CONFIG_VECTOR_MASK 0x00000000000000ffUL
674#define UVH_LOCAL_INT0_CONFIG_DM_SHFT 8
675#define UVH_LOCAL_INT0_CONFIG_DM_MASK 0x0000000000000700UL
676#define UVH_LOCAL_INT0_CONFIG_DESTMODE_SHFT 11
677#define UVH_LOCAL_INT0_CONFIG_DESTMODE_MASK 0x0000000000000800UL
678#define UVH_LOCAL_INT0_CONFIG_STATUS_SHFT 12
679#define UVH_LOCAL_INT0_CONFIG_STATUS_MASK 0x0000000000001000UL
680#define UVH_LOCAL_INT0_CONFIG_P_SHFT 13
681#define UVH_LOCAL_INT0_CONFIG_P_MASK 0x0000000000002000UL
682#define UVH_LOCAL_INT0_CONFIG_T_SHFT 15
683#define UVH_LOCAL_INT0_CONFIG_T_MASK 0x0000000000008000UL
684#define UVH_LOCAL_INT0_CONFIG_M_SHFT 16
685#define UVH_LOCAL_INT0_CONFIG_M_MASK 0x0000000000010000UL
686#define UVH_LOCAL_INT0_CONFIG_APIC_ID_SHFT 32
687#define UVH_LOCAL_INT0_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
688
689union uvh_local_int0_config_u {
690 unsigned long v;
691 struct uvh_local_int0_config_s {
692 unsigned long vector_ : 8; /* RW */
693 unsigned long dm : 3; /* RW */
694 unsigned long destmode : 1; /* RW */
695 unsigned long status : 1; /* RO */
696 unsigned long p : 1; /* RO */
697 unsigned long rsvd_14 : 1; /* */
698 unsigned long t : 1; /* RO */
699 unsigned long m : 1; /* RW */
700 unsigned long rsvd_17_31: 15; /* */
701 unsigned long apic_id : 32; /* RW */
702 } s;
703};
704
705/* ========================================================================= */
706/* UVH_LOCAL_INT0_ENABLE */
707/* ========================================================================= */
708#define UVH_LOCAL_INT0_ENABLE 0x65000UL
709
710#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_SHFT 0
711#define UVH_LOCAL_INT0_ENABLE_LB_HCERR_MASK 0x0000000000000001UL
712#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_SHFT 1
713#define UVH_LOCAL_INT0_ENABLE_GR0_HCERR_MASK 0x0000000000000002UL
714#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_SHFT 2
715#define UVH_LOCAL_INT0_ENABLE_GR1_HCERR_MASK 0x0000000000000004UL
716#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_SHFT 3
717#define UVH_LOCAL_INT0_ENABLE_LH_HCERR_MASK 0x0000000000000008UL
718#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_SHFT 4
719#define UVH_LOCAL_INT0_ENABLE_RH_HCERR_MASK 0x0000000000000010UL
720#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_SHFT 5
721#define UVH_LOCAL_INT0_ENABLE_XN_HCERR_MASK 0x0000000000000020UL
722#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_SHFT 6
723#define UVH_LOCAL_INT0_ENABLE_SI_HCERR_MASK 0x0000000000000040UL
724#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_SHFT 7
725#define UVH_LOCAL_INT0_ENABLE_LB_AOERR0_MASK 0x0000000000000080UL
726#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_SHFT 8
727#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR0_MASK 0x0000000000000100UL
728#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_SHFT 9
729#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR0_MASK 0x0000000000000200UL
730#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_SHFT 10
731#define UVH_LOCAL_INT0_ENABLE_LH_AOERR0_MASK 0x0000000000000400UL
732#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_SHFT 11
733#define UVH_LOCAL_INT0_ENABLE_RH_AOERR0_MASK 0x0000000000000800UL
734#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_SHFT 12
735#define UVH_LOCAL_INT0_ENABLE_XN_AOERR0_MASK 0x0000000000001000UL
736#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_SHFT 13
737#define UVH_LOCAL_INT0_ENABLE_SI_AOERR0_MASK 0x0000000000002000UL
738#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_SHFT 14
739#define UVH_LOCAL_INT0_ENABLE_LB_AOERR1_MASK 0x0000000000004000UL
740#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_SHFT 15
741#define UVH_LOCAL_INT0_ENABLE_GR0_AOERR1_MASK 0x0000000000008000UL
742#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_SHFT 16
743#define UVH_LOCAL_INT0_ENABLE_GR1_AOERR1_MASK 0x0000000000010000UL
744#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_SHFT 17
745#define UVH_LOCAL_INT0_ENABLE_LH_AOERR1_MASK 0x0000000000020000UL
746#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_SHFT 18
747#define UVH_LOCAL_INT0_ENABLE_RH_AOERR1_MASK 0x0000000000040000UL
748#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_SHFT 19
749#define UVH_LOCAL_INT0_ENABLE_XN_AOERR1_MASK 0x0000000000080000UL
750#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_SHFT 20
751#define UVH_LOCAL_INT0_ENABLE_SI_AOERR1_MASK 0x0000000000100000UL
752#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_SHFT 21
753#define UVH_LOCAL_INT0_ENABLE_RH_VPI_INT_MASK 0x0000000000200000UL
754#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_SHFT 22
755#define UVH_LOCAL_INT0_ENABLE_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
756#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_SHFT 23
757#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_0_MASK 0x0000000000800000UL
758#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_SHFT 24
759#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_1_MASK 0x0000000001000000UL
760#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_SHFT 25
761#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_2_MASK 0x0000000002000000UL
762#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_SHFT 26
763#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_3_MASK 0x0000000004000000UL
764#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_SHFT 27
765#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_4_MASK 0x0000000008000000UL
766#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_SHFT 28
767#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_5_MASK 0x0000000010000000UL
768#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_SHFT 29
769#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_6_MASK 0x0000000020000000UL
770#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_SHFT 30
771#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_7_MASK 0x0000000040000000UL
772#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_SHFT 31
773#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_8_MASK 0x0000000080000000UL
774#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_SHFT 32
775#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_9_MASK 0x0000000100000000UL
776#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_SHFT 33
777#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_10_MASK 0x0000000200000000UL
778#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_SHFT 34
779#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_11_MASK 0x0000000400000000UL
780#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_SHFT 35
781#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_12_MASK 0x0000000800000000UL
782#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_SHFT 36
783#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_13_MASK 0x0000001000000000UL
784#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_SHFT 37
785#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_14_MASK 0x0000002000000000UL
786#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_SHFT 38
787#define UVH_LOCAL_INT0_ENABLE_LB_IRQ_INT_15_MASK 0x0000004000000000UL
788#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_SHFT 39
789#define UVH_LOCAL_INT0_ENABLE_L1_NMI_INT_MASK 0x0000008000000000UL
790#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_SHFT 40
791#define UVH_LOCAL_INT0_ENABLE_STOP_CLOCK_MASK 0x0000010000000000UL
792#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_SHFT 41
793#define UVH_LOCAL_INT0_ENABLE_ASIC_TO_L1_MASK 0x0000020000000000UL
794#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_SHFT 42
795#define UVH_LOCAL_INT0_ENABLE_L1_TO_ASIC_MASK 0x0000040000000000UL
796#define UVH_LOCAL_INT0_ENABLE_LTC_INT_SHFT 43
797#define UVH_LOCAL_INT0_ENABLE_LTC_INT_MASK 0x0000080000000000UL
798#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_SHFT 44
799#define UVH_LOCAL_INT0_ENABLE_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
800
801union uvh_local_int0_enable_u {
802 unsigned long v;
803 struct uvh_local_int0_enable_s {
804 unsigned long lb_hcerr : 1; /* RW */
805 unsigned long gr0_hcerr : 1; /* RW */
806 unsigned long gr1_hcerr : 1; /* RW */
807 unsigned long lh_hcerr : 1; /* RW */
808 unsigned long rh_hcerr : 1; /* RW */
809 unsigned long xn_hcerr : 1; /* RW */
810 unsigned long si_hcerr : 1; /* RW */
811 unsigned long lb_aoerr0 : 1; /* RW */
812 unsigned long gr0_aoerr0 : 1; /* RW */
813 unsigned long gr1_aoerr0 : 1; /* RW */
814 unsigned long lh_aoerr0 : 1; /* RW */
815 unsigned long rh_aoerr0 : 1; /* RW */
816 unsigned long xn_aoerr0 : 1; /* RW */
817 unsigned long si_aoerr0 : 1; /* RW */
818 unsigned long lb_aoerr1 : 1; /* RW */
819 unsigned long gr0_aoerr1 : 1; /* RW */
820 unsigned long gr1_aoerr1 : 1; /* RW */
821 unsigned long lh_aoerr1 : 1; /* RW */
822 unsigned long rh_aoerr1 : 1; /* RW */
823 unsigned long xn_aoerr1 : 1; /* RW */
824 unsigned long si_aoerr1 : 1; /* RW */
825 unsigned long rh_vpi_int : 1; /* RW */
826 unsigned long system_shutdown_int : 1; /* RW */
827 unsigned long lb_irq_int_0 : 1; /* RW */
828 unsigned long lb_irq_int_1 : 1; /* RW */
829 unsigned long lb_irq_int_2 : 1; /* RW */
830 unsigned long lb_irq_int_3 : 1; /* RW */
831 unsigned long lb_irq_int_4 : 1; /* RW */
832 unsigned long lb_irq_int_5 : 1; /* RW */
833 unsigned long lb_irq_int_6 : 1; /* RW */
834 unsigned long lb_irq_int_7 : 1; /* RW */
835 unsigned long lb_irq_int_8 : 1; /* RW */
836 unsigned long lb_irq_int_9 : 1; /* RW */
837 unsigned long lb_irq_int_10 : 1; /* RW */
838 unsigned long lb_irq_int_11 : 1; /* RW */
839 unsigned long lb_irq_int_12 : 1; /* RW */
840 unsigned long lb_irq_int_13 : 1; /* RW */
841 unsigned long lb_irq_int_14 : 1; /* RW */
842 unsigned long lb_irq_int_15 : 1; /* RW */
843 unsigned long l1_nmi_int : 1; /* RW */
844 unsigned long stop_clock : 1; /* RW */
845 unsigned long asic_to_l1 : 1; /* RW */
846 unsigned long l1_to_asic : 1; /* RW */
847 unsigned long ltc_int : 1; /* RW */
848 unsigned long la_seq_trigger : 1; /* RW */
849 unsigned long rsvd_45_63 : 19; /* */
850 } s;
851};
852
853/* ========================================================================= */
854/* UVH_NODE_ID */
855/* ========================================================================= */
856#define UVH_NODE_ID 0x0UL
857
858#define UVH_NODE_ID_FORCE1_SHFT 0
859#define UVH_NODE_ID_FORCE1_MASK 0x0000000000000001UL
860#define UVH_NODE_ID_MANUFACTURER_SHFT 1
861#define UVH_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
862#define UVH_NODE_ID_PART_NUMBER_SHFT 12
863#define UVH_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
864#define UVH_NODE_ID_REVISION_SHFT 28
865#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
866#define UVH_NODE_ID_NODE_ID_SHFT 32
867#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
868#define UVH_NODE_ID_NODES_PER_BIT_SHFT 48
869#define UVH_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
870#define UVH_NODE_ID_NI_PORT_SHFT 56
871#define UVH_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
872
873union uvh_node_id_u {
874 unsigned long v;
875 struct uvh_node_id_s {
876 unsigned long force1 : 1; /* RO */
877 unsigned long manufacturer : 11; /* RO */
878 unsigned long part_number : 16; /* RO */
879 unsigned long revision : 4; /* RO */
880 unsigned long node_id : 15; /* RW */
881 unsigned long rsvd_47 : 1; /* */
882 unsigned long nodes_per_bit : 7; /* RW */
883 unsigned long rsvd_55 : 1; /* */
884 unsigned long ni_port : 4; /* RO */
885 unsigned long rsvd_60_63 : 4; /* */
886 } s;
887};
888
889/* ========================================================================= */
890/* UVH_NODE_PRESENT_TABLE */
891/* ========================================================================= */
892#define UVH_NODE_PRESENT_TABLE 0x1400UL
893#define UVH_NODE_PRESENT_TABLE_DEPTH 16
894
895#define UVH_NODE_PRESENT_TABLE_NODES_SHFT 0
896#define UVH_NODE_PRESENT_TABLE_NODES_MASK 0xffffffffffffffffUL
897
898union uvh_node_present_table_u {
899 unsigned long v;
900 struct uvh_node_present_table_s {
901 unsigned long nodes : 64; /* RW */
902 } s;
903};
904
905/* ========================================================================= */
906/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */
907/* ========================================================================= */
908#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
909
910#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 24
911#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_MASK 0x00003fffff000000UL
912
913union uvh_rh_gam_alias210_redirect_config_0_mmr_u {
914 unsigned long v;
915 struct uvh_rh_gam_alias210_redirect_config_0_mmr_s {
916 unsigned long rsvd_0_23 : 24; /* */
917 unsigned long dest_base : 22; /* RW */
918 unsigned long rsvd_46_63: 18; /* */
919 } s;
920};
921
922/* ========================================================================= */
923/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR */
924/* ========================================================================= */
925#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR 0x16000e0UL
926
927#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_SHFT 24
928#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR_DEST_BASE_MASK 0x00003fffff000000UL
929
930union uvh_rh_gam_alias210_redirect_config_1_mmr_u {
931 unsigned long v;
932 struct uvh_rh_gam_alias210_redirect_config_1_mmr_s {
933 unsigned long rsvd_0_23 : 24; /* */
934 unsigned long dest_base : 22; /* RW */
935 unsigned long rsvd_46_63: 18; /* */
936 } s;
937};
938
939/* ========================================================================= */
940/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR */
941/* ========================================================================= */
942#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR 0x16000f0UL
943
944#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_SHFT 24
945#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR_DEST_BASE_MASK 0x00003fffff000000UL
946
947union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
948 unsigned long v;
949 struct uvh_rh_gam_alias210_redirect_config_2_mmr_s {
950 unsigned long rsvd_0_23 : 24; /* */
951 unsigned long dest_base : 22; /* RW */
952 unsigned long rsvd_46_63: 18; /* */
953 } s;
954};
955
956/* ========================================================================= */
957/* UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR */
958/* ========================================================================= */
959#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR 0x1600020UL
960
961#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_SHFT 26
962#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
963#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
964#define UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
965
966union uvh_rh_gam_cfg_overlay_config_mmr_u {
967 unsigned long v;
968 struct uvh_rh_gam_cfg_overlay_config_mmr_s {
969 unsigned long rsvd_0_25: 26; /* */
970 unsigned long base : 20; /* RW */
971 unsigned long rsvd_46_62: 17; /* */
972 unsigned long enable : 1; /* RW */
973 } s;
974};
975
976/* ========================================================================= */
977/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */
978/* ========================================================================= */
979#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
980
981#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
982#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
983#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
984#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
985#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
986#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
987#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
988#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
989
990union uvh_rh_gam_gru_overlay_config_mmr_u {
991 unsigned long v;
992 struct uvh_rh_gam_gru_overlay_config_mmr_s {
993 unsigned long rsvd_0_27: 28; /* */
994 unsigned long base : 18; /* RW */
995 unsigned long rsvd_46_47: 2; /* */
996 unsigned long gr4 : 1; /* RW */
997 unsigned long rsvd_49_51: 3; /* */
998 unsigned long n_gru : 4; /* RW */
999 unsigned long rsvd_56_62: 7; /* */
1000 unsigned long enable : 1; /* RW */
1001 } s;
1002};
1003
1004/* ========================================================================= */
1005/* UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR */
1006/* ========================================================================= */
1007#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
1008
1009#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
1010#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
1011#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
1012#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
1013#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
1014#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
1015#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1016#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1017
1018union uvh_rh_gam_mmioh_overlay_config_mmr_u {
1019 unsigned long v;
1020 struct uvh_rh_gam_mmioh_overlay_config_mmr_s {
1021 unsigned long rsvd_0_29: 30; /* */
1022 unsigned long base : 16; /* RW */
1023 unsigned long m_io : 6; /* RW */
1024 unsigned long n_io : 4; /* RW */
1025 unsigned long rsvd_56_62: 7; /* */
1026 unsigned long enable : 1; /* RW */
1027 } s;
1028};
1029
1030/* ========================================================================= */
1031/* UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR */
1032/* ========================================================================= */
1033#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR 0x1600028UL
1034
1035#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
1036#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
1037#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
1038#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
1039#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1040#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1041
1042union uvh_rh_gam_mmr_overlay_config_mmr_u {
1043 unsigned long v;
1044 struct uvh_rh_gam_mmr_overlay_config_mmr_s {
1045 unsigned long rsvd_0_25: 26; /* */
1046 unsigned long base : 20; /* RW */
1047 unsigned long dual_hub : 1; /* RW */
1048 unsigned long rsvd_47_62: 16; /* */
1049 unsigned long enable : 1; /* RW */
1050 } s;
1051};
1052
1053/* ========================================================================= */
1054/* UVH_RTC */
1055/* ========================================================================= */
1056#define UVH_RTC 0x340000UL
1057
1058#define UVH_RTC_REAL_TIME_CLOCK_SHFT 0
1059#define UVH_RTC_REAL_TIME_CLOCK_MASK 0x00ffffffffffffffUL
1060
1061union uvh_rtc_u {
1062 unsigned long v;
1063 struct uvh_rtc_s {
1064 unsigned long real_time_clock : 56; /* RW */
1065 unsigned long rsvd_56_63 : 8; /* */
1066 } s;
1067};
1068
1069/* ========================================================================= */
1070/* UVH_RTC1_INT_CONFIG */
1071/* ========================================================================= */
1072#define UVH_RTC1_INT_CONFIG 0x615c0UL
1073
1074#define UVH_RTC1_INT_CONFIG_VECTOR_SHFT 0
1075#define UVH_RTC1_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
1076#define UVH_RTC1_INT_CONFIG_DM_SHFT 8
1077#define UVH_RTC1_INT_CONFIG_DM_MASK 0x0000000000000700UL
1078#define UVH_RTC1_INT_CONFIG_DESTMODE_SHFT 11
1079#define UVH_RTC1_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
1080#define UVH_RTC1_INT_CONFIG_STATUS_SHFT 12
1081#define UVH_RTC1_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
1082#define UVH_RTC1_INT_CONFIG_P_SHFT 13
1083#define UVH_RTC1_INT_CONFIG_P_MASK 0x0000000000002000UL
1084#define UVH_RTC1_INT_CONFIG_T_SHFT 15
1085#define UVH_RTC1_INT_CONFIG_T_MASK 0x0000000000008000UL
1086#define UVH_RTC1_INT_CONFIG_M_SHFT 16
1087#define UVH_RTC1_INT_CONFIG_M_MASK 0x0000000000010000UL
1088#define UVH_RTC1_INT_CONFIG_APIC_ID_SHFT 32
1089#define UVH_RTC1_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
1090
1091union uvh_rtc1_int_config_u {
1092 unsigned long v;
1093 struct uvh_rtc1_int_config_s {
1094 unsigned long vector_ : 8; /* RW */
1095 unsigned long dm : 3; /* RW */
1096 unsigned long destmode : 1; /* RW */
1097 unsigned long status : 1; /* RO */
1098 unsigned long p : 1; /* RO */
1099 unsigned long rsvd_14 : 1; /* */
1100 unsigned long t : 1; /* RO */
1101 unsigned long m : 1; /* RW */
1102 unsigned long rsvd_17_31: 15; /* */
1103 unsigned long apic_id : 32; /* RW */
1104 } s;
1105};
1106
1107/* ========================================================================= */
1108/* UVH_RTC2_INT_CONFIG */
1109/* ========================================================================= */
1110#define UVH_RTC2_INT_CONFIG 0x61600UL
1111
1112#define UVH_RTC2_INT_CONFIG_VECTOR_SHFT 0
1113#define UVH_RTC2_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
1114#define UVH_RTC2_INT_CONFIG_DM_SHFT 8
1115#define UVH_RTC2_INT_CONFIG_DM_MASK 0x0000000000000700UL
1116#define UVH_RTC2_INT_CONFIG_DESTMODE_SHFT 11
1117#define UVH_RTC2_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
1118#define UVH_RTC2_INT_CONFIG_STATUS_SHFT 12
1119#define UVH_RTC2_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
1120#define UVH_RTC2_INT_CONFIG_P_SHFT 13
1121#define UVH_RTC2_INT_CONFIG_P_MASK 0x0000000000002000UL
1122#define UVH_RTC2_INT_CONFIG_T_SHFT 15
1123#define UVH_RTC2_INT_CONFIG_T_MASK 0x0000000000008000UL
1124#define UVH_RTC2_INT_CONFIG_M_SHFT 16
1125#define UVH_RTC2_INT_CONFIG_M_MASK 0x0000000000010000UL
1126#define UVH_RTC2_INT_CONFIG_APIC_ID_SHFT 32
1127#define UVH_RTC2_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
1128
1129union uvh_rtc2_int_config_u {
1130 unsigned long v;
1131 struct uvh_rtc2_int_config_s {
1132 unsigned long vector_ : 8; /* RW */
1133 unsigned long dm : 3; /* RW */
1134 unsigned long destmode : 1; /* RW */
1135 unsigned long status : 1; /* RO */
1136 unsigned long p : 1; /* RO */
1137 unsigned long rsvd_14 : 1; /* */
1138 unsigned long t : 1; /* RO */
1139 unsigned long m : 1; /* RW */
1140 unsigned long rsvd_17_31: 15; /* */
1141 unsigned long apic_id : 32; /* RW */
1142 } s;
1143};
1144
1145/* ========================================================================= */
1146/* UVH_RTC3_INT_CONFIG */
1147/* ========================================================================= */
1148#define UVH_RTC3_INT_CONFIG 0x61640UL
1149
1150#define UVH_RTC3_INT_CONFIG_VECTOR_SHFT 0
1151#define UVH_RTC3_INT_CONFIG_VECTOR_MASK 0x00000000000000ffUL
1152#define UVH_RTC3_INT_CONFIG_DM_SHFT 8
1153#define UVH_RTC3_INT_CONFIG_DM_MASK 0x0000000000000700UL
1154#define UVH_RTC3_INT_CONFIG_DESTMODE_SHFT 11
1155#define UVH_RTC3_INT_CONFIG_DESTMODE_MASK 0x0000000000000800UL
1156#define UVH_RTC3_INT_CONFIG_STATUS_SHFT 12
1157#define UVH_RTC3_INT_CONFIG_STATUS_MASK 0x0000000000001000UL
1158#define UVH_RTC3_INT_CONFIG_P_SHFT 13
1159#define UVH_RTC3_INT_CONFIG_P_MASK 0x0000000000002000UL
1160#define UVH_RTC3_INT_CONFIG_T_SHFT 15
1161#define UVH_RTC3_INT_CONFIG_T_MASK 0x0000000000008000UL
1162#define UVH_RTC3_INT_CONFIG_M_SHFT 16
1163#define UVH_RTC3_INT_CONFIG_M_MASK 0x0000000000010000UL
1164#define UVH_RTC3_INT_CONFIG_APIC_ID_SHFT 32
1165#define UVH_RTC3_INT_CONFIG_APIC_ID_MASK 0xffffffff00000000UL
1166
1167union uvh_rtc3_int_config_u {
1168 unsigned long v;
1169 struct uvh_rtc3_int_config_s {
1170 unsigned long vector_ : 8; /* RW */
1171 unsigned long dm : 3; /* RW */
1172 unsigned long destmode : 1; /* RW */
1173 unsigned long status : 1; /* RO */
1174 unsigned long p : 1; /* RO */
1175 unsigned long rsvd_14 : 1; /* */
1176 unsigned long t : 1; /* RO */
1177 unsigned long m : 1; /* RW */
1178 unsigned long rsvd_17_31: 15; /* */
1179 unsigned long apic_id : 32; /* RW */
1180 } s;
1181};
1182
1183/* ========================================================================= */
1184/* UVH_RTC_INC_RATIO */
1185/* ========================================================================= */
1186#define UVH_RTC_INC_RATIO 0x350000UL
1187
1188#define UVH_RTC_INC_RATIO_FRACTION_SHFT 0
1189#define UVH_RTC_INC_RATIO_FRACTION_MASK 0x00000000000fffffUL
1190#define UVH_RTC_INC_RATIO_RATIO_SHFT 20
1191#define UVH_RTC_INC_RATIO_RATIO_MASK 0x0000000000700000UL
1192
1193union uvh_rtc_inc_ratio_u {
1194 unsigned long v;
1195 struct uvh_rtc_inc_ratio_s {
1196 unsigned long fraction : 20; /* RW */
1197 unsigned long ratio : 3; /* RW */
1198 unsigned long rsvd_23_63: 41; /* */
1199 } s;
1200};
1201
1202/* ========================================================================= */
1203/* UVH_SI_ADDR_MAP_CONFIG */
1204/* ========================================================================= */
1205#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL
1206
1207#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0
1208#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL
1209#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8
1210#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL
1211
1212union uvh_si_addr_map_config_u {
1213 unsigned long v;
1214 struct uvh_si_addr_map_config_s {
1215 unsigned long m_skt : 6; /* RW */
1216 unsigned long rsvd_6_7: 2; /* */
1217 unsigned long n_skt : 4; /* RW */
1218 unsigned long rsvd_12_63: 52; /* */
1219 } s;
1220};
1221
1222/* ========================================================================= */
1223/* UVH_SI_ALIAS0_OVERLAY_CONFIG */
1224/* ========================================================================= */
1225#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL
1226
1227#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24
1228#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
1229#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48
1230#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
1231#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63
1232#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
1233
1234union uvh_si_alias0_overlay_config_u {
1235 unsigned long v;
1236 struct uvh_si_alias0_overlay_config_s {
1237 unsigned long rsvd_0_23: 24; /* */
1238 unsigned long base : 8; /* RW */
1239 unsigned long rsvd_32_47: 16; /* */
1240 unsigned long m_alias : 5; /* RW */
1241 unsigned long rsvd_53_62: 10; /* */
1242 unsigned long enable : 1; /* RW */
1243 } s;
1244};
1245
1246/* ========================================================================= */
1247/* UVH_SI_ALIAS1_OVERLAY_CONFIG */
1248/* ========================================================================= */
1249#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL
1250
1251#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24
1252#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
1253#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48
1254#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
1255#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63
1256#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
1257
1258union uvh_si_alias1_overlay_config_u {
1259 unsigned long v;
1260 struct uvh_si_alias1_overlay_config_s {
1261 unsigned long rsvd_0_23: 24; /* */
1262 unsigned long base : 8; /* RW */
1263 unsigned long rsvd_32_47: 16; /* */
1264 unsigned long m_alias : 5; /* RW */
1265 unsigned long rsvd_53_62: 10; /* */
1266 unsigned long enable : 1; /* RW */
1267 } s;
1268};
1269
1270/* ========================================================================= */
1271/* UVH_SI_ALIAS2_OVERLAY_CONFIG */
1272/* ========================================================================= */
1273#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL
1274
1275#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24
1276#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL
1277#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48
1278#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
1279#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63
1280#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
1281
1282union uvh_si_alias2_overlay_config_u {
1283 unsigned long v;
1284 struct uvh_si_alias2_overlay_config_s {
1285 unsigned long rsvd_0_23: 24; /* */
1286 unsigned long base : 8; /* RW */
1287 unsigned long rsvd_32_47: 16; /* */
1288 unsigned long m_alias : 5; /* RW */
1289 unsigned long rsvd_53_62: 10; /* */
1290 unsigned long enable : 1; /* RW */
1291 } s;
1292};
1293
1294
1295#endif /* _ASM_X86_UV_UV_MMRS_H */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
new file mode 100644
index 000000000000..9064052b73de
--- /dev/null
+++ b/arch/x86/include/asm/vdso.h
@@ -0,0 +1,47 @@
1#ifndef _ASM_X86_VDSO_H
2#define _ASM_X86_VDSO_H
3
4#ifdef CONFIG_X86_64
5extern const char VDSO64_PRELINK[];
6
7/*
8 * Given a pointer to the vDSO image, find the pointer to VDSO64_name
9 * as that symbol is defined in the vDSO sources or linker script.
10 */
11#define VDSO64_SYMBOL(base, name) \
12({ \
13 extern const char VDSO64_##name[]; \
14 (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
15})
16#endif
17
18#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
19extern const char VDSO32_PRELINK[];
20
21/*
22 * Given a pointer to the vDSO image, find the pointer to VDSO32_name
23 * as that symbol is defined in the vDSO sources or linker script.
24 */
25#define VDSO32_SYMBOL(base, name) \
26({ \
27 extern const char VDSO32_##name[]; \
28 (void *)(VDSO32_##name - VDSO32_PRELINK + (unsigned long)(base)); \
29})
30#endif
31
32/*
33 * These symbols are defined with the addresses in the vsyscall page.
34 * See vsyscall-sigreturn.S.
35 */
36extern void __user __kernel_sigreturn;
37extern void __user __kernel_rt_sigreturn;
38
39/*
40 * These symbols are defined by vdso32.S to mark the bounds
41 * of the ELF DSO images included therein.
42 */
43extern const char vdso32_int80_start, vdso32_int80_end;
44extern const char vdso32_syscall_start, vdso32_syscall_end;
45extern const char vdso32_sysenter_start, vdso32_sysenter_end;
46
47#endif /* _ASM_X86_VDSO_H */
diff --git a/arch/x86/include/asm/vga.h b/arch/x86/include/asm/vga.h
new file mode 100644
index 000000000000..c4b9dc2f67c5
--- /dev/null
+++ b/arch/x86/include/asm/vga.h
@@ -0,0 +1,20 @@
1/*
2 * Access to VGA videoram
3 *
4 * (c) 1998 Martin Mares <mj@ucw.cz>
5 */
6
7#ifndef _ASM_X86_VGA_H
8#define _ASM_X86_VGA_H
9
10/*
11 * On the PC, we can just recalculate addresses and then
12 * access the videoram directly without any black magic.
13 */
14
15#define VGA_MAP_MEM(x, s) (unsigned long)phys_to_virt(x)
16
17#define vga_readb(x) (*(x))
18#define vga_writeb(x, y) (*(y) = (x))
19
20#endif /* _ASM_X86_VGA_H */
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
new file mode 100644
index 000000000000..dc27a69e5d2a
--- /dev/null
+++ b/arch/x86/include/asm/vgtod.h
@@ -0,0 +1,29 @@
1#ifndef _ASM_X86_VGTOD_H
2#define _ASM_X86_VGTOD_H
3
4#include <asm/vsyscall.h>
5#include <linux/clocksource.h>
6
7struct vsyscall_gtod_data {
8 seqlock_t lock;
9
10 /* open coded 'struct timespec' */
11 time_t wall_time_sec;
12 u32 wall_time_nsec;
13
14 int sysctl_enabled;
15 struct timezone sys_tz;
16 struct { /* extract of a clocksource struct */
17 cycle_t (*vread)(void);
18 cycle_t cycle_last;
19 cycle_t mask;
20 u32 mult;
21 u32 shift;
22 } clock;
23 struct timespec wall_to_monotonic;
24};
25extern struct vsyscall_gtod_data __vsyscall_gtod_data
26__section_vsyscall_gtod_data;
27extern struct vsyscall_gtod_data vsyscall_gtod_data;
28
29#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vic.h b/arch/x86/include/asm/vic.h
new file mode 100644
index 000000000000..53100f353612
--- /dev/null
+++ b/arch/x86/include/asm/vic.h
@@ -0,0 +1,61 @@
1/* Copyright (C) 1999,2001
2 *
3 * Author: J.E.J.Bottomley@HansenPartnership.com
4 *
5 * Standard include definitions for the NCR Voyager Interrupt Controller */
6
7/* The eight CPI vectors. To activate a CPI, you write a bit mask
8 * corresponding to the processor set to be interrupted into the
9 * relevant register. That set of CPUs will then be interrupted with
10 * the CPI */
11static const int VIC_CPI_Registers[] =
12 {0xFC00, 0xFC01, 0xFC08, 0xFC09,
13 0xFC10, 0xFC11, 0xFC18, 0xFC19 };
14
15#define VIC_PROC_WHO_AM_I 0xfc29
16# define QUAD_IDENTIFIER 0xC0
17# define EIGHT_SLOT_IDENTIFIER 0xE0
18#define QIC_EXTENDED_PROCESSOR_SELECT 0xFC72
19#define VIC_CPI_BASE_REGISTER 0xFC41
20#define VIC_PROCESSOR_ID 0xFC21
21# define VIC_CPU_MASQUERADE_ENABLE 0x8
22
23#define VIC_CLAIM_REGISTER_0 0xFC38
24#define VIC_CLAIM_REGISTER_1 0xFC39
25#define VIC_REDIRECT_REGISTER_0 0xFC60
26#define VIC_REDIRECT_REGISTER_1 0xFC61
27#define VIC_PRIORITY_REGISTER 0xFC20
28
29#define VIC_PRIMARY_MC_BASE 0xFC48
30#define VIC_SECONDARY_MC_BASE 0xFC49
31
32#define QIC_PROCESSOR_ID 0xFC71
33# define QIC_CPUID_ENABLE 0x08
34
35#define QIC_VIC_CPI_BASE_REGISTER 0xFC79
36#define QIC_CPI_BASE_REGISTER 0xFC7A
37
38#define QIC_MASK_REGISTER0 0xFC80
39/* NOTE: these are masked high, enabled low */
40# define QIC_PERF_TIMER 0x01
41# define QIC_LPE 0x02
42# define QIC_SYS_INT 0x04
43# define QIC_CMN_INT 0x08
44/* at the moment, just enable CMN_INT, disable SYS_INT */
45# define QIC_DEFAULT_MASK0 (~(QIC_CMN_INT /* | VIC_SYS_INT */))
46#define QIC_MASK_REGISTER1 0xFC81
47# define QIC_BOOT_CPI_MASK 0xFE
48/* Enable CPI's 1-6 inclusive */
49# define QIC_CPI_ENABLE 0x81
50
51#define QIC_INTERRUPT_CLEAR0 0xFC8A
52#define QIC_INTERRUPT_CLEAR1 0xFC8B
53
54/* this is where we place the CPI vectors */
55#define VIC_DEFAULT_CPI_BASE 0xC0
56/* this is where we place the QIC CPI vectors */
57#define QIC_DEFAULT_CPI_BASE 0xD0
58
59#define VIC_BOOT_INTERRUPT_MASK 0xfe
60
61extern void smp_vic_timer_interrupt(void);
diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h
new file mode 100644
index 000000000000..166adf61e770
--- /dev/null
+++ b/arch/x86/include/asm/visws/cobalt.h
@@ -0,0 +1,125 @@
1#ifndef _ASM_X86_VISWS_COBALT_H
2#define _ASM_X86_VISWS_COBALT_H
3
4#include <asm/fixmap.h>
5
6/*
7 * Cobalt SGI Visual Workstation system ASIC
8 */
9
10#define CO_CPU_NUM_PHYS 0x1e00
11#define CO_CPU_TAB_PHYS (CO_CPU_NUM_PHYS + 2)
12
13#define CO_CPU_MAX 4
14
15#define CO_CPU_PHYS 0xc2000000
16#define CO_APIC_PHYS 0xc4000000
17
18/* see set_fixmap() and asm/fixmap.h */
19#define CO_CPU_VADDR (fix_to_virt(FIX_CO_CPU))
20#define CO_APIC_VADDR (fix_to_virt(FIX_CO_APIC))
21
22/* Cobalt CPU registers -- relative to CO_CPU_VADDR, use co_cpu_*() */
23#define CO_CPU_REV 0x08
24#define CO_CPU_CTRL 0x10
25#define CO_CPU_STAT 0x20
26#define CO_CPU_TIMEVAL 0x30
27
28/* CO_CPU_CTRL bits */
29#define CO_CTRL_TIMERUN 0x04 /* 0 == disabled */
30#define CO_CTRL_TIMEMASK 0x08 /* 0 == unmasked */
31
32/* CO_CPU_STATUS bits */
33#define CO_STAT_TIMEINTR 0x02 /* (r) 1 == int pend, (w) 0 == clear */
34
35/* CO_CPU_TIMEVAL value */
36#define CO_TIME_HZ 100000000 /* Cobalt core rate */
37
38/* Cobalt APIC registers -- relative to CO_APIC_VADDR, use co_apic_*() */
39#define CO_APIC_HI(n) (((n) * 0x10) + 4)
40#define CO_APIC_LO(n) ((n) * 0x10)
41#define CO_APIC_ID 0x0ffc
42
43/* CO_APIC_ID bits */
44#define CO_APIC_ENABLE 0x00000100
45
46/* CO_APIC_LO bits */
47#define CO_APIC_MASK 0x00010000 /* 0 = enabled */
48#define CO_APIC_LEVEL 0x00008000 /* 0 = edge */
49
50/*
51 * Where things are physically wired to Cobalt
52 * #defines with no board _<type>_<rev>_ are common to all (thus far)
53 */
54#define CO_APIC_IDE0 4
55#define CO_APIC_IDE1 2 /* Only on 320 */
56
57#define CO_APIC_8259 12 /* serial, floppy, par-l-l */
58
59/* Lithium PCI Bridge A -- "the one with 82557 Ethernet" */
60#define CO_APIC_PCIA_BASE0 0 /* and 1 */ /* slot 0, line 0 */
61#define CO_APIC_PCIA_BASE123 5 /* and 6 */ /* slot 0, line 1 */
62
63#define CO_APIC_PIIX4_USB 7 /* this one is weird */
64
65/* Lithium PCI Bridge B -- "the one with PIIX4" */
66#define CO_APIC_PCIB_BASE0 8 /* and 9-12 *//* slot 0, line 0 */
67#define CO_APIC_PCIB_BASE123 13 /* 14.15 */ /* slot 0, line 1 */
68
69#define CO_APIC_VIDOUT0 16
70#define CO_APIC_VIDOUT1 17
71#define CO_APIC_VIDIN0 18
72#define CO_APIC_VIDIN1 19
73
74#define CO_APIC_LI_AUDIO 22
75
76#define CO_APIC_AS 24
77#define CO_APIC_RE 25
78
79#define CO_APIC_CPU 28 /* Timer and Cache interrupt */
80#define CO_APIC_NMI 29
81#define CO_APIC_LAST CO_APIC_NMI
82
83/*
84 * This is how irqs are assigned on the Visual Workstation.
85 * Legacy devices get irq's 1-15 (system clock is 0 and is CO_APIC_CPU).
86 * All other devices (including PCI) go to Cobalt and are irq's 16 on up.
87 */
88#define CO_IRQ_APIC0 16 /* irq of apic entry 0 */
89#define IS_CO_APIC(irq) ((irq) >= CO_IRQ_APIC0)
90#define CO_IRQ(apic) (CO_IRQ_APIC0 + (apic)) /* apic ent to irq */
91#define CO_APIC(irq) ((irq) - CO_IRQ_APIC0) /* irq to apic ent */
92#define CO_IRQ_IDE0 14 /* knowledge of... */
93#define CO_IRQ_IDE1 15 /* ... ide driver defaults! */
94#define CO_IRQ_8259 CO_IRQ(CO_APIC_8259)
95
96#ifdef CONFIG_X86_VISWS_APIC
97static inline void co_cpu_write(unsigned long reg, unsigned long v)
98{
99 *((volatile unsigned long *)(CO_CPU_VADDR+reg))=v;
100}
101
102static inline unsigned long co_cpu_read(unsigned long reg)
103{
104 return *((volatile unsigned long *)(CO_CPU_VADDR+reg));
105}
106
107static inline void co_apic_write(unsigned long reg, unsigned long v)
108{
109 *((volatile unsigned long *)(CO_APIC_VADDR+reg))=v;
110}
111
112static inline unsigned long co_apic_read(unsigned long reg)
113{
114 return *((volatile unsigned long *)(CO_APIC_VADDR+reg));
115}
116#endif
117
118extern char visws_board_type;
119
120#define VISWS_320 0
121#define VISWS_540 1
122
123extern char visws_board_rev;
124
125#endif /* _ASM_X86_VISWS_COBALT_H */
diff --git a/arch/x86/include/asm/visws/lithium.h b/arch/x86/include/asm/visws/lithium.h
new file mode 100644
index 000000000000..a10d89bc1270
--- /dev/null
+++ b/arch/x86/include/asm/visws/lithium.h
@@ -0,0 +1,53 @@
1#ifndef _ASM_X86_VISWS_LITHIUM_H
2#define _ASM_X86_VISWS_LITHIUM_H
3
4#include <asm/fixmap.h>
5
6/*
7 * Lithium is the SGI Visual Workstation I/O ASIC
8 */
9
10#define LI_PCI_A_PHYS 0xfc000000 /* Enet is dev 3 */
11#define LI_PCI_B_PHYS 0xfd000000 /* PIIX4 is here */
12
13/* see set_fixmap() and asm/fixmap.h */
14#define LI_PCIA_VADDR (fix_to_virt(FIX_LI_PCIA))
15#define LI_PCIB_VADDR (fix_to_virt(FIX_LI_PCIB))
16
17/* Not a standard PCI? (not in linux/pci.h) */
18#define LI_PCI_BUSNUM 0x44 /* lo8: primary, hi8: sub */
19#define LI_PCI_INTEN 0x46
20
21/* LI_PCI_INTENT bits */
22#define LI_INTA_0 0x0001
23#define LI_INTA_1 0x0002
24#define LI_INTA_2 0x0004
25#define LI_INTA_3 0x0008
26#define LI_INTA_4 0x0010
27#define LI_INTB 0x0020
28#define LI_INTC 0x0040
29#define LI_INTD 0x0080
30
31/* More special purpose macros... */
32static inline void li_pcia_write16(unsigned long reg, unsigned short v)
33{
34 *((volatile unsigned short *)(LI_PCIA_VADDR+reg))=v;
35}
36
37static inline unsigned short li_pcia_read16(unsigned long reg)
38{
39 return *((volatile unsigned short *)(LI_PCIA_VADDR+reg));
40}
41
42static inline void li_pcib_write16(unsigned long reg, unsigned short v)
43{
44 *((volatile unsigned short *)(LI_PCIB_VADDR+reg))=v;
45}
46
47static inline unsigned short li_pcib_read16(unsigned long reg)
48{
49 return *((volatile unsigned short *)(LI_PCIB_VADDR+reg));
50}
51
52#endif /* _ASM_X86_VISWS_LITHIUM_H */
53
diff --git a/arch/x86/include/asm/visws/piix4.h b/arch/x86/include/asm/visws/piix4.h
new file mode 100644
index 000000000000..d0af4d338e7f
--- /dev/null
+++ b/arch/x86/include/asm/visws/piix4.h
@@ -0,0 +1,107 @@
1#ifndef _ASM_X86_VISWS_PIIX4_H
2#define _ASM_X86_VISWS_PIIX4_H
3
4/*
5 * PIIX4 as used on SGI Visual Workstations
6 */
7
8#define PIIX_PM_START 0x0F80
9
10#define SIO_GPIO_START 0x0FC0
11
12#define SIO_PM_START 0x0FC8
13
14#define PMBASE PIIX_PM_START
15#define GPIREG0 (PMBASE+0x30)
16#define GPIREG(x) (GPIREG0+((x)/8))
17#define GPIBIT(x) (1 << ((x)%8))
18
19#define PIIX_GPI_BD_ID1 18
20#define PIIX_GPI_BD_ID2 19
21#define PIIX_GPI_BD_ID3 20
22#define PIIX_GPI_BD_ID4 21
23#define PIIX_GPI_BD_REG GPIREG(PIIX_GPI_BD_ID1)
24#define PIIX_GPI_BD_MASK (GPIBIT(PIIX_GPI_BD_ID1) | \
25 GPIBIT(PIIX_GPI_BD_ID2) | \
26 GPIBIT(PIIX_GPI_BD_ID3) | \
27 GPIBIT(PIIX_GPI_BD_ID4) )
28
29#define PIIX_GPI_BD_SHIFT (PIIX_GPI_BD_ID1 % 8)
30
31#define SIO_INDEX 0x2e
32#define SIO_DATA 0x2f
33
34#define SIO_DEV_SEL 0x7
35#define SIO_DEV_ENB 0x30
36#define SIO_DEV_MSB 0x60
37#define SIO_DEV_LSB 0x61
38
39#define SIO_GP_DEV 0x7
40
41#define SIO_GP_BASE SIO_GPIO_START
42#define SIO_GP_MSB (SIO_GP_BASE>>8)
43#define SIO_GP_LSB (SIO_GP_BASE&0xff)
44
45#define SIO_GP_DATA1 (SIO_GP_BASE+0)
46
47#define SIO_PM_DEV 0x8
48
49#define SIO_PM_BASE SIO_PM_START
50#define SIO_PM_MSB (SIO_PM_BASE>>8)
51#define SIO_PM_LSB (SIO_PM_BASE&0xff)
52#define SIO_PM_INDEX (SIO_PM_BASE+0)
53#define SIO_PM_DATA (SIO_PM_BASE+1)
54
55#define SIO_PM_FER2 0x1
56
57#define SIO_PM_GP_EN 0x80
58
59
60
61/*
62 * This is the dev/reg where generating a config cycle will
63 * result in a PCI special cycle.
64 */
65#define SPECIAL_DEV 0xff
66#define SPECIAL_REG 0x00
67
68/*
69 * PIIX4 needs to see a special cycle with the following data
70 * to be convinced the processor has gone into the stop grant
71 * state. PIIX4 insists on seeing this before it will power
72 * down a system.
73 */
74#define PIIX_SPECIAL_STOP 0x00120002
75
76#define PIIX4_RESET_PORT 0xcf9
77#define PIIX4_RESET_VAL 0x6
78
79#define PMSTS_PORT 0xf80 // 2 bytes PM Status
80#define PMEN_PORT 0xf82 // 2 bytes PM Enable
81#define PMCNTRL_PORT 0xf84 // 2 bytes PM Control
82
83#define PM_SUSPEND_ENABLE 0x2000 // start sequence to suspend state
84
85/*
86 * PMSTS and PMEN I/O bit definitions.
87 * (Bits are the same in both registers)
88 */
89#define PM_STS_RSM (1<<15) // Resume Status
90#define PM_STS_PWRBTNOR (1<<11) // Power Button Override
91#define PM_STS_RTC (1<<10) // RTC status
92#define PM_STS_PWRBTN (1<<8) // Power Button Pressed?
93#define PM_STS_GBL (1<<5) // Global Status
94#define PM_STS_BM (1<<4) // Bus Master Status
95#define PM_STS_TMROF (1<<0) // Timer Overflow Status.
96
97/*
98 * Stop clock GPI register
99 */
100#define PIIX_GPIREG0 (0xf80 + 0x30)
101
102/*
103 * Stop clock GPI bit in GPIREG0
104 */
105#define PIIX_GPI_STPCLK 0x4 // STPCLK signal routed back in
106
107#endif /* _ASM_X86_VISWS_PIIX4_H */
diff --git a/arch/x86/include/asm/visws/sgivw.h b/arch/x86/include/asm/visws/sgivw.h
new file mode 100644
index 000000000000..5fbf63e1003c
--- /dev/null
+++ b/arch/x86/include/asm/visws/sgivw.h
@@ -0,0 +1,5 @@
1/*
2 * Frame buffer position and size:
3 */
4extern unsigned long sgivwfb_mem_phys;
5extern unsigned long sgivwfb_mem_size;
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
new file mode 100644
index 000000000000..f9303602fbc0
--- /dev/null
+++ b/arch/x86/include/asm/vm86.h
@@ -0,0 +1,208 @@
1#ifndef _ASM_X86_VM86_H
2#define _ASM_X86_VM86_H
3
4/*
5 * I'm guessing at the VIF/VIP flag usage, but hope that this is how
6 * the Pentium uses them. Linux will return from vm86 mode when both
7 * VIF and VIP is set.
8 *
9 * On a Pentium, we could probably optimize the virtual flags directly
10 * in the eflags register instead of doing it "by hand" in vflags...
11 *
12 * Linus
13 */
14
15#include <asm/processor-flags.h>
16
17#define BIOSSEG 0x0f000
18
19#define CPU_086 0
20#define CPU_186 1
21#define CPU_286 2
22#define CPU_386 3
23#define CPU_486 4
24#define CPU_586 5
25
26/*
27 * Return values for the 'vm86()' system call
28 */
29#define VM86_TYPE(retval) ((retval) & 0xff)
30#define VM86_ARG(retval) ((retval) >> 8)
31
32#define VM86_SIGNAL 0 /* return due to signal */
33#define VM86_UNKNOWN 1 /* unhandled GP fault
34 - IO-instruction or similar */
35#define VM86_INTx 2 /* int3/int x instruction (ARG = x) */
36#define VM86_STI 3 /* sti/popf/iret instruction enabled
37 virtual interrupts */
38
39/*
40 * Additional return values when invoking new vm86()
41 */
42#define VM86_PICRETURN 4 /* return due to pending PIC request */
43#define VM86_TRAP 6 /* return due to DOS-debugger request */
44
45/*
46 * function codes when invoking new vm86()
47 */
48#define VM86_PLUS_INSTALL_CHECK 0
49#define VM86_ENTER 1
50#define VM86_ENTER_NO_BYPASS 2
51#define VM86_REQUEST_IRQ 3
52#define VM86_FREE_IRQ 4
53#define VM86_GET_IRQ_BITS 5
54#define VM86_GET_AND_RESET_IRQ 6
55
56/*
57 * This is the stack-layout seen by the user space program when we have
58 * done a translation of "SAVE_ALL" from vm86 mode. The real kernel layout
59 * is 'kernel_vm86_regs' (see below).
60 */
61
62struct vm86_regs {
63/*
64 * normal regs, with special meaning for the segment descriptors..
65 */
66 long ebx;
67 long ecx;
68 long edx;
69 long esi;
70 long edi;
71 long ebp;
72 long eax;
73 long __null_ds;
74 long __null_es;
75 long __null_fs;
76 long __null_gs;
77 long orig_eax;
78 long eip;
79 unsigned short cs, __csh;
80 long eflags;
81 long esp;
82 unsigned short ss, __ssh;
83/*
84 * these are specific to v86 mode:
85 */
86 unsigned short es, __esh;
87 unsigned short ds, __dsh;
88 unsigned short fs, __fsh;
89 unsigned short gs, __gsh;
90};
91
92struct revectored_struct {
93 unsigned long __map[8]; /* 256 bits */
94};
95
96struct vm86_struct {
97 struct vm86_regs regs;
98 unsigned long flags;
99 unsigned long screen_bitmap;
100 unsigned long cpu_type;
101 struct revectored_struct int_revectored;
102 struct revectored_struct int21_revectored;
103};
104
105/*
106 * flags masks
107 */
108#define VM86_SCREEN_BITMAP 0x0001
109
110struct vm86plus_info_struct {
111 unsigned long force_return_for_pic:1;
112 unsigned long vm86dbg_active:1; /* for debugger */
113 unsigned long vm86dbg_TFpendig:1; /* for debugger */
114 unsigned long unused:28;
115 unsigned long is_vm86pus:1; /* for vm86 internal use */
116 unsigned char vm86dbg_intxxtab[32]; /* for debugger */
117};
118struct vm86plus_struct {
119 struct vm86_regs regs;
120 unsigned long flags;
121 unsigned long screen_bitmap;
122 unsigned long cpu_type;
123 struct revectored_struct int_revectored;
124 struct revectored_struct int21_revectored;
125 struct vm86plus_info_struct vm86plus;
126};
127
128#ifdef __KERNEL__
129
130#include <asm/ptrace.h>
131
132/*
133 * This is the (kernel) stack-layout when we have done a "SAVE_ALL" from vm86
134 * mode - the main change is that the old segment descriptors aren't
135 * useful any more and are forced to be zero by the kernel (and the
136 * hardware when a trap occurs), and the real segment descriptors are
137 * at the end of the structure. Look at ptrace.h to see the "normal"
138 * setup. For user space layout see 'struct vm86_regs' above.
139 */
140
141struct kernel_vm86_regs {
142/*
143 * normal regs, with special meaning for the segment descriptors..
144 */
145 struct pt_regs pt;
146/*
147 * these are specific to v86 mode:
148 */
149 unsigned short es, __esh;
150 unsigned short ds, __dsh;
151 unsigned short fs, __fsh;
152 unsigned short gs, __gsh;
153};
154
155struct kernel_vm86_struct {
156 struct kernel_vm86_regs regs;
157/*
158 * the below part remains on the kernel stack while we are in VM86 mode.
159 * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we
160 * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above
161 * 'struct kernel_vm86_regs' with the then actual values.
162 * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct'
163 * in kernelspace, hence we need not reget the data from userspace.
164 */
165#define VM86_TSS_ESP0 flags
166 unsigned long flags;
167 unsigned long screen_bitmap;
168 unsigned long cpu_type;
169 struct revectored_struct int_revectored;
170 struct revectored_struct int21_revectored;
171 struct vm86plus_info_struct vm86plus;
172 struct pt_regs *regs32; /* here we save the pointer to the old regs */
173/*
174 * The below is not part of the structure, but the stack layout continues
175 * this way. In front of 'return-eip' may be some data, depending on
176 * compilation, so we don't rely on this and save the pointer to 'oldregs'
177 * in 'regs32' above.
178 * However, with GCC-2.7.2 and the current CFLAGS you see exactly this:
179
180 long return-eip; from call to vm86()
181 struct pt_regs oldregs; user space registers as saved by syscall
182 */
183};
184
185#ifdef CONFIG_VM86
186
187void handle_vm86_fault(struct kernel_vm86_regs *, long);
188int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
189struct pt_regs *save_v86_state(struct kernel_vm86_regs *);
190
191struct task_struct;
192void release_vm86_irqs(struct task_struct *);
193
194#else
195
196#define handle_vm86_fault(a, b)
197#define release_vm86_irqs(a)
198
199static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
200{
201 return 0;
202}
203
204#endif /* CONFIG_VM86 */
205
206#endif /* __KERNEL__ */
207
208#endif /* _ASM_X86_VM86_H */
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h
new file mode 100644
index 000000000000..b7c0dea119fe
--- /dev/null
+++ b/arch/x86/include/asm/vmi.h
@@ -0,0 +1,263 @@
1/*
2 * VMI interface definition
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Maintained by: Zachary Amsden zach@vmware.com
22 *
23 */
24#include <linux/types.h>
25
26/*
27 *---------------------------------------------------------------------
28 *
29 * VMI Option ROM API
30 *
31 *---------------------------------------------------------------------
32 */
33#define VMI_SIGNATURE 0x696d5663 /* "cVmi" */
34
35#define PCI_VENDOR_ID_VMWARE 0x15AD
36#define PCI_DEVICE_ID_VMWARE_VMI 0x0801
37
38/*
39 * We use two version numbers for compatibility, with the major
40 * number signifying interface breakages, and the minor number
41 * interface extensions.
42 */
43#define VMI_API_REV_MAJOR 3
44#define VMI_API_REV_MINOR 0
45
46#define VMI_CALL_CPUID 0
47#define VMI_CALL_WRMSR 1
48#define VMI_CALL_RDMSR 2
49#define VMI_CALL_SetGDT 3
50#define VMI_CALL_SetLDT 4
51#define VMI_CALL_SetIDT 5
52#define VMI_CALL_SetTR 6
53#define VMI_CALL_GetGDT 7
54#define VMI_CALL_GetLDT 8
55#define VMI_CALL_GetIDT 9
56#define VMI_CALL_GetTR 10
57#define VMI_CALL_WriteGDTEntry 11
58#define VMI_CALL_WriteLDTEntry 12
59#define VMI_CALL_WriteIDTEntry 13
60#define VMI_CALL_UpdateKernelStack 14
61#define VMI_CALL_SetCR0 15
62#define VMI_CALL_SetCR2 16
63#define VMI_CALL_SetCR3 17
64#define VMI_CALL_SetCR4 18
65#define VMI_CALL_GetCR0 19
66#define VMI_CALL_GetCR2 20
67#define VMI_CALL_GetCR3 21
68#define VMI_CALL_GetCR4 22
69#define VMI_CALL_WBINVD 23
70#define VMI_CALL_SetDR 24
71#define VMI_CALL_GetDR 25
72#define VMI_CALL_RDPMC 26
73#define VMI_CALL_RDTSC 27
74#define VMI_CALL_CLTS 28
75#define VMI_CALL_EnableInterrupts 29
76#define VMI_CALL_DisableInterrupts 30
77#define VMI_CALL_GetInterruptMask 31
78#define VMI_CALL_SetInterruptMask 32
79#define VMI_CALL_IRET 33
80#define VMI_CALL_SYSEXIT 34
81#define VMI_CALL_Halt 35
82#define VMI_CALL_Reboot 36
83#define VMI_CALL_Shutdown 37
84#define VMI_CALL_SetPxE 38
85#define VMI_CALL_SetPxELong 39
86#define VMI_CALL_UpdatePxE 40
87#define VMI_CALL_UpdatePxELong 41
88#define VMI_CALL_MachineToPhysical 42
89#define VMI_CALL_PhysicalToMachine 43
90#define VMI_CALL_AllocatePage 44
91#define VMI_CALL_ReleasePage 45
92#define VMI_CALL_InvalPage 46
93#define VMI_CALL_FlushTLB 47
94#define VMI_CALL_SetLinearMapping 48
95
96#define VMI_CALL_SetIOPLMask 61
97#define VMI_CALL_SetInitialAPState 62
98#define VMI_CALL_APICWrite 63
99#define VMI_CALL_APICRead 64
100#define VMI_CALL_IODelay 65
101#define VMI_CALL_SetLazyMode 73
102
103/*
104 *---------------------------------------------------------------------
105 *
106 * MMU operation flags
107 *
108 *---------------------------------------------------------------------
109 */
110
111/* Flags used by VMI_{Allocate|Release}Page call */
112#define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */
113#define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */
114#define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */
115
116
117/* Flags shared by Allocate|Release Page and PTE updates */
118#define VMI_PAGE_PT 0x01
119#define VMI_PAGE_PD 0x02
120#define VMI_PAGE_PDP 0x04
121#define VMI_PAGE_PML4 0x08
122
123#define VMI_PAGE_NORMAL 0x00 /* for debugging */
124
125/* Flags used by PTE updates */
126#define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */
127#define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */
128#define VMI_PAGE_VA_MASK 0xfffff000
129
130#ifdef CONFIG_X86_PAE
131#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
132#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
133#else
134#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED)
135#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED)
136#endif
137
138/* Flags used by VMI_FlushTLB call */
139#define VMI_FLUSH_TLB 0x01
140#define VMI_FLUSH_GLOBAL 0x02
141
142/*
143 *---------------------------------------------------------------------
144 *
145 * VMI relocation definitions for ROM call get_reloc
146 *
147 *---------------------------------------------------------------------
148 */
149
150/* VMI Relocation types */
151#define VMI_RELOCATION_NONE 0
152#define VMI_RELOCATION_CALL_REL 1
153#define VMI_RELOCATION_JUMP_REL 2
154#define VMI_RELOCATION_NOP 3
155
156#ifndef __ASSEMBLY__
157struct vmi_relocation_info {
158 unsigned char *eip;
159 unsigned char type;
160 unsigned char reserved[3];
161};
162#endif
163
164
165/*
166 *---------------------------------------------------------------------
167 *
168 * Generic ROM structures and definitions
169 *
170 *---------------------------------------------------------------------
171 */
172
173#ifndef __ASSEMBLY__
174
175struct vrom_header {
176 u16 rom_signature; /* option ROM signature */
177 u8 rom_length; /* ROM length in 512 byte chunks */
178 u8 rom_entry[4]; /* 16-bit code entry point */
179 u8 rom_pad0; /* 4-byte align pad */
180 u32 vrom_signature; /* VROM identification signature */
181 u8 api_version_min;/* Minor version of API */
182 u8 api_version_maj;/* Major version of API */
183 u8 jump_slots; /* Number of jump slots */
184 u8 reserved1; /* Reserved for expansion */
185 u32 virtual_top; /* Hypervisor virtual address start */
186 u16 reserved2; /* Reserved for expansion */
187 u16 license_offs; /* Offset to License string */
188 u16 pci_header_offs;/* Offset to PCI OPROM header */
189 u16 pnp_header_offs;/* Offset to PnP OPROM header */
190 u32 rom_pad3; /* PnP reserverd / VMI reserved */
191 u8 reserved[96]; /* Reserved for headers */
192 char vmi_init[8]; /* VMI_Init jump point */
193 char get_reloc[8]; /* VMI_GetRelocationInfo jump point */
194} __attribute__((packed));
195
196struct pnp_header {
197 char sig[4];
198 char rev;
199 char size;
200 short next;
201 short res;
202 long devID;
203 unsigned short manufacturer_offset;
204 unsigned short product_offset;
205} __attribute__((packed));
206
207struct pci_header {
208 char sig[4];
209 short vendorID;
210 short deviceID;
211 short vpdData;
212 short size;
213 char rev;
214 char class;
215 char subclass;
216 char interface;
217 short chunks;
218 char rom_version_min;
219 char rom_version_maj;
220 char codetype;
221 char lastRom;
222 short reserved;
223} __attribute__((packed));
224
225/* Function prototypes for bootstrapping */
226extern void vmi_init(void);
227extern void vmi_bringup(void);
228extern void vmi_apply_boot_page_allocations(void);
229
230/* State needed to start an application processor in an SMP system. */
231struct vmi_ap_state {
232 u32 cr0;
233 u32 cr2;
234 u32 cr3;
235 u32 cr4;
236
237 u64 efer;
238
239 u32 eip;
240 u32 eflags;
241 u32 eax;
242 u32 ebx;
243 u32 ecx;
244 u32 edx;
245 u32 esp;
246 u32 ebp;
247 u32 esi;
248 u32 edi;
249 u16 cs;
250 u16 ss;
251 u16 ds;
252 u16 es;
253 u16 fs;
254 u16 gs;
255 u16 ldtr;
256
257 u16 gdtr_limit;
258 u32 gdtr_base;
259 u32 idtr_base;
260 u16 idtr_limit;
261};
262
263#endif
diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h
new file mode 100644
index 000000000000..c6e0bee93e3c
--- /dev/null
+++ b/arch/x86/include/asm/vmi_time.h
@@ -0,0 +1,98 @@
1/*
2 * VMI Time wrappers
3 *
4 * Copyright (C) 2006, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to dhecht@vmware.com
22 *
23 */
24
25#ifndef _ASM_X86_VMI_TIME_H
26#define _ASM_X86_VMI_TIME_H
27
28/*
29 * Raw VMI call indices for timer functions
30 */
31#define VMI_CALL_GetCycleFrequency 66
32#define VMI_CALL_GetCycleCounter 67
33#define VMI_CALL_SetAlarm 68
34#define VMI_CALL_CancelAlarm 69
35#define VMI_CALL_GetWallclockTime 70
36#define VMI_CALL_WallclockUpdated 71
37
38/* Cached VMI timer operations */
39extern struct vmi_timer_ops {
40 u64 (*get_cycle_frequency)(void);
41 u64 (*get_cycle_counter)(int);
42 u64 (*get_wallclock)(void);
43 int (*wallclock_updated)(void);
44 void (*set_alarm)(u32 flags, u64 expiry, u64 period);
45 void (*cancel_alarm)(u32 flags);
46} vmi_timer_ops;
47
48/* Prototypes */
49extern void __init vmi_time_init(void);
50extern unsigned long vmi_get_wallclock(void);
51extern int vmi_set_wallclock(unsigned long now);
52extern unsigned long long vmi_sched_clock(void);
53extern unsigned long vmi_tsc_khz(void);
54
55#ifdef CONFIG_X86_LOCAL_APIC
56extern void __devinit vmi_time_bsp_init(void);
57extern void __devinit vmi_time_ap_init(void);
58#endif
59
60/*
61 * When run under a hypervisor, a vcpu is always in one of three states:
62 * running, halted, or ready. The vcpu is in the 'running' state if it
63 * is executing. When the vcpu executes the halt interface, the vcpu
64 * enters the 'halted' state and remains halted until there is some work
65 * pending for the vcpu (e.g. an alarm expires, host I/O completes on
66 * behalf of virtual I/O). At this point, the vcpu enters the 'ready'
67 * state (waiting for the hypervisor to reschedule it). Finally, at any
68 * time when the vcpu is not in the 'running' state nor the 'halted'
69 * state, it is in the 'ready' state.
70 *
71 * Real time is advances while the vcpu is 'running', 'ready', or
72 * 'halted'. Stolen time is the time in which the vcpu is in the
73 * 'ready' state. Available time is the remaining time -- the vcpu is
74 * either 'running' or 'halted'.
75 *
76 * All three views of time are accessible through the VMI cycle
77 * counters.
78 */
79
80/* The cycle counters. */
81#define VMI_CYCLES_REAL 0
82#define VMI_CYCLES_AVAILABLE 1
83#define VMI_CYCLES_STOLEN 2
84
85/* The alarm interface 'flags' bits */
86#define VMI_ALARM_COUNTERS 2
87
88#define VMI_ALARM_COUNTER_MASK 0x000000ff
89
90#define VMI_ALARM_WIRED_IRQ0 0x00000000
91#define VMI_ALARM_WIRED_LVTT 0x00010000
92
93#define VMI_ALARM_IS_ONESHOT 0x00000000
94#define VMI_ALARM_IS_PERIODIC 0x00000100
95
96#define CONFIG_VMI_ALARM_HZ 100
97
98#endif /* _ASM_X86_VMI_TIME_H */
diff --git a/arch/x86/include/asm/voyager.h b/arch/x86/include/asm/voyager.h
new file mode 100644
index 000000000000..9c811d2e6f91
--- /dev/null
+++ b/arch/x86/include/asm/voyager.h
@@ -0,0 +1,528 @@
1/* Copyright (C) 1999,2001
2 *
3 * Author: J.E.J.Bottomley@HansenPartnership.com
4 *
5 * Standard include definitions for the NCR Voyager system */
6
7#undef VOYAGER_DEBUG
8#undef VOYAGER_CAT_DEBUG
9
10#ifdef VOYAGER_DEBUG
11#define VDEBUG(x) printk x
12#else
13#define VDEBUG(x)
14#endif
15
16/* There are three levels of voyager machine: 3,4 and 5. The rule is
17 * if it's less than 3435 it's a Level 3 except for a 3360 which is
18 * a level 4. A 3435 or above is a Level 5 */
19#define VOYAGER_LEVEL5_AND_ABOVE 0x3435
20#define VOYAGER_LEVEL4 0x3360
21
22/* The L4 DINO ASIC */
23#define VOYAGER_DINO 0x43
24
25/* voyager ports in standard I/O space */
26#define VOYAGER_MC_SETUP 0x96
27
28
29#define VOYAGER_CAT_CONFIG_PORT 0x97
30# define VOYAGER_CAT_DESELECT 0xff
31#define VOYAGER_SSPB_RELOCATION_PORT 0x98
32
33/* Valid CAT controller commands */
34/* start instruction register cycle */
35#define VOYAGER_CAT_IRCYC 0x01
36/* start data register cycle */
37#define VOYAGER_CAT_DRCYC 0x02
38/* move to execute state */
39#define VOYAGER_CAT_RUN 0x0F
40/* end operation */
41#define VOYAGER_CAT_END 0x80
42/* hold in idle state */
43#define VOYAGER_CAT_HOLD 0x90
44/* single step an "intest" vector */
45#define VOYAGER_CAT_STEP 0xE0
46/* return cat controller to CLEMSON mode */
47#define VOYAGER_CAT_CLEMSON 0xFF
48
49/* the default cat command header */
50#define VOYAGER_CAT_HEADER 0x7F
51
52/* the range of possible CAT module ids in the system */
53#define VOYAGER_MIN_MODULE 0x10
54#define VOYAGER_MAX_MODULE 0x1f
55
56/* The voyager registers per asic */
57#define VOYAGER_ASIC_ID_REG 0x00
58#define VOYAGER_ASIC_TYPE_REG 0x01
59/* the sub address registers can be made auto incrementing on reads */
60#define VOYAGER_AUTO_INC_REG 0x02
61# define VOYAGER_AUTO_INC 0x04
62# define VOYAGER_NO_AUTO_INC 0xfb
63#define VOYAGER_SUBADDRDATA 0x03
64#define VOYAGER_SCANPATH 0x05
65# define VOYAGER_CONNECT_ASIC 0x01
66# define VOYAGER_DISCONNECT_ASIC 0xfe
67#define VOYAGER_SUBADDRLO 0x06
68#define VOYAGER_SUBADDRHI 0x07
69#define VOYAGER_SUBMODSELECT 0x08
70#define VOYAGER_SUBMODPRESENT 0x09
71
72#define VOYAGER_SUBADDR_LO 0xff
73#define VOYAGER_SUBADDR_HI 0xffff
74
75/* the maximum size of a scan path -- used to form instructions */
76#define VOYAGER_MAX_SCAN_PATH 0x100
77/* the biggest possible register size (in bytes) */
78#define VOYAGER_MAX_REG_SIZE 4
79
80/* Total number of possible modules (including submodules) */
81#define VOYAGER_MAX_MODULES 16
82/* Largest number of asics per module */
83#define VOYAGER_MAX_ASICS_PER_MODULE 7
84
85/* the CAT asic of each module is always the first one */
86#define VOYAGER_CAT_ID 0
87#define VOYAGER_PSI 0x1a
88
89/* voyager instruction operations and registers */
90#define VOYAGER_READ_CONFIG 0x1
91#define VOYAGER_WRITE_CONFIG 0x2
92#define VOYAGER_BYPASS 0xff
93
94typedef struct voyager_asic {
95 __u8 asic_addr; /* ASIC address; Level 4 */
96 __u8 asic_type; /* ASIC type */
97 __u8 asic_id; /* ASIC id */
98 __u8 jtag_id[4]; /* JTAG id */
99 __u8 asic_location; /* Location within scan path; start w/ 0 */
100 __u8 bit_location; /* Location within bit stream; start w/ 0 */
101 __u8 ireg_length; /* Instruction register length */
102 __u16 subaddr; /* Amount of sub address space */
103 struct voyager_asic *next; /* Next asic in linked list */
104} voyager_asic_t;
105
106typedef struct voyager_module {
107 __u8 module_addr; /* Module address */
108 __u8 scan_path_connected; /* Scan path connected */
109 __u16 ee_size; /* Size of the EEPROM */
110 __u16 num_asics; /* Number of Asics */
111 __u16 inst_bits; /* Instruction bits in the scan path */
112 __u16 largest_reg; /* Largest register in the scan path */
113 __u16 smallest_reg; /* Smallest register in the scan path */
114 voyager_asic_t *asic; /* First ASIC in scan path (CAT_I) */
115 struct voyager_module *submodule; /* Submodule pointer */
116 struct voyager_module *next; /* Next module in linked list */
117} voyager_module_t;
118
119typedef struct voyager_eeprom_hdr {
120 __u8 module_id[4];
121 __u8 version_id;
122 __u8 config_id;
123 __u16 boundry_id; /* boundary scan id */
124 __u16 ee_size; /* size of EEPROM */
125 __u8 assembly[11]; /* assembly # */
126 __u8 assembly_rev; /* assembly rev */
127 __u8 tracer[4]; /* tracer number */
128 __u16 assembly_cksum; /* asm checksum */
129 __u16 power_consump; /* pwr requirements */
130 __u16 num_asics; /* number of asics */
131 __u16 bist_time; /* min. bist time */
132 __u16 err_log_offset; /* error log offset */
133 __u16 scan_path_offset;/* scan path offset */
134 __u16 cct_offset;
135 __u16 log_length; /* length of err log */
136 __u16 xsum_end; /* offset to end of
137 checksum */
138 __u8 reserved[4];
139 __u8 sflag; /* starting sentinal */
140 __u8 part_number[13]; /* prom part number */
141 __u8 version[10]; /* version number */
142 __u8 signature[8];
143 __u16 eeprom_chksum;
144 __u32 data_stamp_offset;
145 __u8 eflag ; /* ending sentinal */
146} __attribute__((packed)) voyager_eprom_hdr_t;
147
148
149
150#define VOYAGER_EPROM_SIZE_OFFSET \
151 ((__u16)(&(((voyager_eprom_hdr_t *)0)->ee_size)))
152#define VOYAGER_XSUM_END_OFFSET 0x2a
153
154/* the following three definitions are for internal table layouts
155 * in the module EPROMs. We really only care about the IDs and
156 * offsets */
157typedef struct voyager_sp_table {
158 __u8 asic_id;
159 __u8 bypass_flag;
160 __u16 asic_data_offset;
161 __u16 config_data_offset;
162} __attribute__((packed)) voyager_sp_table_t;
163
164typedef struct voyager_jtag_table {
165 __u8 icode[4];
166 __u8 runbist[4];
167 __u8 intest[4];
168 __u8 samp_preld[4];
169 __u8 ireg_len;
170} __attribute__((packed)) voyager_jtt_t;
171
172typedef struct voyager_asic_data_table {
173 __u8 jtag_id[4];
174 __u16 length_bsr;
175 __u16 length_bist_reg;
176 __u32 bist_clk;
177 __u16 subaddr_bits;
178 __u16 seed_bits;
179 __u16 sig_bits;
180 __u16 jtag_offset;
181} __attribute__((packed)) voyager_at_t;
182
183/* Voyager Interrupt Controller (VIC) registers */
184
185/* Base to add to Cross Processor Interrupts (CPIs) when triggering
186 * the CPU IRQ line */
187/* register defines for the WCBICs (one per processor) */
188#define VOYAGER_WCBIC0 0x41 /* bus A node P1 processor 0 */
189#define VOYAGER_WCBIC1 0x49 /* bus A node P1 processor 1 */
190#define VOYAGER_WCBIC2 0x51 /* bus A node P2 processor 0 */
191#define VOYAGER_WCBIC3 0x59 /* bus A node P2 processor 1 */
192#define VOYAGER_WCBIC4 0x61 /* bus B node P1 processor 0 */
193#define VOYAGER_WCBIC5 0x69 /* bus B node P1 processor 1 */
194#define VOYAGER_WCBIC6 0x71 /* bus B node P2 processor 0 */
195#define VOYAGER_WCBIC7 0x79 /* bus B node P2 processor 1 */
196
197
198/* top of memory registers */
199#define VOYAGER_WCBIC_TOM_L 0x4
200#define VOYAGER_WCBIC_TOM_H 0x5
201
202/* register defines for Voyager Memory Contol (VMC)
203 * these are present on L4 machines only */
204#define VOYAGER_VMC1 0x81
205#define VOYAGER_VMC2 0x91
206#define VOYAGER_VMC3 0xa1
207#define VOYAGER_VMC4 0xb1
208
209/* VMC Ports */
210#define VOYAGER_VMC_MEMORY_SETUP 0x9
211# define VMC_Interleaving 0x01
212# define VMC_4Way 0x02
213# define VMC_EvenCacheLines 0x04
214# define VMC_HighLine 0x08
215# define VMC_Start0_Enable 0x20
216# define VMC_Start1_Enable 0x40
217# define VMC_Vremap 0x80
218#define VOYAGER_VMC_BANK_DENSITY 0xa
219# define VMC_BANK_EMPTY 0
220# define VMC_BANK_4MB 1
221# define VMC_BANK_16MB 2
222# define VMC_BANK_64MB 3
223# define VMC_BANK0_MASK 0x03
224# define VMC_BANK1_MASK 0x0C
225# define VMC_BANK2_MASK 0x30
226# define VMC_BANK3_MASK 0xC0
227
228/* Magellan Memory Controller (MMC) defines - present on L5 */
229#define VOYAGER_MMC_ASIC_ID 1
230/* the two memory modules corresponding to memory cards in the system */
231#define VOYAGER_MMC_MEMORY0_MODULE 0x14
232#define VOYAGER_MMC_MEMORY1_MODULE 0x15
233/* the Magellan Memory Address (MMA) defines */
234#define VOYAGER_MMA_ASIC_ID 2
235
236/* Submodule number for the Quad Baseboard */
237#define VOYAGER_QUAD_BASEBOARD 1
238
239/* ASIC defines for the Quad Baseboard */
240#define VOYAGER_QUAD_QDATA0 1
241#define VOYAGER_QUAD_QDATA1 2
242#define VOYAGER_QUAD_QABC 3
243
244/* Useful areas in extended CMOS */
245#define VOYAGER_PROCESSOR_PRESENT_MASK 0x88a
246#define VOYAGER_MEMORY_CLICKMAP 0xa23
247#define VOYAGER_DUMP_LOCATION 0xb1a
248
249/* SUS In Control bit - used to tell SUS that we don't need to be
250 * babysat anymore */
251#define VOYAGER_SUS_IN_CONTROL_PORT 0x3ff
252# define VOYAGER_IN_CONTROL_FLAG 0x80
253
254/* Voyager PSI defines */
255#define VOYAGER_PSI_STATUS_REG 0x08
256# define PSI_DC_FAIL 0x01
257# define PSI_MON 0x02
258# define PSI_FAULT 0x04
259# define PSI_ALARM 0x08
260# define PSI_CURRENT 0x10
261# define PSI_DVM 0x20
262# define PSI_PSCFAULT 0x40
263# define PSI_STAT_CHG 0x80
264
265#define VOYAGER_PSI_SUPPLY_REG 0x8000
266 /* read */
267# define PSI_FAIL_DC 0x01
268# define PSI_FAIL_AC 0x02
269# define PSI_MON_INT 0x04
270# define PSI_SWITCH_OFF 0x08
271# define PSI_HX_OFF 0x10
272# define PSI_SECURITY 0x20
273# define PSI_CMOS_BATT_LOW 0x40
274# define PSI_CMOS_BATT_FAIL 0x80
275 /* write */
276# define PSI_CLR_SWITCH_OFF 0x13
277# define PSI_CLR_HX_OFF 0x14
278# define PSI_CLR_CMOS_BATT_FAIL 0x17
279
280#define VOYAGER_PSI_MASK 0x8001
281# define PSI_MASK_MASK 0x10
282
283#define VOYAGER_PSI_AC_FAIL_REG 0x8004
284#define AC_FAIL_STAT_CHANGE 0x80
285
286#define VOYAGER_PSI_GENERAL_REG 0x8007
287 /* read */
288# define PSI_SWITCH_ON 0x01
289# define PSI_SWITCH_ENABLED 0x02
290# define PSI_ALARM_ENABLED 0x08
291# define PSI_SECURE_ENABLED 0x10
292# define PSI_COLD_RESET 0x20
293# define PSI_COLD_START 0x80
294 /* write */
295# define PSI_POWER_DOWN 0x10
296# define PSI_SWITCH_DISABLE 0x01
297# define PSI_SWITCH_ENABLE 0x11
298# define PSI_CLEAR 0x12
299# define PSI_ALARM_DISABLE 0x03
300# define PSI_ALARM_ENABLE 0x13
301# define PSI_CLEAR_COLD_RESET 0x05
302# define PSI_SET_COLD_RESET 0x15
303# define PSI_CLEAR_COLD_START 0x07
304# define PSI_SET_COLD_START 0x17
305
306
307
308struct voyager_bios_info {
309 __u8 len;
310 __u8 major;
311 __u8 minor;
312 __u8 debug;
313 __u8 num_classes;
314 __u8 class_1;
315 __u8 class_2;
316};
317
318/* The following structures and definitions are for the Kernel/SUS
319 * interface these are needed to find out how SUS initialised any Quad
320 * boards in the system */
321
322#define NUMBER_OF_MC_BUSSES 2
323#define SLOTS_PER_MC_BUS 8
324#define MAX_CPUS 16 /* 16 way CPU system */
325#define MAX_PROCESSOR_BOARDS 4 /* 4 processor slot system */
326#define MAX_CACHE_LEVELS 4 /* # of cache levels supported */
327#define MAX_SHARED_CPUS 4 /* # of CPUs that can share a LARC */
328#define NUMBER_OF_POS_REGS 8
329
330typedef struct {
331 __u8 MC_Slot;
332 __u8 POS_Values[NUMBER_OF_POS_REGS];
333} __attribute__((packed)) MC_SlotInformation_t;
334
335struct QuadDescription {
336 __u8 Type; /* for type 0 (DYADIC or MONADIC) all fields
337 * will be zero except for slot */
338 __u8 StructureVersion;
339 __u32 CPI_BaseAddress;
340 __u32 LARC_BankSize;
341 __u32 LocalMemoryStateBits;
342 __u8 Slot; /* Processor slots 1 - 4 */
343} __attribute__((packed));
344
345struct ProcBoardInfo {
346 __u8 Type;
347 __u8 StructureVersion;
348 __u8 NumberOfBoards;
349 struct QuadDescription QuadData[MAX_PROCESSOR_BOARDS];
350} __attribute__((packed));
351
352struct CacheDescription {
353 __u8 Level;
354 __u32 TotalSize;
355 __u16 LineSize;
356 __u8 Associativity;
357 __u8 CacheType;
358 __u8 WriteType;
359 __u8 Number_CPUs_SharedBy;
360 __u8 Shared_CPUs_Hardware_IDs[MAX_SHARED_CPUS];
361
362} __attribute__((packed));
363
364struct CPU_Description {
365 __u8 CPU_HardwareId;
366 char *FRU_String;
367 __u8 NumberOfCacheLevels;
368 struct CacheDescription CacheLevelData[MAX_CACHE_LEVELS];
369} __attribute__((packed));
370
371struct CPU_Info {
372 __u8 Type;
373 __u8 StructureVersion;
374 __u8 NumberOf_CPUs;
375 struct CPU_Description CPU_Data[MAX_CPUS];
376} __attribute__((packed));
377
378
379/*
380 * This structure will be used by SUS and the OS.
381 * The assumption about this structure is that no blank space is
382 * packed in it by our friend the compiler.
383 */
384typedef struct {
385 __u8 Mailbox_SUS; /* Written to by SUS to give
386 commands/response to the OS */
387 __u8 Mailbox_OS; /* Written to by the OS to give
388 commands/response to SUS */
389 __u8 SUS_MailboxVersion; /* Tells the OS which iteration of the
390 interface SUS supports */
391 __u8 OS_MailboxVersion; /* Tells SUS which iteration of the
392 interface the OS supports */
393 __u32 OS_Flags; /* Flags set by the OS as info for
394 SUS */
395 __u32 SUS_Flags; /* Flags set by SUS as info
396 for the OS */
397 __u32 WatchDogPeriod; /* Watchdog period (in seconds) which
398 the DP uses to see if the OS
399 is dead */
400 __u32 WatchDogCount; /* Updated by the OS on every tic. */
401 __u32 MemoryFor_SUS_ErrorLog; /* Flat 32 bit address which tells SUS
402 where to stuff the SUS error log
403 on a dump */
404 MC_SlotInformation_t MC_SlotInfo[NUMBER_OF_MC_BUSSES*SLOTS_PER_MC_BUS];
405 /* Storage for MCA POS data */
406 /* All new SECOND_PASS_INTERFACE fields added from this point */
407 struct ProcBoardInfo *BoardData;
408 struct CPU_Info *CPU_Data;
409 /* All new fields must be added from this point */
410} Voyager_KernelSUS_Mbox_t;
411
412/* structure for finding the right memory address to send a QIC CPI to */
413struct voyager_qic_cpi {
414 /* Each cache line (32 bytes) can trigger a cpi. The cpi
415 * read/write may occur anywhere in the cache line---pick the
416 * middle to be safe */
417 struct {
418 __u32 pad1[3];
419 __u32 cpi;
420 __u32 pad2[4];
421 } qic_cpi[8];
422};
423
424struct voyager_status {
425 __u32 power_fail:1;
426 __u32 switch_off:1;
427 __u32 request_from_kernel:1;
428};
429
430struct voyager_psi_regs {
431 __u8 cat_id;
432 __u8 cat_dev;
433 __u8 cat_control;
434 __u8 subaddr;
435 __u8 dummy4;
436 __u8 checkbit;
437 __u8 subaddr_low;
438 __u8 subaddr_high;
439 __u8 intstatus;
440 __u8 stat1;
441 __u8 stat3;
442 __u8 fault;
443 __u8 tms;
444 __u8 gen;
445 __u8 sysconf;
446 __u8 dummy15;
447};
448
449struct voyager_psi_subregs {
450 __u8 supply;
451 __u8 mask;
452 __u8 present;
453 __u8 DCfail;
454 __u8 ACfail;
455 __u8 fail;
456 __u8 UPSfail;
457 __u8 genstatus;
458};
459
460struct voyager_psi {
461 struct voyager_psi_regs regs;
462 struct voyager_psi_subregs subregs;
463};
464
465struct voyager_SUS {
466#define VOYAGER_DUMP_BUTTON_NMI 0x1
467#define VOYAGER_SUS_VALID 0x2
468#define VOYAGER_SYSINT_COMPLETE 0x3
469 __u8 SUS_mbox;
470#define VOYAGER_NO_COMMAND 0x0
471#define VOYAGER_IGNORE_DUMP 0x1
472#define VOYAGER_DO_DUMP 0x2
473#define VOYAGER_SYSINT_HANDSHAKE 0x3
474#define VOYAGER_DO_MEM_DUMP 0x4
475#define VOYAGER_SYSINT_WAS_RECOVERED 0x5
476 __u8 kernel_mbox;
477#define VOYAGER_MAILBOX_VERSION 0x10
478 __u8 SUS_version;
479 __u8 kernel_version;
480#define VOYAGER_OS_HAS_SYSINT 0x1
481#define VOYAGER_OS_IN_PROGRESS 0x2
482#define VOYAGER_UPDATING_WDPERIOD 0x4
483 __u32 kernel_flags;
484#define VOYAGER_SUS_BOOTING 0x1
485#define VOYAGER_SUS_IN_PROGRESS 0x2
486 __u32 SUS_flags;
487 __u32 watchdog_period;
488 __u32 watchdog_count;
489 __u32 SUS_errorlog;
490 /* lots of system configuration stuff under here */
491};
492
493/* Variables exported by voyager_smp */
494extern __u32 voyager_extended_vic_processors;
495extern __u32 voyager_allowed_boot_processors;
496extern __u32 voyager_quad_processors;
497extern struct voyager_qic_cpi *voyager_quad_cpi_addr[NR_CPUS];
498extern struct voyager_SUS *voyager_SUS;
499
500/* variables exported always */
501extern struct task_struct *voyager_thread;
502extern int voyager_level;
503extern struct voyager_status voyager_status;
504
505/* functions exported by the voyager and voyager_smp modules */
506extern int voyager_cat_readb(__u8 module, __u8 asic, int reg);
507extern void voyager_cat_init(void);
508extern void voyager_detect(struct voyager_bios_info *);
509extern void voyager_trap_init(void);
510extern void voyager_setup_irqs(void);
511extern int voyager_memory_detect(int region, __u32 *addr, __u32 *length);
512extern void voyager_smp_intr_init(void);
513extern __u8 voyager_extended_cmos_read(__u16 cmos_address);
514extern void voyager_smp_dump(void);
515extern void voyager_timer_interrupt(void);
516extern void smp_local_timer_interrupt(void);
517extern void voyager_power_off(void);
518extern void smp_voyager_power_off(void *dummy);
519extern void voyager_restart(void);
520extern void voyager_cat_power_off(void);
521extern void voyager_cat_do_common_interrupt(void);
522extern void voyager_handle_nmi(void);
523/* Commands for the following are */
524#define VOYAGER_PSI_READ 0
525#define VOYAGER_PSI_WRITE 1
526#define VOYAGER_PSI_SUBREAD 2
527#define VOYAGER_PSI_SUBWRITE 3
528extern void voyager_cat_psi(__u8, __u16, __u8 *);
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
new file mode 100644
index 000000000000..d0983d255fbd
--- /dev/null
+++ b/arch/x86/include/asm/vsyscall.h
@@ -0,0 +1,44 @@
1#ifndef _ASM_X86_VSYSCALL_H
2#define _ASM_X86_VSYSCALL_H
3
4enum vsyscall_num {
5 __NR_vgettimeofday,
6 __NR_vtime,
7 __NR_vgetcpu,
8};
9
10#define VSYSCALL_START (-10UL << 20)
11#define VSYSCALL_SIZE 1024
12#define VSYSCALL_END (-2UL << 20)
13#define VSYSCALL_MAPPED_PAGES 1
14#define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
15
16#ifdef __KERNEL__
17#include <linux/seqlock.h>
18
19#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
20#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
21
22/* Definitions for CONFIG_GENERIC_TIME definitions */
23#define __section_vsyscall_gtod_data __attribute__ \
24 ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
25#define __section_vsyscall_clock __attribute__ \
26 ((unused, __section__ (".vsyscall_clock"),aligned(16)))
27#define __vsyscall_fn \
28 __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
29
30#define VGETCPU_RDTSCP 1
31#define VGETCPU_LSL 2
32
33extern int __vgetcpu_mode;
34extern volatile unsigned long __jiffies;
35
36/* kernel space (writeable) */
37extern int vgetcpu_mode;
38extern struct timezone sys_tz;
39
40extern void map_vsyscall(void);
41
42#endif /* __KERNEL__ */
43
44#endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/include/asm/xcr.h b/arch/x86/include/asm/xcr.h
new file mode 100644
index 000000000000..f2cba4e79a23
--- /dev/null
+++ b/arch/x86/include/asm/xcr.h
@@ -0,0 +1,49 @@
1/* -*- linux-c -*- ------------------------------------------------------- *
2 *
3 * Copyright 2008 rPath, Inc. - All Rights Reserved
4 *
5 * This file is part of the Linux kernel, and is made available under
6 * the terms of the GNU General Public License version 2 or (at your
7 * option) any later version; incorporated herein by reference.
8 *
9 * ----------------------------------------------------------------------- */
10
11/*
12 * asm-x86/xcr.h
13 *
14 * Definitions for the eXtended Control Register instructions
15 */
16
17#ifndef _ASM_X86_XCR_H
18#define _ASM_X86_XCR_H
19
20#define XCR_XFEATURE_ENABLED_MASK 0x00000000
21
22#ifdef __KERNEL__
23# ifndef __ASSEMBLY__
24
25#include <linux/types.h>
26
27static inline u64 xgetbv(u32 index)
28{
29 u32 eax, edx;
30
31 asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */
32 : "=a" (eax), "=d" (edx)
33 : "c" (index));
34 return eax + ((u64)edx << 32);
35}
36
37static inline void xsetbv(u32 index, u64 value)
38{
39 u32 eax = value;
40 u32 edx = value >> 32;
41
42 asm volatile(".byte 0x0f,0x01,0xd1" /* xsetbv */
43 : : "a" (eax), "d" (edx), "c" (index));
44}
45
46# endif /* __ASSEMBLY__ */
47#endif /* __KERNEL__ */
48
49#endif /* _ASM_X86_XCR_H */
diff --git a/arch/x86/include/asm/xen/events.h b/arch/x86/include/asm/xen/events.h
new file mode 100644
index 000000000000..19144184983a
--- /dev/null
+++ b/arch/x86/include/asm/xen/events.h
@@ -0,0 +1,24 @@
1#ifndef _ASM_X86_XEN_EVENTS_H
2#define _ASM_X86_XEN_EVENTS_H
3
4enum ipi_vector {
5 XEN_RESCHEDULE_VECTOR,
6 XEN_CALL_FUNCTION_VECTOR,
7 XEN_CALL_FUNCTION_SINGLE_VECTOR,
8 XEN_SPIN_UNLOCK_VECTOR,
9
10 XEN_NR_IPIS,
11};
12
13static inline int xen_irqs_disabled(struct pt_regs *regs)
14{
15 return raw_irqs_disabled_flags(regs->flags);
16}
17
18static inline void xen_do_IRQ(int irq, struct pt_regs *regs)
19{
20 regs->orig_ax = ~irq;
21 do_IRQ(regs);
22}
23
24#endif /* _ASM_X86_XEN_EVENTS_H */
diff --git a/arch/x86/include/asm/xen/grant_table.h b/arch/x86/include/asm/xen/grant_table.h
new file mode 100644
index 000000000000..fdbbb45767a6
--- /dev/null
+++ b/arch/x86/include/asm/xen/grant_table.h
@@ -0,0 +1,7 @@
1#ifndef _ASM_X86_XEN_GRANT_TABLE_H
2#define _ASM_X86_XEN_GRANT_TABLE_H
3
4#define xen_alloc_vm_area(size) alloc_vm_area(size)
5#define xen_free_vm_area(area) free_vm_area(area)
6
7#endif /* _ASM_X86_XEN_GRANT_TABLE_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
new file mode 100644
index 000000000000..3f6000d95fe2
--- /dev/null
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -0,0 +1,527 @@
1/******************************************************************************
2 * hypercall.h
3 *
4 * Linux-specific hypervisor handling.
5 *
6 * Copyright (c) 2002-2004, K A Fraser
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version 2
10 * as published by the Free Software Foundation; or, when distributed
11 * separately from the Linux kernel or incorporated into other
12 * software packages, subject to the following license:
13 *
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 * IN THE SOFTWARE.
31 */
32
33#ifndef _ASM_X86_XEN_HYPERCALL_H
34#define _ASM_X86_XEN_HYPERCALL_H
35
36#include <linux/errno.h>
37#include <linux/string.h>
38
39#include <xen/interface/xen.h>
40#include <xen/interface/sched.h>
41#include <xen/interface/physdev.h>
42
43/*
44 * The hypercall asms have to meet several constraints:
45 * - Work on 32- and 64-bit.
46 * The two architectures put their arguments in different sets of
47 * registers.
48 *
49 * - Work around asm syntax quirks
50 * It isn't possible to specify one of the rNN registers in a
51 * constraint, so we use explicit register variables to get the
52 * args into the right place.
53 *
54 * - Mark all registers as potentially clobbered
55 * Even unused parameters can be clobbered by the hypervisor, so we
56 * need to make sure gcc knows it.
57 *
58 * - Avoid compiler bugs.
59 * This is the tricky part. Because x86_32 has such a constrained
60 * register set, gcc versions below 4.3 have trouble generating
61 * code when all the arg registers and memory are trashed by the
62 * asm. There are syntactically simpler ways of achieving the
63 * semantics below, but they cause the compiler to crash.
64 *
65 * The only combination I found which works is:
66 * - assign the __argX variables first
67 * - list all actually used parameters as "+r" (__argX)
68 * - clobber the rest
69 *
70 * The result certainly isn't pretty, and it really shows up cpp's
71 * weakness as as macro language. Sorry. (But let's just give thanks
72 * there aren't more than 5 arguments...)
73 */
74
75extern struct { char _entry[32]; } hypercall_page[];
76
77#define __HYPERCALL "call hypercall_page+%c[offset]"
78#define __HYPERCALL_ENTRY(x) \
79 [offset] "i" (__HYPERVISOR_##x * sizeof(hypercall_page[0]))
80
81#ifdef CONFIG_X86_32
82#define __HYPERCALL_RETREG "eax"
83#define __HYPERCALL_ARG1REG "ebx"
84#define __HYPERCALL_ARG2REG "ecx"
85#define __HYPERCALL_ARG3REG "edx"
86#define __HYPERCALL_ARG4REG "esi"
87#define __HYPERCALL_ARG5REG "edi"
88#else
89#define __HYPERCALL_RETREG "rax"
90#define __HYPERCALL_ARG1REG "rdi"
91#define __HYPERCALL_ARG2REG "rsi"
92#define __HYPERCALL_ARG3REG "rdx"
93#define __HYPERCALL_ARG4REG "r10"
94#define __HYPERCALL_ARG5REG "r8"
95#endif
96
97#define __HYPERCALL_DECLS \
98 register unsigned long __res asm(__HYPERCALL_RETREG); \
99 register unsigned long __arg1 asm(__HYPERCALL_ARG1REG) = __arg1; \
100 register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \
101 register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \
102 register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \
103 register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5;
104
105#define __HYPERCALL_0PARAM "=r" (__res)
106#define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1)
107#define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2)
108#define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3)
109#define __HYPERCALL_4PARAM __HYPERCALL_3PARAM, "+r" (__arg4)
110#define __HYPERCALL_5PARAM __HYPERCALL_4PARAM, "+r" (__arg5)
111
112#define __HYPERCALL_0ARG()
113#define __HYPERCALL_1ARG(a1) \
114 __HYPERCALL_0ARG() __arg1 = (unsigned long)(a1);
115#define __HYPERCALL_2ARG(a1,a2) \
116 __HYPERCALL_1ARG(a1) __arg2 = (unsigned long)(a2);
117#define __HYPERCALL_3ARG(a1,a2,a3) \
118 __HYPERCALL_2ARG(a1,a2) __arg3 = (unsigned long)(a3);
119#define __HYPERCALL_4ARG(a1,a2,a3,a4) \
120 __HYPERCALL_3ARG(a1,a2,a3) __arg4 = (unsigned long)(a4);
121#define __HYPERCALL_5ARG(a1,a2,a3,a4,a5) \
122 __HYPERCALL_4ARG(a1,a2,a3,a4) __arg5 = (unsigned long)(a5);
123
124#define __HYPERCALL_CLOBBER5 "memory"
125#define __HYPERCALL_CLOBBER4 __HYPERCALL_CLOBBER5, __HYPERCALL_ARG5REG
126#define __HYPERCALL_CLOBBER3 __HYPERCALL_CLOBBER4, __HYPERCALL_ARG4REG
127#define __HYPERCALL_CLOBBER2 __HYPERCALL_CLOBBER3, __HYPERCALL_ARG3REG
128#define __HYPERCALL_CLOBBER1 __HYPERCALL_CLOBBER2, __HYPERCALL_ARG2REG
129#define __HYPERCALL_CLOBBER0 __HYPERCALL_CLOBBER1, __HYPERCALL_ARG1REG
130
131#define _hypercall0(type, name) \
132({ \
133 __HYPERCALL_DECLS; \
134 __HYPERCALL_0ARG(); \
135 asm volatile (__HYPERCALL \
136 : __HYPERCALL_0PARAM \
137 : __HYPERCALL_ENTRY(name) \
138 : __HYPERCALL_CLOBBER0); \
139 (type)__res; \
140})
141
142#define _hypercall1(type, name, a1) \
143({ \
144 __HYPERCALL_DECLS; \
145 __HYPERCALL_1ARG(a1); \
146 asm volatile (__HYPERCALL \
147 : __HYPERCALL_1PARAM \
148 : __HYPERCALL_ENTRY(name) \
149 : __HYPERCALL_CLOBBER1); \
150 (type)__res; \
151})
152
153#define _hypercall2(type, name, a1, a2) \
154({ \
155 __HYPERCALL_DECLS; \
156 __HYPERCALL_2ARG(a1, a2); \
157 asm volatile (__HYPERCALL \
158 : __HYPERCALL_2PARAM \
159 : __HYPERCALL_ENTRY(name) \
160 : __HYPERCALL_CLOBBER2); \
161 (type)__res; \
162})
163
164#define _hypercall3(type, name, a1, a2, a3) \
165({ \
166 __HYPERCALL_DECLS; \
167 __HYPERCALL_3ARG(a1, a2, a3); \
168 asm volatile (__HYPERCALL \
169 : __HYPERCALL_3PARAM \
170 : __HYPERCALL_ENTRY(name) \
171 : __HYPERCALL_CLOBBER3); \
172 (type)__res; \
173})
174
175#define _hypercall4(type, name, a1, a2, a3, a4) \
176({ \
177 __HYPERCALL_DECLS; \
178 __HYPERCALL_4ARG(a1, a2, a3, a4); \
179 asm volatile (__HYPERCALL \
180 : __HYPERCALL_4PARAM \
181 : __HYPERCALL_ENTRY(name) \
182 : __HYPERCALL_CLOBBER4); \
183 (type)__res; \
184})
185
186#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
187({ \
188 __HYPERCALL_DECLS; \
189 __HYPERCALL_5ARG(a1, a2, a3, a4, a5); \
190 asm volatile (__HYPERCALL \
191 : __HYPERCALL_5PARAM \
192 : __HYPERCALL_ENTRY(name) \
193 : __HYPERCALL_CLOBBER5); \
194 (type)__res; \
195})
196
197static inline int
198HYPERVISOR_set_trap_table(struct trap_info *table)
199{
200 return _hypercall1(int, set_trap_table, table);
201}
202
203static inline int
204HYPERVISOR_mmu_update(struct mmu_update *req, int count,
205 int *success_count, domid_t domid)
206{
207 return _hypercall4(int, mmu_update, req, count, success_count, domid);
208}
209
210static inline int
211HYPERVISOR_mmuext_op(struct mmuext_op *op, int count,
212 int *success_count, domid_t domid)
213{
214 return _hypercall4(int, mmuext_op, op, count, success_count, domid);
215}
216
217static inline int
218HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
219{
220 return _hypercall2(int, set_gdt, frame_list, entries);
221}
222
223static inline int
224HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
225{
226 return _hypercall2(int, stack_switch, ss, esp);
227}
228
229#ifdef CONFIG_X86_32
230static inline int
231HYPERVISOR_set_callbacks(unsigned long event_selector,
232 unsigned long event_address,
233 unsigned long failsafe_selector,
234 unsigned long failsafe_address)
235{
236 return _hypercall4(int, set_callbacks,
237 event_selector, event_address,
238 failsafe_selector, failsafe_address);
239}
240#else /* CONFIG_X86_64 */
241static inline int
242HYPERVISOR_set_callbacks(unsigned long event_address,
243 unsigned long failsafe_address,
244 unsigned long syscall_address)
245{
246 return _hypercall3(int, set_callbacks,
247 event_address, failsafe_address,
248 syscall_address);
249}
250#endif /* CONFIG_X86_{32,64} */
251
252static inline int
253HYPERVISOR_callback_op(int cmd, void *arg)
254{
255 return _hypercall2(int, callback_op, cmd, arg);
256}
257
258static inline int
259HYPERVISOR_fpu_taskswitch(int set)
260{
261 return _hypercall1(int, fpu_taskswitch, set);
262}
263
264static inline int
265HYPERVISOR_sched_op(int cmd, void *arg)
266{
267 return _hypercall2(int, sched_op_new, cmd, arg);
268}
269
270static inline long
271HYPERVISOR_set_timer_op(u64 timeout)
272{
273 unsigned long timeout_hi = (unsigned long)(timeout>>32);
274 unsigned long timeout_lo = (unsigned long)timeout;
275 return _hypercall2(long, set_timer_op, timeout_lo, timeout_hi);
276}
277
278static inline int
279HYPERVISOR_set_debugreg(int reg, unsigned long value)
280{
281 return _hypercall2(int, set_debugreg, reg, value);
282}
283
284static inline unsigned long
285HYPERVISOR_get_debugreg(int reg)
286{
287 return _hypercall1(unsigned long, get_debugreg, reg);
288}
289
290static inline int
291HYPERVISOR_update_descriptor(u64 ma, u64 desc)
292{
293 return _hypercall4(int, update_descriptor, ma, ma>>32, desc, desc>>32);
294}
295
296static inline int
297HYPERVISOR_memory_op(unsigned int cmd, void *arg)
298{
299 return _hypercall2(int, memory_op, cmd, arg);
300}
301
302static inline int
303HYPERVISOR_multicall(void *call_list, int nr_calls)
304{
305 return _hypercall2(int, multicall, call_list, nr_calls);
306}
307
308static inline int
309HYPERVISOR_update_va_mapping(unsigned long va, pte_t new_val,
310 unsigned long flags)
311{
312 if (sizeof(new_val) == sizeof(long))
313 return _hypercall3(int, update_va_mapping, va,
314 new_val.pte, flags);
315 else
316 return _hypercall4(int, update_va_mapping, va,
317 new_val.pte, new_val.pte >> 32, flags);
318}
319
320static inline int
321HYPERVISOR_event_channel_op(int cmd, void *arg)
322{
323 int rc = _hypercall2(int, event_channel_op, cmd, arg);
324 if (unlikely(rc == -ENOSYS)) {
325 struct evtchn_op op;
326 op.cmd = cmd;
327 memcpy(&op.u, arg, sizeof(op.u));
328 rc = _hypercall1(int, event_channel_op_compat, &op);
329 memcpy(arg, &op.u, sizeof(op.u));
330 }
331 return rc;
332}
333
334static inline int
335HYPERVISOR_xen_version(int cmd, void *arg)
336{
337 return _hypercall2(int, xen_version, cmd, arg);
338}
339
340static inline int
341HYPERVISOR_console_io(int cmd, int count, char *str)
342{
343 return _hypercall3(int, console_io, cmd, count, str);
344}
345
346static inline int
347HYPERVISOR_physdev_op(int cmd, void *arg)
348{
349 int rc = _hypercall2(int, physdev_op, cmd, arg);
350 if (unlikely(rc == -ENOSYS)) {
351 struct physdev_op op;
352 op.cmd = cmd;
353 memcpy(&op.u, arg, sizeof(op.u));
354 rc = _hypercall1(int, physdev_op_compat, &op);
355 memcpy(arg, &op.u, sizeof(op.u));
356 }
357 return rc;
358}
359
360static inline int
361HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
362{
363 return _hypercall3(int, grant_table_op, cmd, uop, count);
364}
365
366static inline int
367HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val,
368 unsigned long flags, domid_t domid)
369{
370 if (sizeof(new_val) == sizeof(long))
371 return _hypercall4(int, update_va_mapping_otherdomain, va,
372 new_val.pte, flags, domid);
373 else
374 return _hypercall5(int, update_va_mapping_otherdomain, va,
375 new_val.pte, new_val.pte >> 32,
376 flags, domid);
377}
378
379static inline int
380HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type)
381{
382 return _hypercall2(int, vm_assist, cmd, type);
383}
384
385static inline int
386HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args)
387{
388 return _hypercall3(int, vcpu_op, cmd, vcpuid, extra_args);
389}
390
391#ifdef CONFIG_X86_64
392static inline int
393HYPERVISOR_set_segment_base(int reg, unsigned long value)
394{
395 return _hypercall2(int, set_segment_base, reg, value);
396}
397#endif
398
399static inline int
400HYPERVISOR_suspend(unsigned long srec)
401{
402 return _hypercall3(int, sched_op, SCHEDOP_shutdown,
403 SHUTDOWN_suspend, srec);
404}
405
406static inline int
407HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
408{
409 return _hypercall2(int, nmi_op, op, arg);
410}
411
412static inline void
413MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
414{
415 mcl->op = __HYPERVISOR_fpu_taskswitch;
416 mcl->args[0] = set;
417}
418
419static inline void
420MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
421 pte_t new_val, unsigned long flags)
422{
423 mcl->op = __HYPERVISOR_update_va_mapping;
424 mcl->args[0] = va;
425 if (sizeof(new_val) == sizeof(long)) {
426 mcl->args[1] = new_val.pte;
427 mcl->args[2] = flags;
428 } else {
429 mcl->args[1] = new_val.pte;
430 mcl->args[2] = new_val.pte >> 32;
431 mcl->args[3] = flags;
432 }
433}
434
435static inline void
436MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd,
437 void *uop, unsigned int count)
438{
439 mcl->op = __HYPERVISOR_grant_table_op;
440 mcl->args[0] = cmd;
441 mcl->args[1] = (unsigned long)uop;
442 mcl->args[2] = count;
443}
444
445static inline void
446MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long va,
447 pte_t new_val, unsigned long flags,
448 domid_t domid)
449{
450 mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
451 mcl->args[0] = va;
452 if (sizeof(new_val) == sizeof(long)) {
453 mcl->args[1] = new_val.pte;
454 mcl->args[2] = flags;
455 mcl->args[3] = domid;
456 } else {
457 mcl->args[1] = new_val.pte;
458 mcl->args[2] = new_val.pte >> 32;
459 mcl->args[3] = flags;
460 mcl->args[4] = domid;
461 }
462}
463
464static inline void
465MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
466 struct desc_struct desc)
467{
468 mcl->op = __HYPERVISOR_update_descriptor;
469 if (sizeof(maddr) == sizeof(long)) {
470 mcl->args[0] = maddr;
471 mcl->args[1] = *(unsigned long *)&desc;
472 } else {
473 mcl->args[0] = maddr;
474 mcl->args[1] = maddr >> 32;
475 mcl->args[2] = desc.a;
476 mcl->args[3] = desc.b;
477 }
478}
479
480static inline void
481MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg)
482{
483 mcl->op = __HYPERVISOR_memory_op;
484 mcl->args[0] = cmd;
485 mcl->args[1] = (unsigned long)arg;
486}
487
488static inline void
489MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
490 int count, int *success_count, domid_t domid)
491{
492 mcl->op = __HYPERVISOR_mmu_update;
493 mcl->args[0] = (unsigned long)req;
494 mcl->args[1] = count;
495 mcl->args[2] = (unsigned long)success_count;
496 mcl->args[3] = domid;
497}
498
499static inline void
500MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
501 int *success_count, domid_t domid)
502{
503 mcl->op = __HYPERVISOR_mmuext_op;
504 mcl->args[0] = (unsigned long)op;
505 mcl->args[1] = count;
506 mcl->args[2] = (unsigned long)success_count;
507 mcl->args[3] = domid;
508}
509
510static inline void
511MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
512{
513 mcl->op = __HYPERVISOR_set_gdt;
514 mcl->args[0] = (unsigned long)frames;
515 mcl->args[1] = entries;
516}
517
518static inline void
519MULTI_stack_switch(struct multicall_entry *mcl,
520 unsigned long ss, unsigned long esp)
521{
522 mcl->op = __HYPERVISOR_stack_switch;
523 mcl->args[0] = ss;
524 mcl->args[1] = esp;
525}
526
527#endif /* _ASM_X86_XEN_HYPERCALL_H */
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
new file mode 100644
index 000000000000..a38d25ac87d2
--- /dev/null
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -0,0 +1,82 @@
1/******************************************************************************
2 * hypervisor.h
3 *
4 * Linux-specific hypervisor handling.
5 *
6 * Copyright (c) 2002-2004, K A Fraser
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License version 2
10 * as published by the Free Software Foundation; or, when distributed
11 * separately from the Linux kernel or incorporated into other
12 * software packages, subject to the following license:
13 *
14 * Permission is hereby granted, free of charge, to any person obtaining a copy
15 * of this source file (the "Software"), to deal in the Software without
16 * restriction, including without limitation the rights to use, copy, modify,
17 * merge, publish, distribute, sublicense, and/or sell copies of the Software,
18 * and to permit persons to whom the Software is furnished to do so, subject to
19 * the following conditions:
20 *
21 * The above copyright notice and this permission notice shall be included in
22 * all copies or substantial portions of the Software.
23 *
24 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
25 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
26 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
27 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
28 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
29 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
30 * IN THE SOFTWARE.
31 */
32
33#ifndef _ASM_X86_XEN_HYPERVISOR_H
34#define _ASM_X86_XEN_HYPERVISOR_H
35
36#include <linux/types.h>
37#include <linux/kernel.h>
38
39#include <xen/interface/xen.h>
40#include <xen/interface/version.h>
41
42#include <asm/ptrace.h>
43#include <asm/page.h>
44#include <asm/desc.h>
45#if defined(__i386__)
46# ifdef CONFIG_X86_PAE
47# include <asm-generic/pgtable-nopud.h>
48# else
49# include <asm-generic/pgtable-nopmd.h>
50# endif
51#endif
52#include <asm/xen/hypercall.h>
53
54/* arch/i386/kernel/setup.c */
55extern struct shared_info *HYPERVISOR_shared_info;
56extern struct start_info *xen_start_info;
57
58/* arch/i386/mach-xen/evtchn.c */
59/* Force a proper event-channel callback from Xen. */
60extern void force_evtchn_callback(void);
61
62/* Turn jiffies into Xen system time. */
63u64 jiffies_to_st(unsigned long jiffies);
64
65
66#define MULTI_UVMFLAGS_INDEX 3
67#define MULTI_UVMDOMID_INDEX 4
68
69enum xen_domain_type {
70 XEN_NATIVE,
71 XEN_PV_DOMAIN,
72 XEN_HVM_DOMAIN,
73};
74
75extern enum xen_domain_type xen_domain_type;
76
77#define xen_domain() (xen_domain_type != XEN_NATIVE)
78#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN)
79#define xen_initial_domain() (xen_pv_domain() && xen_start_info->flags & SIF_INITDOMAIN)
80#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN)
81
82#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
new file mode 100644
index 000000000000..e8506c1f0c55
--- /dev/null
+++ b/arch/x86/include/asm/xen/interface.h
@@ -0,0 +1,175 @@
1/******************************************************************************
2 * arch-x86_32.h
3 *
4 * Guest OS interface to x86 Xen.
5 *
6 * Copyright (c) 2004, K A Fraser
7 */
8
9#ifndef _ASM_X86_XEN_INTERFACE_H
10#define _ASM_X86_XEN_INTERFACE_H
11
12#ifdef __XEN__
13#define __DEFINE_GUEST_HANDLE(name, type) \
14 typedef struct { type *p; } __guest_handle_ ## name
15#else
16#define __DEFINE_GUEST_HANDLE(name, type) \
17 typedef type * __guest_handle_ ## name
18#endif
19
20#define DEFINE_GUEST_HANDLE_STRUCT(name) \
21 __DEFINE_GUEST_HANDLE(name, struct name)
22#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name)
23#define GUEST_HANDLE(name) __guest_handle_ ## name
24
25#ifdef __XEN__
26#if defined(__i386__)
27#define set_xen_guest_handle(hnd, val) \
28 do { \
29 if (sizeof(hnd) == 8) \
30 *(uint64_t *)&(hnd) = 0; \
31 (hnd).p = val; \
32 } while (0)
33#elif defined(__x86_64__)
34#define set_xen_guest_handle(hnd, val) do { (hnd).p = val; } while (0)
35#endif
36#else
37#if defined(__i386__)
38#define set_xen_guest_handle(hnd, val) \
39 do { \
40 if (sizeof(hnd) == 8) \
41 *(uint64_t *)&(hnd) = 0; \
42 (hnd) = val; \
43 } while (0)
44#elif defined(__x86_64__)
45#define set_xen_guest_handle(hnd, val) do { (hnd) = val; } while (0)
46#endif
47#endif
48
49#ifndef __ASSEMBLY__
50/* Guest handles for primitive C types. */
51__DEFINE_GUEST_HANDLE(uchar, unsigned char);
52__DEFINE_GUEST_HANDLE(uint, unsigned int);
53__DEFINE_GUEST_HANDLE(ulong, unsigned long);
54DEFINE_GUEST_HANDLE(char);
55DEFINE_GUEST_HANDLE(int);
56DEFINE_GUEST_HANDLE(long);
57DEFINE_GUEST_HANDLE(void);
58#endif
59
60#ifndef HYPERVISOR_VIRT_START
61#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
62#endif
63
64#ifndef machine_to_phys_mapping
65#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
66#endif
67
68/* Maximum number of virtual CPUs in multi-processor guests. */
69#define MAX_VIRT_CPUS 32
70
71/*
72 * SEGMENT DESCRIPTOR TABLES
73 */
74/*
75 * A number of GDT entries are reserved by Xen. These are not situated at the
76 * start of the GDT because some stupid OSes export hard-coded selector values
77 * in their ABI. These hard-coded values are always near the start of the GDT,
78 * so Xen places itself out of the way, at the far end of the GDT.
79 */
80#define FIRST_RESERVED_GDT_PAGE 14
81#define FIRST_RESERVED_GDT_BYTE (FIRST_RESERVED_GDT_PAGE * 4096)
82#define FIRST_RESERVED_GDT_ENTRY (FIRST_RESERVED_GDT_BYTE / 8)
83
84/*
85 * Send an array of these to HYPERVISOR_set_trap_table()
86 * The privilege level specifies which modes may enter a trap via a software
87 * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
88 * privilege levels as follows:
89 * Level == 0: Noone may enter
90 * Level == 1: Kernel may enter
91 * Level == 2: Kernel may enter
92 * Level == 3: Everyone may enter
93 */
94#define TI_GET_DPL(_ti) ((_ti)->flags & 3)
95#define TI_GET_IF(_ti) ((_ti)->flags & 4)
96#define TI_SET_DPL(_ti, _dpl) ((_ti)->flags |= (_dpl))
97#define TI_SET_IF(_ti, _if) ((_ti)->flags |= ((!!(_if))<<2))
98
99#ifndef __ASSEMBLY__
100struct trap_info {
101 uint8_t vector; /* exception vector */
102 uint8_t flags; /* 0-3: privilege level; 4: clear event enable? */
103 uint16_t cs; /* code selector */
104 unsigned long address; /* code offset */
105};
106DEFINE_GUEST_HANDLE_STRUCT(trap_info);
107
108struct arch_shared_info {
109 unsigned long max_pfn; /* max pfn that appears in table */
110 /* Frame containing list of mfns containing list of mfns containing p2m. */
111 unsigned long pfn_to_mfn_frame_list_list;
112 unsigned long nmi_reason;
113};
114#endif /* !__ASSEMBLY__ */
115
116#ifdef CONFIG_X86_32
117#include "interface_32.h"
118#else
119#include "interface_64.h"
120#endif
121
122#ifndef __ASSEMBLY__
123/*
124 * The following is all CPU context. Note that the fpu_ctxt block is filled
125 * in by FXSAVE if the CPU has feature FXSR; otherwise FSAVE is used.
126 */
127struct vcpu_guest_context {
128 /* FPU registers come first so they can be aligned for FXSAVE/FXRSTOR. */
129 struct { char x[512]; } fpu_ctxt; /* User-level FPU registers */
130#define VGCF_I387_VALID (1<<0)
131#define VGCF_HVM_GUEST (1<<1)
132#define VGCF_IN_KERNEL (1<<2)
133 unsigned long flags; /* VGCF_* flags */
134 struct cpu_user_regs user_regs; /* User-level CPU registers */
135 struct trap_info trap_ctxt[256]; /* Virtual IDT */
136 unsigned long ldt_base, ldt_ents; /* LDT (linear address, # ents) */
137 unsigned long gdt_frames[16], gdt_ents; /* GDT (machine frames, # ents) */
138 unsigned long kernel_ss, kernel_sp; /* Virtual TSS (only SS1/SP1) */
139 /* NB. User pagetable on x86/64 is placed in ctrlreg[1]. */
140 unsigned long ctrlreg[8]; /* CR0-CR7 (control registers) */
141 unsigned long debugreg[8]; /* DB0-DB7 (debug registers) */
142#ifdef __i386__
143 unsigned long event_callback_cs; /* CS:EIP of event callback */
144 unsigned long event_callback_eip;
145 unsigned long failsafe_callback_cs; /* CS:EIP of failsafe callback */
146 unsigned long failsafe_callback_eip;
147#else
148 unsigned long event_callback_eip;
149 unsigned long failsafe_callback_eip;
150 unsigned long syscall_callback_eip;
151#endif
152 unsigned long vm_assist; /* VMASST_TYPE_* bitmap */
153#ifdef __x86_64__
154 /* Segment base addresses. */
155 uint64_t fs_base;
156 uint64_t gs_base_kernel;
157 uint64_t gs_base_user;
158#endif
159};
160DEFINE_GUEST_HANDLE_STRUCT(vcpu_guest_context);
161#endif /* !__ASSEMBLY__ */
162
163/*
164 * Prefix forces emulation of some non-trapping instructions.
165 * Currently only CPUID.
166 */
167#ifdef __ASSEMBLY__
168#define XEN_EMULATE_PREFIX .byte 0x0f,0x0b,0x78,0x65,0x6e ;
169#define XEN_CPUID XEN_EMULATE_PREFIX cpuid
170#else
171#define XEN_EMULATE_PREFIX ".byte 0x0f,0x0b,0x78,0x65,0x6e ; "
172#define XEN_CPUID XEN_EMULATE_PREFIX "cpuid"
173#endif
174
175#endif /* _ASM_X86_XEN_INTERFACE_H */
diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
new file mode 100644
index 000000000000..42a7e004ae5c
--- /dev/null
+++ b/arch/x86/include/asm/xen/interface_32.h
@@ -0,0 +1,97 @@
1/******************************************************************************
2 * arch-x86_32.h
3 *
4 * Guest OS interface to x86 32-bit Xen.
5 *
6 * Copyright (c) 2004, K A Fraser
7 */
8
9#ifndef _ASM_X86_XEN_INTERFACE_32_H
10#define _ASM_X86_XEN_INTERFACE_32_H
11
12
13/*
14 * These flat segments are in the Xen-private section of every GDT. Since these
15 * are also present in the initial GDT, many OSes will be able to avoid
16 * installing their own GDT.
17 */
18#define FLAT_RING1_CS 0xe019 /* GDT index 259 */
19#define FLAT_RING1_DS 0xe021 /* GDT index 260 */
20#define FLAT_RING1_SS 0xe021 /* GDT index 260 */
21#define FLAT_RING3_CS 0xe02b /* GDT index 261 */
22#define FLAT_RING3_DS 0xe033 /* GDT index 262 */
23#define FLAT_RING3_SS 0xe033 /* GDT index 262 */
24
25#define FLAT_KERNEL_CS FLAT_RING1_CS
26#define FLAT_KERNEL_DS FLAT_RING1_DS
27#define FLAT_KERNEL_SS FLAT_RING1_SS
28#define FLAT_USER_CS FLAT_RING3_CS
29#define FLAT_USER_DS FLAT_RING3_DS
30#define FLAT_USER_SS FLAT_RING3_SS
31
32/* And the trap vector is... */
33#define TRAP_INSTR "int $0x82"
34
35/*
36 * Virtual addresses beyond this are not modifiable by guest OSes. The
37 * machine->physical mapping table starts at this address, read-only.
38 */
39#define __HYPERVISOR_VIRT_START 0xF5800000
40
41#ifndef __ASSEMBLY__
42
43struct cpu_user_regs {
44 uint32_t ebx;
45 uint32_t ecx;
46 uint32_t edx;
47 uint32_t esi;
48 uint32_t edi;
49 uint32_t ebp;
50 uint32_t eax;
51 uint16_t error_code; /* private */
52 uint16_t entry_vector; /* private */
53 uint32_t eip;
54 uint16_t cs;
55 uint8_t saved_upcall_mask;
56 uint8_t _pad0;
57 uint32_t eflags; /* eflags.IF == !saved_upcall_mask */
58 uint32_t esp;
59 uint16_t ss, _pad1;
60 uint16_t es, _pad2;
61 uint16_t ds, _pad3;
62 uint16_t fs, _pad4;
63 uint16_t gs, _pad5;
64};
65DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
66
67typedef uint64_t tsc_timestamp_t; /* RDTSC timestamp */
68
69struct arch_vcpu_info {
70 unsigned long cr2;
71 unsigned long pad[5]; /* sizeof(struct vcpu_info) == 64 */
72};
73
74struct xen_callback {
75 unsigned long cs;
76 unsigned long eip;
77};
78typedef struct xen_callback xen_callback_t;
79
80#define XEN_CALLBACK(__cs, __eip) \
81 ((struct xen_callback){ .cs = (__cs), .eip = (unsigned long)(__eip) })
82#endif /* !__ASSEMBLY__ */
83
84
85/*
86 * Page-directory addresses above 4GB do not fit into architectural %cr3.
87 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
88 * must use the following accessor macros to pack/unpack valid MFNs.
89 *
90 * Note that Xen is using the fact that the pagetable base is always
91 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
92 * of cr3.
93 */
94#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
95#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
96
97#endif /* _ASM_X86_XEN_INTERFACE_32_H */
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
new file mode 100644
index 000000000000..100d2662b97c
--- /dev/null
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -0,0 +1,159 @@
1#ifndef _ASM_X86_XEN_INTERFACE_64_H
2#define _ASM_X86_XEN_INTERFACE_64_H
3
4/*
5 * 64-bit segment selectors
6 * These flat segments are in the Xen-private section of every GDT. Since these
7 * are also present in the initial GDT, many OSes will be able to avoid
8 * installing their own GDT.
9 */
10
11#define FLAT_RING3_CS32 0xe023 /* GDT index 260 */
12#define FLAT_RING3_CS64 0xe033 /* GDT index 261 */
13#define FLAT_RING3_DS32 0xe02b /* GDT index 262 */
14#define FLAT_RING3_DS64 0x0000 /* NULL selector */
15#define FLAT_RING3_SS32 0xe02b /* GDT index 262 */
16#define FLAT_RING3_SS64 0xe02b /* GDT index 262 */
17
18#define FLAT_KERNEL_DS64 FLAT_RING3_DS64
19#define FLAT_KERNEL_DS32 FLAT_RING3_DS32
20#define FLAT_KERNEL_DS FLAT_KERNEL_DS64
21#define FLAT_KERNEL_CS64 FLAT_RING3_CS64
22#define FLAT_KERNEL_CS32 FLAT_RING3_CS32
23#define FLAT_KERNEL_CS FLAT_KERNEL_CS64
24#define FLAT_KERNEL_SS64 FLAT_RING3_SS64
25#define FLAT_KERNEL_SS32 FLAT_RING3_SS32
26#define FLAT_KERNEL_SS FLAT_KERNEL_SS64
27
28#define FLAT_USER_DS64 FLAT_RING3_DS64
29#define FLAT_USER_DS32 FLAT_RING3_DS32
30#define FLAT_USER_DS FLAT_USER_DS64
31#define FLAT_USER_CS64 FLAT_RING3_CS64
32#define FLAT_USER_CS32 FLAT_RING3_CS32
33#define FLAT_USER_CS FLAT_USER_CS64
34#define FLAT_USER_SS64 FLAT_RING3_SS64
35#define FLAT_USER_SS32 FLAT_RING3_SS32
36#define FLAT_USER_SS FLAT_USER_SS64
37
38#define __HYPERVISOR_VIRT_START 0xFFFF800000000000
39#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
40#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
41#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
42
43#ifndef HYPERVISOR_VIRT_START
44#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
45#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
46#endif
47
48#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
49#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
50#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
51#ifndef machine_to_phys_mapping
52#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
53#endif
54
55/*
56 * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
57 * @which == SEGBASE_* ; @base == 64-bit base address
58 * Returns 0 on success.
59 */
60#define SEGBASE_FS 0
61#define SEGBASE_GS_USER 1
62#define SEGBASE_GS_KERNEL 2
63#define SEGBASE_GS_USER_SEL 3 /* Set user %gs specified in base[15:0] */
64
65/*
66 * int HYPERVISOR_iret(void)
67 * All arguments are on the kernel stack, in the following format.
68 * Never returns if successful. Current kernel context is lost.
69 * The saved CS is mapped as follows:
70 * RING0 -> RING3 kernel mode.
71 * RING1 -> RING3 kernel mode.
72 * RING2 -> RING3 kernel mode.
73 * RING3 -> RING3 user mode.
74 * However RING0 indicates that the guest kernel should return to iteself
75 * directly with
76 * orb $3,1*8(%rsp)
77 * iretq
78 * If flags contains VGCF_in_syscall:
79 * Restore RAX, RIP, RFLAGS, RSP.
80 * Discard R11, RCX, CS, SS.
81 * Otherwise:
82 * Restore RAX, R11, RCX, CS:RIP, RFLAGS, SS:RSP.
83 * All other registers are saved on hypercall entry and restored to user.
84 */
85/* Guest exited in SYSCALL context? Return to guest with SYSRET? */
86#define _VGCF_in_syscall 8
87#define VGCF_in_syscall (1<<_VGCF_in_syscall)
88#define VGCF_IN_SYSCALL VGCF_in_syscall
89
90#ifndef __ASSEMBLY__
91
92struct iret_context {
93 /* Top of stack (%rsp at point of hypercall). */
94 uint64_t rax, r11, rcx, flags, rip, cs, rflags, rsp, ss;
95 /* Bottom of iret stack frame. */
96};
97
98#if defined(__GNUC__) && !defined(__STRICT_ANSI__)
99/* Anonymous union includes both 32- and 64-bit names (e.g., eax/rax). */
100#define __DECL_REG(name) union { \
101 uint64_t r ## name, e ## name; \
102 uint32_t _e ## name; \
103}
104#else
105/* Non-gcc sources must always use the proper 64-bit name (e.g., rax). */
106#define __DECL_REG(name) uint64_t r ## name
107#endif
108
109struct cpu_user_regs {
110 uint64_t r15;
111 uint64_t r14;
112 uint64_t r13;
113 uint64_t r12;
114 __DECL_REG(bp);
115 __DECL_REG(bx);
116 uint64_t r11;
117 uint64_t r10;
118 uint64_t r9;
119 uint64_t r8;
120 __DECL_REG(ax);
121 __DECL_REG(cx);
122 __DECL_REG(dx);
123 __DECL_REG(si);
124 __DECL_REG(di);
125 uint32_t error_code; /* private */
126 uint32_t entry_vector; /* private */
127 __DECL_REG(ip);
128 uint16_t cs, _pad0[1];
129 uint8_t saved_upcall_mask;
130 uint8_t _pad1[3];
131 __DECL_REG(flags); /* rflags.IF == !saved_upcall_mask */
132 __DECL_REG(sp);
133 uint16_t ss, _pad2[3];
134 uint16_t es, _pad3[3];
135 uint16_t ds, _pad4[3];
136 uint16_t fs, _pad5[3]; /* Non-zero => takes precedence over fs_base. */
137 uint16_t gs, _pad6[3]; /* Non-zero => takes precedence over gs_base_usr. */
138};
139DEFINE_GUEST_HANDLE_STRUCT(cpu_user_regs);
140
141#undef __DECL_REG
142
143#define xen_pfn_to_cr3(pfn) ((unsigned long)(pfn) << 12)
144#define xen_cr3_to_pfn(cr3) ((unsigned long)(cr3) >> 12)
145
146struct arch_vcpu_info {
147 unsigned long cr2;
148 unsigned long pad; /* sizeof(vcpu_info_t) == 64 */
149};
150
151typedef unsigned long xen_callback_t;
152
153#define XEN_CALLBACK(__cs, __rip) \
154 ((unsigned long)(__rip))
155
156#endif /* !__ASSEMBLY__ */
157
158
159#endif /* _ASM_X86_XEN_INTERFACE_64_H */
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
new file mode 100644
index 000000000000..bc628998a1b9
--- /dev/null
+++ b/arch/x86/include/asm/xen/page.h
@@ -0,0 +1,165 @@
1#ifndef _ASM_X86_XEN_PAGE_H
2#define _ASM_X86_XEN_PAGE_H
3
4#include <linux/pfn.h>
5
6#include <asm/uaccess.h>
7#include <asm/pgtable.h>
8
9#include <xen/features.h>
10
11/* Xen machine address */
12typedef struct xmaddr {
13 phys_addr_t maddr;
14} xmaddr_t;
15
16/* Xen pseudo-physical address */
17typedef struct xpaddr {
18 phys_addr_t paddr;
19} xpaddr_t;
20
21#define XMADDR(x) ((xmaddr_t) { .maddr = (x) })
22#define XPADDR(x) ((xpaddr_t) { .paddr = (x) })
23
24/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
25#define INVALID_P2M_ENTRY (~0UL)
26#define FOREIGN_FRAME_BIT (1UL<<31)
27#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
28
29/* Maximum amount of memory we can handle in a domain in pages */
30#define MAX_DOMAIN_PAGES \
31 ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
32
33
34extern unsigned long get_phys_to_machine(unsigned long pfn);
35extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn);
36
37static inline unsigned long pfn_to_mfn(unsigned long pfn)
38{
39 if (xen_feature(XENFEAT_auto_translated_physmap))
40 return pfn;
41
42 return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT;
43}
44
45static inline int phys_to_machine_mapping_valid(unsigned long pfn)
46{
47 if (xen_feature(XENFEAT_auto_translated_physmap))
48 return 1;
49
50 return get_phys_to_machine(pfn) != INVALID_P2M_ENTRY;
51}
52
53static inline unsigned long mfn_to_pfn(unsigned long mfn)
54{
55 unsigned long pfn;
56
57 if (xen_feature(XENFEAT_auto_translated_physmap))
58 return mfn;
59
60#if 0
61 if (unlikely((mfn >> machine_to_phys_order) != 0))
62 return max_mapnr;
63#endif
64
65 pfn = 0;
66 /*
67 * The array access can fail (e.g., device space beyond end of RAM).
68 * In such cases it doesn't matter what we return (we return garbage),
69 * but we must handle the fault without crashing!
70 */
71 __get_user(pfn, &machine_to_phys_mapping[mfn]);
72
73 return pfn;
74}
75
76static inline xmaddr_t phys_to_machine(xpaddr_t phys)
77{
78 unsigned offset = phys.paddr & ~PAGE_MASK;
79 return XMADDR(PFN_PHYS(pfn_to_mfn(PFN_DOWN(phys.paddr))) | offset);
80}
81
82static inline xpaddr_t machine_to_phys(xmaddr_t machine)
83{
84 unsigned offset = machine.maddr & ~PAGE_MASK;
85 return XPADDR(PFN_PHYS(mfn_to_pfn(PFN_DOWN(machine.maddr))) | offset);
86}
87
88/*
89 * We detect special mappings in one of two ways:
90 * 1. If the MFN is an I/O page then Xen will set the m2p entry
91 * to be outside our maximum possible pseudophys range.
92 * 2. If the MFN belongs to a different domain then we will certainly
93 * not have MFN in our p2m table. Conversely, if the page is ours,
94 * then we'll have p2m(m2p(MFN))==MFN.
95 * If we detect a special mapping then it doesn't have a 'struct page'.
96 * We force !pfn_valid() by returning an out-of-range pointer.
97 *
98 * NB. These checks require that, for any MFN that is not in our reservation,
99 * there is no PFN such that p2m(PFN) == MFN. Otherwise we can get confused if
100 * we are foreign-mapping the MFN, and the other domain as m2p(MFN) == PFN.
101 * Yikes! Various places must poke in INVALID_P2M_ENTRY for safety.
102 *
103 * NB2. When deliberately mapping foreign pages into the p2m table, you *must*
104 * use FOREIGN_FRAME(). This will cause pte_pfn() to choke on it, as we
105 * require. In all the cases we care about, the FOREIGN_FRAME bit is
106 * masked (e.g., pfn_to_mfn()) so behaviour there is correct.
107 */
108static inline unsigned long mfn_to_local_pfn(unsigned long mfn)
109{
110 extern unsigned long max_mapnr;
111 unsigned long pfn = mfn_to_pfn(mfn);
112 if ((pfn < max_mapnr)
113 && !xen_feature(XENFEAT_auto_translated_physmap)
114 && (get_phys_to_machine(pfn) != mfn))
115 return max_mapnr; /* force !pfn_valid() */
116 /* XXX fixme; not true with sparsemem */
117 return pfn;
118}
119
120/* VIRT <-> MACHINE conversion */
121#define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v))))
122#define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v))))
123#define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT))
124
125static inline unsigned long pte_mfn(pte_t pte)
126{
127 return (pte.pte & PTE_PFN_MASK) >> PAGE_SHIFT;
128}
129
130static inline pte_t mfn_pte(unsigned long page_nr, pgprot_t pgprot)
131{
132 pte_t pte;
133
134 pte.pte = ((phys_addr_t)page_nr << PAGE_SHIFT) |
135 (pgprot_val(pgprot) & __supported_pte_mask);
136
137 return pte;
138}
139
140static inline pteval_t pte_val_ma(pte_t pte)
141{
142 return pte.pte;
143}
144
145static inline pte_t __pte_ma(pteval_t x)
146{
147 return (pte_t) { .pte = x };
148}
149
150#define pmd_val_ma(v) ((v).pmd)
151#ifdef __PAGETABLE_PUD_FOLDED
152#define pud_val_ma(v) ((v).pgd.pgd)
153#else
154#define pud_val_ma(v) ((v).pud)
155#endif
156#define __pmd_ma(x) ((pmd_t) { (x) } )
157
158#define pgd_val_ma(x) ((x).pgd)
159
160
161xmaddr_t arbitrary_virt_to_machine(void *address);
162void make_lowmem_page_readonly(void *vaddr);
163void make_lowmem_page_readwrite(void *vaddr);
164
165#endif /* _ASM_X86_XEN_PAGE_H */
diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h
new file mode 100644
index 000000000000..11b3bb86e17b
--- /dev/null
+++ b/arch/x86/include/asm/xor.h
@@ -0,0 +1,5 @@
1#ifdef CONFIG_X86_32
2# include "xor_32.h"
3#else
4# include "xor_64.h"
5#endif
diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h
new file mode 100644
index 000000000000..133b40a0f495
--- /dev/null
+++ b/arch/x86/include/asm/xor_32.h
@@ -0,0 +1,888 @@
1#ifndef _ASM_X86_XOR_32_H
2#define _ASM_X86_XOR_32_H
3
4/*
5 * Optimized RAID-5 checksumming functions for MMX and SSE.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
10 * any later version.
11 *
12 * You should have received a copy of the GNU General Public License
13 * (for example /usr/src/linux/COPYING); if not, write to the Free
14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17/*
18 * High-speed RAID5 checksumming functions utilizing MMX instructions.
19 * Copyright (C) 1998 Ingo Molnar.
20 */
21
22#define LD(x, y) " movq 8*("#x")(%1), %%mm"#y" ;\n"
23#define ST(x, y) " movq %%mm"#y", 8*("#x")(%1) ;\n"
24#define XO1(x, y) " pxor 8*("#x")(%2), %%mm"#y" ;\n"
25#define XO2(x, y) " pxor 8*("#x")(%3), %%mm"#y" ;\n"
26#define XO3(x, y) " pxor 8*("#x")(%4), %%mm"#y" ;\n"
27#define XO4(x, y) " pxor 8*("#x")(%5), %%mm"#y" ;\n"
28
29#include <asm/i387.h>
30
31static void
32xor_pII_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
33{
34 unsigned long lines = bytes >> 7;
35
36 kernel_fpu_begin();
37
38 asm volatile(
39#undef BLOCK
40#define BLOCK(i) \
41 LD(i, 0) \
42 LD(i + 1, 1) \
43 LD(i + 2, 2) \
44 LD(i + 3, 3) \
45 XO1(i, 0) \
46 ST(i, 0) \
47 XO1(i+1, 1) \
48 ST(i+1, 1) \
49 XO1(i + 2, 2) \
50 ST(i + 2, 2) \
51 XO1(i + 3, 3) \
52 ST(i + 3, 3)
53
54 " .align 32 ;\n"
55 " 1: ;\n"
56
57 BLOCK(0)
58 BLOCK(4)
59 BLOCK(8)
60 BLOCK(12)
61
62 " addl $128, %1 ;\n"
63 " addl $128, %2 ;\n"
64 " decl %0 ;\n"
65 " jnz 1b ;\n"
66 : "+r" (lines),
67 "+r" (p1), "+r" (p2)
68 :
69 : "memory");
70
71 kernel_fpu_end();
72}
73
74static void
75xor_pII_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
76 unsigned long *p3)
77{
78 unsigned long lines = bytes >> 7;
79
80 kernel_fpu_begin();
81
82 asm volatile(
83#undef BLOCK
84#define BLOCK(i) \
85 LD(i, 0) \
86 LD(i + 1, 1) \
87 LD(i + 2, 2) \
88 LD(i + 3, 3) \
89 XO1(i, 0) \
90 XO1(i + 1, 1) \
91 XO1(i + 2, 2) \
92 XO1(i + 3, 3) \
93 XO2(i, 0) \
94 ST(i, 0) \
95 XO2(i + 1, 1) \
96 ST(i + 1, 1) \
97 XO2(i + 2, 2) \
98 ST(i + 2, 2) \
99 XO2(i + 3, 3) \
100 ST(i + 3, 3)
101
102 " .align 32 ;\n"
103 " 1: ;\n"
104
105 BLOCK(0)
106 BLOCK(4)
107 BLOCK(8)
108 BLOCK(12)
109
110 " addl $128, %1 ;\n"
111 " addl $128, %2 ;\n"
112 " addl $128, %3 ;\n"
113 " decl %0 ;\n"
114 " jnz 1b ;\n"
115 : "+r" (lines),
116 "+r" (p1), "+r" (p2), "+r" (p3)
117 :
118 : "memory");
119
120 kernel_fpu_end();
121}
122
123static void
124xor_pII_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
125 unsigned long *p3, unsigned long *p4)
126{
127 unsigned long lines = bytes >> 7;
128
129 kernel_fpu_begin();
130
131 asm volatile(
132#undef BLOCK
133#define BLOCK(i) \
134 LD(i, 0) \
135 LD(i + 1, 1) \
136 LD(i + 2, 2) \
137 LD(i + 3, 3) \
138 XO1(i, 0) \
139 XO1(i + 1, 1) \
140 XO1(i + 2, 2) \
141 XO1(i + 3, 3) \
142 XO2(i, 0) \
143 XO2(i + 1, 1) \
144 XO2(i + 2, 2) \
145 XO2(i + 3, 3) \
146 XO3(i, 0) \
147 ST(i, 0) \
148 XO3(i + 1, 1) \
149 ST(i + 1, 1) \
150 XO3(i + 2, 2) \
151 ST(i + 2, 2) \
152 XO3(i + 3, 3) \
153 ST(i + 3, 3)
154
155 " .align 32 ;\n"
156 " 1: ;\n"
157
158 BLOCK(0)
159 BLOCK(4)
160 BLOCK(8)
161 BLOCK(12)
162
163 " addl $128, %1 ;\n"
164 " addl $128, %2 ;\n"
165 " addl $128, %3 ;\n"
166 " addl $128, %4 ;\n"
167 " decl %0 ;\n"
168 " jnz 1b ;\n"
169 : "+r" (lines),
170 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
171 :
172 : "memory");
173
174 kernel_fpu_end();
175}
176
177
178static void
179xor_pII_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
180 unsigned long *p3, unsigned long *p4, unsigned long *p5)
181{
182 unsigned long lines = bytes >> 7;
183
184 kernel_fpu_begin();
185
186 /* Make sure GCC forgets anything it knows about p4 or p5,
187 such that it won't pass to the asm volatile below a
188 register that is shared with any other variable. That's
189 because we modify p4 and p5 there, but we can't mark them
190 as read/write, otherwise we'd overflow the 10-asm-operands
191 limit of GCC < 3.1. */
192 asm("" : "+r" (p4), "+r" (p5));
193
194 asm volatile(
195#undef BLOCK
196#define BLOCK(i) \
197 LD(i, 0) \
198 LD(i + 1, 1) \
199 LD(i + 2, 2) \
200 LD(i + 3, 3) \
201 XO1(i, 0) \
202 XO1(i + 1, 1) \
203 XO1(i + 2, 2) \
204 XO1(i + 3, 3) \
205 XO2(i, 0) \
206 XO2(i + 1, 1) \
207 XO2(i + 2, 2) \
208 XO2(i + 3, 3) \
209 XO3(i, 0) \
210 XO3(i + 1, 1) \
211 XO3(i + 2, 2) \
212 XO3(i + 3, 3) \
213 XO4(i, 0) \
214 ST(i, 0) \
215 XO4(i + 1, 1) \
216 ST(i + 1, 1) \
217 XO4(i + 2, 2) \
218 ST(i + 2, 2) \
219 XO4(i + 3, 3) \
220 ST(i + 3, 3)
221
222 " .align 32 ;\n"
223 " 1: ;\n"
224
225 BLOCK(0)
226 BLOCK(4)
227 BLOCK(8)
228 BLOCK(12)
229
230 " addl $128, %1 ;\n"
231 " addl $128, %2 ;\n"
232 " addl $128, %3 ;\n"
233 " addl $128, %4 ;\n"
234 " addl $128, %5 ;\n"
235 " decl %0 ;\n"
236 " jnz 1b ;\n"
237 : "+r" (lines),
238 "+r" (p1), "+r" (p2), "+r" (p3)
239 : "r" (p4), "r" (p5)
240 : "memory");
241
242 /* p4 and p5 were modified, and now the variables are dead.
243 Clobber them just to be sure nobody does something stupid
244 like assuming they have some legal value. */
245 asm("" : "=r" (p4), "=r" (p5));
246
247 kernel_fpu_end();
248}
249
250#undef LD
251#undef XO1
252#undef XO2
253#undef XO3
254#undef XO4
255#undef ST
256#undef BLOCK
257
258static void
259xor_p5_mmx_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
260{
261 unsigned long lines = bytes >> 6;
262
263 kernel_fpu_begin();
264
265 asm volatile(
266 " .align 32 ;\n"
267 " 1: ;\n"
268 " movq (%1), %%mm0 ;\n"
269 " movq 8(%1), %%mm1 ;\n"
270 " pxor (%2), %%mm0 ;\n"
271 " movq 16(%1), %%mm2 ;\n"
272 " movq %%mm0, (%1) ;\n"
273 " pxor 8(%2), %%mm1 ;\n"
274 " movq 24(%1), %%mm3 ;\n"
275 " movq %%mm1, 8(%1) ;\n"
276 " pxor 16(%2), %%mm2 ;\n"
277 " movq 32(%1), %%mm4 ;\n"
278 " movq %%mm2, 16(%1) ;\n"
279 " pxor 24(%2), %%mm3 ;\n"
280 " movq 40(%1), %%mm5 ;\n"
281 " movq %%mm3, 24(%1) ;\n"
282 " pxor 32(%2), %%mm4 ;\n"
283 " movq 48(%1), %%mm6 ;\n"
284 " movq %%mm4, 32(%1) ;\n"
285 " pxor 40(%2), %%mm5 ;\n"
286 " movq 56(%1), %%mm7 ;\n"
287 " movq %%mm5, 40(%1) ;\n"
288 " pxor 48(%2), %%mm6 ;\n"
289 " pxor 56(%2), %%mm7 ;\n"
290 " movq %%mm6, 48(%1) ;\n"
291 " movq %%mm7, 56(%1) ;\n"
292
293 " addl $64, %1 ;\n"
294 " addl $64, %2 ;\n"
295 " decl %0 ;\n"
296 " jnz 1b ;\n"
297 : "+r" (lines),
298 "+r" (p1), "+r" (p2)
299 :
300 : "memory");
301
302 kernel_fpu_end();
303}
304
305static void
306xor_p5_mmx_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
307 unsigned long *p3)
308{
309 unsigned long lines = bytes >> 6;
310
311 kernel_fpu_begin();
312
313 asm volatile(
314 " .align 32,0x90 ;\n"
315 " 1: ;\n"
316 " movq (%1), %%mm0 ;\n"
317 " movq 8(%1), %%mm1 ;\n"
318 " pxor (%2), %%mm0 ;\n"
319 " movq 16(%1), %%mm2 ;\n"
320 " pxor 8(%2), %%mm1 ;\n"
321 " pxor (%3), %%mm0 ;\n"
322 " pxor 16(%2), %%mm2 ;\n"
323 " movq %%mm0, (%1) ;\n"
324 " pxor 8(%3), %%mm1 ;\n"
325 " pxor 16(%3), %%mm2 ;\n"
326 " movq 24(%1), %%mm3 ;\n"
327 " movq %%mm1, 8(%1) ;\n"
328 " movq 32(%1), %%mm4 ;\n"
329 " movq 40(%1), %%mm5 ;\n"
330 " pxor 24(%2), %%mm3 ;\n"
331 " movq %%mm2, 16(%1) ;\n"
332 " pxor 32(%2), %%mm4 ;\n"
333 " pxor 24(%3), %%mm3 ;\n"
334 " pxor 40(%2), %%mm5 ;\n"
335 " movq %%mm3, 24(%1) ;\n"
336 " pxor 32(%3), %%mm4 ;\n"
337 " pxor 40(%3), %%mm5 ;\n"
338 " movq 48(%1), %%mm6 ;\n"
339 " movq %%mm4, 32(%1) ;\n"
340 " movq 56(%1), %%mm7 ;\n"
341 " pxor 48(%2), %%mm6 ;\n"
342 " movq %%mm5, 40(%1) ;\n"
343 " pxor 56(%2), %%mm7 ;\n"
344 " pxor 48(%3), %%mm6 ;\n"
345 " pxor 56(%3), %%mm7 ;\n"
346 " movq %%mm6, 48(%1) ;\n"
347 " movq %%mm7, 56(%1) ;\n"
348
349 " addl $64, %1 ;\n"
350 " addl $64, %2 ;\n"
351 " addl $64, %3 ;\n"
352 " decl %0 ;\n"
353 " jnz 1b ;\n"
354 : "+r" (lines),
355 "+r" (p1), "+r" (p2), "+r" (p3)
356 :
357 : "memory" );
358
359 kernel_fpu_end();
360}
361
362static void
363xor_p5_mmx_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
364 unsigned long *p3, unsigned long *p4)
365{
366 unsigned long lines = bytes >> 6;
367
368 kernel_fpu_begin();
369
370 asm volatile(
371 " .align 32,0x90 ;\n"
372 " 1: ;\n"
373 " movq (%1), %%mm0 ;\n"
374 " movq 8(%1), %%mm1 ;\n"
375 " pxor (%2), %%mm0 ;\n"
376 " movq 16(%1), %%mm2 ;\n"
377 " pxor 8(%2), %%mm1 ;\n"
378 " pxor (%3), %%mm0 ;\n"
379 " pxor 16(%2), %%mm2 ;\n"
380 " pxor 8(%3), %%mm1 ;\n"
381 " pxor (%4), %%mm0 ;\n"
382 " movq 24(%1), %%mm3 ;\n"
383 " pxor 16(%3), %%mm2 ;\n"
384 " pxor 8(%4), %%mm1 ;\n"
385 " movq %%mm0, (%1) ;\n"
386 " movq 32(%1), %%mm4 ;\n"
387 " pxor 24(%2), %%mm3 ;\n"
388 " pxor 16(%4), %%mm2 ;\n"
389 " movq %%mm1, 8(%1) ;\n"
390 " movq 40(%1), %%mm5 ;\n"
391 " pxor 32(%2), %%mm4 ;\n"
392 " pxor 24(%3), %%mm3 ;\n"
393 " movq %%mm2, 16(%1) ;\n"
394 " pxor 40(%2), %%mm5 ;\n"
395 " pxor 32(%3), %%mm4 ;\n"
396 " pxor 24(%4), %%mm3 ;\n"
397 " movq %%mm3, 24(%1) ;\n"
398 " movq 56(%1), %%mm7 ;\n"
399 " movq 48(%1), %%mm6 ;\n"
400 " pxor 40(%3), %%mm5 ;\n"
401 " pxor 32(%4), %%mm4 ;\n"
402 " pxor 48(%2), %%mm6 ;\n"
403 " movq %%mm4, 32(%1) ;\n"
404 " pxor 56(%2), %%mm7 ;\n"
405 " pxor 40(%4), %%mm5 ;\n"
406 " pxor 48(%3), %%mm6 ;\n"
407 " pxor 56(%3), %%mm7 ;\n"
408 " movq %%mm5, 40(%1) ;\n"
409 " pxor 48(%4), %%mm6 ;\n"
410 " pxor 56(%4), %%mm7 ;\n"
411 " movq %%mm6, 48(%1) ;\n"
412 " movq %%mm7, 56(%1) ;\n"
413
414 " addl $64, %1 ;\n"
415 " addl $64, %2 ;\n"
416 " addl $64, %3 ;\n"
417 " addl $64, %4 ;\n"
418 " decl %0 ;\n"
419 " jnz 1b ;\n"
420 : "+r" (lines),
421 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
422 :
423 : "memory");
424
425 kernel_fpu_end();
426}
427
428static void
429xor_p5_mmx_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
430 unsigned long *p3, unsigned long *p4, unsigned long *p5)
431{
432 unsigned long lines = bytes >> 6;
433
434 kernel_fpu_begin();
435
436 /* Make sure GCC forgets anything it knows about p4 or p5,
437 such that it won't pass to the asm volatile below a
438 register that is shared with any other variable. That's
439 because we modify p4 and p5 there, but we can't mark them
440 as read/write, otherwise we'd overflow the 10-asm-operands
441 limit of GCC < 3.1. */
442 asm("" : "+r" (p4), "+r" (p5));
443
444 asm volatile(
445 " .align 32,0x90 ;\n"
446 " 1: ;\n"
447 " movq (%1), %%mm0 ;\n"
448 " movq 8(%1), %%mm1 ;\n"
449 " pxor (%2), %%mm0 ;\n"
450 " pxor 8(%2), %%mm1 ;\n"
451 " movq 16(%1), %%mm2 ;\n"
452 " pxor (%3), %%mm0 ;\n"
453 " pxor 8(%3), %%mm1 ;\n"
454 " pxor 16(%2), %%mm2 ;\n"
455 " pxor (%4), %%mm0 ;\n"
456 " pxor 8(%4), %%mm1 ;\n"
457 " pxor 16(%3), %%mm2 ;\n"
458 " movq 24(%1), %%mm3 ;\n"
459 " pxor (%5), %%mm0 ;\n"
460 " pxor 8(%5), %%mm1 ;\n"
461 " movq %%mm0, (%1) ;\n"
462 " pxor 16(%4), %%mm2 ;\n"
463 " pxor 24(%2), %%mm3 ;\n"
464 " movq %%mm1, 8(%1) ;\n"
465 " pxor 16(%5), %%mm2 ;\n"
466 " pxor 24(%3), %%mm3 ;\n"
467 " movq 32(%1), %%mm4 ;\n"
468 " movq %%mm2, 16(%1) ;\n"
469 " pxor 24(%4), %%mm3 ;\n"
470 " pxor 32(%2), %%mm4 ;\n"
471 " movq 40(%1), %%mm5 ;\n"
472 " pxor 24(%5), %%mm3 ;\n"
473 " pxor 32(%3), %%mm4 ;\n"
474 " pxor 40(%2), %%mm5 ;\n"
475 " movq %%mm3, 24(%1) ;\n"
476 " pxor 32(%4), %%mm4 ;\n"
477 " pxor 40(%3), %%mm5 ;\n"
478 " movq 48(%1), %%mm6 ;\n"
479 " movq 56(%1), %%mm7 ;\n"
480 " pxor 32(%5), %%mm4 ;\n"
481 " pxor 40(%4), %%mm5 ;\n"
482 " pxor 48(%2), %%mm6 ;\n"
483 " pxor 56(%2), %%mm7 ;\n"
484 " movq %%mm4, 32(%1) ;\n"
485 " pxor 48(%3), %%mm6 ;\n"
486 " pxor 56(%3), %%mm7 ;\n"
487 " pxor 40(%5), %%mm5 ;\n"
488 " pxor 48(%4), %%mm6 ;\n"
489 " pxor 56(%4), %%mm7 ;\n"
490 " movq %%mm5, 40(%1) ;\n"
491 " pxor 48(%5), %%mm6 ;\n"
492 " pxor 56(%5), %%mm7 ;\n"
493 " movq %%mm6, 48(%1) ;\n"
494 " movq %%mm7, 56(%1) ;\n"
495
496 " addl $64, %1 ;\n"
497 " addl $64, %2 ;\n"
498 " addl $64, %3 ;\n"
499 " addl $64, %4 ;\n"
500 " addl $64, %5 ;\n"
501 " decl %0 ;\n"
502 " jnz 1b ;\n"
503 : "+r" (lines),
504 "+r" (p1), "+r" (p2), "+r" (p3)
505 : "r" (p4), "r" (p5)
506 : "memory");
507
508 /* p4 and p5 were modified, and now the variables are dead.
509 Clobber them just to be sure nobody does something stupid
510 like assuming they have some legal value. */
511 asm("" : "=r" (p4), "=r" (p5));
512
513 kernel_fpu_end();
514}
515
516static struct xor_block_template xor_block_pII_mmx = {
517 .name = "pII_mmx",
518 .do_2 = xor_pII_mmx_2,
519 .do_3 = xor_pII_mmx_3,
520 .do_4 = xor_pII_mmx_4,
521 .do_5 = xor_pII_mmx_5,
522};
523
524static struct xor_block_template xor_block_p5_mmx = {
525 .name = "p5_mmx",
526 .do_2 = xor_p5_mmx_2,
527 .do_3 = xor_p5_mmx_3,
528 .do_4 = xor_p5_mmx_4,
529 .do_5 = xor_p5_mmx_5,
530};
531
532/*
533 * Cache avoiding checksumming functions utilizing KNI instructions
534 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
535 */
536
537#define XMMS_SAVE \
538do { \
539 preempt_disable(); \
540 cr0 = read_cr0(); \
541 clts(); \
542 asm volatile( \
543 "movups %%xmm0,(%0) ;\n\t" \
544 "movups %%xmm1,0x10(%0) ;\n\t" \
545 "movups %%xmm2,0x20(%0) ;\n\t" \
546 "movups %%xmm3,0x30(%0) ;\n\t" \
547 : \
548 : "r" (xmm_save) \
549 : "memory"); \
550} while (0)
551
552#define XMMS_RESTORE \
553do { \
554 asm volatile( \
555 "sfence ;\n\t" \
556 "movups (%0),%%xmm0 ;\n\t" \
557 "movups 0x10(%0),%%xmm1 ;\n\t" \
558 "movups 0x20(%0),%%xmm2 ;\n\t" \
559 "movups 0x30(%0),%%xmm3 ;\n\t" \
560 : \
561 : "r" (xmm_save) \
562 : "memory"); \
563 write_cr0(cr0); \
564 preempt_enable(); \
565} while (0)
566
567#define ALIGN16 __attribute__((aligned(16)))
568
569#define OFFS(x) "16*("#x")"
570#define PF_OFFS(x) "256+16*("#x")"
571#define PF0(x) " prefetchnta "PF_OFFS(x)"(%1) ;\n"
572#define LD(x, y) " movaps "OFFS(x)"(%1), %%xmm"#y" ;\n"
573#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%1) ;\n"
574#define PF1(x) " prefetchnta "PF_OFFS(x)"(%2) ;\n"
575#define PF2(x) " prefetchnta "PF_OFFS(x)"(%3) ;\n"
576#define PF3(x) " prefetchnta "PF_OFFS(x)"(%4) ;\n"
577#define PF4(x) " prefetchnta "PF_OFFS(x)"(%5) ;\n"
578#define PF5(x) " prefetchnta "PF_OFFS(x)"(%6) ;\n"
579#define XO1(x, y) " xorps "OFFS(x)"(%2), %%xmm"#y" ;\n"
580#define XO2(x, y) " xorps "OFFS(x)"(%3), %%xmm"#y" ;\n"
581#define XO3(x, y) " xorps "OFFS(x)"(%4), %%xmm"#y" ;\n"
582#define XO4(x, y) " xorps "OFFS(x)"(%5), %%xmm"#y" ;\n"
583#define XO5(x, y) " xorps "OFFS(x)"(%6), %%xmm"#y" ;\n"
584
585
586static void
587xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
588{
589 unsigned long lines = bytes >> 8;
590 char xmm_save[16*4] ALIGN16;
591 int cr0;
592
593 XMMS_SAVE;
594
595 asm volatile(
596#undef BLOCK
597#define BLOCK(i) \
598 LD(i, 0) \
599 LD(i + 1, 1) \
600 PF1(i) \
601 PF1(i + 2) \
602 LD(i + 2, 2) \
603 LD(i + 3, 3) \
604 PF0(i + 4) \
605 PF0(i + 6) \
606 XO1(i, 0) \
607 XO1(i + 1, 1) \
608 XO1(i + 2, 2) \
609 XO1(i + 3, 3) \
610 ST(i, 0) \
611 ST(i + 1, 1) \
612 ST(i + 2, 2) \
613 ST(i + 3, 3) \
614
615
616 PF0(0)
617 PF0(2)
618
619 " .align 32 ;\n"
620 " 1: ;\n"
621
622 BLOCK(0)
623 BLOCK(4)
624 BLOCK(8)
625 BLOCK(12)
626
627 " addl $256, %1 ;\n"
628 " addl $256, %2 ;\n"
629 " decl %0 ;\n"
630 " jnz 1b ;\n"
631 : "+r" (lines),
632 "+r" (p1), "+r" (p2)
633 :
634 : "memory");
635
636 XMMS_RESTORE;
637}
638
639static void
640xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
641 unsigned long *p3)
642{
643 unsigned long lines = bytes >> 8;
644 char xmm_save[16*4] ALIGN16;
645 int cr0;
646
647 XMMS_SAVE;
648
649 asm volatile(
650#undef BLOCK
651#define BLOCK(i) \
652 PF1(i) \
653 PF1(i + 2) \
654 LD(i,0) \
655 LD(i + 1, 1) \
656 LD(i + 2, 2) \
657 LD(i + 3, 3) \
658 PF2(i) \
659 PF2(i + 2) \
660 PF0(i + 4) \
661 PF0(i + 6) \
662 XO1(i,0) \
663 XO1(i + 1, 1) \
664 XO1(i + 2, 2) \
665 XO1(i + 3, 3) \
666 XO2(i,0) \
667 XO2(i + 1, 1) \
668 XO2(i + 2, 2) \
669 XO2(i + 3, 3) \
670 ST(i,0) \
671 ST(i + 1, 1) \
672 ST(i + 2, 2) \
673 ST(i + 3, 3) \
674
675
676 PF0(0)
677 PF0(2)
678
679 " .align 32 ;\n"
680 " 1: ;\n"
681
682 BLOCK(0)
683 BLOCK(4)
684 BLOCK(8)
685 BLOCK(12)
686
687 " addl $256, %1 ;\n"
688 " addl $256, %2 ;\n"
689 " addl $256, %3 ;\n"
690 " decl %0 ;\n"
691 " jnz 1b ;\n"
692 : "+r" (lines),
693 "+r" (p1), "+r"(p2), "+r"(p3)
694 :
695 : "memory" );
696
697 XMMS_RESTORE;
698}
699
700static void
701xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
702 unsigned long *p3, unsigned long *p4)
703{
704 unsigned long lines = bytes >> 8;
705 char xmm_save[16*4] ALIGN16;
706 int cr0;
707
708 XMMS_SAVE;
709
710 asm volatile(
711#undef BLOCK
712#define BLOCK(i) \
713 PF1(i) \
714 PF1(i + 2) \
715 LD(i,0) \
716 LD(i + 1, 1) \
717 LD(i + 2, 2) \
718 LD(i + 3, 3) \
719 PF2(i) \
720 PF2(i + 2) \
721 XO1(i,0) \
722 XO1(i + 1, 1) \
723 XO1(i + 2, 2) \
724 XO1(i + 3, 3) \
725 PF3(i) \
726 PF3(i + 2) \
727 PF0(i + 4) \
728 PF0(i + 6) \
729 XO2(i,0) \
730 XO2(i + 1, 1) \
731 XO2(i + 2, 2) \
732 XO2(i + 3, 3) \
733 XO3(i,0) \
734 XO3(i + 1, 1) \
735 XO3(i + 2, 2) \
736 XO3(i + 3, 3) \
737 ST(i,0) \
738 ST(i + 1, 1) \
739 ST(i + 2, 2) \
740 ST(i + 3, 3) \
741
742
743 PF0(0)
744 PF0(2)
745
746 " .align 32 ;\n"
747 " 1: ;\n"
748
749 BLOCK(0)
750 BLOCK(4)
751 BLOCK(8)
752 BLOCK(12)
753
754 " addl $256, %1 ;\n"
755 " addl $256, %2 ;\n"
756 " addl $256, %3 ;\n"
757 " addl $256, %4 ;\n"
758 " decl %0 ;\n"
759 " jnz 1b ;\n"
760 : "+r" (lines),
761 "+r" (p1), "+r" (p2), "+r" (p3), "+r" (p4)
762 :
763 : "memory" );
764
765 XMMS_RESTORE;
766}
767
768static void
769xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
770 unsigned long *p3, unsigned long *p4, unsigned long *p5)
771{
772 unsigned long lines = bytes >> 8;
773 char xmm_save[16*4] ALIGN16;
774 int cr0;
775
776 XMMS_SAVE;
777
778 /* Make sure GCC forgets anything it knows about p4 or p5,
779 such that it won't pass to the asm volatile below a
780 register that is shared with any other variable. That's
781 because we modify p4 and p5 there, but we can't mark them
782 as read/write, otherwise we'd overflow the 10-asm-operands
783 limit of GCC < 3.1. */
784 asm("" : "+r" (p4), "+r" (p5));
785
786 asm volatile(
787#undef BLOCK
788#define BLOCK(i) \
789 PF1(i) \
790 PF1(i + 2) \
791 LD(i,0) \
792 LD(i + 1, 1) \
793 LD(i + 2, 2) \
794 LD(i + 3, 3) \
795 PF2(i) \
796 PF2(i + 2) \
797 XO1(i,0) \
798 XO1(i + 1, 1) \
799 XO1(i + 2, 2) \
800 XO1(i + 3, 3) \
801 PF3(i) \
802 PF3(i + 2) \
803 XO2(i,0) \
804 XO2(i + 1, 1) \
805 XO2(i + 2, 2) \
806 XO2(i + 3, 3) \
807 PF4(i) \
808 PF4(i + 2) \
809 PF0(i + 4) \
810 PF0(i + 6) \
811 XO3(i,0) \
812 XO3(i + 1, 1) \
813 XO3(i + 2, 2) \
814 XO3(i + 3, 3) \
815 XO4(i,0) \
816 XO4(i + 1, 1) \
817 XO4(i + 2, 2) \
818 XO4(i + 3, 3) \
819 ST(i,0) \
820 ST(i + 1, 1) \
821 ST(i + 2, 2) \
822 ST(i + 3, 3) \
823
824
825 PF0(0)
826 PF0(2)
827
828 " .align 32 ;\n"
829 " 1: ;\n"
830
831 BLOCK(0)
832 BLOCK(4)
833 BLOCK(8)
834 BLOCK(12)
835
836 " addl $256, %1 ;\n"
837 " addl $256, %2 ;\n"
838 " addl $256, %3 ;\n"
839 " addl $256, %4 ;\n"
840 " addl $256, %5 ;\n"
841 " decl %0 ;\n"
842 " jnz 1b ;\n"
843 : "+r" (lines),
844 "+r" (p1), "+r" (p2), "+r" (p3)
845 : "r" (p4), "r" (p5)
846 : "memory");
847
848 /* p4 and p5 were modified, and now the variables are dead.
849 Clobber them just to be sure nobody does something stupid
850 like assuming they have some legal value. */
851 asm("" : "=r" (p4), "=r" (p5));
852
853 XMMS_RESTORE;
854}
855
856static struct xor_block_template xor_block_pIII_sse = {
857 .name = "pIII_sse",
858 .do_2 = xor_sse_2,
859 .do_3 = xor_sse_3,
860 .do_4 = xor_sse_4,
861 .do_5 = xor_sse_5,
862};
863
864/* Also try the generic routines. */
865#include <asm-generic/xor.h>
866
867#undef XOR_TRY_TEMPLATES
868#define XOR_TRY_TEMPLATES \
869do { \
870 xor_speed(&xor_block_8regs); \
871 xor_speed(&xor_block_8regs_p); \
872 xor_speed(&xor_block_32regs); \
873 xor_speed(&xor_block_32regs_p); \
874 if (cpu_has_xmm) \
875 xor_speed(&xor_block_pIII_sse); \
876 if (cpu_has_mmx) { \
877 xor_speed(&xor_block_pII_mmx); \
878 xor_speed(&xor_block_p5_mmx); \
879 } \
880} while (0)
881
882/* We force the use of the SSE xor block because it can write around L2.
883 We may also be able to load into the L1 only depending on how the cpu
884 deals with a load to a line that is being prefetched. */
885#define XOR_SELECT_TEMPLATE(FASTEST) \
886 (cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
887
888#endif /* _ASM_X86_XOR_32_H */
diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h
new file mode 100644
index 000000000000..1549b5e261f6
--- /dev/null
+++ b/arch/x86/include/asm/xor_64.h
@@ -0,0 +1,361 @@
1#ifndef _ASM_X86_XOR_64_H
2#define _ASM_X86_XOR_64_H
3
4/*
5 * Optimized RAID-5 checksumming functions for MMX and SSE.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2, or (at your option)
10 * any later version.
11 *
12 * You should have received a copy of the GNU General Public License
13 * (for example /usr/src/linux/COPYING); if not, write to the Free
14 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
15 */
16
17
18/*
19 * Cache avoiding checksumming functions utilizing KNI instructions
20 * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)
21 */
22
23/*
24 * Based on
25 * High-speed RAID5 checksumming functions utilizing SSE instructions.
26 * Copyright (C) 1998 Ingo Molnar.
27 */
28
29/*
30 * x86-64 changes / gcc fixes from Andi Kleen.
31 * Copyright 2002 Andi Kleen, SuSE Labs.
32 *
33 * This hasn't been optimized for the hammer yet, but there are likely
34 * no advantages to be gotten from x86-64 here anyways.
35 */
36
37typedef struct {
38 unsigned long a, b;
39} __attribute__((aligned(16))) xmm_store_t;
40
41/* Doesn't use gcc to save the XMM registers, because there is no easy way to
42 tell it to do a clts before the register saving. */
43#define XMMS_SAVE \
44do { \
45 preempt_disable(); \
46 asm volatile( \
47 "movq %%cr0,%0 ;\n\t" \
48 "clts ;\n\t" \
49 "movups %%xmm0,(%1) ;\n\t" \
50 "movups %%xmm1,0x10(%1) ;\n\t" \
51 "movups %%xmm2,0x20(%1) ;\n\t" \
52 "movups %%xmm3,0x30(%1) ;\n\t" \
53 : "=&r" (cr0) \
54 : "r" (xmm_save) \
55 : "memory"); \
56} while (0)
57
58#define XMMS_RESTORE \
59do { \
60 asm volatile( \
61 "sfence ;\n\t" \
62 "movups (%1),%%xmm0 ;\n\t" \
63 "movups 0x10(%1),%%xmm1 ;\n\t" \
64 "movups 0x20(%1),%%xmm2 ;\n\t" \
65 "movups 0x30(%1),%%xmm3 ;\n\t" \
66 "movq %0,%%cr0 ;\n\t" \
67 : \
68 : "r" (cr0), "r" (xmm_save) \
69 : "memory"); \
70 preempt_enable(); \
71} while (0)
72
73#define OFFS(x) "16*("#x")"
74#define PF_OFFS(x) "256+16*("#x")"
75#define PF0(x) " prefetchnta "PF_OFFS(x)"(%[p1]) ;\n"
76#define LD(x, y) " movaps "OFFS(x)"(%[p1]), %%xmm"#y" ;\n"
77#define ST(x, y) " movaps %%xmm"#y", "OFFS(x)"(%[p1]) ;\n"
78#define PF1(x) " prefetchnta "PF_OFFS(x)"(%[p2]) ;\n"
79#define PF2(x) " prefetchnta "PF_OFFS(x)"(%[p3]) ;\n"
80#define PF3(x) " prefetchnta "PF_OFFS(x)"(%[p4]) ;\n"
81#define PF4(x) " prefetchnta "PF_OFFS(x)"(%[p5]) ;\n"
82#define PF5(x) " prefetchnta "PF_OFFS(x)"(%[p6]) ;\n"
83#define XO1(x, y) " xorps "OFFS(x)"(%[p2]), %%xmm"#y" ;\n"
84#define XO2(x, y) " xorps "OFFS(x)"(%[p3]), %%xmm"#y" ;\n"
85#define XO3(x, y) " xorps "OFFS(x)"(%[p4]), %%xmm"#y" ;\n"
86#define XO4(x, y) " xorps "OFFS(x)"(%[p5]), %%xmm"#y" ;\n"
87#define XO5(x, y) " xorps "OFFS(x)"(%[p6]), %%xmm"#y" ;\n"
88
89
90static void
91xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)
92{
93 unsigned int lines = bytes >> 8;
94 unsigned long cr0;
95 xmm_store_t xmm_save[4];
96
97 XMMS_SAVE;
98
99 asm volatile(
100#undef BLOCK
101#define BLOCK(i) \
102 LD(i, 0) \
103 LD(i + 1, 1) \
104 PF1(i) \
105 PF1(i + 2) \
106 LD(i + 2, 2) \
107 LD(i + 3, 3) \
108 PF0(i + 4) \
109 PF0(i + 6) \
110 XO1(i, 0) \
111 XO1(i + 1, 1) \
112 XO1(i + 2, 2) \
113 XO1(i + 3, 3) \
114 ST(i, 0) \
115 ST(i + 1, 1) \
116 ST(i + 2, 2) \
117 ST(i + 3, 3) \
118
119
120 PF0(0)
121 PF0(2)
122
123 " .align 32 ;\n"
124 " 1: ;\n"
125
126 BLOCK(0)
127 BLOCK(4)
128 BLOCK(8)
129 BLOCK(12)
130
131 " addq %[inc], %[p1] ;\n"
132 " addq %[inc], %[p2] ;\n"
133 " decl %[cnt] ; jnz 1b"
134 : [p1] "+r" (p1), [p2] "+r" (p2), [cnt] "+r" (lines)
135 : [inc] "r" (256UL)
136 : "memory");
137
138 XMMS_RESTORE;
139}
140
141static void
142xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,
143 unsigned long *p3)
144{
145 unsigned int lines = bytes >> 8;
146 xmm_store_t xmm_save[4];
147 unsigned long cr0;
148
149 XMMS_SAVE;
150
151 asm volatile(
152#undef BLOCK
153#define BLOCK(i) \
154 PF1(i) \
155 PF1(i + 2) \
156 LD(i, 0) \
157 LD(i + 1, 1) \
158 LD(i + 2, 2) \
159 LD(i + 3, 3) \
160 PF2(i) \
161 PF2(i + 2) \
162 PF0(i + 4) \
163 PF0(i + 6) \
164 XO1(i, 0) \
165 XO1(i + 1, 1) \
166 XO1(i + 2, 2) \
167 XO1(i + 3, 3) \
168 XO2(i, 0) \
169 XO2(i + 1, 1) \
170 XO2(i + 2, 2) \
171 XO2(i + 3, 3) \
172 ST(i, 0) \
173 ST(i + 1, 1) \
174 ST(i + 2, 2) \
175 ST(i + 3, 3) \
176
177
178 PF0(0)
179 PF0(2)
180
181 " .align 32 ;\n"
182 " 1: ;\n"
183
184 BLOCK(0)
185 BLOCK(4)
186 BLOCK(8)
187 BLOCK(12)
188
189 " addq %[inc], %[p1] ;\n"
190 " addq %[inc], %[p2] ;\n"
191 " addq %[inc], %[p3] ;\n"
192 " decl %[cnt] ; jnz 1b"
193 : [cnt] "+r" (lines),
194 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)
195 : [inc] "r" (256UL)
196 : "memory");
197 XMMS_RESTORE;
198}
199
200static void
201xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,
202 unsigned long *p3, unsigned long *p4)
203{
204 unsigned int lines = bytes >> 8;
205 xmm_store_t xmm_save[4];
206 unsigned long cr0;
207
208 XMMS_SAVE;
209
210 asm volatile(
211#undef BLOCK
212#define BLOCK(i) \
213 PF1(i) \
214 PF1(i + 2) \
215 LD(i, 0) \
216 LD(i + 1, 1) \
217 LD(i + 2, 2) \
218 LD(i + 3, 3) \
219 PF2(i) \
220 PF2(i + 2) \
221 XO1(i, 0) \
222 XO1(i + 1, 1) \
223 XO1(i + 2, 2) \
224 XO1(i + 3, 3) \
225 PF3(i) \
226 PF3(i + 2) \
227 PF0(i + 4) \
228 PF0(i + 6) \
229 XO2(i, 0) \
230 XO2(i + 1, 1) \
231 XO2(i + 2, 2) \
232 XO2(i + 3, 3) \
233 XO3(i, 0) \
234 XO3(i + 1, 1) \
235 XO3(i + 2, 2) \
236 XO3(i + 3, 3) \
237 ST(i, 0) \
238 ST(i + 1, 1) \
239 ST(i + 2, 2) \
240 ST(i + 3, 3) \
241
242
243 PF0(0)
244 PF0(2)
245
246 " .align 32 ;\n"
247 " 1: ;\n"
248
249 BLOCK(0)
250 BLOCK(4)
251 BLOCK(8)
252 BLOCK(12)
253
254 " addq %[inc], %[p1] ;\n"
255 " addq %[inc], %[p2] ;\n"
256 " addq %[inc], %[p3] ;\n"
257 " addq %[inc], %[p4] ;\n"
258 " decl %[cnt] ; jnz 1b"
259 : [cnt] "+c" (lines),
260 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4)
261 : [inc] "r" (256UL)
262 : "memory" );
263
264 XMMS_RESTORE;
265}
266
267static void
268xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,
269 unsigned long *p3, unsigned long *p4, unsigned long *p5)
270{
271 unsigned int lines = bytes >> 8;
272 xmm_store_t xmm_save[4];
273 unsigned long cr0;
274
275 XMMS_SAVE;
276
277 asm volatile(
278#undef BLOCK
279#define BLOCK(i) \
280 PF1(i) \
281 PF1(i + 2) \
282 LD(i, 0) \
283 LD(i + 1, 1) \
284 LD(i + 2, 2) \
285 LD(i + 3, 3) \
286 PF2(i) \
287 PF2(i + 2) \
288 XO1(i, 0) \
289 XO1(i + 1, 1) \
290 XO1(i + 2, 2) \
291 XO1(i + 3, 3) \
292 PF3(i) \
293 PF3(i + 2) \
294 XO2(i, 0) \
295 XO2(i + 1, 1) \
296 XO2(i + 2, 2) \
297 XO2(i + 3, 3) \
298 PF4(i) \
299 PF4(i + 2) \
300 PF0(i + 4) \
301 PF0(i + 6) \
302 XO3(i, 0) \
303 XO3(i + 1, 1) \
304 XO3(i + 2, 2) \
305 XO3(i + 3, 3) \
306 XO4(i, 0) \
307 XO4(i + 1, 1) \
308 XO4(i + 2, 2) \
309 XO4(i + 3, 3) \
310 ST(i, 0) \
311 ST(i + 1, 1) \
312 ST(i + 2, 2) \
313 ST(i + 3, 3) \
314
315
316 PF0(0)
317 PF0(2)
318
319 " .align 32 ;\n"
320 " 1: ;\n"
321
322 BLOCK(0)
323 BLOCK(4)
324 BLOCK(8)
325 BLOCK(12)
326
327 " addq %[inc], %[p1] ;\n"
328 " addq %[inc], %[p2] ;\n"
329 " addq %[inc], %[p3] ;\n"
330 " addq %[inc], %[p4] ;\n"
331 " addq %[inc], %[p5] ;\n"
332 " decl %[cnt] ; jnz 1b"
333 : [cnt] "+c" (lines),
334 [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3), [p4] "+r" (p4),
335 [p5] "+r" (p5)
336 : [inc] "r" (256UL)
337 : "memory");
338
339 XMMS_RESTORE;
340}
341
342static struct xor_block_template xor_block_sse = {
343 .name = "generic_sse",
344 .do_2 = xor_sse_2,
345 .do_3 = xor_sse_3,
346 .do_4 = xor_sse_4,
347 .do_5 = xor_sse_5,
348};
349
350#undef XOR_TRY_TEMPLATES
351#define XOR_TRY_TEMPLATES \
352do { \
353 xor_speed(&xor_block_sse); \
354} while (0)
355
356/* We force the use of the SSE xor block because it can write around L2.
357 We may also be able to load into the L1 only depending on how the cpu
358 deals with a load to a line that is being prefetched. */
359#define XOR_SELECT_TEMPLATE(FASTEST) (&xor_block_sse)
360
361#endif /* _ASM_X86_XOR_64_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
new file mode 100644
index 000000000000..08e9a1ac07a9
--- /dev/null
+++ b/arch/x86/include/asm/xsave.h
@@ -0,0 +1,118 @@
1#ifndef __ASM_X86_XSAVE_H
2#define __ASM_X86_XSAVE_H
3
4#include <linux/types.h>
5#include <asm/processor.h>
6#include <asm/i387.h>
7
8#define XSTATE_FP 0x1
9#define XSTATE_SSE 0x2
10
11#define XSTATE_FPSSE (XSTATE_FP | XSTATE_SSE)
12
13#define FXSAVE_SIZE 512
14
15/*
16 * These are the features that the OS can handle currently.
17 */
18#define XCNTXT_MASK (XSTATE_FP | XSTATE_SSE)
19
20#ifdef CONFIG_X86_64
21#define REX_PREFIX "0x48, "
22#else
23#define REX_PREFIX
24#endif
25
26extern unsigned int xstate_size;
27extern u64 pcntxt_mask;
28extern struct xsave_struct *init_xstate_buf;
29
30extern void xsave_cntxt_init(void);
31extern void xsave_init(void);
32extern int init_fpu(struct task_struct *child);
33extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
34 void __user *fpstate,
35 struct _fpx_sw_bytes *sw);
36
37static inline int xrstor_checking(struct xsave_struct *fx)
38{
39 int err;
40
41 asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
42 "2:\n"
43 ".section .fixup,\"ax\"\n"
44 "3: movl $-1,%[err]\n"
45 " jmp 2b\n"
46 ".previous\n"
47 _ASM_EXTABLE(1b, 3b)
48 : [err] "=r" (err)
49 : "D" (fx), "m" (*fx), "a" (-1), "d" (-1), "0" (0)
50 : "memory");
51
52 return err;
53}
54
55static inline int xsave_user(struct xsave_struct __user *buf)
56{
57 int err;
58 __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n"
59 "2:\n"
60 ".section .fixup,\"ax\"\n"
61 "3: movl $-1,%[err]\n"
62 " jmp 2b\n"
63 ".previous\n"
64 ".section __ex_table,\"a\"\n"
65 _ASM_ALIGN "\n"
66 _ASM_PTR "1b,3b\n"
67 ".previous"
68 : [err] "=r" (err)
69 : "D" (buf), "a" (-1), "d" (-1), "0" (0)
70 : "memory");
71 if (unlikely(err) && __clear_user(buf, xstate_size))
72 err = -EFAULT;
73 /* No need to clear here because the caller clears USED_MATH */
74 return err;
75}
76
77static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)
78{
79 int err;
80 struct xsave_struct *xstate = ((__force struct xsave_struct *)buf);
81 u32 lmask = mask;
82 u32 hmask = mask >> 32;
83
84 __asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n"
85 "2:\n"
86 ".section .fixup,\"ax\"\n"
87 "3: movl $-1,%[err]\n"
88 " jmp 2b\n"
89 ".previous\n"
90 ".section __ex_table,\"a\"\n"
91 _ASM_ALIGN "\n"
92 _ASM_PTR "1b,3b\n"
93 ".previous"
94 : [err] "=r" (err)
95 : "D" (xstate), "a" (lmask), "d" (hmask), "0" (0)
96 : "memory"); /* memory required? */
97 return err;
98}
99
100static inline void xrstor_state(struct xsave_struct *fx, u64 mask)
101{
102 u32 lmask = mask;
103 u32 hmask = mask >> 32;
104
105 asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t"
106 : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask)
107 : "memory");
108}
109
110static inline void xsave(struct task_struct *tsk)
111{
112 /* This, however, we can work around by forcing the compiler to select
113 an addressing mode that doesn't require extended registers. */
114 __asm__ __volatile__(".byte " REX_PREFIX "0x0f,0xae,0x27"
115 : : "D" (&(tsk->thread.xstate->xsave)),
116 "a" (-1), "d"(-1) : "memory");
117}
118#endif
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index da140611bb57..d7e5a58ee22f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -7,9 +7,10 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu
7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) 7CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE)
8 8
9ifdef CONFIG_FTRACE 9ifdef CONFIG_FTRACE
10# Do not profile debug utilities 10# Do not profile debug and lowlevel utilities
11CFLAGS_REMOVE_tsc.o = -pg 11CFLAGS_REMOVE_tsc.o = -pg
12CFLAGS_REMOVE_rtc.o = -pg 12CFLAGS_REMOVE_rtc.o = -pg
13CFLAGS_REMOVE_paravirt-spinlocks.o = -pg
13endif 14endif
14 15
15# 16#
@@ -22,7 +23,7 @@ CFLAGS_hpet.o := $(nostackp)
22CFLAGS_tsc.o := $(nostackp) 23CFLAGS_tsc.o := $(nostackp)
23 24
24obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o 25obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o
25obj-y += traps_$(BITS).o irq_$(BITS).o 26obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
26obj-y += time_$(BITS).o ioport.o ldt.o 27obj-y += time_$(BITS).o ioport.o ldt.o
27obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o 28obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o
28obj-$(CONFIG_X86_VISWS) += visws_quirks.o 29obj-$(CONFIG_X86_VISWS) += visws_quirks.o
@@ -37,7 +38,7 @@ obj-y += tsc.o io_delay.o rtc.o
37 38
38obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 39obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
39obj-y += process.o 40obj-y += process.o
40obj-y += i387.o 41obj-y += i387.o xsave.o
41obj-y += ptrace.o 42obj-y += ptrace.o
42obj-y += ds.o 43obj-y += ds.o
43obj-$(CONFIG_X86_32) += tls.o 44obj-$(CONFIG_X86_32) += tls.o
@@ -50,7 +51,6 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o
50obj-$(CONFIG_MCA) += mca_32.o 51obj-$(CONFIG_MCA) += mca_32.o
51obj-$(CONFIG_X86_MSR) += msr.o 52obj-$(CONFIG_X86_MSR) += msr.o
52obj-$(CONFIG_X86_CPUID) += cpuid.o 53obj-$(CONFIG_X86_CPUID) += cpuid.o
53obj-$(CONFIG_MICROCODE) += microcode.o
54obj-$(CONFIG_PCI) += early-quirks.o 54obj-$(CONFIG_PCI) += early-quirks.o
55apm-y := apm_32.o 55apm-y := apm_32.o
56obj-$(CONFIG_APM) += apm.o 56obj-$(CONFIG_APM) += apm.o
@@ -60,14 +60,15 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o
60obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o 60obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o
61obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o 61obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
62obj-$(CONFIG_X86_MPPARSE) += mpparse.o 62obj-$(CONFIG_X86_MPPARSE) += mpparse.o
63obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o 63obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o
64obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o 64obj-$(CONFIG_X86_IO_APIC) += io_apic.o
65obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 65obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
66obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o 66obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o
67obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o 67obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o
68obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o 68obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
69obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 69obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
70obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 70obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
71obj-$(CONFIG_X86_ES7000) += es7000_32.o
71obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o 72obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o
72obj-y += vsmp_64.o 73obj-y += vsmp_64.o
73obj-$(CONFIG_KPROBES) += kprobes.o 74obj-$(CONFIG_KPROBES) += kprobes.o
@@ -88,7 +89,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
88obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o 89obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
89obj-$(CONFIG_KVM_GUEST) += kvm.o 90obj-$(CONFIG_KVM_GUEST) += kvm.o
90obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 91obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
91obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 92obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o
92obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o 93obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
93 94
94obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 95obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
@@ -98,10 +99,18 @@ scx200-y += scx200_32.o
98 99
99obj-$(CONFIG_OLPC) += olpc.o 100obj-$(CONFIG_OLPC) += olpc.o
100 101
102microcode-y := microcode_core.o
103microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
104microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
105obj-$(CONFIG_MICROCODE) += microcode.o
106
101### 107###
102# 64 bit specific files 108# 64 bit specific files
103ifeq ($(CONFIG_X86_64),y) 109ifeq ($(CONFIG_X86_64),y)
104 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o 110 obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o
111 obj-y += bios_uv.o uv_irq.o uv_sysfs.o
112 obj-y += genx2apic_cluster.o
113 obj-y += genx2apic_phys.o
105 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o 114 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
106 obj-$(CONFIG_AUDIT) += audit_64.o 115 obj-$(CONFIG_AUDIT) += audit_64.o
107 116
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index f489d7a9be92..8c1f76abae9e 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled);
58#ifdef CONFIG_X86_64 58#ifdef CONFIG_X86_64
59 59
60#include <asm/proto.h> 60#include <asm/proto.h>
61#include <asm/genapic.h>
62 61
63#else /* X86 */ 62#else /* X86 */
64 63
@@ -154,10 +153,21 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size)
154} 153}
155 154
156#ifdef CONFIG_PCI_MMCONFIG 155#ifdef CONFIG_PCI_MMCONFIG
156
157static int acpi_mcfg_64bit_base_addr __initdata = FALSE;
158
157/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ 159/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */
158struct acpi_mcfg_allocation *pci_mmcfg_config; 160struct acpi_mcfg_allocation *pci_mmcfg_config;
159int pci_mmcfg_config_num; 161int pci_mmcfg_config_num;
160 162
163static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
164{
165 if (!strcmp(mcfg->header.oem_id, "SGI"))
166 acpi_mcfg_64bit_base_addr = TRUE;
167
168 return 0;
169}
170
161int __init acpi_parse_mcfg(struct acpi_table_header *header) 171int __init acpi_parse_mcfg(struct acpi_table_header *header)
162{ 172{
163 struct acpi_table_mcfg *mcfg; 173 struct acpi_table_mcfg *mcfg;
@@ -190,8 +200,12 @@ int __init acpi_parse_mcfg(struct acpi_table_header *header)
190 } 200 }
191 201
192 memcpy(pci_mmcfg_config, &mcfg[1], config_size); 202 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
203
204 acpi_mcfg_oem_check(mcfg);
205
193 for (i = 0; i < pci_mmcfg_config_num; ++i) { 206 for (i = 0; i < pci_mmcfg_config_num; ++i) {
194 if (pci_mmcfg_config[i].address > 0xFFFFFFFF) { 207 if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
208 !acpi_mcfg_64bit_base_addr) {
195 printk(KERN_ERR PREFIX 209 printk(KERN_ERR PREFIX
196 "MMCONFIG not in low 4GB of memory\n"); 210 "MMCONFIG not in low 4GB of memory\n");
197 kfree(pci_mmcfg_config); 211 kfree(pci_mmcfg_config);
@@ -239,10 +253,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
239 return; 253 return;
240 } 254 }
241 255
242#ifdef CONFIG_X86_32
243 if (boot_cpu_physical_apicid != -1U) 256 if (boot_cpu_physical_apicid != -1U)
244 ver = apic_version[boot_cpu_physical_apicid]; 257 ver = apic_version[boot_cpu_physical_apicid];
245#endif
246 258
247 generic_processor_info(id, ver); 259 generic_processor_info(id, ver);
248} 260}
@@ -761,11 +773,9 @@ static void __init acpi_register_lapic_address(unsigned long address)
761 773
762 set_fixmap_nocache(FIX_APIC_BASE, address); 774 set_fixmap_nocache(FIX_APIC_BASE, address);
763 if (boot_cpu_physical_apicid == -1U) { 775 if (boot_cpu_physical_apicid == -1U) {
764 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 776 boot_cpu_physical_apicid = read_apic_id();
765#ifdef CONFIG_X86_32
766 apic_version[boot_cpu_physical_apicid] = 777 apic_version[boot_cpu_physical_apicid] =
767 GET_APIC_VERSION(apic_read(APIC_LVR)); 778 GET_APIC_VERSION(apic_read(APIC_LVR));
768#endif
769 } 779 }
770} 780}
771 781
@@ -1021,7 +1031,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1021 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; 1031 mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA;
1022#endif 1032#endif
1023 set_bit(MP_ISA_BUS, mp_bus_not_pci); 1033 set_bit(MP_ISA_BUS, mp_bus_not_pci);
1024 Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); 1034 pr_debug("Bus #%d is ISA\n", MP_ISA_BUS);
1025 1035
1026#ifdef CONFIG_X86_ES7000 1036#ifdef CONFIG_X86_ES7000
1027 /* 1037 /*
@@ -1127,8 +1137,8 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
1127 return gsi; 1137 return gsi;
1128 } 1138 }
1129 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { 1139 if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
1130 Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", 1140 pr_debug("Pin %d-%d already programmed\n",
1131 mp_ioapic_routing[ioapic].apic_id, ioapic_pin); 1141 mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
1132#ifdef CONFIG_X86_32 1142#ifdef CONFIG_X86_32
1133 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); 1143 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
1134#else 1144#else
@@ -1247,7 +1257,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1247 1257
1248 count = 1258 count =
1249 acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, 1259 acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr,
1250 NR_IRQ_VECTORS); 1260 nr_irqs);
1251 if (count < 0) { 1261 if (count < 0) {
1252 printk(KERN_ERR PREFIX 1262 printk(KERN_ERR PREFIX
1253 "Error parsing interrupt source overrides entry\n"); 1263 "Error parsing interrupt source overrides entry\n");
@@ -1267,7 +1277,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1267 1277
1268 count = 1278 count =
1269 acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, 1279 acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src,
1270 NR_IRQ_VECTORS); 1280 nr_irqs);
1271 if (count < 0) { 1281 if (count < 0) {
1272 printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); 1282 printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n");
1273 /* TBD: Cleanup to allow fallback to MPS */ 1283 /* TBD: Cleanup to allow fallback to MPS */
@@ -1337,7 +1347,9 @@ static void __init acpi_process_madt(void)
1337 acpi_ioapic = 1; 1347 acpi_ioapic = 1;
1338 1348
1339 smp_found_config = 1; 1349 smp_found_config = 1;
1350#ifdef CONFIG_X86_32
1340 setup_apic_routing(); 1351 setup_apic_routing();
1352#endif
1341 } 1353 }
1342 } 1354 }
1343 if (error == -EINVAL) { 1355 if (error == -EINVAL) {
@@ -1407,8 +1419,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d)
1407 */ 1419 */
1408static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) 1420static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
1409{ 1421{
1410 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident); 1422 /*
1411 acpi_skip_timer_override = 1; 1423 * The ati_ixp4x0_rev() early PCI quirk should have set
1424 * the acpi_skip_timer_override flag already:
1425 */
1426 if (!acpi_skip_timer_override) {
1427 WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n");
1428 pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n",
1429 d->ident);
1430 acpi_skip_timer_override = 1;
1431 }
1412 return 0; 1432 return 0;
1413} 1433}
1414 1434
@@ -1579,6 +1599,11 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1579 DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), 1599 DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"),
1580 }, 1600 },
1581 }, 1601 },
1602 {}
1603};
1604
1605/* second table for DMI checks that should run after early-quirks */
1606static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
1582 /* 1607 /*
1583 * HP laptops which use a DSDT reporting as HP/SB400/10000, 1608 * HP laptops which use a DSDT reporting as HP/SB400/10000,
1584 * which includes some code which overrides all temperature 1609 * which includes some code which overrides all temperature
@@ -1591,6 +1616,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1591 */ 1616 */
1592 { 1617 {
1593 .callback = dmi_ignore_irq0_timer_override, 1618 .callback = dmi_ignore_irq0_timer_override,
1619 .ident = "HP nx6115 laptop",
1620 .matches = {
1621 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1622 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"),
1623 },
1624 },
1625 {
1626 .callback = dmi_ignore_irq0_timer_override,
1594 .ident = "HP NX6125 laptop", 1627 .ident = "HP NX6125 laptop",
1595 .matches = { 1628 .matches = {
1596 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), 1629 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
@@ -1605,6 +1638,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1605 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), 1638 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"),
1606 }, 1639 },
1607 }, 1640 },
1641 {
1642 .callback = dmi_ignore_irq0_timer_override,
1643 .ident = "HP 6715b laptop",
1644 .matches = {
1645 DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"),
1646 DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"),
1647 },
1648 },
1608 {} 1649 {}
1609}; 1650};
1610 1651
@@ -1691,6 +1732,9 @@ int __init early_acpi_boot_init(void)
1691 1732
1692int __init acpi_boot_init(void) 1733int __init acpi_boot_init(void)
1693{ 1734{
1735 /* those are executed after early-quirks are executed */
1736 dmi_check_system(acpi_dmi_table_late);
1737
1694 /* 1738 /*
1695 * If acpi_disabled, bail out 1739 * If acpi_disabled, bail out
1696 * One exception: acpi=ht continues far enough to enumerate LAPICs 1740 * One exception: acpi=ht continues far enough to enumerate LAPICs
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 868de3d5c39d..806b4e9051b4 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -9,6 +9,8 @@
9#include <linux/bootmem.h> 9#include <linux/bootmem.h>
10#include <linux/dmi.h> 10#include <linux/dmi.h>
11#include <linux/cpumask.h> 11#include <linux/cpumask.h>
12#include <asm/segment.h>
13#include <asm/desc.h>
12 14
13#include "realmode/wakeup.h" 15#include "realmode/wakeup.h"
14#include "sleep.h" 16#include "sleep.h"
@@ -19,19 +21,10 @@ unsigned long acpi_realmode_flags;
19/* address in low memory of the wakeup routine. */ 21/* address in low memory of the wakeup routine. */
20static unsigned long acpi_realmode; 22static unsigned long acpi_realmode;
21 23
22#ifdef CONFIG_64BIT 24#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
23static char temp_stack[10240]; 25static char temp_stack[4096];
24#endif 26#endif
25 27
26/* XXX: this macro should move to asm-x86/segment.h and be shared with the
27 boot code... */
28#define GDT_ENTRY(flags, base, limit) \
29 (((u64)(base & 0xff000000) << 32) | \
30 ((u64)flags << 40) | \
31 ((u64)(limit & 0x00ff0000) << 32) | \
32 ((u64)(base & 0x00ffffff) << 16) | \
33 ((u64)(limit & 0x0000ffff)))
34
35/** 28/**
36 * acpi_save_state_mem - save kernel state 29 * acpi_save_state_mem - save kernel state
37 * 30 *
@@ -94,7 +87,7 @@ int acpi_save_state_mem(void)
94#endif /* !CONFIG_64BIT */ 87#endif /* !CONFIG_64BIT */
95 88
96 header->pmode_cr0 = read_cr0(); 89 header->pmode_cr0 = read_cr0();
97 header->pmode_cr4 = read_cr4(); 90 header->pmode_cr4 = read_cr4_safe();
98 header->realmode_flags = acpi_realmode_flags; 91 header->realmode_flags = acpi_realmode_flags;
99 header->real_magic = 0x12345678; 92 header->real_magic = 0x12345678;
100 93
@@ -105,7 +98,9 @@ int acpi_save_state_mem(void)
105#else /* CONFIG_64BIT */ 98#else /* CONFIG_64BIT */
106 header->trampoline_segment = setup_trampoline() >> 4; 99 header->trampoline_segment = setup_trampoline() >> 4;
107#ifdef CONFIG_SMP 100#ifdef CONFIG_SMP
108 stack_start.sp = temp_stack + 4096; 101 stack_start.sp = temp_stack + sizeof(temp_stack);
102 early_gdt_descr.address =
103 (unsigned long)get_cpu_gdt_table(smp_processor_id());
109#endif 104#endif
110 initial_code = (unsigned long)wakeup_long64; 105 initial_code = (unsigned long)wakeup_long64;
111 saved_magic = 0x123456789abcdef0; 106 saved_magic = 0x123456789abcdef0;
@@ -158,6 +153,10 @@ static int __init acpi_sleep_setup(char *str)
158 acpi_realmode_flags |= 2; 153 acpi_realmode_flags |= 2;
159 if (strncmp(str, "s3_beep", 7) == 0) 154 if (strncmp(str, "s3_beep", 7) == 0)
160 acpi_realmode_flags |= 4; 155 acpi_realmode_flags |= 4;
156#ifdef CONFIG_HIBERNATION
157 if (strncmp(str, "s4_nohwsig", 10) == 0)
158 acpi_no_s4_hw_signature();
159#endif
161 if (strncmp(str, "old_ordering", 12) == 0) 160 if (strncmp(str, "old_ordering", 12) == 0)
162 acpi_old_suspend_ordering(); 161 acpi_old_suspend_ordering();
163 str = strchr(str, ','); 162 str = strchr(str, ',');
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 2763cb37b553..a84ac7b570e6 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -145,35 +145,25 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = {
145extern char __vsyscall_0; 145extern char __vsyscall_0;
146const unsigned char *const *find_nop_table(void) 146const unsigned char *const *find_nop_table(void)
147{ 147{
148 return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || 148 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
149 boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; 149 boot_cpu_has(X86_FEATURE_NOPL))
150 return p6_nops;
151 else
152 return k8_nops;
150} 153}
151 154
152#else /* CONFIG_X86_64 */ 155#else /* CONFIG_X86_64 */
153 156
154static const struct nop {
155 int cpuid;
156 const unsigned char *const *noptable;
157} noptypes[] = {
158 { X86_FEATURE_K8, k8_nops },
159 { X86_FEATURE_K7, k7_nops },
160 { X86_FEATURE_P4, p6_nops },
161 { X86_FEATURE_P3, p6_nops },
162 { -1, NULL }
163};
164
165const unsigned char *const *find_nop_table(void) 157const unsigned char *const *find_nop_table(void)
166{ 158{
167 const unsigned char *const *noptable = intel_nops; 159 if (boot_cpu_has(X86_FEATURE_K8))
168 int i; 160 return k8_nops;
169 161 else if (boot_cpu_has(X86_FEATURE_K7))
170 for (i = 0; noptypes[i].cpuid >= 0; i++) { 162 return k7_nops;
171 if (boot_cpu_has(noptypes[i].cpuid)) { 163 else if (boot_cpu_has(X86_FEATURE_NOPL))
172 noptable = noptypes[i].noptable; 164 return p6_nops;
173 break; 165 else
174 } 166 return intel_nops;
175 }
176 return noptable;
177} 167}
178 168
179#endif /* CONFIG_X86_64 */ 169#endif /* CONFIG_X86_64 */
@@ -241,25 +231,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
241 continue; 231 continue;
242 if (*ptr > text_end) 232 if (*ptr > text_end)
243 continue; 233 continue;
244 text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ 234 /* turn DS segment override prefix into lock prefix */
235 text_poke(*ptr, ((unsigned char []){0xf0}), 1);
245 }; 236 };
246} 237}
247 238
248static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) 239static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
249{ 240{
250 u8 **ptr; 241 u8 **ptr;
251 char insn[1];
252 242
253 if (noreplace_smp) 243 if (noreplace_smp)
254 return; 244 return;
255 245
256 add_nops(insn, 1);
257 for (ptr = start; ptr < end; ptr++) { 246 for (ptr = start; ptr < end; ptr++) {
258 if (*ptr < text) 247 if (*ptr < text)
259 continue; 248 continue;
260 if (*ptr > text_end) 249 if (*ptr > text_end)
261 continue; 250 continue;
262 text_poke(*ptr, insn, 1); 251 /* turn lock prefix into DS segment override prefix */
252 text_poke(*ptr, ((unsigned char []){0x3E}), 1);
263 }; 253 };
264} 254}
265 255
@@ -454,7 +444,7 @@ void __init alternative_instructions(void)
454 _text, _etext); 444 _text, _etext);
455 445
456 /* Only switch to UP mode if we don't immediately boot others */ 446 /* Only switch to UP mode if we don't immediately boot others */
457 if (num_possible_cpus() == 1 || setup_max_cpus <= 1) 447 if (num_present_cpus() == 1 || setup_max_cpus <= 1)
458 alternatives_smp_switch(0); 448 alternatives_smp_switch(0);
459 } 449 }
460#endif 450#endif
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f2766d84c7a0..a8fd9ebdc8e2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -23,36 +23,149 @@
23#include <linux/scatterlist.h> 23#include <linux/scatterlist.h>
24#include <linux/iommu-helper.h> 24#include <linux/iommu-helper.h>
25#include <asm/proto.h> 25#include <asm/proto.h>
26#include <asm/gart.h> 26#include <asm/iommu.h>
27#include <asm/amd_iommu_types.h> 27#include <asm/amd_iommu_types.h>
28#include <asm/amd_iommu.h> 28#include <asm/amd_iommu.h>
29 29
30#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) 30#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
31 31
32#define to_pages(addr, size) \ 32#define EXIT_LOOP_COUNT 10000000
33 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
34 33
35static DEFINE_RWLOCK(amd_iommu_devtable_lock); 34static DEFINE_RWLOCK(amd_iommu_devtable_lock);
36 35
37struct command { 36/* A list of preallocated protection domains */
37static LIST_HEAD(iommu_pd_list);
38static DEFINE_SPINLOCK(iommu_pd_list_lock);
39
40/*
41 * general struct to manage commands send to an IOMMU
42 */
43struct iommu_cmd {
38 u32 data[4]; 44 u32 data[4];
39}; 45};
40 46
41static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 47static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
42 struct unity_map_entry *e); 48 struct unity_map_entry *e);
43 49
50/* returns !0 if the IOMMU is caching non-present entries in its TLB */
44static int iommu_has_npcache(struct amd_iommu *iommu) 51static int iommu_has_npcache(struct amd_iommu *iommu)
45{ 52{
46 return iommu->cap & IOMMU_CAP_NPCACHE; 53 return iommu->cap & IOMMU_CAP_NPCACHE;
47} 54}
48 55
49static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) 56/****************************************************************************
57 *
58 * Interrupt handling functions
59 *
60 ****************************************************************************/
61
62static void iommu_print_event(void *__evt)
63{
64 u32 *event = __evt;
65 int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK;
66 int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK;
67 int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK;
68 int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK;
69 u64 address = (u64)(((u64)event[3]) << 32) | event[2];
70
71 printk(KERN_ERR "AMD IOMMU: Event logged [");
72
73 switch (type) {
74 case EVENT_TYPE_ILL_DEV:
75 printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x "
76 "address=0x%016llx flags=0x%04x]\n",
77 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
78 address, flags);
79 break;
80 case EVENT_TYPE_IO_FAULT:
81 printk("IO_PAGE_FAULT device=%02x:%02x.%x "
82 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
83 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
84 domid, address, flags);
85 break;
86 case EVENT_TYPE_DEV_TAB_ERR:
87 printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
88 "address=0x%016llx flags=0x%04x]\n",
89 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
90 address, flags);
91 break;
92 case EVENT_TYPE_PAGE_TAB_ERR:
93 printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x "
94 "domain=0x%04x address=0x%016llx flags=0x%04x]\n",
95 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
96 domid, address, flags);
97 break;
98 case EVENT_TYPE_ILL_CMD:
99 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
100 break;
101 case EVENT_TYPE_CMD_HARD_ERR:
102 printk("COMMAND_HARDWARE_ERROR address=0x%016llx "
103 "flags=0x%04x]\n", address, flags);
104 break;
105 case EVENT_TYPE_IOTLB_INV_TO:
106 printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x "
107 "address=0x%016llx]\n",
108 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
109 address);
110 break;
111 case EVENT_TYPE_INV_DEV_REQ:
112 printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x "
113 "address=0x%016llx flags=0x%04x]\n",
114 PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid),
115 address, flags);
116 break;
117 default:
118 printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type);
119 }
120}
121
122static void iommu_poll_events(struct amd_iommu *iommu)
123{
124 u32 head, tail;
125 unsigned long flags;
126
127 spin_lock_irqsave(&iommu->lock, flags);
128
129 head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
130 tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET);
131
132 while (head != tail) {
133 iommu_print_event(iommu->evt_buf + head);
134 head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size;
135 }
136
137 writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET);
138
139 spin_unlock_irqrestore(&iommu->lock, flags);
140}
141
142irqreturn_t amd_iommu_int_handler(int irq, void *data)
143{
144 struct amd_iommu *iommu;
145
146 list_for_each_entry(iommu, &amd_iommu_list, list)
147 iommu_poll_events(iommu);
148
149 return IRQ_HANDLED;
150}
151
152/****************************************************************************
153 *
154 * IOMMU command queuing functions
155 *
156 ****************************************************************************/
157
158/*
159 * Writes the command to the IOMMUs command buffer and informs the
160 * hardware about the new command. Must be called with iommu->lock held.
161 */
162static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
50{ 163{
51 u32 tail, head; 164 u32 tail, head;
52 u8 *target; 165 u8 *target;
53 166
54 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 167 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
55 target = (iommu->cmd_buf + tail); 168 target = iommu->cmd_buf + tail;
56 memcpy_toio(target, cmd, sizeof(*cmd)); 169 memcpy_toio(target, cmd, sizeof(*cmd));
57 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; 170 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
58 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 171 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
@@ -63,7 +176,11 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
63 return 0; 176 return 0;
64} 177}
65 178
66static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) 179/*
180 * General queuing function for commands. Takes iommu->lock and calls
181 * __iommu_queue_command().
182 */
183static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
67{ 184{
68 unsigned long flags; 185 unsigned long flags;
69 int ret; 186 int ret;
@@ -75,35 +192,59 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd)
75 return ret; 192 return ret;
76} 193}
77 194
195/*
196 * This function is called whenever we need to ensure that the IOMMU has
197 * completed execution of all commands we sent. It sends a
198 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
199 * us about that by writing a value to a physical address we pass with
200 * the command.
201 */
78static int iommu_completion_wait(struct amd_iommu *iommu) 202static int iommu_completion_wait(struct amd_iommu *iommu)
79{ 203{
80 int ret; 204 int ret = 0, ready = 0;
81 struct command cmd; 205 unsigned status = 0;
82 volatile u64 ready = 0; 206 struct iommu_cmd cmd;
83 unsigned long ready_phys = virt_to_phys(&ready); 207 unsigned long flags, i = 0;
84 208
85 memset(&cmd, 0, sizeof(cmd)); 209 memset(&cmd, 0, sizeof(cmd));
86 cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; 210 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
87 cmd.data[1] = HIGH_U32(ready_phys);
88 cmd.data[2] = 1; /* value written to 'ready' */
89 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); 211 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
90 212
91 iommu->need_sync = 0; 213 iommu->need_sync = 0;
92 214
93 ret = iommu_queue_command(iommu, &cmd); 215 spin_lock_irqsave(&iommu->lock, flags);
216
217 ret = __iommu_queue_command(iommu, &cmd);
94 218
95 if (ret) 219 if (ret)
96 return ret; 220 goto out;
221
222 while (!ready && (i < EXIT_LOOP_COUNT)) {
223 ++i;
224 /* wait for the bit to become one */
225 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
226 ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
227 }
228
229 /* set bit back to zero */
230 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
231 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
97 232
98 while (!ready) 233 if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit()))
99 cpu_relax(); 234 printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n");
235out:
236 spin_unlock_irqrestore(&iommu->lock, flags);
100 237
101 return 0; 238 return 0;
102} 239}
103 240
241/*
242 * Command send function for invalidating a device table entry
243 */
104static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 244static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
105{ 245{
106 struct command cmd; 246 struct iommu_cmd cmd;
247 int ret;
107 248
108 BUG_ON(iommu == NULL); 249 BUG_ON(iommu == NULL);
109 250
@@ -111,37 +252,50 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid)
111 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 252 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
112 cmd.data[0] = devid; 253 cmd.data[0] = devid;
113 254
255 ret = iommu_queue_command(iommu, &cmd);
256
114 iommu->need_sync = 1; 257 iommu->need_sync = 1;
115 258
116 return iommu_queue_command(iommu, &cmd); 259 return ret;
117} 260}
118 261
262/*
263 * Generic command send function for invalidaing TLB entries
264 */
119static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, 265static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
120 u64 address, u16 domid, int pde, int s) 266 u64 address, u16 domid, int pde, int s)
121{ 267{
122 struct command cmd; 268 struct iommu_cmd cmd;
269 int ret;
123 270
124 memset(&cmd, 0, sizeof(cmd)); 271 memset(&cmd, 0, sizeof(cmd));
125 address &= PAGE_MASK; 272 address &= PAGE_MASK;
126 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); 273 CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES);
127 cmd.data[1] |= domid; 274 cmd.data[1] |= domid;
128 cmd.data[2] = LOW_U32(address); 275 cmd.data[2] = lower_32_bits(address);
129 cmd.data[3] = HIGH_U32(address); 276 cmd.data[3] = upper_32_bits(address);
130 if (s) 277 if (s) /* size bit - we flush more than one 4kb page */
131 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; 278 cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
132 if (pde) 279 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
133 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; 280 cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
134 281
282 ret = iommu_queue_command(iommu, &cmd);
283
135 iommu->need_sync = 1; 284 iommu->need_sync = 1;
136 285
137 return iommu_queue_command(iommu, &cmd); 286 return ret;
138} 287}
139 288
289/*
290 * TLB invalidation function which is called from the mapping functions.
291 * It invalidates a single PTE if the range to flush is within a single
292 * page. Otherwise it flushes the whole TLB of the IOMMU.
293 */
140static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, 294static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
141 u64 address, size_t size) 295 u64 address, size_t size)
142{ 296{
143 int s = 0; 297 int s = 0;
144 unsigned pages = to_pages(address, size); 298 unsigned pages = iommu_num_pages(address, size, PAGE_SIZE);
145 299
146 address &= PAGE_MASK; 300 address &= PAGE_MASK;
147 301
@@ -159,6 +313,28 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
159 return 0; 313 return 0;
160} 314}
161 315
316/* Flush the whole IO/TLB for a given protection domain */
317static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid)
318{
319 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
320
321 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1);
322}
323
324/****************************************************************************
325 *
326 * The functions below are used the create the page table mappings for
327 * unity mapped regions.
328 *
329 ****************************************************************************/
330
331/*
332 * Generic mapping functions. It maps a physical address into a DMA
333 * address space. It allocates the page table pages if necessary.
334 * In the future it can be extended to a generic mapping function
335 * supporting all features of AMD IOMMU page tables like level skipping
336 * and full 64 bit address spaces.
337 */
162static int iommu_map(struct protection_domain *dom, 338static int iommu_map(struct protection_domain *dom,
163 unsigned long bus_addr, 339 unsigned long bus_addr,
164 unsigned long phys_addr, 340 unsigned long phys_addr,
@@ -209,6 +385,10 @@ static int iommu_map(struct protection_domain *dom,
209 return 0; 385 return 0;
210} 386}
211 387
388/*
389 * This function checks if a specific unity mapping entry is needed for
390 * this specific IOMMU.
391 */
212static int iommu_for_unity_map(struct amd_iommu *iommu, 392static int iommu_for_unity_map(struct amd_iommu *iommu,
213 struct unity_map_entry *entry) 393 struct unity_map_entry *entry)
214{ 394{
@@ -223,6 +403,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
223 return 0; 403 return 0;
224} 404}
225 405
406/*
407 * Init the unity mappings for a specific IOMMU in the system
408 *
409 * Basically iterates over all unity mapping entries and applies them to
410 * the default domain DMA of that IOMMU if necessary.
411 */
226static int iommu_init_unity_mappings(struct amd_iommu *iommu) 412static int iommu_init_unity_mappings(struct amd_iommu *iommu)
227{ 413{
228 struct unity_map_entry *entry; 414 struct unity_map_entry *entry;
@@ -239,6 +425,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu)
239 return 0; 425 return 0;
240} 426}
241 427
428/*
429 * This function actually applies the mapping to the page table of the
430 * dma_ops domain.
431 */
242static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, 432static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
243 struct unity_map_entry *e) 433 struct unity_map_entry *e)
244{ 434{
@@ -261,6 +451,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
261 return 0; 451 return 0;
262} 452}
263 453
454/*
455 * Inits the unity mappings required for a specific device
456 */
264static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 457static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
265 u16 devid) 458 u16 devid)
266{ 459{
@@ -278,33 +471,48 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
278 return 0; 471 return 0;
279} 472}
280 473
281static unsigned long dma_mask_to_pages(unsigned long mask) 474/****************************************************************************
282{ 475 *
283 return (mask >> PAGE_SHIFT) + 476 * The next functions belong to the address allocator for the dma_ops
284 (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); 477 * interface functions. They work like the allocators in the other IOMMU
285} 478 * drivers. Its basically a bitmap which marks the allocated pages in
479 * the aperture. Maybe it could be enhanced in the future to a more
480 * efficient allocator.
481 *
482 ****************************************************************************/
286 483
484/*
485 * The address allocator core function.
486 *
487 * called with domain->lock held
488 */
287static unsigned long dma_ops_alloc_addresses(struct device *dev, 489static unsigned long dma_ops_alloc_addresses(struct device *dev,
288 struct dma_ops_domain *dom, 490 struct dma_ops_domain *dom,
289 unsigned int pages) 491 unsigned int pages,
492 unsigned long align_mask,
493 u64 dma_mask)
290{ 494{
291 unsigned long limit = dma_mask_to_pages(*dev->dma_mask); 495 unsigned long limit;
292 unsigned long address; 496 unsigned long address;
293 unsigned long size = dom->aperture_size >> PAGE_SHIFT;
294 unsigned long boundary_size; 497 unsigned long boundary_size;
295 498
296 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 499 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
297 PAGE_SIZE) >> PAGE_SHIFT; 500 PAGE_SIZE) >> PAGE_SHIFT;
298 limit = limit < size ? limit : size; 501 limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
502 dma_mask >> PAGE_SHIFT);
299 503
300 if (dom->next_bit >= limit) 504 if (dom->next_bit >= limit) {
301 dom->next_bit = 0; 505 dom->next_bit = 0;
506 dom->need_flush = true;
507 }
302 508
303 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, 509 address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
304 0 , boundary_size, 0); 510 0 , boundary_size, align_mask);
305 if (address == -1) 511 if (address == -1) {
306 address = iommu_area_alloc(dom->bitmap, limit, 0, pages, 512 address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
307 0, boundary_size, 0); 513 0, boundary_size, align_mask);
514 dom->need_flush = true;
515 }
308 516
309 if (likely(address != -1)) { 517 if (likely(address != -1)) {
310 dom->next_bit = address + pages; 518 dom->next_bit = address + pages;
@@ -317,6 +525,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
317 return address; 525 return address;
318} 526}
319 527
528/*
529 * The address free function.
530 *
531 * called with domain->lock held
532 */
320static void dma_ops_free_addresses(struct dma_ops_domain *dom, 533static void dma_ops_free_addresses(struct dma_ops_domain *dom,
321 unsigned long address, 534 unsigned long address,
322 unsigned int pages) 535 unsigned int pages)
@@ -325,6 +538,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
325 iommu_area_free(dom->bitmap, address, pages); 538 iommu_area_free(dom->bitmap, address, pages);
326} 539}
327 540
541/****************************************************************************
542 *
543 * The next functions belong to the domain allocation. A domain is
544 * allocated for every IOMMU as the default domain. If device isolation
545 * is enabled, every device get its own domain. The most important thing
546 * about domains is the page table mapping the DMA address space they
547 * contain.
548 *
549 ****************************************************************************/
550
328static u16 domain_id_alloc(void) 551static u16 domain_id_alloc(void)
329{ 552{
330 unsigned long flags; 553 unsigned long flags;
@@ -342,6 +565,10 @@ static u16 domain_id_alloc(void)
342 return id; 565 return id;
343} 566}
344 567
568/*
569 * Used to reserve address ranges in the aperture (e.g. for exclusion
570 * ranges.
571 */
345static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, 572static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
346 unsigned long start_page, 573 unsigned long start_page,
347 unsigned int pages) 574 unsigned int pages)
@@ -351,7 +578,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
351 if (start_page + pages > last_page) 578 if (start_page + pages > last_page)
352 pages = last_page - start_page; 579 pages = last_page - start_page;
353 580
354 set_bit_string(dom->bitmap, start_page, pages); 581 iommu_area_reserve(dom->bitmap, start_page, pages);
355} 582}
356 583
357static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) 584static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
@@ -382,6 +609,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom)
382 free_page((unsigned long)p1); 609 free_page((unsigned long)p1);
383} 610}
384 611
612/*
613 * Free a domain, only used if something went wrong in the
614 * allocation path and we need to free an already allocated page table
615 */
385static void dma_ops_domain_free(struct dma_ops_domain *dom) 616static void dma_ops_domain_free(struct dma_ops_domain *dom)
386{ 617{
387 if (!dom) 618 if (!dom)
@@ -396,6 +627,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
396 kfree(dom); 627 kfree(dom);
397} 628}
398 629
630/*
631 * Allocates a new protection domain usable for the dma_ops functions.
632 * It also intializes the page table and the address allocator data
633 * structures required for the dma_ops interface
634 */
399static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, 635static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
400 unsigned order) 636 unsigned order)
401{ 637{
@@ -436,14 +672,24 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
436 dma_dom->bitmap[0] = 1; 672 dma_dom->bitmap[0] = 1;
437 dma_dom->next_bit = 0; 673 dma_dom->next_bit = 0;
438 674
675 dma_dom->need_flush = false;
676 dma_dom->target_dev = 0xffff;
677
678 /* Intialize the exclusion range if necessary */
439 if (iommu->exclusion_start && 679 if (iommu->exclusion_start &&
440 iommu->exclusion_start < dma_dom->aperture_size) { 680 iommu->exclusion_start < dma_dom->aperture_size) {
441 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 681 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
442 int pages = to_pages(iommu->exclusion_start, 682 int pages = iommu_num_pages(iommu->exclusion_start,
443 iommu->exclusion_length); 683 iommu->exclusion_length,
684 PAGE_SIZE);
444 dma_ops_reserve_addresses(dma_dom, startpage, pages); 685 dma_ops_reserve_addresses(dma_dom, startpage, pages);
445 } 686 }
446 687
688 /*
689 * At the last step, build the page tables so we don't need to
690 * allocate page table pages in the dma_ops mapping/unmapping
691 * path.
692 */
447 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); 693 num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
448 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), 694 dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
449 GFP_KERNEL); 695 GFP_KERNEL);
@@ -472,6 +718,10 @@ free_dma_dom:
472 return NULL; 718 return NULL;
473} 719}
474 720
721/*
722 * Find out the protection domain structure for a given PCI device. This
723 * will give us the pointer to the page table root for example.
724 */
475static struct protection_domain *domain_for_device(u16 devid) 725static struct protection_domain *domain_for_device(u16 devid)
476{ 726{
477 struct protection_domain *dom; 727 struct protection_domain *dom;
@@ -484,6 +734,10 @@ static struct protection_domain *domain_for_device(u16 devid)
484 return dom; 734 return dom;
485} 735}
486 736
737/*
738 * If a device is not yet associated with a domain, this function does
739 * assigns it visible for the hardware
740 */
487static void set_device_domain(struct amd_iommu *iommu, 741static void set_device_domain(struct amd_iommu *iommu,
488 struct protection_domain *domain, 742 struct protection_domain *domain,
489 u16 devid) 743 u16 devid)
@@ -492,12 +746,13 @@ static void set_device_domain(struct amd_iommu *iommu,
492 746
493 u64 pte_root = virt_to_phys(domain->pt_root); 747 u64 pte_root = virt_to_phys(domain->pt_root);
494 748
495 pte_root |= (domain->mode & 0x07) << 9; 749 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
496 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2; 750 << DEV_ENTRY_MODE_SHIFT;
751 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
497 752
498 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 753 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
499 amd_iommu_dev_table[devid].data[0] = pte_root; 754 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
500 amd_iommu_dev_table[devid].data[1] = pte_root >> 32; 755 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
501 amd_iommu_dev_table[devid].data[2] = domain->id; 756 amd_iommu_dev_table[devid].data[2] = domain->id;
502 757
503 amd_iommu_pd_table[devid] = domain; 758 amd_iommu_pd_table[devid] = domain;
@@ -508,6 +763,58 @@ static void set_device_domain(struct amd_iommu *iommu,
508 iommu->need_sync = 1; 763 iommu->need_sync = 1;
509} 764}
510 765
766/*****************************************************************************
767 *
768 * The next functions belong to the dma_ops mapping/unmapping code.
769 *
770 *****************************************************************************/
771
772/*
773 * This function checks if the driver got a valid device from the caller to
774 * avoid dereferencing invalid pointers.
775 */
776static bool check_device(struct device *dev)
777{
778 if (!dev || !dev->dma_mask)
779 return false;
780
781 return true;
782}
783
784/*
785 * In this function the list of preallocated protection domains is traversed to
786 * find the domain for a specific device
787 */
788static struct dma_ops_domain *find_protection_domain(u16 devid)
789{
790 struct dma_ops_domain *entry, *ret = NULL;
791 unsigned long flags;
792
793 if (list_empty(&iommu_pd_list))
794 return NULL;
795
796 spin_lock_irqsave(&iommu_pd_list_lock, flags);
797
798 list_for_each_entry(entry, &iommu_pd_list, list) {
799 if (entry->target_dev == devid) {
800 ret = entry;
801 list_del(&ret->list);
802 break;
803 }
804 }
805
806 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
807
808 return ret;
809}
810
811/*
812 * In the dma_ops path we only have the struct device. This function
813 * finds the corresponding IOMMU, the protection domain and the
814 * requestor id for a given device.
815 * If the device is not yet associated with a domain this is also done
816 * in this function.
817 */
511static int get_device_resources(struct device *dev, 818static int get_device_resources(struct device *dev,
512 struct amd_iommu **iommu, 819 struct amd_iommu **iommu,
513 struct protection_domain **domain, 820 struct protection_domain **domain,
@@ -517,26 +824,30 @@ static int get_device_resources(struct device *dev,
517 struct pci_dev *pcidev; 824 struct pci_dev *pcidev;
518 u16 _bdf; 825 u16 _bdf;
519 826
520 BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); 827 *iommu = NULL;
828 *domain = NULL;
829 *bdf = 0xffff;
830
831 if (dev->bus != &pci_bus_type)
832 return 0;
521 833
522 pcidev = to_pci_dev(dev); 834 pcidev = to_pci_dev(dev);
523 _bdf = (pcidev->bus->number << 8) | pcidev->devfn; 835 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
524 836
525 if (_bdf >= amd_iommu_last_bdf) { 837 /* device not translated by any IOMMU in the system? */
526 *iommu = NULL; 838 if (_bdf > amd_iommu_last_bdf)
527 *domain = NULL;
528 *bdf = 0xffff;
529 return 0; 839 return 0;
530 }
531 840
532 *bdf = amd_iommu_alias_table[_bdf]; 841 *bdf = amd_iommu_alias_table[_bdf];
533 842
534 *iommu = amd_iommu_rlookup_table[*bdf]; 843 *iommu = amd_iommu_rlookup_table[*bdf];
535 if (*iommu == NULL) 844 if (*iommu == NULL)
536 return 0; 845 return 0;
537 dma_dom = (*iommu)->default_dom;
538 *domain = domain_for_device(*bdf); 846 *domain = domain_for_device(*bdf);
539 if (*domain == NULL) { 847 if (*domain == NULL) {
848 dma_dom = find_protection_domain(*bdf);
849 if (!dma_dom)
850 dma_dom = (*iommu)->default_dom;
540 *domain = &dma_dom->domain; 851 *domain = &dma_dom->domain;
541 set_device_domain(*iommu, *domain, *bdf); 852 set_device_domain(*iommu, *domain, *bdf);
542 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " 853 printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
@@ -547,6 +858,10 @@ static int get_device_resources(struct device *dev,
547 return 1; 858 return 1;
548} 859}
549 860
861/*
862 * This is the generic map function. It maps one 4kb page at paddr to
863 * the given address in the DMA address space for the domain.
864 */
550static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, 865static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
551 struct dma_ops_domain *dom, 866 struct dma_ops_domain *dom,
552 unsigned long address, 867 unsigned long address,
@@ -578,6 +893,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
578 return (dma_addr_t)address; 893 return (dma_addr_t)address;
579} 894}
580 895
896/*
897 * The generic unmapping function for on page in the DMA address space.
898 */
581static void dma_ops_domain_unmap(struct amd_iommu *iommu, 899static void dma_ops_domain_unmap(struct amd_iommu *iommu,
582 struct dma_ops_domain *dom, 900 struct dma_ops_domain *dom,
583 unsigned long address) 901 unsigned long address)
@@ -597,22 +915,35 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
597 *pte = 0ULL; 915 *pte = 0ULL;
598} 916}
599 917
918/*
919 * This function contains common code for mapping of a physically
920 * contiguous memory region into DMA address space. It is uses by all
921 * mapping functions provided by this IOMMU driver.
922 * Must be called with the domain lock held.
923 */
600static dma_addr_t __map_single(struct device *dev, 924static dma_addr_t __map_single(struct device *dev,
601 struct amd_iommu *iommu, 925 struct amd_iommu *iommu,
602 struct dma_ops_domain *dma_dom, 926 struct dma_ops_domain *dma_dom,
603 phys_addr_t paddr, 927 phys_addr_t paddr,
604 size_t size, 928 size_t size,
605 int dir) 929 int dir,
930 bool align,
931 u64 dma_mask)
606{ 932{
607 dma_addr_t offset = paddr & ~PAGE_MASK; 933 dma_addr_t offset = paddr & ~PAGE_MASK;
608 dma_addr_t address, start; 934 dma_addr_t address, start;
609 unsigned int pages; 935 unsigned int pages;
936 unsigned long align_mask = 0;
610 int i; 937 int i;
611 938
612 pages = to_pages(paddr, size); 939 pages = iommu_num_pages(paddr, size, PAGE_SIZE);
613 paddr &= PAGE_MASK; 940 paddr &= PAGE_MASK;
614 941
615 address = dma_ops_alloc_addresses(dev, dma_dom, pages); 942 if (align)
943 align_mask = (1UL << get_order(size)) - 1;
944
945 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
946 dma_mask);
616 if (unlikely(address == bad_dma_address)) 947 if (unlikely(address == bad_dma_address))
617 goto out; 948 goto out;
618 949
@@ -624,10 +955,20 @@ static dma_addr_t __map_single(struct device *dev,
624 } 955 }
625 address += offset; 956 address += offset;
626 957
958 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
959 iommu_flush_tlb(iommu, dma_dom->domain.id);
960 dma_dom->need_flush = false;
961 } else if (unlikely(iommu_has_npcache(iommu)))
962 iommu_flush_pages(iommu, dma_dom->domain.id, address, size);
963
627out: 964out:
628 return address; 965 return address;
629} 966}
630 967
968/*
969 * Does the reverse of the __map_single function. Must be called with
970 * the domain lock held too
971 */
631static void __unmap_single(struct amd_iommu *iommu, 972static void __unmap_single(struct amd_iommu *iommu,
632 struct dma_ops_domain *dma_dom, 973 struct dma_ops_domain *dma_dom,
633 dma_addr_t dma_addr, 974 dma_addr_t dma_addr,
@@ -640,7 +981,7 @@ static void __unmap_single(struct amd_iommu *iommu,
640 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) 981 if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size))
641 return; 982 return;
642 983
643 pages = to_pages(dma_addr, size); 984 pages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
644 dma_addr &= PAGE_MASK; 985 dma_addr &= PAGE_MASK;
645 start = dma_addr; 986 start = dma_addr;
646 987
@@ -650,8 +991,14 @@ static void __unmap_single(struct amd_iommu *iommu,
650 } 991 }
651 992
652 dma_ops_free_addresses(dma_dom, dma_addr, pages); 993 dma_ops_free_addresses(dma_dom, dma_addr, pages);
994
995 if (amd_iommu_unmap_flush)
996 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size);
653} 997}
654 998
999/*
1000 * The exported map_single function for dma_ops.
1001 */
655static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, 1002static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
656 size_t size, int dir) 1003 size_t size, int dir)
657{ 1004{
@@ -660,21 +1007,26 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr,
660 struct protection_domain *domain; 1007 struct protection_domain *domain;
661 u16 devid; 1008 u16 devid;
662 dma_addr_t addr; 1009 dma_addr_t addr;
1010 u64 dma_mask;
1011
1012 if (!check_device(dev))
1013 return bad_dma_address;
1014
1015 dma_mask = *dev->dma_mask;
663 1016
664 get_device_resources(dev, &iommu, &domain, &devid); 1017 get_device_resources(dev, &iommu, &domain, &devid);
665 1018
666 if (iommu == NULL || domain == NULL) 1019 if (iommu == NULL || domain == NULL)
1020 /* device not handled by any AMD IOMMU */
667 return (dma_addr_t)paddr; 1021 return (dma_addr_t)paddr;
668 1022
669 spin_lock_irqsave(&domain->lock, flags); 1023 spin_lock_irqsave(&domain->lock, flags);
670 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir); 1024 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false,
1025 dma_mask);
671 if (addr == bad_dma_address) 1026 if (addr == bad_dma_address)
672 goto out; 1027 goto out;
673 1028
674 if (iommu_has_npcache(iommu)) 1029 if (unlikely(iommu->need_sync))
675 iommu_flush_pages(iommu, domain->id, addr, size);
676
677 if (iommu->need_sync)
678 iommu_completion_wait(iommu); 1030 iommu_completion_wait(iommu);
679 1031
680out: 1032out:
@@ -683,6 +1035,9 @@ out:
683 return addr; 1035 return addr;
684} 1036}
685 1037
1038/*
1039 * The exported unmap_single function for dma_ops.
1040 */
686static void unmap_single(struct device *dev, dma_addr_t dma_addr, 1041static void unmap_single(struct device *dev, dma_addr_t dma_addr,
687 size_t size, int dir) 1042 size_t size, int dir)
688{ 1043{
@@ -691,21 +1046,25 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr,
691 struct protection_domain *domain; 1046 struct protection_domain *domain;
692 u16 devid; 1047 u16 devid;
693 1048
694 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1049 if (!check_device(dev) ||
1050 !get_device_resources(dev, &iommu, &domain, &devid))
1051 /* device not handled by any AMD IOMMU */
695 return; 1052 return;
696 1053
697 spin_lock_irqsave(&domain->lock, flags); 1054 spin_lock_irqsave(&domain->lock, flags);
698 1055
699 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1056 __unmap_single(iommu, domain->priv, dma_addr, size, dir);
700 1057
701 iommu_flush_pages(iommu, domain->id, dma_addr, size); 1058 if (unlikely(iommu->need_sync))
702
703 if (iommu->need_sync)
704 iommu_completion_wait(iommu); 1059 iommu_completion_wait(iommu);
705 1060
706 spin_unlock_irqrestore(&domain->lock, flags); 1061 spin_unlock_irqrestore(&domain->lock, flags);
707} 1062}
708 1063
1064/*
1065 * This is a special map_sg function which is used if we should map a
1066 * device which is not handled by an AMD IOMMU in the system.
1067 */
709static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, 1068static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
710 int nelems, int dir) 1069 int nelems, int dir)
711{ 1070{
@@ -720,6 +1079,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist,
720 return nelems; 1079 return nelems;
721} 1080}
722 1081
1082/*
1083 * The exported map_sg function for dma_ops (handles scatter-gather
1084 * lists).
1085 */
723static int map_sg(struct device *dev, struct scatterlist *sglist, 1086static int map_sg(struct device *dev, struct scatterlist *sglist,
724 int nelems, int dir) 1087 int nelems, int dir)
725{ 1088{
@@ -731,6 +1094,12 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
731 struct scatterlist *s; 1094 struct scatterlist *s;
732 phys_addr_t paddr; 1095 phys_addr_t paddr;
733 int mapped_elems = 0; 1096 int mapped_elems = 0;
1097 u64 dma_mask;
1098
1099 if (!check_device(dev))
1100 return 0;
1101
1102 dma_mask = *dev->dma_mask;
734 1103
735 get_device_resources(dev, &iommu, &domain, &devid); 1104 get_device_resources(dev, &iommu, &domain, &devid);
736 1105
@@ -743,19 +1112,17 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
743 paddr = sg_phys(s); 1112 paddr = sg_phys(s);
744 1113
745 s->dma_address = __map_single(dev, iommu, domain->priv, 1114 s->dma_address = __map_single(dev, iommu, domain->priv,
746 paddr, s->length, dir); 1115 paddr, s->length, dir, false,
1116 dma_mask);
747 1117
748 if (s->dma_address) { 1118 if (s->dma_address) {
749 s->dma_length = s->length; 1119 s->dma_length = s->length;
750 mapped_elems++; 1120 mapped_elems++;
751 } else 1121 } else
752 goto unmap; 1122 goto unmap;
753 if (iommu_has_npcache(iommu))
754 iommu_flush_pages(iommu, domain->id, s->dma_address,
755 s->dma_length);
756 } 1123 }
757 1124
758 if (iommu->need_sync) 1125 if (unlikely(iommu->need_sync))
759 iommu_completion_wait(iommu); 1126 iommu_completion_wait(iommu);
760 1127
761out: 1128out:
@@ -775,6 +1142,10 @@ unmap:
775 goto out; 1142 goto out;
776} 1143}
777 1144
1145/*
1146 * The exported map_sg function for dma_ops (handles scatter-gather
1147 * lists).
1148 */
778static void unmap_sg(struct device *dev, struct scatterlist *sglist, 1149static void unmap_sg(struct device *dev, struct scatterlist *sglist,
779 int nelems, int dir) 1150 int nelems, int dir)
780{ 1151{
@@ -785,7 +1156,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
785 u16 devid; 1156 u16 devid;
786 int i; 1157 int i;
787 1158
788 if (!get_device_resources(dev, &iommu, &domain, &devid)) 1159 if (!check_device(dev) ||
1160 !get_device_resources(dev, &iommu, &domain, &devid))
789 return; 1161 return;
790 1162
791 spin_lock_irqsave(&domain->lock, flags); 1163 spin_lock_irqsave(&domain->lock, flags);
@@ -793,17 +1165,18 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
793 for_each_sg(sglist, s, nelems, i) { 1165 for_each_sg(sglist, s, nelems, i) {
794 __unmap_single(iommu, domain->priv, s->dma_address, 1166 __unmap_single(iommu, domain->priv, s->dma_address,
795 s->dma_length, dir); 1167 s->dma_length, dir);
796 iommu_flush_pages(iommu, domain->id, s->dma_address,
797 s->dma_length);
798 s->dma_address = s->dma_length = 0; 1168 s->dma_address = s->dma_length = 0;
799 } 1169 }
800 1170
801 if (iommu->need_sync) 1171 if (unlikely(iommu->need_sync))
802 iommu_completion_wait(iommu); 1172 iommu_completion_wait(iommu);
803 1173
804 spin_unlock_irqrestore(&domain->lock, flags); 1174 spin_unlock_irqrestore(&domain->lock, flags);
805} 1175}
806 1176
1177/*
1178 * The exported alloc_coherent function for dma_ops.
1179 */
807static void *alloc_coherent(struct device *dev, size_t size, 1180static void *alloc_coherent(struct device *dev, size_t size,
808 dma_addr_t *dma_addr, gfp_t flag) 1181 dma_addr_t *dma_addr, gfp_t flag)
809{ 1182{
@@ -813,25 +1186,33 @@ static void *alloc_coherent(struct device *dev, size_t size,
813 struct protection_domain *domain; 1186 struct protection_domain *domain;
814 u16 devid; 1187 u16 devid;
815 phys_addr_t paddr; 1188 phys_addr_t paddr;
1189 u64 dma_mask = dev->coherent_dma_mask;
1190
1191 if (!check_device(dev))
1192 return NULL;
1193
1194 if (!get_device_resources(dev, &iommu, &domain, &devid))
1195 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
816 1196
1197 flag |= __GFP_ZERO;
817 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 1198 virt_addr = (void *)__get_free_pages(flag, get_order(size));
818 if (!virt_addr) 1199 if (!virt_addr)
819 return 0; 1200 return 0;
820 1201
821 memset(virt_addr, 0, size);
822 paddr = virt_to_phys(virt_addr); 1202 paddr = virt_to_phys(virt_addr);
823 1203
824 get_device_resources(dev, &iommu, &domain, &devid);
825
826 if (!iommu || !domain) { 1204 if (!iommu || !domain) {
827 *dma_addr = (dma_addr_t)paddr; 1205 *dma_addr = (dma_addr_t)paddr;
828 return virt_addr; 1206 return virt_addr;
829 } 1207 }
830 1208
1209 if (!dma_mask)
1210 dma_mask = *dev->dma_mask;
1211
831 spin_lock_irqsave(&domain->lock, flags); 1212 spin_lock_irqsave(&domain->lock, flags);
832 1213
833 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 1214 *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
834 size, DMA_BIDIRECTIONAL); 1215 size, DMA_BIDIRECTIONAL, true, dma_mask);
835 1216
836 if (*dma_addr == bad_dma_address) { 1217 if (*dma_addr == bad_dma_address) {
837 free_pages((unsigned long)virt_addr, get_order(size)); 1218 free_pages((unsigned long)virt_addr, get_order(size));
@@ -839,10 +1220,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
839 goto out; 1220 goto out;
840 } 1221 }
841 1222
842 if (iommu_has_npcache(iommu)) 1223 if (unlikely(iommu->need_sync))
843 iommu_flush_pages(iommu, domain->id, *dma_addr, size);
844
845 if (iommu->need_sync)
846 iommu_completion_wait(iommu); 1224 iommu_completion_wait(iommu);
847 1225
848out: 1226out:
@@ -851,6 +1229,9 @@ out:
851 return virt_addr; 1229 return virt_addr;
852} 1230}
853 1231
1232/*
1233 * The exported free_coherent function for dma_ops.
1234 */
854static void free_coherent(struct device *dev, size_t size, 1235static void free_coherent(struct device *dev, size_t size,
855 void *virt_addr, dma_addr_t dma_addr) 1236 void *virt_addr, dma_addr_t dma_addr)
856{ 1237{
@@ -859,6 +1240,9 @@ static void free_coherent(struct device *dev, size_t size,
859 struct protection_domain *domain; 1240 struct protection_domain *domain;
860 u16 devid; 1241 u16 devid;
861 1242
1243 if (!check_device(dev))
1244 return;
1245
862 get_device_resources(dev, &iommu, &domain, &devid); 1246 get_device_resources(dev, &iommu, &domain, &devid);
863 1247
864 if (!iommu || !domain) 1248 if (!iommu || !domain)
@@ -867,9 +1251,8 @@ static void free_coherent(struct device *dev, size_t size,
867 spin_lock_irqsave(&domain->lock, flags); 1251 spin_lock_irqsave(&domain->lock, flags);
868 1252
869 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 1253 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
870 iommu_flush_pages(iommu, domain->id, dma_addr, size);
871 1254
872 if (iommu->need_sync) 1255 if (unlikely(iommu->need_sync))
873 iommu_completion_wait(iommu); 1256 iommu_completion_wait(iommu);
874 1257
875 spin_unlock_irqrestore(&domain->lock, flags); 1258 spin_unlock_irqrestore(&domain->lock, flags);
@@ -879,6 +1262,32 @@ free_mem:
879} 1262}
880 1263
881/* 1264/*
1265 * This function is called by the DMA layer to find out if we can handle a
1266 * particular device. It is part of the dma_ops.
1267 */
1268static int amd_iommu_dma_supported(struct device *dev, u64 mask)
1269{
1270 u16 bdf;
1271 struct pci_dev *pcidev;
1272
1273 /* No device or no PCI device */
1274 if (!dev || dev->bus != &pci_bus_type)
1275 return 0;
1276
1277 pcidev = to_pci_dev(dev);
1278
1279 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
1280
1281 /* Out of our scope? */
1282 if (bdf > amd_iommu_last_bdf)
1283 return 0;
1284
1285 return 1;
1286}
1287
1288/*
1289 * The function for pre-allocating protection domains.
1290 *
882 * If the driver core informs the DMA layer if a driver grabs a device 1291 * If the driver core informs the DMA layer if a driver grabs a device
883 * we don't need to preallocate the protection domains anymore. 1292 * we don't need to preallocate the protection domains anymore.
884 * For now we have to. 1293 * For now we have to.
@@ -893,7 +1302,7 @@ void prealloc_protection_domains(void)
893 1302
894 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 1303 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
895 devid = (dev->bus->number << 8) | dev->devfn; 1304 devid = (dev->bus->number << 8) | dev->devfn;
896 if (devid >= amd_iommu_last_bdf) 1305 if (devid > amd_iommu_last_bdf)
897 continue; 1306 continue;
898 devid = amd_iommu_alias_table[devid]; 1307 devid = amd_iommu_alias_table[devid];
899 if (domain_for_device(devid)) 1308 if (domain_for_device(devid))
@@ -905,10 +1314,9 @@ void prealloc_protection_domains(void)
905 if (!dma_dom) 1314 if (!dma_dom)
906 continue; 1315 continue;
907 init_unity_mappings_for_device(dma_dom, devid); 1316 init_unity_mappings_for_device(dma_dom, devid);
908 set_device_domain(iommu, &dma_dom->domain, devid); 1317 dma_dom->target_dev = devid;
909 printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ", 1318
910 dma_dom->domain.id); 1319 list_add_tail(&dma_dom->list, &iommu_pd_list);
911 print_devid(devid, 1);
912 } 1320 }
913} 1321}
914 1322
@@ -919,14 +1327,23 @@ static struct dma_mapping_ops amd_iommu_dma_ops = {
919 .unmap_single = unmap_single, 1327 .unmap_single = unmap_single,
920 .map_sg = map_sg, 1328 .map_sg = map_sg,
921 .unmap_sg = unmap_sg, 1329 .unmap_sg = unmap_sg,
1330 .dma_supported = amd_iommu_dma_supported,
922}; 1331};
923 1332
1333/*
1334 * The function which clues the AMD IOMMU driver into dma_ops.
1335 */
924int __init amd_iommu_init_dma_ops(void) 1336int __init amd_iommu_init_dma_ops(void)
925{ 1337{
926 struct amd_iommu *iommu; 1338 struct amd_iommu *iommu;
927 int order = amd_iommu_aperture_order; 1339 int order = amd_iommu_aperture_order;
928 int ret; 1340 int ret;
929 1341
1342 /*
1343 * first allocate a default protection domain for every IOMMU we
1344 * found in the system. Devices not assigned to any other
1345 * protection domain will be assigned to the default one.
1346 */
930 list_for_each_entry(iommu, &amd_iommu_list, list) { 1347 list_for_each_entry(iommu, &amd_iommu_list, list) {
931 iommu->default_dom = dma_ops_domain_alloc(iommu, order); 1348 iommu->default_dom = dma_ops_domain_alloc(iommu, order);
932 if (iommu->default_dom == NULL) 1349 if (iommu->default_dom == NULL)
@@ -936,6 +1353,10 @@ int __init amd_iommu_init_dma_ops(void)
936 goto free_domains; 1353 goto free_domains;
937 } 1354 }
938 1355
1356 /*
1357 * If device isolation is enabled, pre-allocate the protection
1358 * domains for each device.
1359 */
939 if (amd_iommu_isolate) 1360 if (amd_iommu_isolate)
940 prealloc_protection_domains(); 1361 prealloc_protection_domains();
941 1362
@@ -947,6 +1368,7 @@ int __init amd_iommu_init_dma_ops(void)
947 gart_iommu_aperture = 0; 1368 gart_iommu_aperture = 0;
948#endif 1369#endif
949 1370
1371 /* Make the driver finally visible to the drivers */
950 dma_ops = &amd_iommu_dma_ops; 1372 dma_ops = &amd_iommu_dma_ops;
951 1373
952 return 0; 1374 return 0;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 2a13e430437d..0cdcda35a05f 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -22,23 +22,17 @@
22#include <linux/gfp.h> 22#include <linux/gfp.h>
23#include <linux/list.h> 23#include <linux/list.h>
24#include <linux/sysdev.h> 24#include <linux/sysdev.h>
25#include <linux/interrupt.h>
26#include <linux/msi.h>
25#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
26#include <asm/amd_iommu_types.h> 28#include <asm/amd_iommu_types.h>
27#include <asm/amd_iommu.h> 29#include <asm/amd_iommu.h>
28#include <asm/gart.h> 30#include <asm/iommu.h>
29 31
30/* 32/*
31 * definitions for the ACPI scanning code 33 * definitions for the ACPI scanning code
32 */ 34 */
33#define UPDATE_LAST_BDF(x) do {\
34 if ((x) > amd_iommu_last_bdf) \
35 amd_iommu_last_bdf = (x); \
36 } while (0);
37
38#define DEVID(bus, devfn) (((bus) << 8) | (devfn))
39#define PCI_BUS(x) (((x) >> 8) & 0xff)
40#define IVRS_HEADER_LENGTH 48 35#define IVRS_HEADER_LENGTH 48
41#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x))))
42 36
43#define ACPI_IVHD_TYPE 0x10 37#define ACPI_IVHD_TYPE 0x10
44#define ACPI_IVMD_TYPE_ALL 0x20 38#define ACPI_IVMD_TYPE_ALL 0x20
@@ -71,6 +65,17 @@
71#define ACPI_DEVFLAG_LINT1 0x80 65#define ACPI_DEVFLAG_LINT1 0x80
72#define ACPI_DEVFLAG_ATSDIS 0x10000000 66#define ACPI_DEVFLAG_ATSDIS 0x10000000
73 67
68/*
69 * ACPI table definitions
70 *
71 * These data structures are laid over the table to parse the important values
72 * out of it.
73 */
74
75/*
76 * structure describing one IOMMU in the ACPI table. Typically followed by one
77 * or more ivhd_entrys.
78 */
74struct ivhd_header { 79struct ivhd_header {
75 u8 type; 80 u8 type;
76 u8 flags; 81 u8 flags;
@@ -83,6 +88,10 @@ struct ivhd_header {
83 u32 reserved; 88 u32 reserved;
84} __attribute__((packed)); 89} __attribute__((packed));
85 90
91/*
92 * A device entry describing which devices a specific IOMMU translates and
93 * which requestor ids they use.
94 */
86struct ivhd_entry { 95struct ivhd_entry {
87 u8 type; 96 u8 type;
88 u16 devid; 97 u16 devid;
@@ -90,6 +99,10 @@ struct ivhd_entry {
90 u32 ext; 99 u32 ext;
91} __attribute__((packed)); 100} __attribute__((packed));
92 101
102/*
103 * An AMD IOMMU memory definition structure. It defines things like exclusion
104 * ranges for devices and regions that should be unity mapped.
105 */
93struct ivmd_header { 106struct ivmd_header {
94 u8 type; 107 u8 type;
95 u8 flags; 108 u8 flags;
@@ -103,22 +116,81 @@ struct ivmd_header {
103 116
104static int __initdata amd_iommu_detected; 117static int __initdata amd_iommu_detected;
105 118
106u16 amd_iommu_last_bdf; 119u16 amd_iommu_last_bdf; /* largest PCI device id we have
107struct list_head amd_iommu_unity_map; 120 to handle */
108unsigned amd_iommu_aperture_order = 26; 121LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
109int amd_iommu_isolate; 122 we find in ACPI */
123unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
124int amd_iommu_isolate; /* if 1, device isolation is enabled */
125bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
126
127LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
128 system */
110 129
111struct list_head amd_iommu_list; 130/*
131 * Pointer to the device table which is shared by all AMD IOMMUs
132 * it is indexed by the PCI device id or the HT unit id and contains
133 * information about the domain the device belongs to as well as the
134 * page table root pointer.
135 */
112struct dev_table_entry *amd_iommu_dev_table; 136struct dev_table_entry *amd_iommu_dev_table;
137
138/*
139 * The alias table is a driver specific data structure which contains the
140 * mappings of the PCI device ids to the actual requestor ids on the IOMMU.
141 * More than one device can share the same requestor id.
142 */
113u16 *amd_iommu_alias_table; 143u16 *amd_iommu_alias_table;
144
145/*
146 * The rlookup table is used to find the IOMMU which is responsible
147 * for a specific device. It is also indexed by the PCI device id.
148 */
114struct amd_iommu **amd_iommu_rlookup_table; 149struct amd_iommu **amd_iommu_rlookup_table;
150
151/*
152 * The pd table (protection domain table) is used to find the protection domain
153 * data structure a device belongs to. Indexed with the PCI device id too.
154 */
115struct protection_domain **amd_iommu_pd_table; 155struct protection_domain **amd_iommu_pd_table;
156
157/*
158 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
159 * to know which ones are already in use.
160 */
116unsigned long *amd_iommu_pd_alloc_bitmap; 161unsigned long *amd_iommu_pd_alloc_bitmap;
117 162
118static u32 dev_table_size; 163static u32 dev_table_size; /* size of the device table */
119static u32 alias_table_size; 164static u32 alias_table_size; /* size of the alias table */
120static u32 rlookup_table_size; 165static u32 rlookup_table_size; /* size if the rlookup table */
166
167static inline void update_last_devid(u16 devid)
168{
169 if (devid > amd_iommu_last_bdf)
170 amd_iommu_last_bdf = devid;
171}
172
173static inline unsigned long tbl_size(int entry_size)
174{
175 unsigned shift = PAGE_SHIFT +
176 get_order(amd_iommu_last_bdf * entry_size);
177
178 return 1UL << shift;
179}
121 180
181/****************************************************************************
182 *
183 * AMD IOMMU MMIO register space handling functions
184 *
185 * These functions are used to program the IOMMU device registers in
186 * MMIO space required for that driver.
187 *
188 ****************************************************************************/
189
190/*
191 * This function set the exclusion range in the IOMMU. DMA accesses to the
192 * exclusion range are passed through untranslated
193 */
122static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) 194static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
123{ 195{
124 u64 start = iommu->exclusion_start & PAGE_MASK; 196 u64 start = iommu->exclusion_start & PAGE_MASK;
@@ -137,9 +209,10 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
137 &entry, sizeof(entry)); 209 &entry, sizeof(entry));
138} 210}
139 211
212/* Programs the physical address of the device table into the IOMMU hardware */
140static void __init iommu_set_device_table(struct amd_iommu *iommu) 213static void __init iommu_set_device_table(struct amd_iommu *iommu)
141{ 214{
142 u32 entry; 215 u64 entry;
143 216
144 BUG_ON(iommu->mmio_base == NULL); 217 BUG_ON(iommu->mmio_base == NULL);
145 218
@@ -149,6 +222,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
149 &entry, sizeof(entry)); 222 &entry, sizeof(entry));
150} 223}
151 224
225/* Generic functions to enable/disable certain features of the IOMMU. */
152static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) 226static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
153{ 227{
154 u32 ctrl; 228 u32 ctrl;
@@ -162,20 +236,35 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
162{ 236{
163 u32 ctrl; 237 u32 ctrl;
164 238
165 ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); 239 ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET);
166 ctrl &= ~(1 << bit); 240 ctrl &= ~(1 << bit);
167 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); 241 writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET);
168} 242}
169 243
244/* Function to enable the hardware */
170void __init iommu_enable(struct amd_iommu *iommu) 245void __init iommu_enable(struct amd_iommu *iommu)
171{ 246{
172 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); 247 printk(KERN_INFO "AMD IOMMU: Enabling IOMMU "
173 print_devid(iommu->devid, 0); 248 "at %02x:%02x.%x cap 0x%hx\n",
174 printk(" cap 0x%hx\n", iommu->cap_ptr); 249 iommu->dev->bus->number,
250 PCI_SLOT(iommu->dev->devfn),
251 PCI_FUNC(iommu->dev->devfn),
252 iommu->cap_ptr);
175 253
176 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 254 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
177} 255}
178 256
257/* Function to enable IOMMU event logging and event interrupts */
258void __init iommu_enable_event_logging(struct amd_iommu *iommu)
259{
260 iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
261 iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
262}
263
264/*
265 * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in
266 * the system has one.
267 */
179static u8 * __init iommu_map_mmio_space(u64 address) 268static u8 * __init iommu_map_mmio_space(u64 address)
180{ 269{
181 u8 *ret; 270 u8 *ret;
@@ -199,16 +288,41 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu)
199 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); 288 release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH);
200} 289}
201 290
291/****************************************************************************
292 *
293 * The functions below belong to the first pass of AMD IOMMU ACPI table
294 * parsing. In this pass we try to find out the highest device id this
295 * code has to handle. Upon this information the size of the shared data
296 * structures is determined later.
297 *
298 ****************************************************************************/
299
300/*
301 * This function calculates the length of a given IVHD entry
302 */
303static inline int ivhd_entry_length(u8 *ivhd)
304{
305 return 0x04 << (*ivhd >> 6);
306}
307
308/*
309 * This function reads the last device id the IOMMU has to handle from the PCI
310 * capability header for this IOMMU
311 */
202static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) 312static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr)
203{ 313{
204 u32 cap; 314 u32 cap;
205 315
206 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); 316 cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET);
207 UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); 317 update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap)));
208 318
209 return 0; 319 return 0;
210} 320}
211 321
322/*
323 * After reading the highest device id from the IOMMU PCI capability header
324 * this function looks if there is a higher device id defined in the ACPI table
325 */
212static int __init find_last_devid_from_ivhd(struct ivhd_header *h) 326static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
213{ 327{
214 u8 *p = (void *)h, *end = (void *)h; 328 u8 *p = (void *)h, *end = (void *)h;
@@ -229,12 +343,13 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
229 case IVHD_DEV_RANGE_END: 343 case IVHD_DEV_RANGE_END:
230 case IVHD_DEV_ALIAS: 344 case IVHD_DEV_ALIAS:
231 case IVHD_DEV_EXT_SELECT: 345 case IVHD_DEV_EXT_SELECT:
232 UPDATE_LAST_BDF(dev->devid); 346 /* all the above subfield types refer to device ids */
347 update_last_devid(dev->devid);
233 break; 348 break;
234 default: 349 default:
235 break; 350 break;
236 } 351 }
237 p += 0x04 << (*p >> 6); 352 p += ivhd_entry_length(p);
238 } 353 }
239 354
240 WARN_ON(p != end); 355 WARN_ON(p != end);
@@ -242,6 +357,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h)
242 return 0; 357 return 0;
243} 358}
244 359
360/*
361 * Iterate over all IVHD entries in the ACPI table and find the highest device
362 * id which we need to handle. This is the first of three functions which parse
363 * the ACPI table. So we check the checksum here.
364 */
245static int __init find_last_devid_acpi(struct acpi_table_header *table) 365static int __init find_last_devid_acpi(struct acpi_table_header *table)
246{ 366{
247 int i; 367 int i;
@@ -277,19 +397,31 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
277 return 0; 397 return 0;
278} 398}
279 399
400/****************************************************************************
401 *
402 * The following functions belong the the code path which parses the ACPI table
403 * the second time. In this ACPI parsing iteration we allocate IOMMU specific
404 * data structures, initialize the device/alias/rlookup table and also
405 * basically initialize the hardware.
406 *
407 ****************************************************************************/
408
409/*
410 * Allocates the command buffer. This buffer is per AMD IOMMU. We can
411 * write commands to that buffer later and the IOMMU will execute them
412 * asynchronously
413 */
280static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) 414static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
281{ 415{
282 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, 416 u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
283 get_order(CMD_BUFFER_SIZE)); 417 get_order(CMD_BUFFER_SIZE));
284 u64 entry = 0; 418 u64 entry;
285 419
286 if (cmd_buf == NULL) 420 if (cmd_buf == NULL)
287 return NULL; 421 return NULL;
288 422
289 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 423 iommu->cmd_buf_size = CMD_BUFFER_SIZE;
290 424
291 memset(cmd_buf, 0, CMD_BUFFER_SIZE);
292
293 entry = (u64)virt_to_phys(cmd_buf); 425 entry = (u64)virt_to_phys(cmd_buf);
294 entry |= MMIO_CMD_SIZE_512; 426 entry |= MMIO_CMD_SIZE_512;
295 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, 427 memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
@@ -302,11 +434,35 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
302 434
303static void __init free_command_buffer(struct amd_iommu *iommu) 435static void __init free_command_buffer(struct amd_iommu *iommu)
304{ 436{
305 if (iommu->cmd_buf) 437 free_pages((unsigned long)iommu->cmd_buf,
306 free_pages((unsigned long)iommu->cmd_buf, 438 get_order(iommu->cmd_buf_size));
307 get_order(CMD_BUFFER_SIZE)); 439}
440
441/* allocates the memory where the IOMMU will log its events to */
442static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
443{
444 u64 entry;
445 iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
446 get_order(EVT_BUFFER_SIZE));
447
448 if (iommu->evt_buf == NULL)
449 return NULL;
450
451 entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
452 memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
453 &entry, sizeof(entry));
454
455 iommu->evt_buf_size = EVT_BUFFER_SIZE;
456
457 return iommu->evt_buf;
458}
459
460static void __init free_event_buffer(struct amd_iommu *iommu)
461{
462 free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE));
308} 463}
309 464
465/* sets a specific bit in the device table entry. */
310static void set_dev_entry_bit(u16 devid, u8 bit) 466static void set_dev_entry_bit(u16 devid, u8 bit)
311{ 467{
312 int i = (bit >> 5) & 0x07; 468 int i = (bit >> 5) & 0x07;
@@ -315,7 +471,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit)
315 amd_iommu_dev_table[devid].data[i] |= (1 << _bit); 471 amd_iommu_dev_table[devid].data[i] |= (1 << _bit);
316} 472}
317 473
318static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) 474/* Writes the specific IOMMU for a device into the rlookup table */
475static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid)
476{
477 amd_iommu_rlookup_table[devid] = iommu;
478}
479
480/*
481 * This function takes the device specific flags read from the ACPI
482 * table and sets up the device table entry with that information
483 */
484static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu,
485 u16 devid, u32 flags, u32 ext_flags)
319{ 486{
320 if (flags & ACPI_DEVFLAG_INITPASS) 487 if (flags & ACPI_DEVFLAG_INITPASS)
321 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); 488 set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS);
@@ -331,13 +498,14 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags)
331 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); 498 set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS);
332 if (flags & ACPI_DEVFLAG_LINT1) 499 if (flags & ACPI_DEVFLAG_LINT1)
333 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); 500 set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS);
334}
335 501
336static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) 502 set_iommu_for_device(iommu, devid);
337{
338 amd_iommu_rlookup_table[devid] = iommu;
339} 503}
340 504
505/*
506 * Reads the device exclusion range from ACPI and initialize IOMMU with
507 * it
508 */
341static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) 509static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
342{ 510{
343 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; 511 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid];
@@ -346,27 +514,45 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
346 return; 514 return;
347 515
348 if (iommu) { 516 if (iommu) {
517 /*
518 * We only can configure exclusion ranges per IOMMU, not
519 * per device. But we can enable the exclusion range per
520 * device. This is done here
521 */
349 set_dev_entry_bit(m->devid, DEV_ENTRY_EX); 522 set_dev_entry_bit(m->devid, DEV_ENTRY_EX);
350 iommu->exclusion_start = m->range_start; 523 iommu->exclusion_start = m->range_start;
351 iommu->exclusion_length = m->range_length; 524 iommu->exclusion_length = m->range_length;
352 } 525 }
353} 526}
354 527
528/*
529 * This function reads some important data from the IOMMU PCI space and
530 * initializes the driver data structure with it. It reads the hardware
531 * capabilities and the first/last device entries
532 */
355static void __init init_iommu_from_pci(struct amd_iommu *iommu) 533static void __init init_iommu_from_pci(struct amd_iommu *iommu)
356{ 534{
357 int bus = PCI_BUS(iommu->devid);
358 int dev = PCI_SLOT(iommu->devid);
359 int fn = PCI_FUNC(iommu->devid);
360 int cap_ptr = iommu->cap_ptr; 535 int cap_ptr = iommu->cap_ptr;
361 u32 range; 536 u32 range, misc;
362 537
363 iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); 538 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
364 539 &iommu->cap);
365 range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); 540 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET,
366 iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range)); 541 &range);
367 iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); 542 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET,
543 &misc);
544
545 iommu->first_device = calc_devid(MMIO_GET_BUS(range),
546 MMIO_GET_FD(range));
547 iommu->last_device = calc_devid(MMIO_GET_BUS(range),
548 MMIO_GET_LD(range));
549 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
368} 550}
369 551
552/*
553 * Takes a pointer to an AMD IOMMU entry in the ACPI table and
554 * initializes the hardware and our data structures with it.
555 */
370static void __init init_iommu_from_acpi(struct amd_iommu *iommu, 556static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
371 struct ivhd_header *h) 557 struct ivhd_header *h)
372{ 558{
@@ -374,7 +560,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
374 u8 *end = p, flags = 0; 560 u8 *end = p, flags = 0;
375 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; 561 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
376 u32 ext_flags = 0; 562 u32 ext_flags = 0;
377 bool alias = 0; 563 bool alias = false;
378 struct ivhd_entry *e; 564 struct ivhd_entry *e;
379 565
380 /* 566 /*
@@ -414,22 +600,23 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
414 case IVHD_DEV_ALL: 600 case IVHD_DEV_ALL:
415 for (dev_i = iommu->first_device; 601 for (dev_i = iommu->first_device;
416 dev_i <= iommu->last_device; ++dev_i) 602 dev_i <= iommu->last_device; ++dev_i)
417 set_dev_entry_from_acpi(dev_i, e->flags, 0); 603 set_dev_entry_from_acpi(iommu, dev_i,
604 e->flags, 0);
418 break; 605 break;
419 case IVHD_DEV_SELECT: 606 case IVHD_DEV_SELECT:
420 devid = e->devid; 607 devid = e->devid;
421 set_dev_entry_from_acpi(devid, e->flags, 0); 608 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
422 break; 609 break;
423 case IVHD_DEV_SELECT_RANGE_START: 610 case IVHD_DEV_SELECT_RANGE_START:
424 devid_start = e->devid; 611 devid_start = e->devid;
425 flags = e->flags; 612 flags = e->flags;
426 ext_flags = 0; 613 ext_flags = 0;
427 alias = 0; 614 alias = false;
428 break; 615 break;
429 case IVHD_DEV_ALIAS: 616 case IVHD_DEV_ALIAS:
430 devid = e->devid; 617 devid = e->devid;
431 devid_to = e->ext >> 8; 618 devid_to = e->ext >> 8;
432 set_dev_entry_from_acpi(devid, e->flags, 0); 619 set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
433 amd_iommu_alias_table[devid] = devid_to; 620 amd_iommu_alias_table[devid] = devid_to;
434 break; 621 break;
435 case IVHD_DEV_ALIAS_RANGE: 622 case IVHD_DEV_ALIAS_RANGE:
@@ -437,24 +624,25 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
437 flags = e->flags; 624 flags = e->flags;
438 devid_to = e->ext >> 8; 625 devid_to = e->ext >> 8;
439 ext_flags = 0; 626 ext_flags = 0;
440 alias = 1; 627 alias = true;
441 break; 628 break;
442 case IVHD_DEV_EXT_SELECT: 629 case IVHD_DEV_EXT_SELECT:
443 devid = e->devid; 630 devid = e->devid;
444 set_dev_entry_from_acpi(devid, e->flags, e->ext); 631 set_dev_entry_from_acpi(iommu, devid, e->flags,
632 e->ext);
445 break; 633 break;
446 case IVHD_DEV_EXT_SELECT_RANGE: 634 case IVHD_DEV_EXT_SELECT_RANGE:
447 devid_start = e->devid; 635 devid_start = e->devid;
448 flags = e->flags; 636 flags = e->flags;
449 ext_flags = e->ext; 637 ext_flags = e->ext;
450 alias = 0; 638 alias = false;
451 break; 639 break;
452 case IVHD_DEV_RANGE_END: 640 case IVHD_DEV_RANGE_END:
453 devid = e->devid; 641 devid = e->devid;
454 for (dev_i = devid_start; dev_i <= devid; ++dev_i) { 642 for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
455 if (alias) 643 if (alias)
456 amd_iommu_alias_table[dev_i] = devid_to; 644 amd_iommu_alias_table[dev_i] = devid_to;
457 set_dev_entry_from_acpi( 645 set_dev_entry_from_acpi(iommu,
458 amd_iommu_alias_table[dev_i], 646 amd_iommu_alias_table[dev_i],
459 flags, ext_flags); 647 flags, ext_flags);
460 } 648 }
@@ -463,10 +651,11 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
463 break; 651 break;
464 } 652 }
465 653
466 p += 0x04 << (e->type >> 6); 654 p += ivhd_entry_length(p);
467 } 655 }
468} 656}
469 657
658/* Initializes the device->iommu mapping for the driver */
470static int __init init_iommu_devices(struct amd_iommu *iommu) 659static int __init init_iommu_devices(struct amd_iommu *iommu)
471{ 660{
472 u16 i; 661 u16 i;
@@ -480,6 +669,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu)
480static void __init free_iommu_one(struct amd_iommu *iommu) 669static void __init free_iommu_one(struct amd_iommu *iommu)
481{ 670{
482 free_command_buffer(iommu); 671 free_command_buffer(iommu);
672 free_event_buffer(iommu);
483 iommu_unmap_mmio_space(iommu); 673 iommu_unmap_mmio_space(iommu);
484} 674}
485 675
@@ -494,6 +684,11 @@ static void __init free_iommu_all(void)
494 } 684 }
495} 685}
496 686
687/*
688 * This function clues the initialization function for one IOMMU
689 * together and also allocates the command buffer and programs the
690 * hardware. It does NOT enable the IOMMU. This is done afterwards.
691 */
497static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) 692static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
498{ 693{
499 spin_lock_init(&iommu->lock); 694 spin_lock_init(&iommu->lock);
@@ -502,8 +697,12 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
502 /* 697 /*
503 * Copy data from ACPI table entry to the iommu struct 698 * Copy data from ACPI table entry to the iommu struct
504 */ 699 */
505 iommu->devid = h->devid; 700 iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff);
701 if (!iommu->dev)
702 return 1;
703
506 iommu->cap_ptr = h->cap_ptr; 704 iommu->cap_ptr = h->cap_ptr;
705 iommu->pci_seg = h->pci_seg;
507 iommu->mmio_phys = h->mmio_phys; 706 iommu->mmio_phys = h->mmio_phys;
508 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); 707 iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys);
509 if (!iommu->mmio_base) 708 if (!iommu->mmio_base)
@@ -514,13 +713,23 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
514 if (!iommu->cmd_buf) 713 if (!iommu->cmd_buf)
515 return -ENOMEM; 714 return -ENOMEM;
516 715
716 iommu->evt_buf = alloc_event_buffer(iommu);
717 if (!iommu->evt_buf)
718 return -ENOMEM;
719
720 iommu->int_enabled = false;
721
517 init_iommu_from_pci(iommu); 722 init_iommu_from_pci(iommu);
518 init_iommu_from_acpi(iommu, h); 723 init_iommu_from_acpi(iommu, h);
519 init_iommu_devices(iommu); 724 init_iommu_devices(iommu);
520 725
521 return 0; 726 return pci_enable_device(iommu->dev);
522} 727}
523 728
729/*
730 * Iterates over all IOMMU entries in the ACPI table, allocates the
731 * IOMMU structure and initializes it with init_iommu_one()
732 */
524static int __init init_iommu_all(struct acpi_table_header *table) 733static int __init init_iommu_all(struct acpi_table_header *table)
525{ 734{
526 u8 *p = (u8 *)table, *end = (u8 *)table; 735 u8 *p = (u8 *)table, *end = (u8 *)table;
@@ -528,8 +737,6 @@ static int __init init_iommu_all(struct acpi_table_header *table)
528 struct amd_iommu *iommu; 737 struct amd_iommu *iommu;
529 int ret; 738 int ret;
530 739
531 INIT_LIST_HEAD(&amd_iommu_list);
532
533 end += table->length; 740 end += table->length;
534 p += IVRS_HEADER_LENGTH; 741 p += IVRS_HEADER_LENGTH;
535 742
@@ -555,6 +762,103 @@ static int __init init_iommu_all(struct acpi_table_header *table)
555 return 0; 762 return 0;
556} 763}
557 764
765/****************************************************************************
766 *
767 * The following functions initialize the MSI interrupts for all IOMMUs
768 * in the system. Its a bit challenging because there could be multiple
769 * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per
770 * pci_dev.
771 *
772 ****************************************************************************/
773
774static int __init iommu_setup_msix(struct amd_iommu *iommu)
775{
776 struct amd_iommu *curr;
777 struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
778 int nvec = 0, i;
779
780 list_for_each_entry(curr, &amd_iommu_list, list) {
781 if (curr->dev == iommu->dev) {
782 entries[nvec].entry = curr->evt_msi_num;
783 entries[nvec].vector = 0;
784 curr->int_enabled = true;
785 nvec++;
786 }
787 }
788
789 if (pci_enable_msix(iommu->dev, entries, nvec)) {
790 pci_disable_msix(iommu->dev);
791 return 1;
792 }
793
794 for (i = 0; i < nvec; ++i) {
795 int r = request_irq(entries->vector, amd_iommu_int_handler,
796 IRQF_SAMPLE_RANDOM,
797 "AMD IOMMU",
798 NULL);
799 if (r)
800 goto out_free;
801 }
802
803 return 0;
804
805out_free:
806 for (i -= 1; i >= 0; --i)
807 free_irq(entries->vector, NULL);
808
809 pci_disable_msix(iommu->dev);
810
811 return 1;
812}
813
814static int __init iommu_setup_msi(struct amd_iommu *iommu)
815{
816 int r;
817 struct amd_iommu *curr;
818
819 list_for_each_entry(curr, &amd_iommu_list, list) {
820 if (curr->dev == iommu->dev)
821 curr->int_enabled = true;
822 }
823
824
825 if (pci_enable_msi(iommu->dev))
826 return 1;
827
828 r = request_irq(iommu->dev->irq, amd_iommu_int_handler,
829 IRQF_SAMPLE_RANDOM,
830 "AMD IOMMU",
831 NULL);
832
833 if (r) {
834 pci_disable_msi(iommu->dev);
835 return 1;
836 }
837
838 return 0;
839}
840
841static int __init iommu_init_msi(struct amd_iommu *iommu)
842{
843 if (iommu->int_enabled)
844 return 0;
845
846 if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
847 return iommu_setup_msix(iommu);
848 else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
849 return iommu_setup_msi(iommu);
850
851 return 1;
852}
853
854/****************************************************************************
855 *
856 * The next functions belong to the third pass of parsing the ACPI
857 * table. In this last pass the memory mapping requirements are
858 * gathered (like exclusion and unity mapping reanges).
859 *
860 ****************************************************************************/
861
558static void __init free_unity_maps(void) 862static void __init free_unity_maps(void)
559{ 863{
560 struct unity_map_entry *entry, *next; 864 struct unity_map_entry *entry, *next;
@@ -565,6 +869,7 @@ static void __init free_unity_maps(void)
565 } 869 }
566} 870}
567 871
872/* called when we find an exclusion range definition in ACPI */
568static int __init init_exclusion_range(struct ivmd_header *m) 873static int __init init_exclusion_range(struct ivmd_header *m)
569{ 874{
570 int i; 875 int i;
@@ -574,7 +879,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
574 set_device_exclusion_range(m->devid, m); 879 set_device_exclusion_range(m->devid, m);
575 break; 880 break;
576 case ACPI_IVMD_TYPE_ALL: 881 case ACPI_IVMD_TYPE_ALL:
577 for (i = 0; i < amd_iommu_last_bdf; ++i) 882 for (i = 0; i <= amd_iommu_last_bdf; ++i)
578 set_device_exclusion_range(i, m); 883 set_device_exclusion_range(i, m);
579 break; 884 break;
580 case ACPI_IVMD_TYPE_RANGE: 885 case ACPI_IVMD_TYPE_RANGE:
@@ -588,6 +893,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
588 return 0; 893 return 0;
589} 894}
590 895
896/* called for unity map ACPI definition */
591static int __init init_unity_map_range(struct ivmd_header *m) 897static int __init init_unity_map_range(struct ivmd_header *m)
592{ 898{
593 struct unity_map_entry *e = 0; 899 struct unity_map_entry *e = 0;
@@ -619,13 +925,12 @@ static int __init init_unity_map_range(struct ivmd_header *m)
619 return 0; 925 return 0;
620} 926}
621 927
928/* iterates over all memory definitions we find in the ACPI table */
622static int __init init_memory_definitions(struct acpi_table_header *table) 929static int __init init_memory_definitions(struct acpi_table_header *table)
623{ 930{
624 u8 *p = (u8 *)table, *end = (u8 *)table; 931 u8 *p = (u8 *)table, *end = (u8 *)table;
625 struct ivmd_header *m; 932 struct ivmd_header *m;
626 933
627 INIT_LIST_HEAD(&amd_iommu_unity_map);
628
629 end += table->length; 934 end += table->length;
630 p += IVRS_HEADER_LENGTH; 935 p += IVRS_HEADER_LENGTH;
631 936
@@ -642,12 +947,32 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
642 return 0; 947 return 0;
643} 948}
644 949
950/*
951 * Init the device table to not allow DMA access for devices and
952 * suppress all page faults
953 */
954static void init_device_table(void)
955{
956 u16 devid;
957
958 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
959 set_dev_entry_bit(devid, DEV_ENTRY_VALID);
960 set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION);
961 }
962}
963
964/*
965 * This function finally enables all IOMMUs found in the system after
966 * they have been initialized
967 */
645static void __init enable_iommus(void) 968static void __init enable_iommus(void)
646{ 969{
647 struct amd_iommu *iommu; 970 struct amd_iommu *iommu;
648 971
649 list_for_each_entry(iommu, &amd_iommu_list, list) { 972 list_for_each_entry(iommu, &amd_iommu_list, list) {
650 iommu_set_exclusion_range(iommu); 973 iommu_set_exclusion_range(iommu);
974 iommu_init_msi(iommu);
975 iommu_enable_event_logging(iommu);
651 iommu_enable(iommu); 976 iommu_enable(iommu);
652 } 977 }
653} 978}
@@ -678,6 +1003,34 @@ static struct sys_device device_amd_iommu = {
678 .cls = &amd_iommu_sysdev_class, 1003 .cls = &amd_iommu_sysdev_class,
679}; 1004};
680 1005
1006/*
1007 * This is the core init function for AMD IOMMU hardware in the system.
1008 * This function is called from the generic x86 DMA layer initialization
1009 * code.
1010 *
1011 * This function basically parses the ACPI table for AMD IOMMU (IVRS)
1012 * three times:
1013 *
1014 * 1 pass) Find the highest PCI device id the driver has to handle.
1015 * Upon this information the size of the data structures is
1016 * determined that needs to be allocated.
1017 *
1018 * 2 pass) Initialize the data structures just allocated with the
1019 * information in the ACPI table about available AMD IOMMUs
1020 * in the system. It also maps the PCI devices in the
1021 * system to specific IOMMUs
1022 *
1023 * 3 pass) After the basic data structures are allocated and
1024 * initialized we update them with information about memory
1025 * remapping requirements parsed out of the ACPI table in
1026 * this last pass.
1027 *
1028 * After that the hardware is initialized and ready to go. In the last
1029 * step we do some Linux specific things like registering the driver in
1030 * the dma_ops interface and initializing the suspend/resume support
1031 * functions. Finally it prints some information about AMD IOMMUs and
1032 * the driver state and enables the hardware.
1033 */
681int __init amd_iommu_init(void) 1034int __init amd_iommu_init(void)
682{ 1035{
683 int i, ret = 0; 1036 int i, ret = 0;
@@ -699,14 +1052,14 @@ int __init amd_iommu_init(void)
699 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) 1052 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
700 return -ENODEV; 1053 return -ENODEV;
701 1054
702 dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE); 1055 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
703 alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE); 1056 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
704 rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE); 1057 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
705 1058
706 ret = -ENOMEM; 1059 ret = -ENOMEM;
707 1060
708 /* Device table - directly used by all IOMMUs */ 1061 /* Device table - directly used by all IOMMUs */
709 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL, 1062 amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
710 get_order(dev_table_size)); 1063 get_order(dev_table_size));
711 if (amd_iommu_dev_table == NULL) 1064 if (amd_iommu_dev_table == NULL)
712 goto out; 1065 goto out;
@@ -730,27 +1083,26 @@ int __init amd_iommu_init(void)
730 * Protection Domain table - maps devices to protection domains 1083 * Protection Domain table - maps devices to protection domains
731 * This table has the same size as the rlookup_table 1084 * This table has the same size as the rlookup_table
732 */ 1085 */
733 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL, 1086 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
734 get_order(rlookup_table_size)); 1087 get_order(rlookup_table_size));
735 if (amd_iommu_pd_table == NULL) 1088 if (amd_iommu_pd_table == NULL)
736 goto free; 1089 goto free;
737 1090
738 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL, 1091 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
1092 GFP_KERNEL | __GFP_ZERO,
739 get_order(MAX_DOMAIN_ID/8)); 1093 get_order(MAX_DOMAIN_ID/8));
740 if (amd_iommu_pd_alloc_bitmap == NULL) 1094 if (amd_iommu_pd_alloc_bitmap == NULL)
741 goto free; 1095 goto free;
742 1096
1097 /* init the device table */
1098 init_device_table();
1099
743 /* 1100 /*
744 * memory is allocated now; initialize the device table with all zeroes 1101 * let all alias entries point to itself
745 * and let all alias entries point to itself
746 */ 1102 */
747 memset(amd_iommu_dev_table, 0, dev_table_size); 1103 for (i = 0; i <= amd_iommu_last_bdf; ++i)
748 for (i = 0; i < amd_iommu_last_bdf; ++i)
749 amd_iommu_alias_table[i] = i; 1104 amd_iommu_alias_table[i] = i;
750 1105
751 memset(amd_iommu_pd_table, 0, rlookup_table_size);
752 memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8);
753
754 /* 1106 /*
755 * never allocate domain 0 because its used as the non-allocated and 1107 * never allocate domain 0 because its used as the non-allocated and
756 * error value placeholder 1108 * error value placeholder
@@ -768,15 +1120,15 @@ int __init amd_iommu_init(void)
768 if (acpi_table_parse("IVRS", init_memory_definitions) != 0) 1120 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
769 goto free; 1121 goto free;
770 1122
771 ret = amd_iommu_init_dma_ops(); 1123 ret = sysdev_class_register(&amd_iommu_sysdev_class);
772 if (ret) 1124 if (ret)
773 goto free; 1125 goto free;
774 1126
775 ret = sysdev_class_register(&amd_iommu_sysdev_class); 1127 ret = sysdev_register(&device_amd_iommu);
776 if (ret) 1128 if (ret)
777 goto free; 1129 goto free;
778 1130
779 ret = sysdev_register(&device_amd_iommu); 1131 ret = amd_iommu_init_dma_ops();
780 if (ret) 1132 if (ret)
781 goto free; 1133 goto free;
782 1134
@@ -791,28 +1143,29 @@ int __init amd_iommu_init(void)
791 else 1143 else
792 printk("disabled\n"); 1144 printk("disabled\n");
793 1145
1146 if (amd_iommu_unmap_flush)
1147 printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n");
1148 else
1149 printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n");
1150
794out: 1151out:
795 return ret; 1152 return ret;
796 1153
797free: 1154free:
798 if (amd_iommu_pd_alloc_bitmap) 1155 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
799 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); 1156 get_order(MAX_DOMAIN_ID/8));
800 1157
801 if (amd_iommu_pd_table) 1158 free_pages((unsigned long)amd_iommu_pd_table,
802 free_pages((unsigned long)amd_iommu_pd_table, 1159 get_order(rlookup_table_size));
803 get_order(rlookup_table_size));
804 1160
805 if (amd_iommu_rlookup_table) 1161 free_pages((unsigned long)amd_iommu_rlookup_table,
806 free_pages((unsigned long)amd_iommu_rlookup_table, 1162 get_order(rlookup_table_size));
807 get_order(rlookup_table_size));
808 1163
809 if (amd_iommu_alias_table) 1164 free_pages((unsigned long)amd_iommu_alias_table,
810 free_pages((unsigned long)amd_iommu_alias_table, 1165 get_order(alias_table_size));
811 get_order(alias_table_size));
812 1166
813 if (amd_iommu_dev_table) 1167 free_pages((unsigned long)amd_iommu_dev_table,
814 free_pages((unsigned long)amd_iommu_dev_table, 1168 get_order(dev_table_size));
815 get_order(dev_table_size));
816 1169
817 free_iommu_all(); 1170 free_iommu_all();
818 1171
@@ -821,6 +1174,13 @@ free:
821 goto out; 1174 goto out;
822} 1175}
823 1176
1177/****************************************************************************
1178 *
1179 * Early detect code. This code runs at IOMMU detection time in the DMA
1180 * layer. It just looks if there is an IVRS ACPI table to detect AMD
1181 * IOMMUs
1182 *
1183 ****************************************************************************/
824static int __init early_amd_iommu_detect(struct acpi_table_header *table) 1184static int __init early_amd_iommu_detect(struct acpi_table_header *table)
825{ 1185{
826 return 0; 1186 return 0;
@@ -828,7 +1188,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
828 1188
829void __init amd_iommu_detect(void) 1189void __init amd_iommu_detect(void)
830{ 1190{
831 if (swiotlb || no_iommu || iommu_detected) 1191 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture))
832 return; 1192 return;
833 1193
834 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1194 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
@@ -841,11 +1201,20 @@ void __init amd_iommu_detect(void)
841 } 1201 }
842} 1202}
843 1203
1204/****************************************************************************
1205 *
1206 * Parsing functions for the AMD IOMMU specific kernel command line
1207 * options.
1208 *
1209 ****************************************************************************/
1210
844static int __init parse_amd_iommu_options(char *str) 1211static int __init parse_amd_iommu_options(char *str)
845{ 1212{
846 for (; *str; ++str) { 1213 for (; *str; ++str) {
847 if (strcmp(str, "isolate") == 0) 1214 if (strncmp(str, "isolate", 7) == 0)
848 amd_iommu_isolate = 1; 1215 amd_iommu_isolate = 1;
1216 if (strncmp(str, "fullflush", 11) == 0)
1217 amd_iommu_unmap_flush = true;
849 } 1218 }
850 1219
851 return 1; 1220 return 1;
@@ -853,20 +1222,10 @@ static int __init parse_amd_iommu_options(char *str)
853 1222
854static int __init parse_amd_iommu_size_options(char *str) 1223static int __init parse_amd_iommu_size_options(char *str)
855{ 1224{
856 for (; *str; ++str) { 1225 unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
857 if (strcmp(str, "32M") == 0) 1226
858 amd_iommu_aperture_order = 25; 1227 if ((order > 24) && (order < 31))
859 if (strcmp(str, "64M") == 0) 1228 amd_iommu_aperture_order = order;
860 amd_iommu_aperture_order = 26;
861 if (strcmp(str, "128M") == 0)
862 amd_iommu_aperture_order = 27;
863 if (strcmp(str, "256M") == 0)
864 amd_iommu_aperture_order = 28;
865 if (strcmp(str, "512M") == 0)
866 amd_iommu_aperture_order = 29;
867 if (strcmp(str, "1G") == 0)
868 amd_iommu_aperture_order = 30;
869 }
870 1229
871 return 1; 1230 return 1;
872} 1231}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 9f907806c1a5..9a32b37ee2ee 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -21,6 +21,7 @@
21#include <linux/suspend.h> 21#include <linux/suspend.h>
22#include <asm/e820.h> 22#include <asm/e820.h>
23#include <asm/io.h> 23#include <asm/io.h>
24#include <asm/iommu.h>
24#include <asm/gart.h> 25#include <asm/gart.h>
25#include <asm/pci-direct.h> 26#include <asm/pci-direct.h>
26#include <asm/dma.h> 27#include <asm/dma.h>
@@ -454,11 +455,11 @@ out:
454 force_iommu || 455 force_iommu ||
455 valid_agp || 456 valid_agp ||
456 fallback_aper_force) { 457 fallback_aper_force) {
457 printk(KERN_ERR 458 printk(KERN_INFO
458 "Your BIOS doesn't leave a aperture memory hole\n"); 459 "Your BIOS doesn't leave a aperture memory hole\n");
459 printk(KERN_ERR 460 printk(KERN_INFO
460 "Please enable the IOMMU option in the BIOS setup\n"); 461 "Please enable the IOMMU option in the BIOS setup\n");
461 printk(KERN_ERR 462 printk(KERN_INFO
462 "This costs you %d MB of RAM\n", 463 "This costs you %d MB of RAM\n",
463 32 << fallback_aper_order); 464 32 << fallback_aper_order);
464 465
diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic.c
index a437d027f20b..04a7f960bbc0 100644
--- a/arch/x86/kernel/apic_32.c
+++ b/arch/x86/kernel/apic.c
@@ -23,11 +23,13 @@
23#include <linux/mc146818rtc.h> 23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h> 24#include <linux/kernel_stat.h>
25#include <linux/sysdev.h> 25#include <linux/sysdev.h>
26#include <linux/ioport.h>
26#include <linux/cpu.h> 27#include <linux/cpu.h>
27#include <linux/clockchips.h> 28#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h> 29#include <linux/acpi_pmtmr.h>
29#include <linux/module.h> 30#include <linux/module.h>
30#include <linux/dmi.h> 31#include <linux/dmi.h>
32#include <linux/dmar.h>
31 33
32#include <asm/atomic.h> 34#include <asm/atomic.h>
33#include <asm/smp.h> 35#include <asm/smp.h>
@@ -36,8 +38,14 @@
36#include <asm/desc.h> 38#include <asm/desc.h>
37#include <asm/arch_hooks.h> 39#include <asm/arch_hooks.h>
38#include <asm/hpet.h> 40#include <asm/hpet.h>
41#include <asm/pgalloc.h>
39#include <asm/i8253.h> 42#include <asm/i8253.h>
40#include <asm/nmi.h> 43#include <asm/nmi.h>
44#include <asm/idle.h>
45#include <asm/proto.h>
46#include <asm/timex.h>
47#include <asm/apic.h>
48#include <asm/i8259.h>
41 49
42#include <mach_apic.h> 50#include <mach_apic.h>
43#include <mach_apicdef.h> 51#include <mach_apicdef.h>
@@ -50,20 +58,60 @@
50# error SPURIOUS_APIC_VECTOR definition error 58# error SPURIOUS_APIC_VECTOR definition error
51#endif 59#endif
52 60
53unsigned long mp_lapic_addr; 61#ifdef CONFIG_X86_32
54
55/* 62/*
56 * Knob to control our willingness to enable the local APIC. 63 * Knob to control our willingness to enable the local APIC.
57 * 64 *
58 * +1=force-enable 65 * +1=force-enable
59 */ 66 */
60static int force_enable_local_apic; 67static int force_enable_local_apic;
61int disable_apic; 68/*
69 * APIC command line parameters
70 */
71static int __init parse_lapic(char *arg)
72{
73 force_enable_local_apic = 1;
74 return 0;
75}
76early_param("lapic", parse_lapic);
77/* Local APIC was disabled by the BIOS and enabled by the kernel */
78static int enabled_via_apicbase;
79
80#endif
62 81
63/* Local APIC timer verification ok */ 82#ifdef CONFIG_X86_64
64static int local_apic_timer_verify_ok; 83static int apic_calibrate_pmtmr __initdata;
84static __init int setup_apicpmtimer(char *s)
85{
86 apic_calibrate_pmtmr = 1;
87 notsc_setup(NULL);
88 return 0;
89}
90__setup("apicpmtimer", setup_apicpmtimer);
91#endif
92
93#ifdef CONFIG_X86_64
94#define HAVE_X2APIC
95#endif
96
97#ifdef HAVE_X2APIC
98int x2apic;
99/* x2apic enabled before OS handover */
100int x2apic_preenabled;
101int disable_x2apic;
102static __init int setup_nox2apic(char *str)
103{
104 disable_x2apic = 1;
105 setup_clear_cpu_cap(X86_FEATURE_X2APIC);
106 return 0;
107}
108early_param("nox2apic", setup_nox2apic);
109#endif
110
111unsigned long mp_lapic_addr;
112int disable_apic;
65/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 113/* Disable local APIC timer from the kernel commandline or via dmi quirk */
66static int local_apic_timer_disabled; 114static int disable_apic_timer __cpuinitdata;
67/* Local APIC timer works in C2 */ 115/* Local APIC timer works in C2 */
68int local_apic_timer_c2_ok; 116int local_apic_timer_c2_ok;
69EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 117EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -75,7 +123,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE};
75/* 123/*
76 * Debug level, exported for io_apic.c 124 * Debug level, exported for io_apic.c
77 */ 125 */
78int apic_verbosity; 126unsigned int apic_verbosity;
79 127
80int pic_mode; 128int pic_mode;
81 129
@@ -112,9 +160,6 @@ static struct clock_event_device lapic_clockevent = {
112}; 160};
113static DEFINE_PER_CPU(struct clock_event_device, lapic_events); 161static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
114 162
115/* Local APIC was disabled by the BIOS and enabled by the kernel */
116static int enabled_via_apicbase;
117
118static unsigned long apic_phys; 163static unsigned long apic_phys;
119 164
120/* 165/*
@@ -130,7 +175,11 @@ static inline int lapic_get_version(void)
130 */ 175 */
131static inline int lapic_is_integrated(void) 176static inline int lapic_is_integrated(void)
132{ 177{
178#ifdef CONFIG_X86_64
179 return 1;
180#else
133 return APIC_INTEGRATED(lapic_get_version()); 181 return APIC_INTEGRATED(lapic_get_version());
182#endif
134} 183}
135 184
136/* 185/*
@@ -145,13 +194,18 @@ static int modern_apic(void)
145 return lapic_get_version() >= 0x14; 194 return lapic_get_version() >= 0x14;
146} 195}
147 196
148void apic_wait_icr_idle(void) 197/*
198 * Paravirt kernels also might be using these below ops. So we still
199 * use generic apic_read()/apic_write(), which might be pointing to different
200 * ops in PARAVIRT case.
201 */
202void xapic_wait_icr_idle(void)
149{ 203{
150 while (apic_read(APIC_ICR) & APIC_ICR_BUSY) 204 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
151 cpu_relax(); 205 cpu_relax();
152} 206}
153 207
154u32 safe_apic_wait_icr_idle(void) 208u32 safe_xapic_wait_icr_idle(void)
155{ 209{
156 u32 send_status; 210 u32 send_status;
157 int timeout; 211 int timeout;
@@ -167,19 +221,88 @@ u32 safe_apic_wait_icr_idle(void)
167 return send_status; 221 return send_status;
168} 222}
169 223
224void xapic_icr_write(u32 low, u32 id)
225{
226 apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id));
227 apic_write(APIC_ICR, low);
228}
229
230u64 xapic_icr_read(void)
231{
232 u32 icr1, icr2;
233
234 icr2 = apic_read(APIC_ICR2);
235 icr1 = apic_read(APIC_ICR);
236
237 return icr1 | ((u64)icr2 << 32);
238}
239
240static struct apic_ops xapic_ops = {
241 .read = native_apic_mem_read,
242 .write = native_apic_mem_write,
243 .icr_read = xapic_icr_read,
244 .icr_write = xapic_icr_write,
245 .wait_icr_idle = xapic_wait_icr_idle,
246 .safe_wait_icr_idle = safe_xapic_wait_icr_idle,
247};
248
249struct apic_ops __read_mostly *apic_ops = &xapic_ops;
250EXPORT_SYMBOL_GPL(apic_ops);
251
252#ifdef HAVE_X2APIC
253static void x2apic_wait_icr_idle(void)
254{
255 /* no need to wait for icr idle in x2apic */
256 return;
257}
258
259static u32 safe_x2apic_wait_icr_idle(void)
260{
261 /* no need to wait for icr idle in x2apic */
262 return 0;
263}
264
265void x2apic_icr_write(u32 low, u32 id)
266{
267 wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low);
268}
269
270u64 x2apic_icr_read(void)
271{
272 unsigned long val;
273
274 rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val);
275 return val;
276}
277
278static struct apic_ops x2apic_ops = {
279 .read = native_apic_msr_read,
280 .write = native_apic_msr_write,
281 .icr_read = x2apic_icr_read,
282 .icr_write = x2apic_icr_write,
283 .wait_icr_idle = x2apic_wait_icr_idle,
284 .safe_wait_icr_idle = safe_x2apic_wait_icr_idle,
285};
286#endif
287
170/** 288/**
171 * enable_NMI_through_LVT0 - enable NMI through local vector table 0 289 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
172 */ 290 */
173void __cpuinit enable_NMI_through_LVT0(void) 291void __cpuinit enable_NMI_through_LVT0(void)
174{ 292{
175 unsigned int v = APIC_DM_NMI; 293 unsigned int v;
294
295 /* unmask and set to NMI */
296 v = APIC_DM_NMI;
176 297
177 /* Level triggered for 82489DX */ 298 /* Level triggered for 82489DX (32bit mode) */
178 if (!lapic_is_integrated()) 299 if (!lapic_is_integrated())
179 v |= APIC_LVT_LEVEL_TRIGGER; 300 v |= APIC_LVT_LEVEL_TRIGGER;
180 apic_write_around(APIC_LVT0, v); 301
302 apic_write(APIC_LVT0, v);
181} 303}
182 304
305#ifdef CONFIG_X86_32
183/** 306/**
184 * get_physical_broadcast - Get number of physical broadcast IDs 307 * get_physical_broadcast - Get number of physical broadcast IDs
185 */ 308 */
@@ -187,15 +310,20 @@ int get_physical_broadcast(void)
187{ 310{
188 return modern_apic() ? 0xff : 0xf; 311 return modern_apic() ? 0xff : 0xf;
189} 312}
313#endif
190 314
191/** 315/**
192 * lapic_get_maxlvt - get the maximum number of local vector table entries 316 * lapic_get_maxlvt - get the maximum number of local vector table entries
193 */ 317 */
194int lapic_get_maxlvt(void) 318int lapic_get_maxlvt(void)
195{ 319{
196 unsigned int v = apic_read(APIC_LVR); 320 unsigned int v;
197 321
198 /* 82489DXs do not report # of LVT entries. */ 322 v = apic_read(APIC_LVR);
323 /*
324 * - we always have APIC integrated on 64bit mode
325 * - 82489DXs do not report # of LVT entries
326 */
199 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; 327 return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2;
200} 328}
201 329
@@ -203,7 +331,7 @@ int lapic_get_maxlvt(void)
203 * Local APIC timer 331 * Local APIC timer
204 */ 332 */
205 333
206/* Clock divisor is set to 16 */ 334/* Clock divisor */
207#define APIC_DIVISOR 16 335#define APIC_DIVISOR 16
208 336
209/* 337/*
@@ -229,27 +357,61 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
229 if (!irqen) 357 if (!irqen)
230 lvtt_value |= APIC_LVT_MASKED; 358 lvtt_value |= APIC_LVT_MASKED;
231 359
232 apic_write_around(APIC_LVTT, lvtt_value); 360 apic_write(APIC_LVTT, lvtt_value);
233 361
234 /* 362 /*
235 * Divide PICLK by 16 363 * Divide PICLK by 16
236 */ 364 */
237 tmp_value = apic_read(APIC_TDCR); 365 tmp_value = apic_read(APIC_TDCR);
238 apic_write_around(APIC_TDCR, (tmp_value 366 apic_write(APIC_TDCR,
239 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) 367 (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) |
240 | APIC_TDR_DIV_16); 368 APIC_TDR_DIV_16);
241 369
242 if (!oneshot) 370 if (!oneshot)
243 apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); 371 apic_write(APIC_TMICT, clocks / APIC_DIVISOR);
244} 372}
245 373
246/* 374/*
375 * Setup extended LVT, AMD specific (K8, family 10h)
376 *
377 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
378 * MCE interrupts are supported. Thus MCE offset must be set to 0.
379 *
380 * If mask=1, the LVT entry does not generate interrupts while mask=0
381 * enables the vector. See also the BKDGs.
382 */
383
384#define APIC_EILVT_LVTOFF_MCE 0
385#define APIC_EILVT_LVTOFF_IBS 1
386
387static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
388{
389 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
390 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
391
392 apic_write(reg, v);
393}
394
395u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
396{
397 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
398 return APIC_EILVT_LVTOFF_MCE;
399}
400
401u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
402{
403 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
404 return APIC_EILVT_LVTOFF_IBS;
405}
406EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs);
407
408/*
247 * Program the next event, relative to now 409 * Program the next event, relative to now
248 */ 410 */
249static int lapic_next_event(unsigned long delta, 411static int lapic_next_event(unsigned long delta,
250 struct clock_event_device *evt) 412 struct clock_event_device *evt)
251{ 413{
252 apic_write_around(APIC_TMICT, delta); 414 apic_write(APIC_TMICT, delta);
253 return 0; 415 return 0;
254} 416}
255 417
@@ -262,8 +424,8 @@ static void lapic_timer_setup(enum clock_event_mode mode,
262 unsigned long flags; 424 unsigned long flags;
263 unsigned int v; 425 unsigned int v;
264 426
265 /* Lapic used for broadcast ? */ 427 /* Lapic used as dummy for broadcast ? */
266 if (!local_apic_timer_verify_ok) 428 if (evt->features & CLOCK_EVT_FEAT_DUMMY)
267 return; 429 return;
268 430
269 local_irq_save(flags); 431 local_irq_save(flags);
@@ -278,7 +440,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
278 case CLOCK_EVT_MODE_SHUTDOWN: 440 case CLOCK_EVT_MODE_SHUTDOWN:
279 v = apic_read(APIC_LVTT); 441 v = apic_read(APIC_LVTT);
280 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 442 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
281 apic_write_around(APIC_LVTT, v); 443 apic_write(APIC_LVTT, v);
282 break; 444 break;
283 case CLOCK_EVT_MODE_RESUME: 445 case CLOCK_EVT_MODE_RESUME:
284 /* Nothing to do here */ 446 /* Nothing to do here */
@@ -302,7 +464,7 @@ static void lapic_timer_broadcast(cpumask_t mask)
302 * Setup the local APIC timer for this CPU. Copy the initilized values 464 * Setup the local APIC timer for this CPU. Copy the initilized values
303 * of the boot CPU and register the clock event in the framework. 465 * of the boot CPU and register the clock event in the framework.
304 */ 466 */
305static void __devinit setup_APIC_timer(void) 467static void __cpuinit setup_APIC_timer(void)
306{ 468{
307 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 469 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
308 470
@@ -372,39 +534,53 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
372 } 534 }
373} 535}
374 536
375/* 537static int __init calibrate_by_pmtimer(long deltapm, long *delta)
376 * Setup the boot APIC 538{
377 * 539 const long pm_100ms = PMTMR_TICKS_PER_SEC / 10;
378 * Calibrate and verify the result. 540 const long pm_thresh = pm_100ms / 100;
379 */ 541 unsigned long mult;
380void __init setup_boot_APIC_clock(void) 542 u64 res;
543
544#ifndef CONFIG_X86_PM_TIMER
545 return -1;
546#endif
547
548 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm);
549
550 /* Check, if the PM timer is available */
551 if (!deltapm)
552 return -1;
553
554 mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
555
556 if (deltapm > (pm_100ms - pm_thresh) &&
557 deltapm < (pm_100ms + pm_thresh)) {
558 apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
559 } else {
560 res = (((u64)deltapm) * mult) >> 22;
561 do_div(res, 1000000);
562 printk(KERN_WARNING "APIC calibration not consistent "
563 "with PM Timer: %ldms instead of 100ms\n",
564 (long)res);
565 /* Correct the lapic counter value */
566 res = (((u64)(*delta)) * pm_100ms);
567 do_div(res, deltapm);
568 printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
569 "%lu (%ld)\n", (unsigned long)res, *delta);
570 *delta = (long)res;
571 }
572
573 return 0;
574}
575
576static int __init calibrate_APIC_clock(void)
381{ 577{
382 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 578 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
383 const long pm_100ms = PMTMR_TICKS_PER_SEC/10;
384 const long pm_thresh = pm_100ms/100;
385 void (*real_handler)(struct clock_event_device *dev); 579 void (*real_handler)(struct clock_event_device *dev);
386 unsigned long deltaj; 580 unsigned long deltaj;
387 long delta, deltapm; 581 long delta;
388 int pm_referenced = 0; 582 int pm_referenced = 0;
389 583
390 /*
391 * The local apic timer can be disabled via the kernel
392 * commandline or from the CPU detection code. Register the lapic
393 * timer as a dummy clock event source on SMP systems, so the
394 * broadcast mechanism is used. On UP systems simply ignore it.
395 */
396 if (local_apic_timer_disabled) {
397 /* No broadcast on UP ! */
398 if (num_possible_cpus() > 1) {
399 lapic_clockevent.mult = 1;
400 setup_APIC_timer();
401 }
402 return;
403 }
404
405 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
406 "calibrating APIC timer ...\n");
407
408 local_irq_disable(); 584 local_irq_disable();
409 585
410 /* Replace the global interrupt handler */ 586 /* Replace the global interrupt handler */
@@ -412,10 +588,10 @@ void __init setup_boot_APIC_clock(void)
412 global_clock_event->event_handler = lapic_cal_handler; 588 global_clock_event->event_handler = lapic_cal_handler;
413 589
414 /* 590 /*
415 * Setup the APIC counter to 1e9. There is no way the lapic 591 * Setup the APIC counter to maximum. There is no way the lapic
416 * can underflow in the 100ms detection time frame 592 * can underflow in the 100ms detection time frame
417 */ 593 */
418 __setup_APIC_LVTT(1000000000, 0, 0); 594 __setup_APIC_LVTT(0xffffffff, 0, 0);
419 595
420 /* Let the interrupts run */ 596 /* Let the interrupts run */
421 local_irq_enable(); 597 local_irq_enable();
@@ -432,34 +608,9 @@ void __init setup_boot_APIC_clock(void)
432 delta = lapic_cal_t1 - lapic_cal_t2; 608 delta = lapic_cal_t1 - lapic_cal_t2;
433 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); 609 apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta);
434 610
435 /* Check, if the PM timer is available */ 611 /* we trust the PM based calibration if possible */
436 deltapm = lapic_cal_pm2 - lapic_cal_pm1; 612 pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1,
437 apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); 613 &delta);
438
439 if (deltapm) {
440 unsigned long mult;
441 u64 res;
442
443 mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22);
444
445 if (deltapm > (pm_100ms - pm_thresh) &&
446 deltapm < (pm_100ms + pm_thresh)) {
447 apic_printk(APIC_VERBOSE, "... PM timer result ok\n");
448 } else {
449 res = (((u64) deltapm) * mult) >> 22;
450 do_div(res, 1000000);
451 printk(KERN_WARNING "APIC calibration not consistent "
452 "with PM Timer: %ldms instead of 100ms\n",
453 (long)res);
454 /* Correct the lapic counter value */
455 res = (((u64) delta) * pm_100ms);
456 do_div(res, deltapm);
457 printk(KERN_INFO "APIC delta adjusted to PM-Timer: "
458 "%lu (%ld)\n", (unsigned long) res, delta);
459 delta = (long) res;
460 }
461 pm_referenced = 1;
462 }
463 614
464 /* Calculate the scaled math multiplication factor */ 615 /* Calculate the scaled math multiplication factor */
465 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 616 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
@@ -489,8 +640,6 @@ void __init setup_boot_APIC_clock(void)
489 calibration_result / (1000000 / HZ), 640 calibration_result / (1000000 / HZ),
490 calibration_result % (1000000 / HZ)); 641 calibration_result % (1000000 / HZ));
491 642
492 local_apic_timer_verify_ok = 1;
493
494 /* 643 /*
495 * Do a sanity check on the APIC calibration result 644 * Do a sanity check on the APIC calibration result
496 */ 645 */
@@ -498,13 +647,15 @@ void __init setup_boot_APIC_clock(void)
498 local_irq_enable(); 647 local_irq_enable();
499 printk(KERN_WARNING 648 printk(KERN_WARNING
500 "APIC frequency too slow, disabling apic timer\n"); 649 "APIC frequency too slow, disabling apic timer\n");
501 /* No broadcast on UP ! */ 650 return -1;
502 if (num_possible_cpus() > 1)
503 setup_APIC_timer();
504 return;
505 } 651 }
506 652
507 /* We trust the pm timer based calibration */ 653 levt->features &= ~CLOCK_EVT_FEAT_DUMMY;
654
655 /*
656 * PM timer calibration failed or not turned on
657 * so lets try APIC timer based calibration
658 */
508 if (!pm_referenced) { 659 if (!pm_referenced) {
509 apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); 660 apic_printk(APIC_VERBOSE, "... verify APIC timer\n");
510 661
@@ -536,34 +687,68 @@ void __init setup_boot_APIC_clock(void)
536 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) 687 if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2)
537 apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); 688 apic_printk(APIC_VERBOSE, "... jiffies result ok\n");
538 else 689 else
539 local_apic_timer_verify_ok = 0; 690 levt->features |= CLOCK_EVT_FEAT_DUMMY;
540 } else 691 } else
541 local_irq_enable(); 692 local_irq_enable();
542 693
543 if (!local_apic_timer_verify_ok) { 694 if (levt->features & CLOCK_EVT_FEAT_DUMMY) {
544 printk(KERN_WARNING 695 printk(KERN_WARNING
545 "APIC timer disabled due to verification failure.\n"); 696 "APIC timer disabled due to verification failure.\n");
697 return -1;
698 }
699
700 return 0;
701}
702
703/*
704 * Setup the boot APIC
705 *
706 * Calibrate and verify the result.
707 */
708void __init setup_boot_APIC_clock(void)
709{
710 /*
711 * The local apic timer can be disabled via the kernel
712 * commandline or from the CPU detection code. Register the lapic
713 * timer as a dummy clock event source on SMP systems, so the
714 * broadcast mechanism is used. On UP systems simply ignore it.
715 */
716 if (disable_apic_timer) {
717 printk(KERN_INFO "Disabling APIC timer\n");
546 /* No broadcast on UP ! */ 718 /* No broadcast on UP ! */
547 if (num_possible_cpus() == 1) 719 if (num_possible_cpus() > 1) {
548 return; 720 lapic_clockevent.mult = 1;
549 } else { 721 setup_APIC_timer();
550 /* 722 }
551 * If nmi_watchdog is set to IO_APIC, we need the 723 return;
552 * PIT/HPET going. Otherwise register lapic as a dummy 724 }
553 * device. 725
554 */ 726 apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"
555 if (nmi_watchdog != NMI_IO_APIC) 727 "calibrating APIC timer ...\n");
556 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; 728
557 else 729 if (calibrate_APIC_clock()) {
558 printk(KERN_WARNING "APIC timer registered as dummy," 730 /* No broadcast on UP ! */
559 " due to nmi_watchdog=%d!\n", nmi_watchdog); 731 if (num_possible_cpus() > 1)
732 setup_APIC_timer();
733 return;
560 } 734 }
561 735
736 /*
737 * If nmi_watchdog is set to IO_APIC, we need the
738 * PIT/HPET going. Otherwise register lapic as a dummy
739 * device.
740 */
741 if (nmi_watchdog != NMI_IO_APIC)
742 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
743 else
744 printk(KERN_WARNING "APIC timer registered as dummy,"
745 " due to nmi_watchdog=%d!\n", nmi_watchdog);
746
562 /* Setup the lapic or request the broadcast */ 747 /* Setup the lapic or request the broadcast */
563 setup_APIC_timer(); 748 setup_APIC_timer();
564} 749}
565 750
566void __devinit setup_secondary_APIC_clock(void) 751void __cpuinit setup_secondary_APIC_clock(void)
567{ 752{
568 setup_APIC_timer(); 753 setup_APIC_timer();
569} 754}
@@ -598,7 +783,11 @@ static void local_apic_timer_interrupt(void)
598 /* 783 /*
599 * the NMI deadlock-detector uses this. 784 * the NMI deadlock-detector uses this.
600 */ 785 */
786#ifdef CONFIG_X86_64
787 add_pda(apic_timer_irqs, 1);
788#else
601 per_cpu(irq_stat, cpu).apic_timer_irqs++; 789 per_cpu(irq_stat, cpu).apic_timer_irqs++;
790#endif
602 791
603 evt->event_handler(evt); 792 evt->event_handler(evt);
604} 793}
@@ -625,6 +814,9 @@ void smp_apic_timer_interrupt(struct pt_regs *regs)
625 * Besides, if we don't timer interrupts ignore the global 814 * Besides, if we don't timer interrupts ignore the global
626 * interrupt lock, which is the WrongThing (tm) to do. 815 * interrupt lock, which is the WrongThing (tm) to do.
627 */ 816 */
817#ifdef CONFIG_X86_64
818 exit_idle();
819#endif
628 irq_enter(); 820 irq_enter();
629 local_apic_timer_interrupt(); 821 local_apic_timer_interrupt();
630 irq_exit(); 822 irq_exit();
@@ -638,35 +830,6 @@ int setup_profiling_timer(unsigned int multiplier)
638} 830}
639 831
640/* 832/*
641 * Setup extended LVT, AMD specific (K8, family 10h)
642 *
643 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
644 * MCE interrupts are supported. Thus MCE offset must be set to 0.
645 */
646
647#define APIC_EILVT_LVTOFF_MCE 0
648#define APIC_EILVT_LVTOFF_IBS 1
649
650static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
651{
652 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
653 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
654 apic_write(reg, v);
655}
656
657u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
658{
659 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
660 return APIC_EILVT_LVTOFF_MCE;
661}
662
663u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
664{
665 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
666 return APIC_EILVT_LVTOFF_IBS;
667}
668
669/*
670 * Local APIC start and shutdown 833 * Local APIC start and shutdown
671 */ 834 */
672 835
@@ -693,45 +856,41 @@ void clear_local_APIC(void)
693 */ 856 */
694 if (maxlvt >= 3) { 857 if (maxlvt >= 3) {
695 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ 858 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
696 apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); 859 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
697 } 860 }
698 /* 861 /*
699 * Careful: we have to set masks only first to deassert 862 * Careful: we have to set masks only first to deassert
700 * any level-triggered sources. 863 * any level-triggered sources.
701 */ 864 */
702 v = apic_read(APIC_LVTT); 865 v = apic_read(APIC_LVTT);
703 apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); 866 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
704 v = apic_read(APIC_LVT0); 867 v = apic_read(APIC_LVT0);
705 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); 868 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
706 v = apic_read(APIC_LVT1); 869 v = apic_read(APIC_LVT1);
707 apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); 870 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
708 if (maxlvt >= 4) { 871 if (maxlvt >= 4) {
709 v = apic_read(APIC_LVTPC); 872 v = apic_read(APIC_LVTPC);
710 apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); 873 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
711 } 874 }
712 875
713 /* lets not touch this if we didn't frob it */ 876 /* lets not touch this if we didn't frob it */
714#ifdef CONFIG_X86_MCE_P4THERMAL 877#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL)
715 if (maxlvt >= 5) { 878 if (maxlvt >= 5) {
716 v = apic_read(APIC_LVTTHMR); 879 v = apic_read(APIC_LVTTHMR);
717 apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); 880 apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED);
718 } 881 }
719#endif 882#endif
720 /* 883 /*
721 * Clean APIC state for other OSs: 884 * Clean APIC state for other OSs:
722 */ 885 */
723 apic_write_around(APIC_LVTT, APIC_LVT_MASKED); 886 apic_write(APIC_LVTT, APIC_LVT_MASKED);
724 apic_write_around(APIC_LVT0, APIC_LVT_MASKED); 887 apic_write(APIC_LVT0, APIC_LVT_MASKED);
725 apic_write_around(APIC_LVT1, APIC_LVT_MASKED); 888 apic_write(APIC_LVT1, APIC_LVT_MASKED);
726 if (maxlvt >= 3) 889 if (maxlvt >= 3)
727 apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); 890 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
728 if (maxlvt >= 4) 891 if (maxlvt >= 4)
729 apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); 892 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
730 893
731#ifdef CONFIG_X86_MCE_P4THERMAL
732 if (maxlvt >= 5)
733 apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED);
734#endif
735 /* Integrated APIC (!82489DX) ? */ 894 /* Integrated APIC (!82489DX) ? */
736 if (lapic_is_integrated()) { 895 if (lapic_is_integrated()) {
737 if (maxlvt > 3) 896 if (maxlvt > 3)
@@ -746,7 +905,7 @@ void clear_local_APIC(void)
746 */ 905 */
747void disable_local_APIC(void) 906void disable_local_APIC(void)
748{ 907{
749 unsigned long value; 908 unsigned int value;
750 909
751 clear_local_APIC(); 910 clear_local_APIC();
752 911
@@ -756,8 +915,9 @@ void disable_local_APIC(void)
756 */ 915 */
757 value = apic_read(APIC_SPIV); 916 value = apic_read(APIC_SPIV);
758 value &= ~APIC_SPIV_APIC_ENABLED; 917 value &= ~APIC_SPIV_APIC_ENABLED;
759 apic_write_around(APIC_SPIV, value); 918 apic_write(APIC_SPIV, value);
760 919
920#ifdef CONFIG_X86_32
761 /* 921 /*
762 * When LAPIC was disabled by the BIOS and enabled by the kernel, 922 * When LAPIC was disabled by the BIOS and enabled by the kernel,
763 * restore the disabled state. 923 * restore the disabled state.
@@ -769,6 +929,7 @@ void disable_local_APIC(void)
769 l &= ~MSR_IA32_APICBASE_ENABLE; 929 l &= ~MSR_IA32_APICBASE_ENABLE;
770 wrmsr(MSR_IA32_APICBASE, l, h); 930 wrmsr(MSR_IA32_APICBASE, l, h);
771 } 931 }
932#endif
772} 933}
773 934
774/* 935/*
@@ -785,11 +946,15 @@ void lapic_shutdown(void)
785 return; 946 return;
786 947
787 local_irq_save(flags); 948 local_irq_save(flags);
788 clear_local_APIC();
789 949
790 if (enabled_via_apicbase) 950#ifdef CONFIG_X86_32
951 if (!enabled_via_apicbase)
952 clear_local_APIC();
953 else
954#endif
791 disable_local_APIC(); 955 disable_local_APIC();
792 956
957
793 local_irq_restore(flags); 958 local_irq_restore(flags);
794} 959}
795 960
@@ -834,6 +999,12 @@ int __init verify_local_APIC(void)
834 */ 999 */
835 reg0 = apic_read(APIC_ID); 1000 reg0 = apic_read(APIC_ID);
836 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); 1001 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
1002 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
1003 reg1 = apic_read(APIC_ID);
1004 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
1005 apic_write(APIC_ID, reg0);
1006 if (reg1 != (reg0 ^ APIC_ID_MASK))
1007 return 0;
837 1008
838 /* 1009 /*
839 * The next two are just to see if we have sane values. 1010 * The next two are just to see if we have sane values.
@@ -859,14 +1030,15 @@ void __init sync_Arb_IDs(void)
859 */ 1030 */
860 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 1031 if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
861 return; 1032 return;
1033
862 /* 1034 /*
863 * Wait for idle. 1035 * Wait for idle.
864 */ 1036 */
865 apic_wait_icr_idle(); 1037 apic_wait_icr_idle();
866 1038
867 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); 1039 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
868 apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG 1040 apic_write(APIC_ICR, APIC_DEST_ALLINC |
869 | APIC_DM_INIT); 1041 APIC_INT_LEVELTRIG | APIC_DM_INIT);
870} 1042}
871 1043
872/* 1044/*
@@ -874,7 +1046,7 @@ void __init sync_Arb_IDs(void)
874 */ 1046 */
875void __init init_bsp_APIC(void) 1047void __init init_bsp_APIC(void)
876{ 1048{
877 unsigned long value; 1049 unsigned int value;
878 1050
879 /* 1051 /*
880 * Don't do the setup now if we have a SMP BIOS as the 1052 * Don't do the setup now if we have a SMP BIOS as the
@@ -895,60 +1067,66 @@ void __init init_bsp_APIC(void)
895 value &= ~APIC_VECTOR_MASK; 1067 value &= ~APIC_VECTOR_MASK;
896 value |= APIC_SPIV_APIC_ENABLED; 1068 value |= APIC_SPIV_APIC_ENABLED;
897 1069
1070#ifdef CONFIG_X86_32
898 /* This bit is reserved on P4/Xeon and should be cleared */ 1071 /* This bit is reserved on P4/Xeon and should be cleared */
899 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && 1072 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
900 (boot_cpu_data.x86 == 15)) 1073 (boot_cpu_data.x86 == 15))
901 value &= ~APIC_SPIV_FOCUS_DISABLED; 1074 value &= ~APIC_SPIV_FOCUS_DISABLED;
902 else 1075 else
1076#endif
903 value |= APIC_SPIV_FOCUS_DISABLED; 1077 value |= APIC_SPIV_FOCUS_DISABLED;
904 value |= SPURIOUS_APIC_VECTOR; 1078 value |= SPURIOUS_APIC_VECTOR;
905 apic_write_around(APIC_SPIV, value); 1079 apic_write(APIC_SPIV, value);
906 1080
907 /* 1081 /*
908 * Set up the virtual wire mode. 1082 * Set up the virtual wire mode.
909 */ 1083 */
910 apic_write_around(APIC_LVT0, APIC_DM_EXTINT); 1084 apic_write(APIC_LVT0, APIC_DM_EXTINT);
911 value = APIC_DM_NMI; 1085 value = APIC_DM_NMI;
912 if (!lapic_is_integrated()) /* 82489DX */ 1086 if (!lapic_is_integrated()) /* 82489DX */
913 value |= APIC_LVT_LEVEL_TRIGGER; 1087 value |= APIC_LVT_LEVEL_TRIGGER;
914 apic_write_around(APIC_LVT1, value); 1088 apic_write(APIC_LVT1, value);
915} 1089}
916 1090
917static void __cpuinit lapic_setup_esr(void) 1091static void __cpuinit lapic_setup_esr(void)
918{ 1092{
919 unsigned long oldvalue, value, maxlvt; 1093 unsigned int oldvalue, value, maxlvt;
920 if (lapic_is_integrated() && !esr_disable) {
921 /* !82489DX */
922 maxlvt = lapic_get_maxlvt();
923 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
924 apic_write(APIC_ESR, 0);
925 oldvalue = apic_read(APIC_ESR);
926 1094
927 /* enables sending errors */ 1095 if (!lapic_is_integrated()) {
928 value = ERROR_APIC_VECTOR; 1096 printk(KERN_INFO "No ESR for 82489DX.\n");
929 apic_write_around(APIC_LVTERR, value); 1097 return;
1098 }
1099
1100 if (esr_disable) {
930 /* 1101 /*
931 * spec says clear errors after enabling vector. 1102 * Something untraceable is creating bad interrupts on
1103 * secondary quads ... for the moment, just leave the
1104 * ESR disabled - we can't do anything useful with the
1105 * errors anyway - mbligh
932 */ 1106 */
933 if (maxlvt > 3) 1107 printk(KERN_INFO "Leaving ESR disabled.\n");
934 apic_write(APIC_ESR, 0); 1108 return;
935 value = apic_read(APIC_ESR);
936 if (value != oldvalue)
937 apic_printk(APIC_VERBOSE, "ESR value before enabling "
938 "vector: 0x%08lx after: 0x%08lx\n",
939 oldvalue, value);
940 } else {
941 if (esr_disable)
942 /*
943 * Something untraceable is creating bad interrupts on
944 * secondary quads ... for the moment, just leave the
945 * ESR disabled - we can't do anything useful with the
946 * errors anyway - mbligh
947 */
948 printk(KERN_INFO "Leaving ESR disabled.\n");
949 else
950 printk(KERN_INFO "No ESR for 82489DX.\n");
951 } 1109 }
1110
1111 maxlvt = lapic_get_maxlvt();
1112 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1113 apic_write(APIC_ESR, 0);
1114 oldvalue = apic_read(APIC_ESR);
1115
1116 /* enables sending errors */
1117 value = ERROR_APIC_VECTOR;
1118 apic_write(APIC_LVTERR, value);
1119
1120 /*
1121 * spec says clear errors after enabling vector.
1122 */
1123 if (maxlvt > 3)
1124 apic_write(APIC_ESR, 0);
1125 value = apic_read(APIC_ESR);
1126 if (value != oldvalue)
1127 apic_printk(APIC_VERBOSE, "ESR value before enabling "
1128 "vector: 0x%08x after: 0x%08x\n",
1129 oldvalue, value);
952} 1130}
953 1131
954 1132
@@ -957,24 +1135,27 @@ static void __cpuinit lapic_setup_esr(void)
957 */ 1135 */
958void __cpuinit setup_local_APIC(void) 1136void __cpuinit setup_local_APIC(void)
959{ 1137{
960 unsigned long value, integrated; 1138 unsigned int value;
961 int i, j; 1139 int i, j;
962 1140
1141#ifdef CONFIG_X86_32
963 /* Pound the ESR really hard over the head with a big hammer - mbligh */ 1142 /* Pound the ESR really hard over the head with a big hammer - mbligh */
964 if (esr_disable) { 1143 if (lapic_is_integrated() && esr_disable) {
965 apic_write(APIC_ESR, 0); 1144 apic_write(APIC_ESR, 0);
966 apic_write(APIC_ESR, 0); 1145 apic_write(APIC_ESR, 0);
967 apic_write(APIC_ESR, 0); 1146 apic_write(APIC_ESR, 0);
968 apic_write(APIC_ESR, 0); 1147 apic_write(APIC_ESR, 0);
969 } 1148 }
1149#endif
970 1150
971 integrated = lapic_is_integrated(); 1151 preempt_disable();
972 1152
973 /* 1153 /*
974 * Double-check whether this APIC is really registered. 1154 * Double-check whether this APIC is really registered.
1155 * This is meaningless in clustered apic mode, so we skip it.
975 */ 1156 */
976 if (!apic_id_registered()) 1157 if (!apic_id_registered())
977 WARN_ON_ONCE(1); 1158 BUG();
978 1159
979 /* 1160 /*
980 * Intel recommends to set DFR, LDR and TPR before enabling 1161 * Intel recommends to set DFR, LDR and TPR before enabling
@@ -989,7 +1170,7 @@ void __cpuinit setup_local_APIC(void)
989 */ 1170 */
990 value = apic_read(APIC_TASKPRI); 1171 value = apic_read(APIC_TASKPRI);
991 value &= ~APIC_TPRI_MASK; 1172 value &= ~APIC_TPRI_MASK;
992 apic_write_around(APIC_TASKPRI, value); 1173 apic_write(APIC_TASKPRI, value);
993 1174
994 /* 1175 /*
995 * After a crash, we no longer service the interrupts and a pending 1176 * After a crash, we no longer service the interrupts and a pending
@@ -1020,6 +1201,7 @@ void __cpuinit setup_local_APIC(void)
1020 */ 1201 */
1021 value |= APIC_SPIV_APIC_ENABLED; 1202 value |= APIC_SPIV_APIC_ENABLED;
1022 1203
1204#ifdef CONFIG_X86_32
1023 /* 1205 /*
1024 * Some unknown Intel IO/APIC (or APIC) errata is biting us with 1206 * Some unknown Intel IO/APIC (or APIC) errata is biting us with
1025 * certain networking cards. If high frequency interrupts are 1207 * certain networking cards. If high frequency interrupts are
@@ -1040,14 +1222,19 @@ void __cpuinit setup_local_APIC(void)
1040 * See also the comment in end_level_ioapic_irq(). --macro 1222 * See also the comment in end_level_ioapic_irq(). --macro
1041 */ 1223 */
1042 1224
1043 /* Enable focus processor (bit==0) */ 1225 /*
1226 * - enable focus processor (bit==0)
1227 * - 64bit mode always use processor focus
1228 * so no need to set it
1229 */
1044 value &= ~APIC_SPIV_FOCUS_DISABLED; 1230 value &= ~APIC_SPIV_FOCUS_DISABLED;
1231#endif
1045 1232
1046 /* 1233 /*
1047 * Set spurious IRQ vector 1234 * Set spurious IRQ vector
1048 */ 1235 */
1049 value |= SPURIOUS_APIC_VECTOR; 1236 value |= SPURIOUS_APIC_VECTOR;
1050 apic_write_around(APIC_SPIV, value); 1237 apic_write(APIC_SPIV, value);
1051 1238
1052 /* 1239 /*
1053 * Set up LVT0, LVT1: 1240 * Set up LVT0, LVT1:
@@ -1069,7 +1256,7 @@ void __cpuinit setup_local_APIC(void)
1069 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", 1256 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
1070 smp_processor_id()); 1257 smp_processor_id());
1071 } 1258 }
1072 apic_write_around(APIC_LVT0, value); 1259 apic_write(APIC_LVT0, value);
1073 1260
1074 /* 1261 /*
1075 * only the BP should see the LINT1 NMI signal, obviously. 1262 * only the BP should see the LINT1 NMI signal, obviously.
@@ -1078,25 +1265,178 @@ void __cpuinit setup_local_APIC(void)
1078 value = APIC_DM_NMI; 1265 value = APIC_DM_NMI;
1079 else 1266 else
1080 value = APIC_DM_NMI | APIC_LVT_MASKED; 1267 value = APIC_DM_NMI | APIC_LVT_MASKED;
1081 if (!integrated) /* 82489DX */ 1268 if (!lapic_is_integrated()) /* 82489DX */
1082 value |= APIC_LVT_LEVEL_TRIGGER; 1269 value |= APIC_LVT_LEVEL_TRIGGER;
1083 apic_write_around(APIC_LVT1, value); 1270 apic_write(APIC_LVT1, value);
1271
1272 preempt_enable();
1084} 1273}
1085 1274
1086void __cpuinit end_local_APIC_setup(void) 1275void __cpuinit end_local_APIC_setup(void)
1087{ 1276{
1088 unsigned long value;
1089
1090 lapic_setup_esr(); 1277 lapic_setup_esr();
1091 /* Disable the local apic timer */ 1278
1092 value = apic_read(APIC_LVTT); 1279#ifdef CONFIG_X86_32
1093 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 1280 {
1094 apic_write_around(APIC_LVTT, value); 1281 unsigned int value;
1282 /* Disable the local apic timer */
1283 value = apic_read(APIC_LVTT);
1284 value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
1285 apic_write(APIC_LVTT, value);
1286 }
1287#endif
1095 1288
1096 setup_apic_nmi_watchdog(NULL); 1289 setup_apic_nmi_watchdog(NULL);
1097 apic_pm_activate(); 1290 apic_pm_activate();
1098} 1291}
1099 1292
1293#ifdef HAVE_X2APIC
1294void check_x2apic(void)
1295{
1296 int msr, msr2;
1297
1298 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1299
1300 if (msr & X2APIC_ENABLE) {
1301 printk("x2apic enabled by BIOS, switching to x2apic ops\n");
1302 x2apic_preenabled = x2apic = 1;
1303 apic_ops = &x2apic_ops;
1304 }
1305}
1306
1307void enable_x2apic(void)
1308{
1309 int msr, msr2;
1310
1311 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1312 if (!(msr & X2APIC_ENABLE)) {
1313 printk("Enabling x2apic\n");
1314 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1315 }
1316}
1317
1318void enable_IR_x2apic(void)
1319{
1320#ifdef CONFIG_INTR_REMAP
1321 int ret;
1322 unsigned long flags;
1323
1324 if (!cpu_has_x2apic)
1325 return;
1326
1327 if (!x2apic_preenabled && disable_x2apic) {
1328 printk(KERN_INFO
1329 "Skipped enabling x2apic and Interrupt-remapping "
1330 "because of nox2apic\n");
1331 return;
1332 }
1333
1334 if (x2apic_preenabled && disable_x2apic)
1335 panic("Bios already enabled x2apic, can't enforce nox2apic");
1336
1337 if (!x2apic_preenabled && skip_ioapic_setup) {
1338 printk(KERN_INFO
1339 "Skipped enabling x2apic and Interrupt-remapping "
1340 "because of skipping io-apic setup\n");
1341 return;
1342 }
1343
1344 ret = dmar_table_init();
1345 if (ret) {
1346 printk(KERN_INFO
1347 "dmar_table_init() failed with %d:\n", ret);
1348
1349 if (x2apic_preenabled)
1350 panic("x2apic enabled by bios. But IR enabling failed");
1351 else
1352 printk(KERN_INFO
1353 "Not enabling x2apic,Intr-remapping\n");
1354 return;
1355 }
1356
1357 local_irq_save(flags);
1358 mask_8259A();
1359
1360 ret = save_mask_IO_APIC_setup();
1361 if (ret) {
1362 printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret);
1363 goto end;
1364 }
1365
1366 ret = enable_intr_remapping(1);
1367
1368 if (ret && x2apic_preenabled) {
1369 local_irq_restore(flags);
1370 panic("x2apic enabled by bios. But IR enabling failed");
1371 }
1372
1373 if (ret)
1374 goto end_restore;
1375
1376 if (!x2apic) {
1377 x2apic = 1;
1378 apic_ops = &x2apic_ops;
1379 enable_x2apic();
1380 }
1381
1382end_restore:
1383 if (ret)
1384 /*
1385 * IR enabling failed
1386 */
1387 restore_IO_APIC_setup();
1388 else
1389 reinit_intr_remapped_IO_APIC(x2apic_preenabled);
1390
1391end:
1392 unmask_8259A();
1393 local_irq_restore(flags);
1394
1395 if (!ret) {
1396 if (!x2apic_preenabled)
1397 printk(KERN_INFO
1398 "Enabled x2apic and interrupt-remapping\n");
1399 else
1400 printk(KERN_INFO
1401 "Enabled Interrupt-remapping\n");
1402 } else
1403 printk(KERN_ERR
1404 "Failed to enable Interrupt-remapping and x2apic\n");
1405#else
1406 if (!cpu_has_x2apic)
1407 return;
1408
1409 if (x2apic_preenabled)
1410 panic("x2apic enabled prior OS handover,"
1411 " enable CONFIG_INTR_REMAP");
1412
1413 printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping "
1414 " and x2apic\n");
1415#endif
1416
1417 return;
1418}
1419#endif /* HAVE_X2APIC */
1420
1421#ifdef CONFIG_X86_64
1422/*
1423 * Detect and enable local APICs on non-SMP boards.
1424 * Original code written by Keir Fraser.
1425 * On AMD64 we trust the BIOS - if it says no APIC it is likely
1426 * not correctly set up (usually the APIC timer won't work etc.)
1427 */
1428static int __init detect_init_APIC(void)
1429{
1430 if (!cpu_has_apic) {
1431 printk(KERN_INFO "No local APIC present\n");
1432 return -1;
1433 }
1434
1435 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1436 boot_cpu_physical_apicid = 0;
1437 return 0;
1438}
1439#else
1100/* 1440/*
1101 * Detect and initialize APIC 1441 * Detect and initialize APIC
1102 */ 1442 */
@@ -1175,12 +1515,46 @@ no_apic:
1175 printk(KERN_INFO "No local APIC present or hardware disabled\n"); 1515 printk(KERN_INFO "No local APIC present or hardware disabled\n");
1176 return -1; 1516 return -1;
1177} 1517}
1518#endif
1519
1520#ifdef CONFIG_X86_64
1521void __init early_init_lapic_mapping(void)
1522{
1523 unsigned long phys_addr;
1524
1525 /*
1526 * If no local APIC can be found then go out
1527 * : it means there is no mpatable and MADT
1528 */
1529 if (!smp_found_config)
1530 return;
1531
1532 phys_addr = mp_lapic_addr;
1533
1534 set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
1535 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
1536 APIC_BASE, phys_addr);
1537
1538 /*
1539 * Fetch the APIC ID of the BSP in case we have a
1540 * default configuration (or the MP table is broken).
1541 */
1542 boot_cpu_physical_apicid = read_apic_id();
1543}
1544#endif
1178 1545
1179/** 1546/**
1180 * init_apic_mappings - initialize APIC mappings 1547 * init_apic_mappings - initialize APIC mappings
1181 */ 1548 */
1182void __init init_apic_mappings(void) 1549void __init init_apic_mappings(void)
1183{ 1550{
1551#ifdef HAVE_X2APIC
1552 if (x2apic) {
1553 boot_cpu_physical_apicid = read_apic_id();
1554 return;
1555 }
1556#endif
1557
1184 /* 1558 /*
1185 * If no local APIC can be found then set up a fake all 1559 * If no local APIC can be found then set up a fake all
1186 * zeroes page to simulate the local APIC and another 1560 * zeroes page to simulate the local APIC and another
@@ -1193,30 +1567,36 @@ void __init init_apic_mappings(void)
1193 apic_phys = mp_lapic_addr; 1567 apic_phys = mp_lapic_addr;
1194 1568
1195 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 1569 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
1196 printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE, 1570 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
1197 apic_phys); 1571 APIC_BASE, apic_phys);
1198 1572
1199 /* 1573 /*
1200 * Fetch the APIC ID of the BSP in case we have a 1574 * Fetch the APIC ID of the BSP in case we have a
1201 * default configuration (or the MP table is broken). 1575 * default configuration (or the MP table is broken).
1202 */ 1576 */
1203 if (boot_cpu_physical_apicid == -1U) 1577 if (boot_cpu_physical_apicid == -1U)
1204 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1578 boot_cpu_physical_apicid = read_apic_id();
1205
1206} 1579}
1207 1580
1208/* 1581/*
1209 * This initializes the IO-APIC and APIC hardware if this is 1582 * This initializes the IO-APIC and APIC hardware if this is
1210 * a UP kernel. 1583 * a UP kernel.
1211 */ 1584 */
1212
1213int apic_version[MAX_APICS]; 1585int apic_version[MAX_APICS];
1214 1586
1215int __init APIC_init_uniprocessor(void) 1587int __init APIC_init_uniprocessor(void)
1216{ 1588{
1217 if (disable_apic) 1589#ifdef CONFIG_X86_64
1218 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1590 if (disable_apic) {
1219 1591 printk(KERN_INFO "Apic disabled\n");
1592 return -1;
1593 }
1594 if (!cpu_has_apic) {
1595 disable_apic = 1;
1596 printk(KERN_INFO "Apic disabled by BIOS\n");
1597 return -1;
1598 }
1599#else
1220 if (!smp_found_config && !cpu_has_apic) 1600 if (!smp_found_config && !cpu_has_apic)
1221 return -1; 1601 return -1;
1222 1602
@@ -1225,39 +1605,68 @@ int __init APIC_init_uniprocessor(void)
1225 */ 1605 */
1226 if (!cpu_has_apic && 1606 if (!cpu_has_apic &&
1227 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { 1607 APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
1228 printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", 1608 printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n",
1229 boot_cpu_physical_apicid); 1609 boot_cpu_physical_apicid);
1230 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 1610 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1231 return -1; 1611 return -1;
1232 } 1612 }
1613#endif
1233 1614
1234 verify_local_APIC(); 1615#ifdef HAVE_X2APIC
1616 enable_IR_x2apic();
1617#endif
1618#ifdef CONFIG_X86_64
1619 setup_apic_routing();
1620#endif
1235 1621
1622 verify_local_APIC();
1236 connect_bsp_APIC(); 1623 connect_bsp_APIC();
1237 1624
1625#ifdef CONFIG_X86_64
1626 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
1627#else
1238 /* 1628 /*
1239 * Hack: In case of kdump, after a crash, kernel might be booting 1629 * Hack: In case of kdump, after a crash, kernel might be booting
1240 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid 1630 * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid
1241 * might be zero if read from MP tables. Get it from LAPIC. 1631 * might be zero if read from MP tables. Get it from LAPIC.
1242 */ 1632 */
1243#ifdef CONFIG_CRASH_DUMP 1633# ifdef CONFIG_CRASH_DUMP
1244 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); 1634 boot_cpu_physical_apicid = read_apic_id();
1635# endif
1245#endif 1636#endif
1246 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 1637 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
1247
1248 setup_local_APIC(); 1638 setup_local_APIC();
1249 1639
1640#ifdef CONFIG_X86_64
1641 /*
1642 * Now enable IO-APICs, actually call clear_IO_APIC
1643 * We need clear_IO_APIC before enabling vector on BP
1644 */
1645 if (!skip_ioapic_setup && nr_ioapics)
1646 enable_IO_APIC();
1647#endif
1648
1250#ifdef CONFIG_X86_IO_APIC 1649#ifdef CONFIG_X86_IO_APIC
1251 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) 1650 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
1252#endif 1651#endif
1253 localise_nmi_watchdog(); 1652 localise_nmi_watchdog();
1254 end_local_APIC_setup(); 1653 end_local_APIC_setup();
1654
1255#ifdef CONFIG_X86_IO_APIC 1655#ifdef CONFIG_X86_IO_APIC
1256 if (smp_found_config) 1656 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1257 if (!skip_ioapic_setup && nr_ioapics) 1657 setup_IO_APIC();
1258 setup_IO_APIC(); 1658# ifdef CONFIG_X86_64
1659 else
1660 nr_ioapics = 0;
1661# endif
1259#endif 1662#endif
1663
1664#ifdef CONFIG_X86_64
1665 setup_boot_APIC_clock();
1666 check_nmi_watchdog();
1667#else
1260 setup_boot_clock(); 1668 setup_boot_clock();
1669#endif
1261 1670
1262 return 0; 1671 return 0;
1263} 1672}
@@ -1271,8 +1680,11 @@ int __init APIC_init_uniprocessor(void)
1271 */ 1680 */
1272void smp_spurious_interrupt(struct pt_regs *regs) 1681void smp_spurious_interrupt(struct pt_regs *regs)
1273{ 1682{
1274 unsigned long v; 1683 u32 v;
1275 1684
1685#ifdef CONFIG_X86_64
1686 exit_idle();
1687#endif
1276 irq_enter(); 1688 irq_enter();
1277 /* 1689 /*
1278 * Check if this really is a spurious interrupt and ACK it 1690 * Check if this really is a spurious interrupt and ACK it
@@ -1283,10 +1695,14 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1283 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) 1695 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
1284 ack_APIC_irq(); 1696 ack_APIC_irq();
1285 1697
1698#ifdef CONFIG_X86_64
1699 add_pda(irq_spurious_count, 1);
1700#else
1286 /* see sw-dev-man vol 3, chapter 7.4.13.5 */ 1701 /* see sw-dev-man vol 3, chapter 7.4.13.5 */
1287 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " 1702 printk(KERN_INFO "spurious APIC interrupt on CPU#%d, "
1288 "should never happen.\n", smp_processor_id()); 1703 "should never happen.\n", smp_processor_id());
1289 __get_cpu_var(irq_stat).irq_spurious_count++; 1704 __get_cpu_var(irq_stat).irq_spurious_count++;
1705#endif
1290 irq_exit(); 1706 irq_exit();
1291} 1707}
1292 1708
@@ -1295,8 +1711,11 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1295 */ 1711 */
1296void smp_error_interrupt(struct pt_regs *regs) 1712void smp_error_interrupt(struct pt_regs *regs)
1297{ 1713{
1298 unsigned long v, v1; 1714 u32 v, v1;
1299 1715
1716#ifdef CONFIG_X86_64
1717 exit_idle();
1718#endif
1300 irq_enter(); 1719 irq_enter();
1301 /* First tickle the hardware, only then report what went on. -- REW */ 1720 /* First tickle the hardware, only then report what went on. -- REW */
1302 v = apic_read(APIC_ESR); 1721 v = apic_read(APIC_ESR);
@@ -1315,64 +1734,17 @@ void smp_error_interrupt(struct pt_regs *regs)
1315 6: Received illegal vector 1734 6: Received illegal vector
1316 7: Illegal register address 1735 7: Illegal register address
1317 */ 1736 */
1318 printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", 1737 printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
1319 smp_processor_id(), v , v1); 1738 smp_processor_id(), v , v1);
1320 irq_exit(); 1739 irq_exit();
1321} 1740}
1322 1741
1323#ifdef CONFIG_SMP
1324void __init smp_intr_init(void)
1325{
1326 /*
1327 * IRQ0 must be given a fixed assignment and initialized,
1328 * because it's used before the IO-APIC is set up.
1329 */
1330 set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]);
1331
1332 /*
1333 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
1334 * IPI, driven by wakeup.
1335 */
1336 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
1337
1338 /* IPI for invalidation */
1339 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
1340
1341 /* IPI for generic function call */
1342 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
1343
1344 /* IPI for single call function */
1345 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
1346 call_function_single_interrupt);
1347}
1348#endif
1349
1350/*
1351 * Initialize APIC interrupts
1352 */
1353void __init apic_intr_init(void)
1354{
1355#ifdef CONFIG_SMP
1356 smp_intr_init();
1357#endif
1358 /* self generated IPI for local APIC timer */
1359 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
1360
1361 /* IPI vectors for APIC spurious and error interrupts */
1362 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
1363 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
1364
1365 /* thermal monitor LVT interrupt */
1366#ifdef CONFIG_X86_MCE_P4THERMAL
1367 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
1368#endif
1369}
1370
1371/** 1742/**
1372 * connect_bsp_APIC - attach the APIC to the interrupt system 1743 * connect_bsp_APIC - attach the APIC to the interrupt system
1373 */ 1744 */
1374void __init connect_bsp_APIC(void) 1745void __init connect_bsp_APIC(void)
1375{ 1746{
1747#ifdef CONFIG_X86_32
1376 if (pic_mode) { 1748 if (pic_mode) {
1377 /* 1749 /*
1378 * Do not trust the local APIC being empty at bootup. 1750 * Do not trust the local APIC being empty at bootup.
@@ -1387,6 +1759,7 @@ void __init connect_bsp_APIC(void)
1387 outb(0x70, 0x22); 1759 outb(0x70, 0x22);
1388 outb(0x01, 0x23); 1760 outb(0x01, 0x23);
1389 } 1761 }
1762#endif
1390 enable_apic_mode(); 1763 enable_apic_mode();
1391} 1764}
1392 1765
@@ -1399,6 +1772,9 @@ void __init connect_bsp_APIC(void)
1399 */ 1772 */
1400void disconnect_bsp_APIC(int virt_wire_setup) 1773void disconnect_bsp_APIC(int virt_wire_setup)
1401{ 1774{
1775 unsigned int value;
1776
1777#ifdef CONFIG_X86_32
1402 if (pic_mode) { 1778 if (pic_mode) {
1403 /* 1779 /*
1404 * Put the board back into PIC mode (has an effect only on 1780 * Put the board back into PIC mode (has an effect only on
@@ -1410,56 +1786,53 @@ void disconnect_bsp_APIC(int virt_wire_setup)
1410 "entering PIC mode.\n"); 1786 "entering PIC mode.\n");
1411 outb(0x70, 0x22); 1787 outb(0x70, 0x22);
1412 outb(0x00, 0x23); 1788 outb(0x00, 0x23);
1413 } else { 1789 return;
1414 /* Go back to Virtual Wire compatibility mode */ 1790 }
1415 unsigned long value; 1791#endif
1416 1792
1417 /* For the spurious interrupt use vector F, and enable it */ 1793 /* Go back to Virtual Wire compatibility mode */
1418 value = apic_read(APIC_SPIV); 1794
1419 value &= ~APIC_VECTOR_MASK; 1795 /* For the spurious interrupt use vector F, and enable it */
1420 value |= APIC_SPIV_APIC_ENABLED; 1796 value = apic_read(APIC_SPIV);
1421 value |= 0xf; 1797 value &= ~APIC_VECTOR_MASK;
1422 apic_write_around(APIC_SPIV, value); 1798 value |= APIC_SPIV_APIC_ENABLED;
1423 1799 value |= 0xf;
1424 if (!virt_wire_setup) { 1800 apic_write(APIC_SPIV, value);
1425 /*
1426 * For LVT0 make it edge triggered, active high,
1427 * external and enabled
1428 */
1429 value = apic_read(APIC_LVT0);
1430 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1431 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1432 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1433 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1434 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1435 apic_write_around(APIC_LVT0, value);
1436 } else {
1437 /* Disable LVT0 */
1438 apic_write_around(APIC_LVT0, APIC_LVT_MASKED);
1439 }
1440 1801
1802 if (!virt_wire_setup) {
1441 /* 1803 /*
1442 * For LVT1 make it edge triggered, active high, nmi and 1804 * For LVT0 make it edge triggered, active high,
1443 * enabled 1805 * external and enabled
1444 */ 1806 */
1445 value = apic_read(APIC_LVT1); 1807 value = apic_read(APIC_LVT0);
1446 value &= ~( 1808 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1447 APIC_MODE_MASK | APIC_SEND_PENDING |
1448 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | 1809 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1449 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); 1810 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1450 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; 1811 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1451 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); 1812 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1452 apic_write_around(APIC_LVT1, value); 1813 apic_write(APIC_LVT0, value);
1814 } else {
1815 /* Disable LVT0 */
1816 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1453 } 1817 }
1454}
1455 1818
1456unsigned int __cpuinitdata maxcpus = NR_CPUS; 1819 /*
1820 * For LVT1 make it edge triggered, active high,
1821 * nmi and enabled
1822 */
1823 value = apic_read(APIC_LVT1);
1824 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1825 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1826 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1827 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1828 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1829 apic_write(APIC_LVT1, value);
1830}
1457 1831
1458void __cpuinit generic_processor_info(int apicid, int version) 1832void __cpuinit generic_processor_info(int apicid, int version)
1459{ 1833{
1460 int cpu; 1834 int cpu;
1461 cpumask_t tmp_map; 1835 cpumask_t tmp_map;
1462 physid_mask_t phys_cpu;
1463 1836
1464 /* 1837 /*
1465 * Validate version 1838 * Validate version
@@ -1472,36 +1845,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
1472 } 1845 }
1473 apic_version[apicid] = version; 1846 apic_version[apicid] = version;
1474 1847
1475 phys_cpu = apicid_to_cpu_present(apicid);
1476 physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu);
1477
1478 if (num_processors >= NR_CPUS) { 1848 if (num_processors >= NR_CPUS) {
1479 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." 1849 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1480 " Processor ignored.\n", NR_CPUS); 1850 " Processor ignored.\n", NR_CPUS);
1481 return; 1851 return;
1482 } 1852 }
1483 1853
1484 if (num_processors >= maxcpus) {
1485 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
1486 " Processor ignored.\n", maxcpus);
1487 return;
1488 }
1489
1490 num_processors++; 1854 num_processors++;
1491 cpus_complement(tmp_map, cpu_present_map); 1855 cpus_complement(tmp_map, cpu_present_map);
1492 cpu = first_cpu(tmp_map); 1856 cpu = first_cpu(tmp_map);
1493 1857
1494 if (apicid == boot_cpu_physical_apicid) 1858 physid_set(apicid, phys_cpu_present_map);
1859 if (apicid == boot_cpu_physical_apicid) {
1495 /* 1860 /*
1496 * x86_bios_cpu_apicid is required to have processors listed 1861 * x86_bios_cpu_apicid is required to have processors listed
1497 * in same order as logical cpu numbers. Hence the first 1862 * in same order as logical cpu numbers. Hence the first
1498 * entry is BSP, and so on. 1863 * entry is BSP, and so on.
1499 */ 1864 */
1500 cpu = 0; 1865 cpu = 0;
1501 1866 }
1502 if (apicid > max_physical_apicid) 1867 if (apicid > max_physical_apicid)
1503 max_physical_apicid = apicid; 1868 max_physical_apicid = apicid;
1504 1869
1870#ifdef CONFIG_X86_32
1505 /* 1871 /*
1506 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y 1872 * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y
1507 * but we need to work other dependencies like SMP_SUSPEND etc 1873 * but we need to work other dependencies like SMP_SUSPEND etc
@@ -1521,7 +1887,9 @@ void __cpuinit generic_processor_info(int apicid, int version)
1521 def_to_bigsmp = 1; 1887 def_to_bigsmp = 1;
1522 } 1888 }
1523 } 1889 }
1524#ifdef CONFIG_SMP 1890#endif
1891
1892#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64)
1525 /* are we being called early in kernel startup? */ 1893 /* are we being called early in kernel startup? */
1526 if (early_per_cpu_ptr(x86_cpu_to_apicid)) { 1894 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1527 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); 1895 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
@@ -1534,16 +1902,29 @@ void __cpuinit generic_processor_info(int apicid, int version)
1534 per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1902 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1535 } 1903 }
1536#endif 1904#endif
1905
1537 cpu_set(cpu, cpu_possible_map); 1906 cpu_set(cpu, cpu_possible_map);
1538 cpu_set(cpu, cpu_present_map); 1907 cpu_set(cpu, cpu_present_map);
1539} 1908}
1540 1909
1910#ifdef CONFIG_X86_64
1911int hard_smp_processor_id(void)
1912{
1913 return read_apic_id();
1914}
1915#endif
1916
1541/* 1917/*
1542 * Power management 1918 * Power management
1543 */ 1919 */
1544#ifdef CONFIG_PM 1920#ifdef CONFIG_PM
1545 1921
1546static struct { 1922static struct {
1923 /*
1924 * 'active' is true if the local APIC was enabled by us and
1925 * not the BIOS; this signifies that we are also responsible
1926 * for disabling it before entering apm/acpi suspend
1927 */
1547 int active; 1928 int active;
1548 /* r/w apic fields */ 1929 /* r/w apic fields */
1549 unsigned int apic_id; 1930 unsigned int apic_id;
@@ -1584,7 +1965,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1584 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); 1965 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1585 apic_pm_state.apic_tmict = apic_read(APIC_TMICT); 1966 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1586 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); 1967 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1587#ifdef CONFIG_X86_MCE_P4THERMAL 1968#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1588 if (maxlvt >= 5) 1969 if (maxlvt >= 5)
1589 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); 1970 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1590#endif 1971#endif
@@ -1608,16 +1989,23 @@ static int lapic_resume(struct sys_device *dev)
1608 1989
1609 local_irq_save(flags); 1990 local_irq_save(flags);
1610 1991
1611 /* 1992#ifdef HAVE_X2APIC
1612 * Make sure the APICBASE points to the right address 1993 if (x2apic)
1613 * 1994 enable_x2apic();
1614 * FIXME! This will be wrong if we ever support suspend on 1995 else
1615 * SMP! We'll need to do this as part of the CPU restore! 1996#endif
1616 */ 1997 {
1617 rdmsr(MSR_IA32_APICBASE, l, h); 1998 /*
1618 l &= ~MSR_IA32_APICBASE_BASE; 1999 * Make sure the APICBASE points to the right address
1619 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; 2000 *
1620 wrmsr(MSR_IA32_APICBASE, l, h); 2001 * FIXME! This will be wrong if we ever support suspend on
2002 * SMP! We'll need to do this as part of the CPU restore!
2003 */
2004 rdmsr(MSR_IA32_APICBASE, l, h);
2005 l &= ~MSR_IA32_APICBASE_BASE;
2006 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
2007 wrmsr(MSR_IA32_APICBASE, l, h);
2008 }
1621 2009
1622 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); 2010 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1623 apic_write(APIC_ID, apic_pm_state.apic_id); 2011 apic_write(APIC_ID, apic_pm_state.apic_id);
@@ -1627,7 +2015,7 @@ static int lapic_resume(struct sys_device *dev)
1627 apic_write(APIC_SPIV, apic_pm_state.apic_spiv); 2015 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1628 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); 2016 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1629 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); 2017 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1630#ifdef CONFIG_X86_MCE_P4THERMAL 2018#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL)
1631 if (maxlvt >= 5) 2019 if (maxlvt >= 5)
1632 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); 2020 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1633#endif 2021#endif
@@ -1641,7 +2029,9 @@ static int lapic_resume(struct sys_device *dev)
1641 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); 2029 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1642 apic_write(APIC_ESR, 0); 2030 apic_write(APIC_ESR, 0);
1643 apic_read(APIC_ESR); 2031 apic_read(APIC_ESR);
2032
1644 local_irq_restore(flags); 2033 local_irq_restore(flags);
2034
1645 return 0; 2035 return 0;
1646} 2036}
1647 2037
@@ -1661,7 +2051,7 @@ static struct sys_device device_lapic = {
1661 .cls = &lapic_sysclass, 2051 .cls = &lapic_sysclass,
1662}; 2052};
1663 2053
1664static void __devinit apic_pm_activate(void) 2054static void __cpuinit apic_pm_activate(void)
1665{ 2055{
1666 apic_pm_state.active = 1; 2056 apic_pm_state.active = 1;
1667} 2057}
@@ -1687,30 +2077,101 @@ static void apic_pm_activate(void) { }
1687 2077
1688#endif /* CONFIG_PM */ 2078#endif /* CONFIG_PM */
1689 2079
2080#ifdef CONFIG_X86_64
1690/* 2081/*
1691 * APIC command line parameters 2082 * apic_is_clustered_box() -- Check if we can expect good TSC
2083 *
2084 * Thus far, the major user of this is IBM's Summit2 series:
2085 *
2086 * Clustered boxes may have unsynced TSC problems if they are
2087 * multi-chassis. Use available data to take a good guess.
2088 * If in doubt, go HPET.
1692 */ 2089 */
1693static int __init parse_lapic(char *arg) 2090__cpuinit int apic_is_clustered_box(void)
1694{ 2091{
1695 force_enable_local_apic = 1; 2092 int i, clusters, zeros;
1696 return 0; 2093 unsigned id;
2094 u16 *bios_cpu_apicid;
2095 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
2096
2097 /*
2098 * there is not this kind of box with AMD CPU yet.
2099 * Some AMD box with quadcore cpu and 8 sockets apicid
2100 * will be [4, 0x23] or [8, 0x27] could be thought to
2101 * vsmp box still need checking...
2102 */
2103 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
2104 return 0;
2105
2106 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
2107 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
2108
2109 for (i = 0; i < NR_CPUS; i++) {
2110 /* are we being called early in kernel startup? */
2111 if (bios_cpu_apicid) {
2112 id = bios_cpu_apicid[i];
2113 }
2114 else if (i < nr_cpu_ids) {
2115 if (cpu_present(i))
2116 id = per_cpu(x86_bios_cpu_apicid, i);
2117 else
2118 continue;
2119 }
2120 else
2121 break;
2122
2123 if (id != BAD_APICID)
2124 __set_bit(APIC_CLUSTERID(id), clustermap);
2125 }
2126
2127 /* Problem: Partially populated chassis may not have CPUs in some of
2128 * the APIC clusters they have been allocated. Only present CPUs have
2129 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
2130 * Since clusters are allocated sequentially, count zeros only if
2131 * they are bounded by ones.
2132 */
2133 clusters = 0;
2134 zeros = 0;
2135 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
2136 if (test_bit(i, clustermap)) {
2137 clusters += 1 + zeros;
2138 zeros = 0;
2139 } else
2140 ++zeros;
2141 }
2142
2143 /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
2144 * not guaranteed to be synced between boards
2145 */
2146 if (is_vsmp_box() && clusters > 1)
2147 return 1;
2148
2149 /*
2150 * If clusters > 2, then should be multi-chassis.
2151 * May have to revisit this when multi-core + hyperthreaded CPUs come
2152 * out, but AFAIK this will work even for them.
2153 */
2154 return (clusters > 2);
1697} 2155}
1698early_param("lapic", parse_lapic); 2156#endif
1699 2157
1700static int __init parse_nolapic(char *arg) 2158/*
2159 * APIC command line parameters
2160 */
2161static int __init setup_disableapic(char *arg)
1701{ 2162{
1702 disable_apic = 1; 2163 disable_apic = 1;
1703 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 2164 setup_clear_cpu_cap(X86_FEATURE_APIC);
1704 return 0; 2165 return 0;
1705} 2166}
1706early_param("nolapic", parse_nolapic); 2167early_param("disableapic", setup_disableapic);
1707 2168
1708static int __init parse_disable_lapic_timer(char *arg) 2169/* same as disableapic, for compatibility */
2170static int __init setup_nolapic(char *arg)
1709{ 2171{
1710 local_apic_timer_disabled = 1; 2172 return setup_disableapic(arg);
1711 return 0;
1712} 2173}
1713early_param("nolapic_timer", parse_disable_lapic_timer); 2174early_param("nolapic", setup_nolapic);
1714 2175
1715static int __init parse_lapic_timer_c2_ok(char *arg) 2176static int __init parse_lapic_timer_c2_ok(char *arg)
1716{ 2177{
@@ -1719,15 +2180,43 @@ static int __init parse_lapic_timer_c2_ok(char *arg)
1719} 2180}
1720early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); 2181early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1721 2182
1722static int __init apic_set_verbosity(char *str) 2183static int __init parse_disable_apic_timer(char *arg)
1723{ 2184{
1724 if (strcmp("debug", str) == 0) 2185 disable_apic_timer = 1;
2186 return 0;
2187}
2188early_param("noapictimer", parse_disable_apic_timer);
2189
2190static int __init parse_nolapic_timer(char *arg)
2191{
2192 disable_apic_timer = 1;
2193 return 0;
2194}
2195early_param("nolapic_timer", parse_nolapic_timer);
2196
2197static int __init apic_set_verbosity(char *arg)
2198{
2199 if (!arg) {
2200#ifdef CONFIG_X86_64
2201 skip_ioapic_setup = 0;
2202 return 0;
2203#endif
2204 return -EINVAL;
2205 }
2206
2207 if (strcmp("debug", arg) == 0)
1725 apic_verbosity = APIC_DEBUG; 2208 apic_verbosity = APIC_DEBUG;
1726 else if (strcmp("verbose", str) == 0) 2209 else if (strcmp("verbose", arg) == 0)
1727 apic_verbosity = APIC_VERBOSE; 2210 apic_verbosity = APIC_VERBOSE;
1728 return 1; 2211 else {
2212 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
2213 " use apic=verbose or apic=debug\n", arg);
2214 return -EINVAL;
2215 }
2216
2217 return 0;
1729} 2218}
1730__setup("apic=", apic_set_verbosity); 2219early_param("apic", apic_set_verbosity);
1731 2220
1732static int __init lapic_insert_resource(void) 2221static int __init lapic_insert_resource(void)
1733{ 2222{
diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c
deleted file mode 100644
index 1e3d32e27c14..000000000000
--- a/arch/x86/kernel/apic_64.c
+++ /dev/null
@@ -1,1393 +0,0 @@
1/*
2 * Local APIC handling, local APIC timers
3 *
4 * (c) 1999, 2000 Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes
7 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
8 * thanks to Eric Gilmore
9 * and Rolf G. Tews
10 * for testing these extensively.
11 * Maciej W. Rozycki : Various updates and fixes.
12 * Mikael Pettersson : Power Management for UP-APIC.
13 * Pavel Machek and
14 * Mikael Pettersson : PM converted to driver model.
15 */
16
17#include <linux/init.h>
18
19#include <linux/mm.h>
20#include <linux/delay.h>
21#include <linux/bootmem.h>
22#include <linux/interrupt.h>
23#include <linux/mc146818rtc.h>
24#include <linux/kernel_stat.h>
25#include <linux/sysdev.h>
26#include <linux/ioport.h>
27#include <linux/clockchips.h>
28#include <linux/acpi_pmtmr.h>
29#include <linux/module.h>
30
31#include <asm/atomic.h>
32#include <asm/smp.h>
33#include <asm/mtrr.h>
34#include <asm/mpspec.h>
35#include <asm/hpet.h>
36#include <asm/pgalloc.h>
37#include <asm/nmi.h>
38#include <asm/idle.h>
39#include <asm/proto.h>
40#include <asm/timex.h>
41#include <asm/apic.h>
42
43#include <mach_ipi.h>
44#include <mach_apic.h>
45
46static int disable_apic_timer __cpuinitdata;
47static int apic_calibrate_pmtmr __initdata;
48int disable_apic;
49
50/* Local APIC timer works in C2 */
51int local_apic_timer_c2_ok;
52EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
53
54/*
55 * Debug level, exported for io_apic.c
56 */
57int apic_verbosity;
58
59/* Have we found an MP table */
60int smp_found_config;
61
62static struct resource lapic_resource = {
63 .name = "Local APIC",
64 .flags = IORESOURCE_MEM | IORESOURCE_BUSY,
65};
66
67static unsigned int calibration_result;
68
69static int lapic_next_event(unsigned long delta,
70 struct clock_event_device *evt);
71static void lapic_timer_setup(enum clock_event_mode mode,
72 struct clock_event_device *evt);
73static void lapic_timer_broadcast(cpumask_t mask);
74static void apic_pm_activate(void);
75
76static struct clock_event_device lapic_clockevent = {
77 .name = "lapic",
78 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
79 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
80 .shift = 32,
81 .set_mode = lapic_timer_setup,
82 .set_next_event = lapic_next_event,
83 .broadcast = lapic_timer_broadcast,
84 .rating = 100,
85 .irq = -1,
86};
87static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
88
89static unsigned long apic_phys;
90
91unsigned long mp_lapic_addr;
92
93unsigned int __cpuinitdata maxcpus = NR_CPUS;
94/*
95 * Get the LAPIC version
96 */
97static inline int lapic_get_version(void)
98{
99 return GET_APIC_VERSION(apic_read(APIC_LVR));
100}
101
102/*
103 * Check, if the APIC is integrated or a seperate chip
104 */
105static inline int lapic_is_integrated(void)
106{
107 return 1;
108}
109
110/*
111 * Check, whether this is a modern or a first generation APIC
112 */
113static int modern_apic(void)
114{
115 /* AMD systems use old APIC versions, so check the CPU */
116 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
117 boot_cpu_data.x86 >= 0xf)
118 return 1;
119 return lapic_get_version() >= 0x14;
120}
121
122void apic_wait_icr_idle(void)
123{
124 while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
125 cpu_relax();
126}
127
128u32 safe_apic_wait_icr_idle(void)
129{
130 u32 send_status;
131 int timeout;
132
133 timeout = 0;
134 do {
135 send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY;
136 if (!send_status)
137 break;
138 udelay(100);
139 } while (timeout++ < 1000);
140
141 return send_status;
142}
143
144/**
145 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
146 */
147void __cpuinit enable_NMI_through_LVT0(void)
148{
149 unsigned int v;
150
151 /* unmask and set to NMI */
152 v = APIC_DM_NMI;
153 apic_write(APIC_LVT0, v);
154}
155
156/**
157 * lapic_get_maxlvt - get the maximum number of local vector table entries
158 */
159int lapic_get_maxlvt(void)
160{
161 unsigned int v, maxlvt;
162
163 v = apic_read(APIC_LVR);
164 maxlvt = GET_APIC_MAXLVT(v);
165 return maxlvt;
166}
167
168/*
169 * This function sets up the local APIC timer, with a timeout of
170 * 'clocks' APIC bus clock. During calibration we actually call
171 * this function twice on the boot CPU, once with a bogus timeout
172 * value, second time for real. The other (noncalibrating) CPUs
173 * call this function only once, with the real, calibrated value.
174 *
175 * We do reads before writes even if unnecessary, to get around the
176 * P5 APIC double write bug.
177 */
178
179static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
180{
181 unsigned int lvtt_value, tmp_value;
182
183 lvtt_value = LOCAL_TIMER_VECTOR;
184 if (!oneshot)
185 lvtt_value |= APIC_LVT_TIMER_PERIODIC;
186 if (!irqen)
187 lvtt_value |= APIC_LVT_MASKED;
188
189 apic_write(APIC_LVTT, lvtt_value);
190
191 /*
192 * Divide PICLK by 16
193 */
194 tmp_value = apic_read(APIC_TDCR);
195 apic_write(APIC_TDCR, (tmp_value
196 & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
197 | APIC_TDR_DIV_16);
198
199 if (!oneshot)
200 apic_write(APIC_TMICT, clocks);
201}
202
203/*
204 * Setup extended LVT, AMD specific (K8, family 10h)
205 *
206 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and
207 * MCE interrupts are supported. Thus MCE offset must be set to 0.
208 */
209
210#define APIC_EILVT_LVTOFF_MCE 0
211#define APIC_EILVT_LVTOFF_IBS 1
212
213static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
214{
215 unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
216 unsigned int v = (mask << 16) | (msg_type << 8) | vector;
217
218 apic_write(reg, v);
219}
220
221u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask)
222{
223 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask);
224 return APIC_EILVT_LVTOFF_MCE;
225}
226
227u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask)
228{
229 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask);
230 return APIC_EILVT_LVTOFF_IBS;
231}
232
233/*
234 * Program the next event, relative to now
235 */
236static int lapic_next_event(unsigned long delta,
237 struct clock_event_device *evt)
238{
239 apic_write(APIC_TMICT, delta);
240 return 0;
241}
242
243/*
244 * Setup the lapic timer in periodic or oneshot mode
245 */
246static void lapic_timer_setup(enum clock_event_mode mode,
247 struct clock_event_device *evt)
248{
249 unsigned long flags;
250 unsigned int v;
251
252 /* Lapic used as dummy for broadcast ? */
253 if (evt->features & CLOCK_EVT_FEAT_DUMMY)
254 return;
255
256 local_irq_save(flags);
257
258 switch (mode) {
259 case CLOCK_EVT_MODE_PERIODIC:
260 case CLOCK_EVT_MODE_ONESHOT:
261 __setup_APIC_LVTT(calibration_result,
262 mode != CLOCK_EVT_MODE_PERIODIC, 1);
263 break;
264 case CLOCK_EVT_MODE_UNUSED:
265 case CLOCK_EVT_MODE_SHUTDOWN:
266 v = apic_read(APIC_LVTT);
267 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
268 apic_write(APIC_LVTT, v);
269 break;
270 case CLOCK_EVT_MODE_RESUME:
271 /* Nothing to do here */
272 break;
273 }
274
275 local_irq_restore(flags);
276}
277
278/*
279 * Local APIC timer broadcast function
280 */
281static void lapic_timer_broadcast(cpumask_t mask)
282{
283#ifdef CONFIG_SMP
284 send_IPI_mask(mask, LOCAL_TIMER_VECTOR);
285#endif
286}
287
288/*
289 * Setup the local APIC timer for this CPU. Copy the initilized values
290 * of the boot CPU and register the clock event in the framework.
291 */
292static void setup_APIC_timer(void)
293{
294 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
295
296 memcpy(levt, &lapic_clockevent, sizeof(*levt));
297 levt->cpumask = cpumask_of_cpu(smp_processor_id());
298
299 clockevents_register_device(levt);
300}
301
302/*
303 * In this function we calibrate APIC bus clocks to the external
304 * timer. Unfortunately we cannot use jiffies and the timer irq
305 * to calibrate, since some later bootup code depends on getting
306 * the first irq? Ugh.
307 *
308 * We want to do the calibration only once since we
309 * want to have local timer irqs syncron. CPUs connected
310 * by the same APIC bus have the very same bus frequency.
311 * And we want to have irqs off anyways, no accidental
312 * APIC irq that way.
313 */
314
315#define TICK_COUNT 100000000
316
317static void __init calibrate_APIC_clock(void)
318{
319 unsigned apic, apic_start;
320 unsigned long tsc, tsc_start;
321 int result;
322
323 local_irq_disable();
324
325 /*
326 * Put whatever arbitrary (but long enough) timeout
327 * value into the APIC clock, we just want to get the
328 * counter running for calibration.
329 *
330 * No interrupt enable !
331 */
332 __setup_APIC_LVTT(250000000, 0, 0);
333
334 apic_start = apic_read(APIC_TMCCT);
335#ifdef CONFIG_X86_PM_TIMER
336 if (apic_calibrate_pmtmr && pmtmr_ioport) {
337 pmtimer_wait(5000); /* 5ms wait */
338 apic = apic_read(APIC_TMCCT);
339 result = (apic_start - apic) * 1000L / 5;
340 } else
341#endif
342 {
343 rdtscll(tsc_start);
344
345 do {
346 apic = apic_read(APIC_TMCCT);
347 rdtscll(tsc);
348 } while ((tsc - tsc_start) < TICK_COUNT &&
349 (apic_start - apic) < TICK_COUNT);
350
351 result = (apic_start - apic) * 1000L * tsc_khz /
352 (tsc - tsc_start);
353 }
354
355 local_irq_enable();
356
357 printk(KERN_DEBUG "APIC timer calibration result %d\n", result);
358
359 printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n",
360 result / 1000 / 1000, result / 1000 % 1000);
361
362 /* Calculate the scaled math multiplication factor */
363 lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC,
364 lapic_clockevent.shift);
365 lapic_clockevent.max_delta_ns =
366 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
367 lapic_clockevent.min_delta_ns =
368 clockevent_delta2ns(0xF, &lapic_clockevent);
369
370 calibration_result = result / HZ;
371}
372
373/*
374 * Setup the boot APIC
375 *
376 * Calibrate and verify the result.
377 */
378void __init setup_boot_APIC_clock(void)
379{
380 /*
381 * The local apic timer can be disabled via the kernel commandline.
382 * Register the lapic timer as a dummy clock event source on SMP
383 * systems, so the broadcast mechanism is used. On UP systems simply
384 * ignore it.
385 */
386 if (disable_apic_timer) {
387 printk(KERN_INFO "Disabling APIC timer\n");
388 /* No broadcast on UP ! */
389 if (num_possible_cpus() > 1) {
390 lapic_clockevent.mult = 1;
391 setup_APIC_timer();
392 }
393 return;
394 }
395
396 printk(KERN_INFO "Using local APIC timer interrupts.\n");
397 calibrate_APIC_clock();
398
399 /*
400 * Do a sanity check on the APIC calibration result
401 */
402 if (calibration_result < (1000000 / HZ)) {
403 printk(KERN_WARNING
404 "APIC frequency too slow, disabling apic timer\n");
405 /* No broadcast on UP ! */
406 if (num_possible_cpus() > 1)
407 setup_APIC_timer();
408 return;
409 }
410
411 /*
412 * If nmi_watchdog is set to IO_APIC, we need the
413 * PIT/HPET going. Otherwise register lapic as a dummy
414 * device.
415 */
416 if (nmi_watchdog != NMI_IO_APIC)
417 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
418 else
419 printk(KERN_WARNING "APIC timer registered as dummy,"
420 " due to nmi_watchdog=%d!\n", nmi_watchdog);
421
422 setup_APIC_timer();
423}
424
425void __cpuinit setup_secondary_APIC_clock(void)
426{
427 setup_APIC_timer();
428}
429
430/*
431 * The guts of the apic timer interrupt
432 */
433static void local_apic_timer_interrupt(void)
434{
435 int cpu = smp_processor_id();
436 struct clock_event_device *evt = &per_cpu(lapic_events, cpu);
437
438 /*
439 * Normally we should not be here till LAPIC has been initialized but
440 * in some cases like kdump, its possible that there is a pending LAPIC
441 * timer interrupt from previous kernel's context and is delivered in
442 * new kernel the moment interrupts are enabled.
443 *
444 * Interrupts are enabled early and LAPIC is setup much later, hence
445 * its possible that when we get here evt->event_handler is NULL.
446 * Check for event_handler being NULL and discard the interrupt as
447 * spurious.
448 */
449 if (!evt->event_handler) {
450 printk(KERN_WARNING
451 "Spurious LAPIC timer interrupt on cpu %d\n", cpu);
452 /* Switch it off */
453 lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt);
454 return;
455 }
456
457 /*
458 * the NMI deadlock-detector uses this.
459 */
460 add_pda(apic_timer_irqs, 1);
461
462 evt->event_handler(evt);
463}
464
465/*
466 * Local APIC timer interrupt. This is the most natural way for doing
467 * local interrupts, but local timer interrupts can be emulated by
468 * broadcast interrupts too. [in case the hw doesn't support APIC timers]
469 *
470 * [ if a single-CPU system runs an SMP kernel then we call the local
471 * interrupt as well. Thus we cannot inline the local irq ... ]
472 */
473void smp_apic_timer_interrupt(struct pt_regs *regs)
474{
475 struct pt_regs *old_regs = set_irq_regs(regs);
476
477 /*
478 * NOTE! We'd better ACK the irq immediately,
479 * because timer handling can be slow.
480 */
481 ack_APIC_irq();
482 /*
483 * update_process_times() expects us to have done irq_enter().
484 * Besides, if we don't timer interrupts ignore the global
485 * interrupt lock, which is the WrongThing (tm) to do.
486 */
487 exit_idle();
488 irq_enter();
489 local_apic_timer_interrupt();
490 irq_exit();
491 set_irq_regs(old_regs);
492}
493
494int setup_profiling_timer(unsigned int multiplier)
495{
496 return -EINVAL;
497}
498
499
500/*
501 * Local APIC start and shutdown
502 */
503
504/**
505 * clear_local_APIC - shutdown the local APIC
506 *
507 * This is called, when a CPU is disabled and before rebooting, so the state of
508 * the local APIC has no dangling leftovers. Also used to cleanout any BIOS
509 * leftovers during boot.
510 */
511void clear_local_APIC(void)
512{
513 int maxlvt;
514 u32 v;
515
516 /* APIC hasn't been mapped yet */
517 if (!apic_phys)
518 return;
519
520 maxlvt = lapic_get_maxlvt();
521 /*
522 * Masking an LVT entry can trigger a local APIC error
523 * if the vector is zero. Mask LVTERR first to prevent this.
524 */
525 if (maxlvt >= 3) {
526 v = ERROR_APIC_VECTOR; /* any non-zero vector will do */
527 apic_write(APIC_LVTERR, v | APIC_LVT_MASKED);
528 }
529 /*
530 * Careful: we have to set masks only first to deassert
531 * any level-triggered sources.
532 */
533 v = apic_read(APIC_LVTT);
534 apic_write(APIC_LVTT, v | APIC_LVT_MASKED);
535 v = apic_read(APIC_LVT0);
536 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
537 v = apic_read(APIC_LVT1);
538 apic_write(APIC_LVT1, v | APIC_LVT_MASKED);
539 if (maxlvt >= 4) {
540 v = apic_read(APIC_LVTPC);
541 apic_write(APIC_LVTPC, v | APIC_LVT_MASKED);
542 }
543
544 /*
545 * Clean APIC state for other OSs:
546 */
547 apic_write(APIC_LVTT, APIC_LVT_MASKED);
548 apic_write(APIC_LVT0, APIC_LVT_MASKED);
549 apic_write(APIC_LVT1, APIC_LVT_MASKED);
550 if (maxlvt >= 3)
551 apic_write(APIC_LVTERR, APIC_LVT_MASKED);
552 if (maxlvt >= 4)
553 apic_write(APIC_LVTPC, APIC_LVT_MASKED);
554 apic_write(APIC_ESR, 0);
555 apic_read(APIC_ESR);
556}
557
558/**
559 * disable_local_APIC - clear and disable the local APIC
560 */
561void disable_local_APIC(void)
562{
563 unsigned int value;
564
565 clear_local_APIC();
566
567 /*
568 * Disable APIC (implies clearing of registers
569 * for 82489DX!).
570 */
571 value = apic_read(APIC_SPIV);
572 value &= ~APIC_SPIV_APIC_ENABLED;
573 apic_write(APIC_SPIV, value);
574}
575
576void lapic_shutdown(void)
577{
578 unsigned long flags;
579
580 if (!cpu_has_apic)
581 return;
582
583 local_irq_save(flags);
584
585 disable_local_APIC();
586
587 local_irq_restore(flags);
588}
589
590/*
591 * This is to verify that we're looking at a real local APIC.
592 * Check these against your board if the CPUs aren't getting
593 * started for no apparent reason.
594 */
595int __init verify_local_APIC(void)
596{
597 unsigned int reg0, reg1;
598
599 /*
600 * The version register is read-only in a real APIC.
601 */
602 reg0 = apic_read(APIC_LVR);
603 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
604 apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
605 reg1 = apic_read(APIC_LVR);
606 apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
607
608 /*
609 * The two version reads above should print the same
610 * numbers. If the second one is different, then we
611 * poke at a non-APIC.
612 */
613 if (reg1 != reg0)
614 return 0;
615
616 /*
617 * Check if the version looks reasonably.
618 */
619 reg1 = GET_APIC_VERSION(reg0);
620 if (reg1 == 0x00 || reg1 == 0xff)
621 return 0;
622 reg1 = lapic_get_maxlvt();
623 if (reg1 < 0x02 || reg1 == 0xff)
624 return 0;
625
626 /*
627 * The ID register is read/write in a real APIC.
628 */
629 reg0 = read_apic_id();
630 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
631 apic_write(APIC_ID, reg0 ^ APIC_ID_MASK);
632 reg1 = read_apic_id();
633 apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
634 apic_write(APIC_ID, reg0);
635 if (reg1 != (reg0 ^ APIC_ID_MASK))
636 return 0;
637
638 /*
639 * The next two are just to see if we have sane values.
640 * They're only really relevant if we're in Virtual Wire
641 * compatibility mode, but most boxes are anymore.
642 */
643 reg0 = apic_read(APIC_LVT0);
644 apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
645 reg1 = apic_read(APIC_LVT1);
646 apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
647
648 return 1;
649}
650
651/**
652 * sync_Arb_IDs - synchronize APIC bus arbitration IDs
653 */
654void __init sync_Arb_IDs(void)
655{
656 /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */
657 if (modern_apic())
658 return;
659
660 /*
661 * Wait for idle.
662 */
663 apic_wait_icr_idle();
664
665 apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n");
666 apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG
667 | APIC_DM_INIT);
668}
669
670/*
671 * An initial setup of the virtual wire mode.
672 */
673void __init init_bsp_APIC(void)
674{
675 unsigned int value;
676
677 /*
678 * Don't do the setup now if we have a SMP BIOS as the
679 * through-I/O-APIC virtual wire mode might be active.
680 */
681 if (smp_found_config || !cpu_has_apic)
682 return;
683
684 value = apic_read(APIC_LVR);
685
686 /*
687 * Do not trust the local APIC being empty at bootup.
688 */
689 clear_local_APIC();
690
691 /*
692 * Enable APIC.
693 */
694 value = apic_read(APIC_SPIV);
695 value &= ~APIC_VECTOR_MASK;
696 value |= APIC_SPIV_APIC_ENABLED;
697 value |= APIC_SPIV_FOCUS_DISABLED;
698 value |= SPURIOUS_APIC_VECTOR;
699 apic_write(APIC_SPIV, value);
700
701 /*
702 * Set up the virtual wire mode.
703 */
704 apic_write(APIC_LVT0, APIC_DM_EXTINT);
705 value = APIC_DM_NMI;
706 apic_write(APIC_LVT1, value);
707}
708
709/**
710 * setup_local_APIC - setup the local APIC
711 */
712void __cpuinit setup_local_APIC(void)
713{
714 unsigned int value;
715 int i, j;
716
717 preempt_disable();
718 value = apic_read(APIC_LVR);
719
720 BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f);
721
722 /*
723 * Double-check whether this APIC is really registered.
724 * This is meaningless in clustered apic mode, so we skip it.
725 */
726 if (!apic_id_registered())
727 BUG();
728
729 /*
730 * Intel recommends to set DFR, LDR and TPR before enabling
731 * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel
732 * document number 292116). So here it goes...
733 */
734 init_apic_ldr();
735
736 /*
737 * Set Task Priority to 'accept all'. We never change this
738 * later on.
739 */
740 value = apic_read(APIC_TASKPRI);
741 value &= ~APIC_TPRI_MASK;
742 apic_write(APIC_TASKPRI, value);
743
744 /*
745 * After a crash, we no longer service the interrupts and a pending
746 * interrupt from previous kernel might still have ISR bit set.
747 *
748 * Most probably by now CPU has serviced that pending interrupt and
749 * it might not have done the ack_APIC_irq() because it thought,
750 * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it
751 * does not clear the ISR bit and cpu thinks it has already serivced
752 * the interrupt. Hence a vector might get locked. It was noticed
753 * for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
754 */
755 for (i = APIC_ISR_NR - 1; i >= 0; i--) {
756 value = apic_read(APIC_ISR + i*0x10);
757 for (j = 31; j >= 0; j--) {
758 if (value & (1<<j))
759 ack_APIC_irq();
760 }
761 }
762
763 /*
764 * Now that we are all set up, enable the APIC
765 */
766 value = apic_read(APIC_SPIV);
767 value &= ~APIC_VECTOR_MASK;
768 /*
769 * Enable APIC
770 */
771 value |= APIC_SPIV_APIC_ENABLED;
772
773 /* We always use processor focus */
774
775 /*
776 * Set spurious IRQ vector
777 */
778 value |= SPURIOUS_APIC_VECTOR;
779 apic_write(APIC_SPIV, value);
780
781 /*
782 * Set up LVT0, LVT1:
783 *
784 * set up through-local-APIC on the BP's LINT0. This is not
785 * strictly necessary in pure symmetric-IO mode, but sometimes
786 * we delegate interrupts to the 8259A.
787 */
788 /*
789 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
790 */
791 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
792 if (!smp_processor_id() && !value) {
793 value = APIC_DM_EXTINT;
794 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n",
795 smp_processor_id());
796 } else {
797 value = APIC_DM_EXTINT | APIC_LVT_MASKED;
798 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n",
799 smp_processor_id());
800 }
801 apic_write(APIC_LVT0, value);
802
803 /*
804 * only the BP should see the LINT1 NMI signal, obviously.
805 */
806 if (!smp_processor_id())
807 value = APIC_DM_NMI;
808 else
809 value = APIC_DM_NMI | APIC_LVT_MASKED;
810 apic_write(APIC_LVT1, value);
811 preempt_enable();
812}
813
814static void __cpuinit lapic_setup_esr(void)
815{
816 unsigned maxlvt = lapic_get_maxlvt();
817
818 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR);
819 /*
820 * spec says clear errors after enabling vector.
821 */
822 if (maxlvt > 3)
823 apic_write(APIC_ESR, 0);
824}
825
826void __cpuinit end_local_APIC_setup(void)
827{
828 lapic_setup_esr();
829 setup_apic_nmi_watchdog(NULL);
830 apic_pm_activate();
831}
832
833/*
834 * Detect and enable local APICs on non-SMP boards.
835 * Original code written by Keir Fraser.
836 * On AMD64 we trust the BIOS - if it says no APIC it is likely
837 * not correctly set up (usually the APIC timer won't work etc.)
838 */
839static int __init detect_init_APIC(void)
840{
841 if (!cpu_has_apic) {
842 printk(KERN_INFO "No local APIC present\n");
843 return -1;
844 }
845
846 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
847 boot_cpu_physical_apicid = 0;
848 return 0;
849}
850
851void __init early_init_lapic_mapping(void)
852{
853 unsigned long phys_addr;
854
855 /*
856 * If no local APIC can be found then go out
857 * : it means there is no mpatable and MADT
858 */
859 if (!smp_found_config)
860 return;
861
862 phys_addr = mp_lapic_addr;
863
864 set_fixmap_nocache(FIX_APIC_BASE, phys_addr);
865 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
866 APIC_BASE, phys_addr);
867
868 /*
869 * Fetch the APIC ID of the BSP in case we have a
870 * default configuration (or the MP table is broken).
871 */
872 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
873}
874
875/**
876 * init_apic_mappings - initialize APIC mappings
877 */
878void __init init_apic_mappings(void)
879{
880 /*
881 * If no local APIC can be found then set up a fake all
882 * zeroes page to simulate the local APIC and another
883 * one for the IO-APIC.
884 */
885 if (!smp_found_config && detect_init_APIC()) {
886 apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
887 apic_phys = __pa(apic_phys);
888 } else
889 apic_phys = mp_lapic_addr;
890
891 set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
892 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
893 APIC_BASE, apic_phys);
894
895 /*
896 * Fetch the APIC ID of the BSP in case we have a
897 * default configuration (or the MP table is broken).
898 */
899 boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id());
900}
901
902/*
903 * This initializes the IO-APIC and APIC hardware if this is
904 * a UP kernel.
905 */
906int __init APIC_init_uniprocessor(void)
907{
908 if (disable_apic) {
909 printk(KERN_INFO "Apic disabled\n");
910 return -1;
911 }
912 if (!cpu_has_apic) {
913 disable_apic = 1;
914 printk(KERN_INFO "Apic disabled by BIOS\n");
915 return -1;
916 }
917
918 verify_local_APIC();
919
920 connect_bsp_APIC();
921
922 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
923 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid));
924
925 setup_local_APIC();
926
927 /*
928 * Now enable IO-APICs, actually call clear_IO_APIC
929 * We need clear_IO_APIC before enabling vector on BP
930 */
931 if (!skip_ioapic_setup && nr_ioapics)
932 enable_IO_APIC();
933
934 if (!smp_found_config || skip_ioapic_setup || !nr_ioapics)
935 localise_nmi_watchdog();
936 end_local_APIC_setup();
937
938 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
939 setup_IO_APIC();
940 else
941 nr_ioapics = 0;
942 setup_boot_APIC_clock();
943 check_nmi_watchdog();
944 return 0;
945}
946
947/*
948 * Local APIC interrupts
949 */
950
951/*
952 * This interrupt should _never_ happen with our APIC/SMP architecture
953 */
954asmlinkage void smp_spurious_interrupt(void)
955{
956 unsigned int v;
957 exit_idle();
958 irq_enter();
959 /*
960 * Check if this really is a spurious interrupt and ACK it
961 * if it is a vectored one. Just in case...
962 * Spurious interrupts should not be ACKed.
963 */
964 v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1));
965 if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f)))
966 ack_APIC_irq();
967
968 add_pda(irq_spurious_count, 1);
969 irq_exit();
970}
971
972/*
973 * This interrupt should never happen with our APIC/SMP architecture
974 */
975asmlinkage void smp_error_interrupt(void)
976{
977 unsigned int v, v1;
978
979 exit_idle();
980 irq_enter();
981 /* First tickle the hardware, only then report what went on. -- REW */
982 v = apic_read(APIC_ESR);
983 apic_write(APIC_ESR, 0);
984 v1 = apic_read(APIC_ESR);
985 ack_APIC_irq();
986 atomic_inc(&irq_err_count);
987
988 /* Here is what the APIC error bits mean:
989 0: Send CS error
990 1: Receive CS error
991 2: Send accept error
992 3: Receive accept error
993 4: Reserved
994 5: Send illegal vector
995 6: Received illegal vector
996 7: Illegal register address
997 */
998 printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n",
999 smp_processor_id(), v , v1);
1000 irq_exit();
1001}
1002
1003/**
1004 * * connect_bsp_APIC - attach the APIC to the interrupt system
1005 * */
1006void __init connect_bsp_APIC(void)
1007{
1008 enable_apic_mode();
1009}
1010
1011void disconnect_bsp_APIC(int virt_wire_setup)
1012{
1013 /* Go back to Virtual Wire compatibility mode */
1014 unsigned long value;
1015
1016 /* For the spurious interrupt use vector F, and enable it */
1017 value = apic_read(APIC_SPIV);
1018 value &= ~APIC_VECTOR_MASK;
1019 value |= APIC_SPIV_APIC_ENABLED;
1020 value |= 0xf;
1021 apic_write(APIC_SPIV, value);
1022
1023 if (!virt_wire_setup) {
1024 /*
1025 * For LVT0 make it edge triggered, active high,
1026 * external and enabled
1027 */
1028 value = apic_read(APIC_LVT0);
1029 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1030 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1031 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1032 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1033 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT);
1034 apic_write(APIC_LVT0, value);
1035 } else {
1036 /* Disable LVT0 */
1037 apic_write(APIC_LVT0, APIC_LVT_MASKED);
1038 }
1039
1040 /* For LVT1 make it edge triggered, active high, nmi and enabled */
1041 value = apic_read(APIC_LVT1);
1042 value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING |
1043 APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR |
1044 APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED);
1045 value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING;
1046 value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI);
1047 apic_write(APIC_LVT1, value);
1048}
1049
1050void __cpuinit generic_processor_info(int apicid, int version)
1051{
1052 int cpu;
1053 cpumask_t tmp_map;
1054
1055 if (num_processors >= NR_CPUS) {
1056 printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached."
1057 " Processor ignored.\n", NR_CPUS);
1058 return;
1059 }
1060
1061 if (num_processors >= maxcpus) {
1062 printk(KERN_WARNING "WARNING: maxcpus limit of %i reached."
1063 " Processor ignored.\n", maxcpus);
1064 return;
1065 }
1066
1067 num_processors++;
1068 cpus_complement(tmp_map, cpu_present_map);
1069 cpu = first_cpu(tmp_map);
1070
1071 physid_set(apicid, phys_cpu_present_map);
1072 if (apicid == boot_cpu_physical_apicid) {
1073 /*
1074 * x86_bios_cpu_apicid is required to have processors listed
1075 * in same order as logical cpu numbers. Hence the first
1076 * entry is BSP, and so on.
1077 */
1078 cpu = 0;
1079 }
1080 if (apicid > max_physical_apicid)
1081 max_physical_apicid = apicid;
1082
1083 /* are we being called early in kernel startup? */
1084 if (early_per_cpu_ptr(x86_cpu_to_apicid)) {
1085 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
1086 u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1087
1088 cpu_to_apicid[cpu] = apicid;
1089 bios_cpu_apicid[cpu] = apicid;
1090 } else {
1091 per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1092 per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1093 }
1094
1095 cpu_set(cpu, cpu_possible_map);
1096 cpu_set(cpu, cpu_present_map);
1097}
1098
1099/*
1100 * Power management
1101 */
1102#ifdef CONFIG_PM
1103
1104static struct {
1105 /* 'active' is true if the local APIC was enabled by us and
1106 not the BIOS; this signifies that we are also responsible
1107 for disabling it before entering apm/acpi suspend */
1108 int active;
1109 /* r/w apic fields */
1110 unsigned int apic_id;
1111 unsigned int apic_taskpri;
1112 unsigned int apic_ldr;
1113 unsigned int apic_dfr;
1114 unsigned int apic_spiv;
1115 unsigned int apic_lvtt;
1116 unsigned int apic_lvtpc;
1117 unsigned int apic_lvt0;
1118 unsigned int apic_lvt1;
1119 unsigned int apic_lvterr;
1120 unsigned int apic_tmict;
1121 unsigned int apic_tdcr;
1122 unsigned int apic_thmr;
1123} apic_pm_state;
1124
1125static int lapic_suspend(struct sys_device *dev, pm_message_t state)
1126{
1127 unsigned long flags;
1128 int maxlvt;
1129
1130 if (!apic_pm_state.active)
1131 return 0;
1132
1133 maxlvt = lapic_get_maxlvt();
1134
1135 apic_pm_state.apic_id = read_apic_id();
1136 apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI);
1137 apic_pm_state.apic_ldr = apic_read(APIC_LDR);
1138 apic_pm_state.apic_dfr = apic_read(APIC_DFR);
1139 apic_pm_state.apic_spiv = apic_read(APIC_SPIV);
1140 apic_pm_state.apic_lvtt = apic_read(APIC_LVTT);
1141 if (maxlvt >= 4)
1142 apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC);
1143 apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0);
1144 apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1);
1145 apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR);
1146 apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
1147 apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
1148#ifdef CONFIG_X86_MCE_INTEL
1149 if (maxlvt >= 5)
1150 apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
1151#endif
1152 local_irq_save(flags);
1153 disable_local_APIC();
1154 local_irq_restore(flags);
1155 return 0;
1156}
1157
1158static int lapic_resume(struct sys_device *dev)
1159{
1160 unsigned int l, h;
1161 unsigned long flags;
1162 int maxlvt;
1163
1164 if (!apic_pm_state.active)
1165 return 0;
1166
1167 maxlvt = lapic_get_maxlvt();
1168
1169 local_irq_save(flags);
1170 rdmsr(MSR_IA32_APICBASE, l, h);
1171 l &= ~MSR_IA32_APICBASE_BASE;
1172 l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr;
1173 wrmsr(MSR_IA32_APICBASE, l, h);
1174 apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
1175 apic_write(APIC_ID, apic_pm_state.apic_id);
1176 apic_write(APIC_DFR, apic_pm_state.apic_dfr);
1177 apic_write(APIC_LDR, apic_pm_state.apic_ldr);
1178 apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri);
1179 apic_write(APIC_SPIV, apic_pm_state.apic_spiv);
1180 apic_write(APIC_LVT0, apic_pm_state.apic_lvt0);
1181 apic_write(APIC_LVT1, apic_pm_state.apic_lvt1);
1182#ifdef CONFIG_X86_MCE_INTEL
1183 if (maxlvt >= 5)
1184 apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr);
1185#endif
1186 if (maxlvt >= 4)
1187 apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc);
1188 apic_write(APIC_LVTT, apic_pm_state.apic_lvtt);
1189 apic_write(APIC_TDCR, apic_pm_state.apic_tdcr);
1190 apic_write(APIC_TMICT, apic_pm_state.apic_tmict);
1191 apic_write(APIC_ESR, 0);
1192 apic_read(APIC_ESR);
1193 apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
1194 apic_write(APIC_ESR, 0);
1195 apic_read(APIC_ESR);
1196 local_irq_restore(flags);
1197 return 0;
1198}
1199
1200static struct sysdev_class lapic_sysclass = {
1201 .name = "lapic",
1202 .resume = lapic_resume,
1203 .suspend = lapic_suspend,
1204};
1205
1206static struct sys_device device_lapic = {
1207 .id = 0,
1208 .cls = &lapic_sysclass,
1209};
1210
1211static void __cpuinit apic_pm_activate(void)
1212{
1213 apic_pm_state.active = 1;
1214}
1215
1216static int __init init_lapic_sysfs(void)
1217{
1218 int error;
1219
1220 if (!cpu_has_apic)
1221 return 0;
1222 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
1223
1224 error = sysdev_class_register(&lapic_sysclass);
1225 if (!error)
1226 error = sysdev_register(&device_lapic);
1227 return error;
1228}
1229device_initcall(init_lapic_sysfs);
1230
1231#else /* CONFIG_PM */
1232
1233static void apic_pm_activate(void) { }
1234
1235#endif /* CONFIG_PM */
1236
1237/*
1238 * apic_is_clustered_box() -- Check if we can expect good TSC
1239 *
1240 * Thus far, the major user of this is IBM's Summit2 series:
1241 *
1242 * Clustered boxes may have unsynced TSC problems if they are
1243 * multi-chassis. Use available data to take a good guess.
1244 * If in doubt, go HPET.
1245 */
1246__cpuinit int apic_is_clustered_box(void)
1247{
1248 int i, clusters, zeros;
1249 unsigned id;
1250 u16 *bios_cpu_apicid;
1251 DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
1252
1253 /*
1254 * there is not this kind of box with AMD CPU yet.
1255 * Some AMD box with quadcore cpu and 8 sockets apicid
1256 * will be [4, 0x23] or [8, 0x27] could be thought to
1257 * vsmp box still need checking...
1258 */
1259 if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
1260 return 0;
1261
1262 bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
1263 bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
1264
1265 for (i = 0; i < NR_CPUS; i++) {
1266 /* are we being called early in kernel startup? */
1267 if (bios_cpu_apicid) {
1268 id = bios_cpu_apicid[i];
1269 }
1270 else if (i < nr_cpu_ids) {
1271 if (cpu_present(i))
1272 id = per_cpu(x86_bios_cpu_apicid, i);
1273 else
1274 continue;
1275 }
1276 else
1277 break;
1278
1279 if (id != BAD_APICID)
1280 __set_bit(APIC_CLUSTERID(id), clustermap);
1281 }
1282
1283 /* Problem: Partially populated chassis may not have CPUs in some of
1284 * the APIC clusters they have been allocated. Only present CPUs have
1285 * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap.
1286 * Since clusters are allocated sequentially, count zeros only if
1287 * they are bounded by ones.
1288 */
1289 clusters = 0;
1290 zeros = 0;
1291 for (i = 0; i < NUM_APIC_CLUSTERS; i++) {
1292 if (test_bit(i, clustermap)) {
1293 clusters += 1 + zeros;
1294 zeros = 0;
1295 } else
1296 ++zeros;
1297 }
1298
1299 /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
1300 * not guaranteed to be synced between boards
1301 */
1302 if (is_vsmp_box() && clusters > 1)
1303 return 1;
1304
1305 /*
1306 * If clusters > 2, then should be multi-chassis.
1307 * May have to revisit this when multi-core + hyperthreaded CPUs come
1308 * out, but AFAIK this will work even for them.
1309 */
1310 return (clusters > 2);
1311}
1312
1313/*
1314 * APIC command line parameters
1315 */
1316static int __init apic_set_verbosity(char *str)
1317{
1318 if (str == NULL) {
1319 skip_ioapic_setup = 0;
1320 ioapic_force = 1;
1321 return 0;
1322 }
1323 if (strcmp("debug", str) == 0)
1324 apic_verbosity = APIC_DEBUG;
1325 else if (strcmp("verbose", str) == 0)
1326 apic_verbosity = APIC_VERBOSE;
1327 else {
1328 printk(KERN_WARNING "APIC Verbosity level %s not recognised"
1329 " use apic=verbose or apic=debug\n", str);
1330 return -EINVAL;
1331 }
1332
1333 return 0;
1334}
1335early_param("apic", apic_set_verbosity);
1336
1337static __init int setup_disableapic(char *str)
1338{
1339 disable_apic = 1;
1340 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1341 return 0;
1342}
1343early_param("disableapic", setup_disableapic);
1344
1345/* same as disableapic, for compatibility */
1346static __init int setup_nolapic(char *str)
1347{
1348 return setup_disableapic(str);
1349}
1350early_param("nolapic", setup_nolapic);
1351
1352static int __init parse_lapic_timer_c2_ok(char *arg)
1353{
1354 local_apic_timer_c2_ok = 1;
1355 return 0;
1356}
1357early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok);
1358
1359static __init int setup_noapictimer(char *str)
1360{
1361 if (str[0] != ' ' && str[0] != 0)
1362 return 0;
1363 disable_apic_timer = 1;
1364 return 1;
1365}
1366__setup("noapictimer", setup_noapictimer);
1367
1368static __init int setup_apicpmtimer(char *s)
1369{
1370 apic_calibrate_pmtmr = 1;
1371 notsc_setup(NULL);
1372 return 0;
1373}
1374__setup("apicpmtimer", setup_apicpmtimer);
1375
1376static int __init lapic_insert_resource(void)
1377{
1378 if (!apic_phys)
1379 return -1;
1380
1381 /* Put local APIC into the resource map. */
1382 lapic_resource.start = apic_phys;
1383 lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1;
1384 insert_resource(&iomem_resource, &lapic_resource);
1385
1386 return 0;
1387}
1388
1389/*
1390 * need call insert after e820_reserve_resources()
1391 * that is using request_resource
1392 */
1393late_initcall(lapic_insert_resource);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index bf9b441331e9..5145a6e72bbb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -219,7 +219,6 @@
219#include <linux/time.h> 219#include <linux/time.h>
220#include <linux/sched.h> 220#include <linux/sched.h>
221#include <linux/pm.h> 221#include <linux/pm.h>
222#include <linux/pm_legacy.h>
223#include <linux/capability.h> 222#include <linux/capability.h>
224#include <linux/device.h> 223#include <linux/device.h>
225#include <linux/kernel.h> 224#include <linux/kernel.h>
@@ -229,12 +228,12 @@
229#include <linux/suspend.h> 228#include <linux/suspend.h>
230#include <linux/kthread.h> 229#include <linux/kthread.h>
231#include <linux/jiffies.h> 230#include <linux/jiffies.h>
232#include <linux/smp_lock.h>
233 231
234#include <asm/system.h> 232#include <asm/system.h>
235#include <asm/uaccess.h> 233#include <asm/uaccess.h>
236#include <asm/desc.h> 234#include <asm/desc.h>
237#include <asm/i8253.h> 235#include <asm/i8253.h>
236#include <asm/olpc.h>
238#include <asm/paravirt.h> 237#include <asm/paravirt.h>
239#include <asm/reboot.h> 238#include <asm/reboot.h>
240 239
@@ -2218,7 +2217,7 @@ static int __init apm_init(void)
2218 2217
2219 dmi_check_system(apm_dmi_table); 2218 dmi_check_system(apm_dmi_table);
2220 2219
2221 if (apm_info.bios.version == 0 || paravirt_enabled()) { 2220 if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) {
2222 printk(KERN_INFO "apm: BIOS not found.\n"); 2221 printk(KERN_INFO "apm: BIOS not found.\n");
2223 return -ENODEV; 2222 return -ENODEV;
2224 } 2223 }
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index bacf5deeec2d..7fcf63d22f8b 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -18,9 +18,11 @@
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/bootparam.h> 19#include <asm/bootparam.h>
20 20
21#include <xen/interface/xen.h>
22
21#define __NO_STUBS 1 23#define __NO_STUBS 1
22#undef __SYSCALL 24#undef __SYSCALL
23#undef _ASM_X86_64_UNISTD_H_ 25#undef _ASM_X86_UNISTD_64_H
24#define __SYSCALL(nr, sym) [nr] = 1, 26#define __SYSCALL(nr, sym) [nr] = 1,
25static char syscalls[] = { 27static char syscalls[] = {
26#include <asm/unistd.h> 28#include <asm/unistd.h>
@@ -131,5 +133,14 @@ int main(void)
131 OFFSET(BP_loadflags, boot_params, hdr.loadflags); 133 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
132 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); 134 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
133 OFFSET(BP_version, boot_params, hdr.version); 135 OFFSET(BP_version, boot_params, hdr.version);
136
137 BLANK();
138 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
139#ifdef CONFIG_XEN
140 BLANK();
141 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
142 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
143#undef ENTRY
144#endif
134 return 0; 145 return 0;
135} 146}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
new file mode 100644
index 000000000000..f0dfe6f17e7e
--- /dev/null
+++ b/arch/x86/kernel/bios_uv.c
@@ -0,0 +1,141 @@
1/*
2 * BIOS run time interface routines.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson
20 */
21
22#include <linux/efi.h>
23#include <asm/efi.h>
24#include <linux/io.h>
25#include <asm/uv/bios.h>
26#include <asm/uv/uv_hub.h>
27
28struct uv_systab uv_systab;
29
30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
31{
32 struct uv_systab *tab = &uv_systab;
33
34 if (!tab->function)
35 /*
36 * BIOS does not support UV systab
37 */
38 return BIOS_STATUS_UNIMPLEMENTED;
39
40 return efi_call6((void *)__va(tab->function),
41 (u64)which, a1, a2, a3, a4, a5);
42}
43
44s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
45 u64 a4, u64 a5)
46{
47 unsigned long bios_flags;
48 s64 ret;
49
50 local_irq_save(bios_flags);
51 ret = uv_bios_call(which, a1, a2, a3, a4, a5);
52 local_irq_restore(bios_flags);
53
54 return ret;
55}
56
57s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
58 u64 a4, u64 a5)
59{
60 s64 ret;
61
62 preempt_disable();
63 ret = uv_bios_call(which, a1, a2, a3, a4, a5);
64 preempt_enable();
65
66 return ret;
67}
68
69
70long sn_partition_id;
71EXPORT_SYMBOL_GPL(sn_partition_id);
72long uv_coherency_id;
73EXPORT_SYMBOL_GPL(uv_coherency_id);
74long uv_region_size;
75EXPORT_SYMBOL_GPL(uv_region_size);
76int uv_type;
77
78
79s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
80 long *region)
81{
82 s64 ret;
83 u64 v0, v1;
84 union partition_info_u part;
85
86 ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc,
87 (u64)(&v0), (u64)(&v1), 0, 0);
88 if (ret != BIOS_STATUS_SUCCESS)
89 return ret;
90
91 part.val = v0;
92 if (uvtype)
93 *uvtype = part.hub_version;
94 if (partid)
95 *partid = part.partition_id;
96 if (coher)
97 *coher = part.coherence_id;
98 if (region)
99 *region = part.region_size;
100 return ret;
101}
102
103
104s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
105{
106 return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type,
107 (u64)ticks_per_second, 0, 0, 0);
108}
109EXPORT_SYMBOL_GPL(uv_bios_freq_base);
110
111
112#ifdef CONFIG_EFI
113void uv_bios_init(void)
114{
115 struct uv_systab *tab;
116
117 if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) ||
118 (efi.uv_systab == (unsigned long)NULL)) {
119 printk(KERN_CRIT "No EFI UV System Table.\n");
120 uv_systab.function = (unsigned long)NULL;
121 return;
122 }
123
124 tab = (struct uv_systab *)ioremap(efi.uv_systab,
125 sizeof(struct uv_systab));
126 if (strncmp(tab->signature, "UVST", 4) != 0)
127 printk(KERN_ERR "bad signature in UV system table!");
128
129 /*
130 * Copy table to permanent spot for later use.
131 */
132 memcpy(&uv_systab, tab, sizeof(struct uv_systab));
133 iounmap(tab);
134
135 printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision);
136}
137#else /* !CONFIG_EFI */
138
139void uv_bios_init(void) { }
140#endif
141
diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore
new file mode 100644
index 000000000000..667df55a4399
--- /dev/null
+++ b/arch/x86/kernel/cpu/.gitignore
@@ -0,0 +1 @@
capflags.c
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index ee76eaad3001..82ec6075c057 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -3,22 +3,30 @@
3# 3#
4 4
5obj-y := intel_cacheinfo.o addon_cpuid_features.o 5obj-y := intel_cacheinfo.o addon_cpuid_features.o
6obj-y += proc.o feature_names.o 6obj-y += proc.o capflags.o powerflags.o common.o
7 7
8obj-$(CONFIG_X86_32) += common.o bugs.o 8obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
9obj-$(CONFIG_X86_64) += common_64.o bugs_64.o 9obj-$(CONFIG_X86_64) += bugs_64.o
10obj-$(CONFIG_X86_32) += amd.o 10
11obj-$(CONFIG_X86_64) += amd_64.o 11obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
12obj-$(CONFIG_X86_32) += cyrix.o 12obj-$(CONFIG_CPU_SUP_AMD) += amd.o
13obj-$(CONFIG_X86_32) += centaur.o 13obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
14obj-$(CONFIG_X86_64) += centaur_64.o 14obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o
15obj-$(CONFIG_X86_32) += transmeta.o 15obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o
16obj-$(CONFIG_X86_32) += intel.o 16obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o
17obj-$(CONFIG_X86_64) += intel_64.o 17obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o
18obj-$(CONFIG_X86_32) += umc.o
19 18
20obj-$(CONFIG_X86_MCE) += mcheck/ 19obj-$(CONFIG_X86_MCE) += mcheck/
21obj-$(CONFIG_MTRR) += mtrr/ 20obj-$(CONFIG_MTRR) += mtrr/
22obj-$(CONFIG_CPU_FREQ) += cpufreq/ 21obj-$(CONFIG_CPU_FREQ) += cpufreq/
23 22
24obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 23obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
24
25quiet_cmd_mkcapflags = MKCAP $@
26 cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@
27
28cpufeature = $(src)/../../include/asm/cpufeature.h
29
30targets += capflags.c
31$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE
32 $(call if_changed,mkcapflags)
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index 84a8220a6072..0d9c993aa93e 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -7,6 +7,8 @@
7#include <asm/pat.h> 7#include <asm/pat.h>
8#include <asm/processor.h> 8#include <asm/processor.h>
9 9
10#include <mach_apic.h>
11
10struct cpuid_bit { 12struct cpuid_bit {
11 u16 feature; 13 u16 feature;
12 u8 reg; 14 u8 reg;
@@ -48,6 +50,92 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
48 } 50 }
49} 51}
50 52
53/* leaf 0xb SMT level */
54#define SMT_LEVEL 0
55
56/* leaf 0xb sub-leaf types */
57#define INVALID_TYPE 0
58#define SMT_TYPE 1
59#define CORE_TYPE 2
60
61#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff)
62#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f)
63#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff)
64
65/*
66 * Check for extended topology enumeration cpuid leaf 0xb and if it
67 * exists, use it for populating initial_apicid and cpu topology
68 * detection.
69 */
70void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
71{
72#ifdef CONFIG_SMP
73 unsigned int eax, ebx, ecx, edx, sub_index;
74 unsigned int ht_mask_width, core_plus_mask_width;
75 unsigned int core_select_mask, core_level_siblings;
76
77 if (c->cpuid_level < 0xb)
78 return;
79
80 cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx);
81
82 /*
83 * check if the cpuid leaf 0xb is actually implemented.
84 */
85 if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE))
86 return;
87
88 set_cpu_cap(c, X86_FEATURE_XTOPOLOGY);
89
90 /*
91 * initial apic id, which also represents 32-bit extended x2apic id.
92 */
93 c->initial_apicid = edx;
94
95 /*
96 * Populate HT related information from sub-leaf level 0.
97 */
98 core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx);
99 core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
100
101 sub_index = 1;
102 do {
103 cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx);
104
105 /*
106 * Check for the Core type in the implemented sub leaves.
107 */
108 if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) {
109 core_level_siblings = LEVEL_MAX_SIBLINGS(ebx);
110 core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax);
111 break;
112 }
113
114 sub_index++;
115 } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE);
116
117 core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width;
118
119#ifdef CONFIG_X86_32
120 c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width)
121 & core_select_mask;
122 c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width);
123#else
124 c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask;
125 c->phys_proc_id = phys_pkg_id(core_plus_mask_width);
126#endif
127 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
128
129
130 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
131 c->phys_proc_id);
132 if (c->x86_max_cores > 1)
133 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
134 c->cpu_core_id);
135 return;
136#endif
137}
138
51#ifdef CONFIG_X86_PAT 139#ifdef CONFIG_X86_PAT
52void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) 140void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
53{ 141{
@@ -56,9 +144,22 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c)
56 144
57 switch (c->x86_vendor) { 145 switch (c->x86_vendor) {
58 case X86_VENDOR_INTEL: 146 case X86_VENDOR_INTEL:
59 if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) 147 /*
148 * There is a known erratum on Pentium III and Core Solo
149 * and Core Duo CPUs.
150 * " Page with PAT set to WC while associated MTRR is UC
151 * may consolidate to UC "
152 * Because of this erratum, it is better to stick with
153 * setting WC in MTRR rather than using PAT on these CPUs.
154 *
155 * Enable PAT WC only on P4, Core 2 or later CPUs.
156 */
157 if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15))
60 return; 158 return;
61 break; 159
160 pat_disable("PAT WC disabled due to known CPU erratum.");
161 return;
162
62 case X86_VENDOR_AMD: 163 case X86_VENDOR_AMD:
63 case X86_VENDOR_CENTAUR: 164 case X86_VENDOR_CENTAUR:
64 case X86_VENDOR_TRANSMETA: 165 case X86_VENDOR_TRANSMETA:
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 81a07ca65d44..8f1e31db2ad5 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -1,13 +1,22 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/bitops.h> 2#include <linux/bitops.h>
3#include <linux/mm.h> 3#include <linux/mm.h>
4
4#include <asm/io.h> 5#include <asm/io.h>
5#include <asm/processor.h> 6#include <asm/processor.h>
6#include <asm/apic.h> 7#include <asm/apic.h>
7 8
9#ifdef CONFIG_X86_64
10# include <asm/numa_64.h>
11# include <asm/mmconfig.h>
12# include <asm/cacheflush.h>
13#endif
14
8#include <mach_apic.h> 15#include <mach_apic.h>
16
9#include "cpu.h" 17#include "cpu.h"
10 18
19#ifdef CONFIG_X86_32
11/* 20/*
12 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause 21 * B step AMD K6 before B 9730xxxx have hardware bugs that can cause
13 * misexecution of code under Linux. Owners of such processors should 22 * misexecution of code under Linux. Owners of such processors should
@@ -24,23 +33,273 @@
24extern void vide(void); 33extern void vide(void);
25__asm__(".align 4\nvide: ret"); 34__asm__(".align 4\nvide: ret");
26 35
27int force_mwait __cpuinitdata; 36static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c)
28
29static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
30{ 37{
31 if (cpuid_eax(0x80000000) >= 0x80000007) { 38/*
32 c->x86_power = cpuid_edx(0x80000007); 39 * General Systems BIOSen alias the cpu frequency registers
33 if (c->x86_power & (1<<8)) 40 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
34 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 41 * drivers subsequently pokes it, and changes the CPU speed.
42 * Workaround : Remove the unneeded alias.
43 */
44#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
45#define CBAR_ENB (0x80000000)
46#define CBAR_KEY (0X000000CB)
47 if (c->x86_model == 9 || c->x86_model == 10) {
48 if (inl (CBAR) & CBAR_ENB)
49 outl (0 | CBAR_KEY, CBAR);
35 } 50 }
36} 51}
37 52
38static void __cpuinit init_amd(struct cpuinfo_x86 *c) 53
54static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c)
39{ 55{
40 u32 l, h; 56 u32 l, h;
41 int mbytes = num_physpages >> (20-PAGE_SHIFT); 57 int mbytes = num_physpages >> (20-PAGE_SHIFT);
42 int r;
43 58
59 if (c->x86_model < 6) {
60 /* Based on AMD doc 20734R - June 2000 */
61 if (c->x86_model == 0) {
62 clear_cpu_cap(c, X86_FEATURE_APIC);
63 set_cpu_cap(c, X86_FEATURE_PGE);
64 }
65 return;
66 }
67
68 if (c->x86_model == 6 && c->x86_mask == 1) {
69 const int K6_BUG_LOOP = 1000000;
70 int n;
71 void (*f_vide)(void);
72 unsigned long d, d2;
73
74 printk(KERN_INFO "AMD K6 stepping B detected - ");
75
76 /*
77 * It looks like AMD fixed the 2.6.2 bug and improved indirect
78 * calls at the same time.
79 */
80
81 n = K6_BUG_LOOP;
82 f_vide = vide;
83 rdtscl(d);
84 while (n--)
85 f_vide();
86 rdtscl(d2);
87 d = d2-d;
88
89 if (d > 20*K6_BUG_LOOP)
90 printk("system stability may be impaired when more than 32 MB are used.\n");
91 else
92 printk("probably OK (after B9730xxxx).\n");
93 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
94 }
95
96 /* K6 with old style WHCR */
97 if (c->x86_model < 8 ||
98 (c->x86_model == 8 && c->x86_mask < 8)) {
99 /* We can only write allocate on the low 508Mb */
100 if (mbytes > 508)
101 mbytes = 508;
102
103 rdmsr(MSR_K6_WHCR, l, h);
104 if ((l&0x0000FFFF) == 0) {
105 unsigned long flags;
106 l = (1<<0)|((mbytes/4)<<1);
107 local_irq_save(flags);
108 wbinvd();
109 wrmsr(MSR_K6_WHCR, l, h);
110 local_irq_restore(flags);
111 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
112 mbytes);
113 }
114 return;
115 }
116
117 if ((c->x86_model == 8 && c->x86_mask > 7) ||
118 c->x86_model == 9 || c->x86_model == 13) {
119 /* The more serious chips .. */
120
121 if (mbytes > 4092)
122 mbytes = 4092;
123
124 rdmsr(MSR_K6_WHCR, l, h);
125 if ((l&0xFFFF0000) == 0) {
126 unsigned long flags;
127 l = ((mbytes>>2)<<22)|(1<<16);
128 local_irq_save(flags);
129 wbinvd();
130 wrmsr(MSR_K6_WHCR, l, h);
131 local_irq_restore(flags);
132 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
133 mbytes);
134 }
135
136 return;
137 }
138
139 if (c->x86_model == 10) {
140 /* AMD Geode LX is model 10 */
141 /* placeholder for any needed mods */
142 return;
143 }
144}
145
146static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
147{
148 u32 l, h;
149
150 /*
151 * Bit 15 of Athlon specific MSR 15, needs to be 0
152 * to enable SSE on Palomino/Morgan/Barton CPU's.
153 * If the BIOS didn't enable it already, enable it here.
154 */
155 if (c->x86_model >= 6 && c->x86_model <= 10) {
156 if (!cpu_has(c, X86_FEATURE_XMM)) {
157 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
158 rdmsr(MSR_K7_HWCR, l, h);
159 l &= ~0x00008000;
160 wrmsr(MSR_K7_HWCR, l, h);
161 set_cpu_cap(c, X86_FEATURE_XMM);
162 }
163 }
164
165 /*
166 * It's been determined by AMD that Athlons since model 8 stepping 1
167 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
168 * As per AMD technical note 27212 0.2
169 */
170 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
171 rdmsr(MSR_K7_CLK_CTL, l, h);
172 if ((l & 0xfff00000) != 0x20000000) {
173 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
174 ((l & 0x000fffff)|0x20000000));
175 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
176 }
177 }
178
179 set_cpu_cap(c, X86_FEATURE_K7);
180}
181#endif
182
183#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
184static int __cpuinit nearby_node(int apicid)
185{
186 int i, node;
187
188 for (i = apicid - 1; i >= 0; i--) {
189 node = apicid_to_node[i];
190 if (node != NUMA_NO_NODE && node_online(node))
191 return node;
192 }
193 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
194 node = apicid_to_node[i];
195 if (node != NUMA_NO_NODE && node_online(node))
196 return node;
197 }
198 return first_node(node_online_map); /* Shouldn't happen */
199}
200#endif
201
202/*
203 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
204 * Assumes number of cores is a power of two.
205 */
206static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
207{
208#ifdef CONFIG_X86_HT
209 unsigned bits;
210
211 bits = c->x86_coreid_bits;
212
213 /* Low order bits define the core id (index of core in socket) */
214 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
215 /* Convert the initial APIC ID into the socket ID */
216 c->phys_proc_id = c->initial_apicid >> bits;
217#endif
218}
219
220static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
221{
222#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
223 int cpu = smp_processor_id();
224 int node;
225 unsigned apicid = hard_smp_processor_id();
226
227 node = c->phys_proc_id;
228 if (apicid_to_node[apicid] != NUMA_NO_NODE)
229 node = apicid_to_node[apicid];
230 if (!node_online(node)) {
231 /* Two possibilities here:
232 - The CPU is missing memory and no node was created.
233 In that case try picking one from a nearby CPU
234 - The APIC IDs differ from the HyperTransport node IDs
235 which the K8 northbridge parsing fills in.
236 Assume they are all increased by a constant offset,
237 but in the same order as the HT nodeids.
238 If that doesn't result in a usable node fall back to the
239 path for the previous case. */
240
241 int ht_nodeid = c->initial_apicid;
242
243 if (ht_nodeid >= 0 &&
244 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
245 node = apicid_to_node[ht_nodeid];
246 /* Pick a nearby node */
247 if (!node_online(node))
248 node = nearby_node(apicid);
249 }
250 numa_set_node(cpu, node);
251
252 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
253#endif
254}
255
256static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
257{
258#ifdef CONFIG_X86_HT
259 unsigned bits, ecx;
260
261 /* Multi core CPU? */
262 if (c->extended_cpuid_level < 0x80000008)
263 return;
264
265 ecx = cpuid_ecx(0x80000008);
266
267 c->x86_max_cores = (ecx & 0xff) + 1;
268
269 /* CPU telling us the core id bits shift? */
270 bits = (ecx >> 12) & 0xF;
271
272 /* Otherwise recompute */
273 if (bits == 0) {
274 while ((1 << bits) < c->x86_max_cores)
275 bits++;
276 }
277
278 c->x86_coreid_bits = bits;
279#endif
280}
281
282static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
283{
284 early_init_amd_mc(c);
285
286 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
287 if (c->x86_power & (1<<8))
288 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
289
290#ifdef CONFIG_X86_64
291 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
292#else
293 /* Set MTRR capability flag if appropriate */
294 if (c->x86 == 5)
295 if (c->x86_model == 13 || c->x86_model == 9 ||
296 (c->x86_model == 8 && c->x86_mask >= 8))
297 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
298#endif
299}
300
301static void __cpuinit init_amd(struct cpuinfo_x86 *c)
302{
44#ifdef CONFIG_SMP 303#ifdef CONFIG_SMP
45 unsigned long long value; 304 unsigned long long value;
46 305
@@ -51,7 +310,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
51 * Errata 63 for SH-B3 steppings 310 * Errata 63 for SH-B3 steppings
52 * Errata 122 for all steppings (F+ have it disabled by default) 311 * Errata 122 for all steppings (F+ have it disabled by default)
53 */ 312 */
54 if (c->x86 == 15) { 313 if (c->x86 == 0xf) {
55 rdmsrl(MSR_K7_HWCR, value); 314 rdmsrl(MSR_K7_HWCR, value);
56 value |= 1 << 6; 315 value |= 1 << 6;
57 wrmsrl(MSR_K7_HWCR, value); 316 wrmsrl(MSR_K7_HWCR, value);
@@ -61,213 +320,119 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
61 early_init_amd(c); 320 early_init_amd(c);
62 321
63 /* 322 /*
64 * FIXME: We should handle the K5 here. Set up the write
65 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
66 * no bus pipeline)
67 */
68
69 /*
70 * Bit 31 in normal CPUID used for nonstandard 3DNow ID; 323 * Bit 31 in normal CPUID used for nonstandard 3DNow ID;
71 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway 324 * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway
72 */ 325 */
73 clear_cpu_cap(c, 0*32+31); 326 clear_cpu_cap(c, 0*32+31);
74 327
75 r = get_model_name(c); 328#ifdef CONFIG_X86_64
76 329 /* On C+ stepping K8 rep microcode works well for copy/memset */
77 switch (c->x86) { 330 if (c->x86 == 0xf) {
78 case 4: 331 u32 level;
79 /*
80 * General Systems BIOSen alias the cpu frequency registers
81 * of the Elan at 0x000df000. Unfortuantly, one of the Linux
82 * drivers subsequently pokes it, and changes the CPU speed.
83 * Workaround : Remove the unneeded alias.
84 */
85#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */
86#define CBAR_ENB (0x80000000)
87#define CBAR_KEY (0X000000CB)
88 if (c->x86_model == 9 || c->x86_model == 10) {
89 if (inl (CBAR) & CBAR_ENB)
90 outl (0 | CBAR_KEY, CBAR);
91 }
92 break;
93 case 5:
94 if (c->x86_model < 6) {
95 /* Based on AMD doc 20734R - June 2000 */
96 if (c->x86_model == 0) {
97 clear_cpu_cap(c, X86_FEATURE_APIC);
98 set_cpu_cap(c, X86_FEATURE_PGE);
99 }
100 break;
101 }
102
103 if (c->x86_model == 6 && c->x86_mask == 1) {
104 const int K6_BUG_LOOP = 1000000;
105 int n;
106 void (*f_vide)(void);
107 unsigned long d, d2;
108
109 printk(KERN_INFO "AMD K6 stepping B detected - ");
110
111 /*
112 * It looks like AMD fixed the 2.6.2 bug and improved indirect
113 * calls at the same time.
114 */
115
116 n = K6_BUG_LOOP;
117 f_vide = vide;
118 rdtscl(d);
119 while (n--)
120 f_vide();
121 rdtscl(d2);
122 d = d2-d;
123
124 if (d > 20*K6_BUG_LOOP)
125 printk("system stability may be impaired when more than 32 MB are used.\n");
126 else
127 printk("probably OK (after B9730xxxx).\n");
128 printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n");
129 }
130
131 /* K6 with old style WHCR */
132 if (c->x86_model < 8 ||
133 (c->x86_model == 8 && c->x86_mask < 8)) {
134 /* We can only write allocate on the low 508Mb */
135 if (mbytes > 508)
136 mbytes = 508;
137
138 rdmsr(MSR_K6_WHCR, l, h);
139 if ((l&0x0000FFFF) == 0) {
140 unsigned long flags;
141 l = (1<<0)|((mbytes/4)<<1);
142 local_irq_save(flags);
143 wbinvd();
144 wrmsr(MSR_K6_WHCR, l, h);
145 local_irq_restore(flags);
146 printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n",
147 mbytes);
148 }
149 break;
150 }
151
152 if ((c->x86_model == 8 && c->x86_mask > 7) ||
153 c->x86_model == 9 || c->x86_model == 13) {
154 /* The more serious chips .. */
155
156 if (mbytes > 4092)
157 mbytes = 4092;
158
159 rdmsr(MSR_K6_WHCR, l, h);
160 if ((l&0xFFFF0000) == 0) {
161 unsigned long flags;
162 l = ((mbytes>>2)<<22)|(1<<16);
163 local_irq_save(flags);
164 wbinvd();
165 wrmsr(MSR_K6_WHCR, l, h);
166 local_irq_restore(flags);
167 printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n",
168 mbytes);
169 }
170
171 /* Set MTRR capability flag if appropriate */
172 if (c->x86_model == 13 || c->x86_model == 9 ||
173 (c->x86_model == 8 && c->x86_mask >= 8))
174 set_cpu_cap(c, X86_FEATURE_K6_MTRR);
175 break;
176 }
177
178 if (c->x86_model == 10) {
179 /* AMD Geode LX is model 10 */
180 /* placeholder for any needed mods */
181 break;
182 }
183 break;
184 case 6: /* An Athlon/Duron */
185 332
186 /* 333 level = cpuid_eax(1);
187 * Bit 15 of Athlon specific MSR 15, needs to be 0 334 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
188 * to enable SSE on Palomino/Morgan/Barton CPU's. 335 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
189 * If the BIOS didn't enable it already, enable it here.
190 */
191 if (c->x86_model >= 6 && c->x86_model <= 10) {
192 if (!cpu_has(c, X86_FEATURE_XMM)) {
193 printk(KERN_INFO "Enabling disabled K7/SSE Support.\n");
194 rdmsr(MSR_K7_HWCR, l, h);
195 l &= ~0x00008000;
196 wrmsr(MSR_K7_HWCR, l, h);
197 set_cpu_cap(c, X86_FEATURE_XMM);
198 }
199 }
200
201 /*
202 * It's been determined by AMD that Athlons since model 8 stepping 1
203 * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx
204 * As per AMD technical note 27212 0.2
205 */
206 if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) {
207 rdmsr(MSR_K7_CLK_CTL, l, h);
208 if ((l & 0xfff00000) != 0x20000000) {
209 printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l,
210 ((l & 0x000fffff)|0x20000000));
211 wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h);
212 }
213 }
214 break;
215 } 336 }
337 if (c->x86 == 0x10 || c->x86 == 0x11)
338 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
339#else
340
341 /*
342 * FIXME: We should handle the K5 here. Set up the write
343 * range and also turn on MSR 83 bits 4 and 31 (write alloc,
344 * no bus pipeline)
345 */
216 346
217 switch (c->x86) { 347 switch (c->x86) {
218 case 15: 348 case 4:
219 /* Use K8 tuning for Fam10h and Fam11h */ 349 init_amd_k5(c);
220 case 0x10:
221 case 0x11:
222 set_cpu_cap(c, X86_FEATURE_K8);
223 break; 350 break;
224 case 6: 351 case 5:
225 set_cpu_cap(c, X86_FEATURE_K7); 352 init_amd_k6(c);
353 break;
354 case 6: /* An Athlon/Duron */
355 init_amd_k7(c);
226 break; 356 break;
227 } 357 }
358
359 /* K6s reports MCEs but don't actually have all the MSRs */
360 if (c->x86 < 6)
361 clear_cpu_cap(c, X86_FEATURE_MCE);
362#endif
363
364 /* Enable workaround for FXSAVE leak */
228 if (c->x86 >= 6) 365 if (c->x86 >= 6)
229 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); 366 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
230 367
231 display_cacheinfo(c); 368 if (!c->x86_model_id[0]) {
232 369 switch (c->x86) {
233 if (cpuid_eax(0x80000000) >= 0x80000008) 370 case 0xf:
234 c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; 371 /* Should distinguish Models here, but this is only
372 a fallback anyways. */
373 strcpy(c->x86_model_id, "Hammer");
374 break;
375 }
376 }
235 377
236#ifdef CONFIG_X86_HT 378 display_cacheinfo(c);
237 /*
238 * On a AMD multi core setup the lower bits of the APIC id
239 * distinguish the cores.
240 */
241 if (c->x86_max_cores > 1) {
242 int cpu = smp_processor_id();
243 unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf;
244 379
245 if (bits == 0) { 380 /* Multi core CPU? */
246 while ((1 << bits) < c->x86_max_cores) 381 if (c->extended_cpuid_level >= 0x80000008) {
247 bits++; 382 amd_detect_cmp(c);
248 } 383 srat_detect_node(c);
249 c->cpu_core_id = c->phys_proc_id & ((1<<bits)-1);
250 c->phys_proc_id >>= bits;
251 printk(KERN_INFO "CPU %d(%d) -> Core %d\n",
252 cpu, c->x86_max_cores, c->cpu_core_id);
253 } 384 }
385
386#ifdef CONFIG_X86_32
387 detect_ht(c);
254#endif 388#endif
255 389
256 if (cpuid_eax(0x80000000) >= 0x80000006) { 390 if (c->extended_cpuid_level >= 0x80000006) {
257 if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) 391 if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000))
258 num_cache_leaves = 4; 392 num_cache_leaves = 4;
259 else 393 else
260 num_cache_leaves = 3; 394 num_cache_leaves = 3;
261 } 395 }
262 396
263 /* K6s reports MCEs but don't actually have all the MSRs */ 397 if (c->x86 >= 0xf && c->x86 <= 0x11)
264 if (c->x86 < 6) 398 set_cpu_cap(c, X86_FEATURE_K8);
265 clear_cpu_cap(c, X86_FEATURE_MCE);
266 399
267 if (cpu_has_xmm2) 400 if (cpu_has_xmm2) {
401 /* MFENCE stops RDTSC speculation */
268 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); 402 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
403 }
404
405#ifdef CONFIG_X86_64
406 if (c->x86 == 0x10) {
407 /* do this for boot cpu */
408 if (c == &boot_cpu_data)
409 check_enable_amd_mmconf_dmi();
410
411 fam10h_check_enable_mmcfg();
412 }
413
414 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
415 unsigned long long tseg;
416
417 /*
418 * Split up direct mapping around the TSEG SMM area.
419 * Don't do it for gbpages because there seems very little
420 * benefit in doing so.
421 */
422 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
423 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
424 if ((tseg>>PMD_SHIFT) <
425 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
426 ((tseg>>PMD_SHIFT) <
427 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
428 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
429 set_memory_4k((unsigned long)__va(tseg), 1);
430 }
431 }
432#endif
269} 433}
270 434
435#ifdef CONFIG_X86_32
271static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) 436static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size)
272{ 437{
273 /* AMD errata T13 (order #21922) */ 438 /* AMD errata T13 (order #21922) */
@@ -280,10 +445,12 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int
280 } 445 }
281 return size; 446 return size;
282} 447}
448#endif
283 449
284static struct cpu_dev amd_cpu_dev __cpuinitdata = { 450static struct cpu_dev amd_cpu_dev __cpuinitdata = {
285 .c_vendor = "AMD", 451 .c_vendor = "AMD",
286 .c_ident = { "AuthenticAMD" }, 452 .c_ident = { "AuthenticAMD" },
453#ifdef CONFIG_X86_32
287 .c_models = { 454 .c_models = {
288 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = 455 { .vendor = X86_VENDOR_AMD, .family = 4, .model_names =
289 { 456 {
@@ -296,9 +463,11 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = {
296 } 463 }
297 }, 464 },
298 }, 465 },
466 .c_size_cache = amd_size_cache,
467#endif
299 .c_early_init = early_init_amd, 468 .c_early_init = early_init_amd,
300 .c_init = init_amd, 469 .c_init = init_amd,
301 .c_size_cache = amd_size_cache, 470 .c_x86_vendor = X86_VENDOR_AMD,
302}; 471};
303 472
304cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); 473cpu_dev_register(amd_cpu_dev);
diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c
deleted file mode 100644
index 7c36fb8a28d4..000000000000
--- a/arch/x86/kernel/cpu/amd_64.c
+++ /dev/null
@@ -1,222 +0,0 @@
1#include <linux/init.h>
2#include <linux/mm.h>
3
4#include <asm/numa_64.h>
5#include <asm/mmconfig.h>
6#include <asm/cacheflush.h>
7
8#include <mach_apic.h>
9
10#include "cpu.h"
11
12int force_mwait __cpuinitdata;
13
14#ifdef CONFIG_NUMA
15static int __cpuinit nearby_node(int apicid)
16{
17 int i, node;
18
19 for (i = apicid - 1; i >= 0; i--) {
20 node = apicid_to_node[i];
21 if (node != NUMA_NO_NODE && node_online(node))
22 return node;
23 }
24 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
25 node = apicid_to_node[i];
26 if (node != NUMA_NO_NODE && node_online(node))
27 return node;
28 }
29 return first_node(node_online_map); /* Shouldn't happen */
30}
31#endif
32
33/*
34 * On a AMD dual core setup the lower bits of the APIC id distingush the cores.
35 * Assumes number of cores is a power of two.
36 */
37static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
38{
39#ifdef CONFIG_SMP
40 unsigned bits;
41#ifdef CONFIG_NUMA
42 int cpu = smp_processor_id();
43 int node = 0;
44 unsigned apicid = hard_smp_processor_id();
45#endif
46 bits = c->x86_coreid_bits;
47
48 /* Low order bits define the core id (index of core in socket) */
49 c->cpu_core_id = c->initial_apicid & ((1 << bits)-1);
50 /* Convert the initial APIC ID into the socket ID */
51 c->phys_proc_id = c->initial_apicid >> bits;
52
53#ifdef CONFIG_NUMA
54 node = c->phys_proc_id;
55 if (apicid_to_node[apicid] != NUMA_NO_NODE)
56 node = apicid_to_node[apicid];
57 if (!node_online(node)) {
58 /* Two possibilities here:
59 - The CPU is missing memory and no node was created.
60 In that case try picking one from a nearby CPU
61 - The APIC IDs differ from the HyperTransport node IDs
62 which the K8 northbridge parsing fills in.
63 Assume they are all increased by a constant offset,
64 but in the same order as the HT nodeids.
65 If that doesn't result in a usable node fall back to the
66 path for the previous case. */
67
68 int ht_nodeid = c->initial_apicid;
69
70 if (ht_nodeid >= 0 &&
71 apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
72 node = apicid_to_node[ht_nodeid];
73 /* Pick a nearby node */
74 if (!node_online(node))
75 node = nearby_node(apicid);
76 }
77 numa_set_node(cpu, node);
78
79 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
80#endif
81#endif
82}
83
84static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c)
85{
86#ifdef CONFIG_SMP
87 unsigned bits, ecx;
88
89 /* Multi core CPU? */
90 if (c->extended_cpuid_level < 0x80000008)
91 return;
92
93 ecx = cpuid_ecx(0x80000008);
94
95 c->x86_max_cores = (ecx & 0xff) + 1;
96
97 /* CPU telling us the core id bits shift? */
98 bits = (ecx >> 12) & 0xF;
99
100 /* Otherwise recompute */
101 if (bits == 0) {
102 while ((1 << bits) < c->x86_max_cores)
103 bits++;
104 }
105
106 c->x86_coreid_bits = bits;
107
108#endif
109}
110
111static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
112{
113 early_init_amd_mc(c);
114
115 /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */
116 if (c->x86_power & (1<<8))
117 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
118}
119
120static void __cpuinit init_amd(struct cpuinfo_x86 *c)
121{
122 unsigned level;
123
124#ifdef CONFIG_SMP
125 unsigned long value;
126
127 /*
128 * Disable TLB flush filter by setting HWCR.FFDIS on K8
129 * bit 6 of msr C001_0015
130 *
131 * Errata 63 for SH-B3 steppings
132 * Errata 122 for all steppings (F+ have it disabled by default)
133 */
134 if (c->x86 == 0xf) {
135 rdmsrl(MSR_K8_HWCR, value);
136 value |= 1 << 6;
137 wrmsrl(MSR_K8_HWCR, value);
138 }
139#endif
140
141 /* Bit 31 in normal CPUID used for nonstandard 3DNow ID;
142 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */
143 clear_cpu_cap(c, 0*32+31);
144
145 /* On C+ stepping K8 rep microcode works well for copy/memset */
146 if (c->x86 == 0xf) {
147 level = cpuid_eax(1);
148 if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)
149 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
150 }
151 if (c->x86 == 0x10 || c->x86 == 0x11)
152 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
153
154 /* Enable workaround for FXSAVE leak */
155 if (c->x86 >= 6)
156 set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK);
157
158 level = get_model_name(c);
159 if (!level) {
160 switch (c->x86) {
161 case 0xf:
162 /* Should distinguish Models here, but this is only
163 a fallback anyways. */
164 strcpy(c->x86_model_id, "Hammer");
165 break;
166 }
167 }
168 display_cacheinfo(c);
169
170 /* Multi core CPU? */
171 if (c->extended_cpuid_level >= 0x80000008)
172 amd_detect_cmp(c);
173
174 if (c->extended_cpuid_level >= 0x80000006 &&
175 (cpuid_edx(0x80000006) & 0xf000))
176 num_cache_leaves = 4;
177 else
178 num_cache_leaves = 3;
179
180 if (c->x86 >= 0xf && c->x86 <= 0x11)
181 set_cpu_cap(c, X86_FEATURE_K8);
182
183 /* MFENCE stops RDTSC speculation */
184 set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
185
186 if (c->x86 == 0x10) {
187 /* do this for boot cpu */
188 if (c == &boot_cpu_data)
189 check_enable_amd_mmconf_dmi();
190
191 fam10h_check_enable_mmcfg();
192 }
193
194 if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
195 unsigned long long tseg;
196
197 /*
198 * Split up direct mapping around the TSEG SMM area.
199 * Don't do it for gbpages because there seems very little
200 * benefit in doing so.
201 */
202 if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) {
203 printk(KERN_DEBUG "tseg: %010llx\n", tseg);
204 if ((tseg>>PMD_SHIFT) <
205 (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) ||
206 ((tseg>>PMD_SHIFT) <
207 (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) &&
208 (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT))))
209 set_memory_4k((unsigned long)__va(tseg), 1);
210 }
211 }
212}
213
214static struct cpu_dev amd_cpu_dev __cpuinitdata = {
215 .c_vendor = "AMD",
216 .c_ident = { "AuthenticAMD" },
217 .c_early_init = early_init_amd,
218 .c_init = init_amd,
219};
220
221cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev);
222
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 1b1c56bb338f..c8e315f1aa83 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -50,6 +50,8 @@ static double __initdata y = 3145727.0;
50 */ 50 */
51static void __init check_fpu(void) 51static void __init check_fpu(void)
52{ 52{
53 s32 fdiv_bug;
54
53 if (!boot_cpu_data.hard_math) { 55 if (!boot_cpu_data.hard_math) {
54#ifndef CONFIG_MATH_EMULATION 56#ifndef CONFIG_MATH_EMULATION
55 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); 57 printk(KERN_EMERG "No coprocessor found and no math emulation present.\n");
@@ -74,8 +76,10 @@ static void __init check_fpu(void)
74 "fistpl %0\n\t" 76 "fistpl %0\n\t"
75 "fwait\n\t" 77 "fwait\n\t"
76 "fninit" 78 "fninit"
77 : "=m" (*&boot_cpu_data.fdiv_bug) 79 : "=m" (*&fdiv_bug)
78 : "m" (*&x), "m" (*&y)); 80 : "m" (*&x), "m" (*&y));
81
82 boot_cpu_data.fdiv_bug = fdiv_bug;
79 if (boot_cpu_data.fdiv_bug) 83 if (boot_cpu_data.fdiv_bug)
80 printk("Hmm, FPU with FDIV bug.\n"); 84 printk("Hmm, FPU with FDIV bug.\n");
81} 85}
@@ -131,13 +135,7 @@ static void __init check_popad(void)
131 * (for due to lack of "invlpg" and working WP on a i386) 135 * (for due to lack of "invlpg" and working WP on a i386)
132 * - In order to run on anything without a TSC, we need to be 136 * - In order to run on anything without a TSC, we need to be
133 * compiled for a i486. 137 * compiled for a i486.
134 * - In order to support the local APIC on a buggy Pentium machine, 138 */
135 * we need to be compiled with CONFIG_X86_GOOD_APIC disabled,
136 * which happens implicitly if compiled for a Pentium or lower
137 * (unless an advanced selection of CPU features is used) as an
138 * otherwise config implies a properly working local APIC without
139 * the need to do extra reads from the APIC.
140*/
141 139
142static void __init check_config(void) 140static void __init check_config(void)
143{ 141{
@@ -151,21 +149,6 @@ static void __init check_config(void)
151 if (boot_cpu_data.x86 == 3) 149 if (boot_cpu_data.x86 == 3)
152 panic("Kernel requires i486+ for 'invlpg' and other features"); 150 panic("Kernel requires i486+ for 'invlpg' and other features");
153#endif 151#endif
154
155/*
156 * If we were told we had a good local APIC, check for buggy Pentia,
157 * i.e. all B steppings and the C2 stepping of P54C when using their
158 * integrated APIC (see 11AP erratum in "Pentium Processor
159 * Specification Update").
160 */
161#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC)
162 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL
163 && cpu_has_apic
164 && boot_cpu_data.x86 == 5
165 && boot_cpu_data.x86_model == 2
166 && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11))
167 panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!");
168#endif
169} 152}
170 153
171 154
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index e0f45edd6a55..89bfdd9cacc6 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -289,7 +289,6 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
289 if (c->x86_model >= 6 && c->x86_model < 9) 289 if (c->x86_model >= 6 && c->x86_model < 9)
290 set_cpu_cap(c, X86_FEATURE_3DNOW); 290 set_cpu_cap(c, X86_FEATURE_3DNOW);
291 291
292 get_model_name(c);
293 display_cacheinfo(c); 292 display_cacheinfo(c);
294} 293}
295 294
@@ -314,6 +313,16 @@ enum {
314 EAMD3D = 1<<20, 313 EAMD3D = 1<<20,
315}; 314};
316 315
316static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
317{
318 switch (c->x86) {
319 case 5:
320 /* Emulate MTRRs using Centaur's MCR. */
321 set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR);
322 break;
323 }
324}
325
317static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 326static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
318{ 327{
319 328
@@ -462,8 +471,10 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size)
462static struct cpu_dev centaur_cpu_dev __cpuinitdata = { 471static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
463 .c_vendor = "Centaur", 472 .c_vendor = "Centaur",
464 .c_ident = { "CentaurHauls" }, 473 .c_ident = { "CentaurHauls" },
474 .c_early_init = early_init_centaur,
465 .c_init = init_centaur, 475 .c_init = init_centaur,
466 .c_size_cache = centaur_size_cache, 476 .c_size_cache = centaur_size_cache,
477 .c_x86_vendor = X86_VENDOR_CENTAUR,
467}; 478};
468 479
469cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev); 480cpu_dev_register(centaur_cpu_dev);
diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c
index 1d181c40e2e1..a1625f5a1e78 100644
--- a/arch/x86/kernel/cpu/centaur_64.c
+++ b/arch/x86/kernel/cpu/centaur_64.c
@@ -16,9 +16,10 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c)
16 16
17static void __cpuinit init_centaur(struct cpuinfo_x86 *c) 17static void __cpuinit init_centaur(struct cpuinfo_x86 *c)
18{ 18{
19 early_init_centaur(c);
20
19 if (c->x86 == 0x6 && c->x86_model >= 0xf) { 21 if (c->x86 == 0x6 && c->x86_model >= 0xf) {
20 c->x86_cache_alignment = c->x86_clflush_size * 2; 22 c->x86_cache_alignment = c->x86_clflush_size * 2;
21 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
22 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 23 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
23 } 24 }
24 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); 25 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
@@ -29,7 +30,8 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = {
29 .c_ident = { "CentaurHauls" }, 30 .c_ident = { "CentaurHauls" },
30 .c_early_init = early_init_centaur, 31 .c_early_init = early_init_centaur,
31 .c_init = init_centaur, 32 .c_init = init_centaur,
33 .c_x86_vendor = X86_VENDOR_CENTAUR,
32}; 34};
33 35
34cpu_vendor_dev_register(X86_VENDOR_CENTAUR, &centaur_cpu_dev); 36cpu_dev_register(centaur_cpu_dev);
35 37
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
new file mode 100644
index 000000000000..2056ccf572cc
--- /dev/null
+++ b/arch/x86/kernel/cpu/cmpxchg.c
@@ -0,0 +1,72 @@
1/*
2 * cmpxchg*() fallbacks for CPU not supporting these instructions
3 */
4
5#include <linux/kernel.h>
6#include <linux/smp.h>
7#include <linux/module.h>
8
9#ifndef CONFIG_X86_CMPXCHG
10unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
11{
12 u8 prev;
13 unsigned long flags;
14
15 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
16 local_irq_save(flags);
17 prev = *(u8 *)ptr;
18 if (prev == old)
19 *(u8 *)ptr = new;
20 local_irq_restore(flags);
21 return prev;
22}
23EXPORT_SYMBOL(cmpxchg_386_u8);
24
25unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
26{
27 u16 prev;
28 unsigned long flags;
29
30 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
31 local_irq_save(flags);
32 prev = *(u16 *)ptr;
33 if (prev == old)
34 *(u16 *)ptr = new;
35 local_irq_restore(flags);
36 return prev;
37}
38EXPORT_SYMBOL(cmpxchg_386_u16);
39
40unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
41{
42 u32 prev;
43 unsigned long flags;
44
45 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
46 local_irq_save(flags);
47 prev = *(u32 *)ptr;
48 if (prev == old)
49 *(u32 *)ptr = new;
50 local_irq_restore(flags);
51 return prev;
52}
53EXPORT_SYMBOL(cmpxchg_386_u32);
54#endif
55
56#ifndef CONFIG_X86_CMPXCHG64
57unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
58{
59 u64 prev;
60 unsigned long flags;
61
62 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
63 local_irq_save(flags);
64 prev = *(u64 *)ptr;
65 if (prev == old)
66 *(u64 *)ptr = new;
67 local_irq_restore(flags);
68 return prev;
69}
70EXPORT_SYMBOL(cmpxchg_486_u64);
71#endif
72
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 80ab20d4fa39..25581dcb280e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1,27 +1,62 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
2#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
3#include <linux/delay.h> 10#include <linux/delay.h>
4#include <linux/smp.h> 11#include <linux/smp.h>
5#include <linux/module.h>
6#include <linux/percpu.h> 12#include <linux/percpu.h>
7#include <linux/bootmem.h>
8#include <asm/processor.h>
9#include <asm/i387.h> 13#include <asm/i387.h>
10#include <asm/msr.h> 14#include <asm/msr.h>
11#include <asm/io.h> 15#include <asm/io.h>
16#include <asm/linkage.h>
12#include <asm/mmu_context.h> 17#include <asm/mmu_context.h>
13#include <asm/mtrr.h> 18#include <asm/mtrr.h>
14#include <asm/mce.h> 19#include <asm/mce.h>
15#include <asm/pat.h> 20#include <asm/pat.h>
21#include <asm/asm.h>
22#include <asm/numa.h>
16#ifdef CONFIG_X86_LOCAL_APIC 23#ifdef CONFIG_X86_LOCAL_APIC
17#include <asm/mpspec.h> 24#include <asm/mpspec.h>
18#include <asm/apic.h> 25#include <asm/apic.h>
19#include <mach_apic.h> 26#include <mach_apic.h>
27#include <asm/genapic.h>
20#endif 28#endif
21 29
30#include <asm/pda.h>
31#include <asm/pgtable.h>
32#include <asm/processor.h>
33#include <asm/desc.h>
34#include <asm/atomic.h>
35#include <asm/proto.h>
36#include <asm/sections.h>
37#include <asm/setup.h>
38
22#include "cpu.h" 39#include "cpu.h"
23 40
41static struct cpu_dev *this_cpu __cpuinitdata;
42
43#ifdef CONFIG_X86_64
44/* We need valid kernel segments for data and code in long mode too
45 * IRET will check the segment types kkeil 2000/10/28
46 * Also sysret mandates a special GDT layout
47 */
48/* The TLS descriptors are currently at a different place compared to i386.
49 Hopefully nobody expects them at a fixed place (Wine?) */
24DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { 50DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
51 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
52 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
53 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
54 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
55 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
56 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
57} };
58#else
59DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
25 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, 60 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } },
26 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, 61 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } },
27 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, 62 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } },
@@ -55,17 +90,157 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
55 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, 90 [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } },
56 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, 91 [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } },
57} }; 92} };
93#endif
58EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); 94EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
59 95
60__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; 96#ifdef CONFIG_X86_32
61
62static int cachesize_override __cpuinitdata = -1; 97static int cachesize_override __cpuinitdata = -1;
63static int disable_x86_serial_nr __cpuinitdata = 1; 98static int disable_x86_serial_nr __cpuinitdata = 1;
64 99
65struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; 100static int __init cachesize_setup(char *str)
101{
102 get_option(&str, &cachesize_override);
103 return 1;
104}
105__setup("cachesize=", cachesize_setup);
106
107static int __init x86_fxsr_setup(char *s)
108{
109 setup_clear_cpu_cap(X86_FEATURE_FXSR);
110 setup_clear_cpu_cap(X86_FEATURE_XMM);
111 return 1;
112}
113__setup("nofxsr", x86_fxsr_setup);
114
115static int __init x86_sep_setup(char *s)
116{
117 setup_clear_cpu_cap(X86_FEATURE_SEP);
118 return 1;
119}
120__setup("nosep", x86_sep_setup);
121
122/* Standard macro to see if a specific flag is changeable */
123static inline int flag_is_changeable_p(u32 flag)
124{
125 u32 f1, f2;
126
127 /*
128 * Cyrix and IDT cpus allow disabling of CPUID
129 * so the code below may return different results
130 * when it is executed before and after enabling
131 * the CPUID. Add "volatile" to not allow gcc to
132 * optimize the subsequent calls to this function.
133 */
134 asm volatile ("pushfl\n\t"
135 "pushfl\n\t"
136 "popl %0\n\t"
137 "movl %0,%1\n\t"
138 "xorl %2,%0\n\t"
139 "pushl %0\n\t"
140 "popfl\n\t"
141 "pushfl\n\t"
142 "popl %0\n\t"
143 "popfl\n\t"
144 : "=&r" (f1), "=&r" (f2)
145 : "ir" (flag));
146
147 return ((f1^f2) & flag) != 0;
148}
149
150/* Probe for the CPUID instruction */
151static int __cpuinit have_cpuid_p(void)
152{
153 return flag_is_changeable_p(X86_EFLAGS_ID);
154}
155
156static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
157{
158 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) {
159 /* Disable processor serial number */
160 unsigned long lo, hi;
161 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
162 lo |= 0x200000;
163 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
164 printk(KERN_NOTICE "CPU serial number disabled.\n");
165 clear_cpu_cap(c, X86_FEATURE_PN);
166
167 /* Disabling the serial number may affect the cpuid level */
168 c->cpuid_level = cpuid_eax(0);
169 }
170}
171
172static int __init x86_serial_nr_setup(char *s)
173{
174 disable_x86_serial_nr = 0;
175 return 1;
176}
177__setup("serialnumber", x86_serial_nr_setup);
178#else
179static inline int flag_is_changeable_p(u32 flag)
180{
181 return 1;
182}
183/* Probe for the CPUID instruction */
184static inline int have_cpuid_p(void)
185{
186 return 1;
187}
188static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
189{
190}
191#endif
192
193/*
194 * Naming convention should be: <Name> [(<Codename>)]
195 * This table only is used unless init_<vendor>() below doesn't set it;
196 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
197 *
198 */
199
200/* Look up CPU names by table lookup. */
201static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
202{
203 struct cpu_model_info *info;
204
205 if (c->x86_model >= 16)
206 return NULL; /* Range check */
207
208 if (!this_cpu)
209 return NULL;
210
211 info = this_cpu->c_models;
212
213 while (info && info->family) {
214 if (info->family == c->x86)
215 return info->model_names[c->x86_model];
216 info++;
217 }
218 return NULL; /* Not found */
219}
220
221__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
222
223/* Current gdt points %fs at the "master" per-cpu area: after this,
224 * it's on the real one. */
225void switch_to_new_gdt(void)
226{
227 struct desc_ptr gdt_descr;
228
229 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
230 gdt_descr.size = GDT_SIZE - 1;
231 load_gdt(&gdt_descr);
232#ifdef CONFIG_X86_32
233 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
234#endif
235}
236
237static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
66 238
67static void __cpuinit default_init(struct cpuinfo_x86 *c) 239static void __cpuinit default_init(struct cpuinfo_x86 *c)
68{ 240{
241#ifdef CONFIG_X86_64
242 display_cacheinfo(c);
243#else
69 /* Not much we can do here... */ 244 /* Not much we can do here... */
70 /* Check if at least it has cpuid */ 245 /* Check if at least it has cpuid */
71 if (c->cpuid_level == -1) { 246 if (c->cpuid_level == -1) {
@@ -75,28 +250,22 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c)
75 else if (c->x86 == 3) 250 else if (c->x86 == 3)
76 strcpy(c->x86_model_id, "386"); 251 strcpy(c->x86_model_id, "386");
77 } 252 }
253#endif
78} 254}
79 255
80static struct cpu_dev __cpuinitdata default_cpu = { 256static struct cpu_dev __cpuinitdata default_cpu = {
81 .c_init = default_init, 257 .c_init = default_init,
82 .c_vendor = "Unknown", 258 .c_vendor = "Unknown",
259 .c_x86_vendor = X86_VENDOR_UNKNOWN,
83}; 260};
84static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
85
86static int __init cachesize_setup(char *str)
87{
88 get_option(&str, &cachesize_override);
89 return 1;
90}
91__setup("cachesize=", cachesize_setup);
92 261
93int __cpuinit get_model_name(struct cpuinfo_x86 *c) 262static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
94{ 263{
95 unsigned int *v; 264 unsigned int *v;
96 char *p, *q; 265 char *p, *q;
97 266
98 if (cpuid_eax(0x80000000) < 0x80000004) 267 if (c->extended_cpuid_level < 0x80000004)
99 return 0; 268 return;
100 269
101 v = (unsigned int *) c->x86_model_id; 270 v = (unsigned int *) c->x86_model_id;
102 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); 271 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
@@ -115,30 +284,34 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c)
115 while (q <= &c->x86_model_id[48]) 284 while (q <= &c->x86_model_id[48])
116 *q++ = '\0'; /* Zero-pad the rest */ 285 *q++ = '\0'; /* Zero-pad the rest */
117 } 286 }
118
119 return 1;
120} 287}
121 288
122
123void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 289void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
124{ 290{
125 unsigned int n, dummy, ecx, edx, l2size; 291 unsigned int n, dummy, ebx, ecx, edx, l2size;
126 292
127 n = cpuid_eax(0x80000000); 293 n = c->extended_cpuid_level;
128 294
129 if (n >= 0x80000005) { 295 if (n >= 0x80000005) {
130 cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); 296 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
131 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", 297 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
132 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); 298 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
133 c->x86_cache_size = (ecx>>24)+(edx>>24); 299 c->x86_cache_size = (ecx>>24) + (edx>>24);
300#ifdef CONFIG_X86_64
301 /* On K8 L1 TLB is inclusive, so don't count it */
302 c->x86_tlbsize = 0;
303#endif
134 } 304 }
135 305
136 if (n < 0x80000006) /* Some chips just has a large L1. */ 306 if (n < 0x80000006) /* Some chips just has a large L1. */
137 return; 307 return;
138 308
139 ecx = cpuid_ecx(0x80000006); 309 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
140 l2size = ecx >> 16; 310 l2size = ecx >> 16;
141 311
312#ifdef CONFIG_X86_64
313 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
314#else
142 /* do processor-specific cache resizing */ 315 /* do processor-specific cache resizing */
143 if (this_cpu->c_size_cache) 316 if (this_cpu->c_size_cache)
144 l2size = this_cpu->c_size_cache(c, l2size); 317 l2size = this_cpu->c_size_cache(c, l2size);
@@ -149,116 +322,106 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
149 322
150 if (l2size == 0) 323 if (l2size == 0)
151 return; /* Again, no L2 cache is possible */ 324 return; /* Again, no L2 cache is possible */
325#endif
152 326
153 c->x86_cache_size = l2size; 327 c->x86_cache_size = l2size;
154 328
155 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", 329 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
156 l2size, ecx & 0xFF); 330 l2size, ecx & 0xFF);
157} 331}
158 332
159/* 333void __cpuinit detect_ht(struct cpuinfo_x86 *c)
160 * Naming convention should be: <Name> [(<Codename>)]
161 * This table only is used unless init_<vendor>() below doesn't set it;
162 * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used
163 *
164 */
165
166/* Look up CPU names by table lookup. */
167static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c)
168{ 334{
169 struct cpu_model_info *info; 335#ifdef CONFIG_X86_HT
336 u32 eax, ebx, ecx, edx;
337 int index_msb, core_bits;
170 338
171 if (c->x86_model >= 16) 339 if (!cpu_has(c, X86_FEATURE_HT))
172 return NULL; /* Range check */ 340 return;
173 341
174 if (!this_cpu) 342 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
175 return NULL; 343 goto out;
176 344
177 info = this_cpu->c_models; 345 if (cpu_has(c, X86_FEATURE_XTOPOLOGY))
346 return;
178 347
179 while (info && info->family) { 348 cpuid(1, &eax, &ebx, &ecx, &edx);
180 if (info->family == c->x86) 349
181 return info->model_names[c->x86_model]; 350 smp_num_siblings = (ebx & 0xff0000) >> 16;
182 info++; 351
352 if (smp_num_siblings == 1) {
353 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
354 } else if (smp_num_siblings > 1) {
355
356 if (smp_num_siblings > NR_CPUS) {
357 printk(KERN_WARNING "CPU: Unsupported number of siblings %d",
358 smp_num_siblings);
359 smp_num_siblings = 1;
360 return;
361 }
362
363 index_msb = get_count_order(smp_num_siblings);
364#ifdef CONFIG_X86_64
365 c->phys_proc_id = phys_pkg_id(index_msb);
366#else
367 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb);
368#endif
369
370 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
371
372 index_msb = get_count_order(smp_num_siblings);
373
374 core_bits = get_count_order(c->x86_max_cores);
375
376#ifdef CONFIG_X86_64
377 c->cpu_core_id = phys_pkg_id(index_msb) &
378 ((1 << core_bits) - 1);
379#else
380 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) &
381 ((1 << core_bits) - 1);
382#endif
183 } 383 }
184 return NULL; /* Not found */
185}
186 384
385out:
386 if ((c->x86_max_cores * smp_num_siblings) > 1) {
387 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
388 c->phys_proc_id);
389 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
390 c->cpu_core_id);
391 }
392#endif
393}
187 394
188static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) 395static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
189{ 396{
190 char *v = c->x86_vendor_id; 397 char *v = c->x86_vendor_id;
191 int i; 398 int i;
192 static int printed; 399 static int printed;
193 400
194 for (i = 0; i < X86_VENDOR_NUM; i++) { 401 for (i = 0; i < X86_VENDOR_NUM; i++) {
195 if (cpu_devs[i]) { 402 if (!cpu_devs[i])
196 if (!strcmp(v, cpu_devs[i]->c_ident[0]) || 403 break;
197 (cpu_devs[i]->c_ident[1] && 404
198 !strcmp(v, cpu_devs[i]->c_ident[1]))) { 405 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
199 c->x86_vendor = i; 406 (cpu_devs[i]->c_ident[1] &&
200 if (!early) 407 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
201 this_cpu = cpu_devs[i]; 408 this_cpu = cpu_devs[i];
202 return; 409 c->x86_vendor = this_cpu->c_x86_vendor;
203 } 410 return;
204 } 411 }
205 } 412 }
413
206 if (!printed) { 414 if (!printed) {
207 printed++; 415 printed++;
208 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); 416 printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v);
209 printk(KERN_ERR "CPU: Your system may be unstable.\n"); 417 printk(KERN_ERR "CPU: Your system may be unstable.\n");
210 } 418 }
419
211 c->x86_vendor = X86_VENDOR_UNKNOWN; 420 c->x86_vendor = X86_VENDOR_UNKNOWN;
212 this_cpu = &default_cpu; 421 this_cpu = &default_cpu;
213} 422}
214 423
215 424void __cpuinit cpu_detect(struct cpuinfo_x86 *c)
216static int __init x86_fxsr_setup(char *s)
217{
218 setup_clear_cpu_cap(X86_FEATURE_FXSR);
219 setup_clear_cpu_cap(X86_FEATURE_XMM);
220 return 1;
221}
222__setup("nofxsr", x86_fxsr_setup);
223
224
225static int __init x86_sep_setup(char *s)
226{
227 setup_clear_cpu_cap(X86_FEATURE_SEP);
228 return 1;
229}
230__setup("nosep", x86_sep_setup);
231
232
233/* Standard macro to see if a specific flag is changeable */
234static inline int flag_is_changeable_p(u32 flag)
235{
236 u32 f1, f2;
237
238 asm("pushfl\n\t"
239 "pushfl\n\t"
240 "popl %0\n\t"
241 "movl %0,%1\n\t"
242 "xorl %2,%0\n\t"
243 "pushl %0\n\t"
244 "popfl\n\t"
245 "pushfl\n\t"
246 "popl %0\n\t"
247 "popfl\n\t"
248 : "=&r" (f1), "=&r" (f2)
249 : "ir" (flag));
250
251 return ((f1^f2) & flag) != 0;
252}
253
254
255/* Probe for the CPUID instruction */
256static int __cpuinit have_cpuid_p(void)
257{
258 return flag_is_changeable_p(X86_EFLAGS_ID);
259}
260
261void __init cpu_detect(struct cpuinfo_x86 *c)
262{ 425{
263 /* Get vendor name */ 426 /* Get vendor name */
264 cpuid(0x00000000, (unsigned int *)&c->cpuid_level, 427 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
@@ -267,50 +430,87 @@ void __init cpu_detect(struct cpuinfo_x86 *c)
267 (unsigned int *)&c->x86_vendor_id[4]); 430 (unsigned int *)&c->x86_vendor_id[4]);
268 431
269 c->x86 = 4; 432 c->x86 = 4;
433 /* Intel-defined flags: level 0x00000001 */
270 if (c->cpuid_level >= 0x00000001) { 434 if (c->cpuid_level >= 0x00000001) {
271 u32 junk, tfms, cap0, misc; 435 u32 junk, tfms, cap0, misc;
272 cpuid(0x00000001, &tfms, &misc, &junk, &cap0); 436 cpuid(0x00000001, &tfms, &misc, &junk, &cap0);
273 c->x86 = (tfms >> 8) & 15; 437 c->x86 = (tfms >> 8) & 0xf;
274 c->x86_model = (tfms >> 4) & 15; 438 c->x86_model = (tfms >> 4) & 0xf;
439 c->x86_mask = tfms & 0xf;
275 if (c->x86 == 0xf) 440 if (c->x86 == 0xf)
276 c->x86 += (tfms >> 20) & 0xff; 441 c->x86 += (tfms >> 20) & 0xff;
277 if (c->x86 >= 0x6) 442 if (c->x86 >= 0x6)
278 c->x86_model += ((tfms >> 16) & 0xF) << 4; 443 c->x86_model += ((tfms >> 16) & 0xf) << 4;
279 c->x86_mask = tfms & 15;
280 if (cap0 & (1<<19)) { 444 if (cap0 & (1<<19)) {
281 c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8;
282 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; 445 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
446 c->x86_cache_alignment = c->x86_clflush_size;
283 } 447 }
284 } 448 }
285} 449}
286static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) 450
451static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
287{ 452{
288 u32 tfms, xlvl; 453 u32 tfms, xlvl;
289 unsigned int ebx; 454 u32 ebx;
290 455
291 memset(&c->x86_capability, 0, sizeof c->x86_capability); 456 /* Intel-defined flags: level 0x00000001 */
292 if (have_cpuid_p()) { 457 if (c->cpuid_level >= 0x00000001) {
293 /* Intel-defined flags: level 0x00000001 */ 458 u32 capability, excap;
294 if (c->cpuid_level >= 0x00000001) { 459 cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
295 u32 capability, excap; 460 c->x86_capability[0] = capability;
296 cpuid(0x00000001, &tfms, &ebx, &excap, &capability); 461 c->x86_capability[4] = excap;
297 c->x86_capability[0] = capability; 462 }
298 c->x86_capability[4] = excap;
299 }
300 463
301 /* AMD-defined flags: level 0x80000001 */ 464 /* AMD-defined flags: level 0x80000001 */
302 xlvl = cpuid_eax(0x80000000); 465 xlvl = cpuid_eax(0x80000000);
303 if ((xlvl & 0xffff0000) == 0x80000000) { 466 c->extended_cpuid_level = xlvl;
304 if (xlvl >= 0x80000001) { 467 if ((xlvl & 0xffff0000) == 0x80000000) {
305 c->x86_capability[1] = cpuid_edx(0x80000001); 468 if (xlvl >= 0x80000001) {
306 c->x86_capability[6] = cpuid_ecx(0x80000001); 469 c->x86_capability[1] = cpuid_edx(0x80000001);
307 } 470 c->x86_capability[6] = cpuid_ecx(0x80000001);
308 } 471 }
472 }
473
474#ifdef CONFIG_X86_64
475 if (c->extended_cpuid_level >= 0x80000008) {
476 u32 eax = cpuid_eax(0x80000008);
309 477
478 c->x86_virt_bits = (eax >> 8) & 0xff;
479 c->x86_phys_bits = eax & 0xff;
310 } 480 }
481#endif
482
483 if (c->extended_cpuid_level >= 0x80000007)
484 c->x86_power = cpuid_edx(0x80000007);
311 485
312} 486}
313 487
488static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
489{
490#ifdef CONFIG_X86_32
491 int i;
492
493 /*
494 * First of all, decide if this is a 486 or higher
495 * It's a 486 if we can modify the AC flag
496 */
497 if (flag_is_changeable_p(X86_EFLAGS_AC))
498 c->x86 = 4;
499 else
500 c->x86 = 3;
501
502 for (i = 0; i < X86_VENDOR_NUM; i++)
503 if (cpu_devs[i] && cpu_devs[i]->c_identify) {
504 c->x86_vendor_id[0] = 0;
505 cpu_devs[i]->c_identify(c);
506 if (c->x86_vendor_id[0]) {
507 get_cpu_vendor(c);
508 break;
509 }
510 }
511#endif
512}
513
314/* 514/*
315 * Do minimum CPU detection early. 515 * Do minimum CPU detection early.
316 * Fields really needed: vendor, cpuid_level, family, model, mask, 516 * Fields really needed: vendor, cpuid_level, family, model, mask,
@@ -320,109 +520,113 @@ static void __cpuinit early_get_cap(struct cpuinfo_x86 *c)
320 * WARNING: this function is only called on the BP. Don't add code here 520 * WARNING: this function is only called on the BP. Don't add code here
321 * that is supposed to run on all CPUs. 521 * that is supposed to run on all CPUs.
322 */ 522 */
323static void __init early_cpu_detect(void) 523static void __init early_identify_cpu(struct cpuinfo_x86 *c)
324{ 524{
325 struct cpuinfo_x86 *c = &boot_cpu_data; 525#ifdef CONFIG_X86_64
326 526 c->x86_clflush_size = 64;
327 c->x86_cache_alignment = 32; 527#else
328 c->x86_clflush_size = 32; 528 c->x86_clflush_size = 32;
529#endif
530 c->x86_cache_alignment = c->x86_clflush_size;
531
532 memset(&c->x86_capability, 0, sizeof c->x86_capability);
533 c->extended_cpuid_level = 0;
534
535 if (!have_cpuid_p())
536 identify_cpu_without_cpuid(c);
329 537
538 /* cyrix could have cpuid enabled via c_identify()*/
330 if (!have_cpuid_p()) 539 if (!have_cpuid_p())
331 return; 540 return;
332 541
333 cpu_detect(c); 542 cpu_detect(c);
334 543
335 get_cpu_vendor(c, 1); 544 get_cpu_vendor(c);
545
546 get_cpu_cap(c);
336 547
337 if (c->x86_vendor != X86_VENDOR_UNKNOWN && 548 if (this_cpu->c_early_init)
338 cpu_devs[c->x86_vendor]->c_early_init) 549 this_cpu->c_early_init(c);
339 cpu_devs[c->x86_vendor]->c_early_init(c);
340 550
341 early_get_cap(c); 551 validate_pat_support(c);
342} 552}
343 553
344static void __cpuinit generic_identify(struct cpuinfo_x86 *c) 554void __init early_cpu_init(void)
345{ 555{
346 u32 tfms, xlvl; 556 struct cpu_dev **cdev;
347 unsigned int ebx; 557 int count = 0;
348 558
349 if (have_cpuid_p()) { 559 printk("KERNEL supported cpus:\n");
350 /* Get vendor name */ 560 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
351 cpuid(0x00000000, (unsigned int *)&c->cpuid_level, 561 struct cpu_dev *cpudev = *cdev;
352 (unsigned int *)&c->x86_vendor_id[0], 562 unsigned int j;
353 (unsigned int *)&c->x86_vendor_id[8], 563
354 (unsigned int *)&c->x86_vendor_id[4]); 564 if (count >= X86_VENDOR_NUM)
355 565 break;
356 get_cpu_vendor(c, 0); 566 cpu_devs[count] = cpudev;
357 /* Initialize the standard set of capabilities */ 567 count++;
358 /* Note that the vendor-specific code below might override */ 568
359 /* Intel-defined flags: level 0x00000001 */ 569 for (j = 0; j < 2; j++) {
360 if (c->cpuid_level >= 0x00000001) { 570 if (!cpudev->c_ident[j])
361 u32 capability, excap; 571 continue;
362 cpuid(0x00000001, &tfms, &ebx, &excap, &capability); 572 printk(" %s %s\n", cpudev->c_vendor,
363 c->x86_capability[0] = capability; 573 cpudev->c_ident[j]);
364 c->x86_capability[4] = excap;
365 c->x86 = (tfms >> 8) & 15;
366 c->x86_model = (tfms >> 4) & 15;
367 if (c->x86 == 0xf)
368 c->x86 += (tfms >> 20) & 0xff;
369 if (c->x86 >= 0x6)
370 c->x86_model += ((tfms >> 16) & 0xF) << 4;
371 c->x86_mask = tfms & 15;
372 c->initial_apicid = (ebx >> 24) & 0xFF;
373#ifdef CONFIG_X86_HT
374 c->apicid = phys_pkg_id(c->initial_apicid, 0);
375 c->phys_proc_id = c->initial_apicid;
376#else
377 c->apicid = c->initial_apicid;
378#endif
379 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
380 c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8;
381 } else {
382 /* Have CPUID level 0 only - unheard of */
383 c->x86 = 4;
384 }
385
386 /* AMD-defined flags: level 0x80000001 */
387 xlvl = cpuid_eax(0x80000000);
388 if ((xlvl & 0xffff0000) == 0x80000000) {
389 if (xlvl >= 0x80000001) {
390 c->x86_capability[1] = cpuid_edx(0x80000001);
391 c->x86_capability[6] = cpuid_ecx(0x80000001);
392 }
393 if (xlvl >= 0x80000004)
394 get_model_name(c); /* Default name */
395 } 574 }
396
397 init_scattered_cpuid_features(c);
398 } 575 }
399 576
577 early_identify_cpu(&boot_cpu_data);
400} 578}
401 579
402static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) 580/*
581 * The NOPL instruction is supposed to exist on all CPUs with
582 * family >= 6; unfortunately, that's not true in practice because
583 * of early VIA chips and (more importantly) broken virtualizers that
584 * are not easy to detect. In the latter case it doesn't even *fail*
585 * reliably, so probing for it doesn't even work. Disable it completely
586 * unless we can find a reliable way to detect all the broken cases.
587 */
588static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
403{ 589{
404 if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { 590 clear_cpu_cap(c, X86_FEATURE_NOPL);
405 /* Disable processor serial number */
406 unsigned long lo, hi;
407 rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
408 lo |= 0x200000;
409 wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi);
410 printk(KERN_NOTICE "CPU serial number disabled.\n");
411 clear_cpu_cap(c, X86_FEATURE_PN);
412
413 /* Disabling the serial number may affect the cpuid level */
414 c->cpuid_level = cpuid_eax(0);
415 }
416} 591}
417 592
418static int __init x86_serial_nr_setup(char *s) 593static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
419{ 594{
420 disable_x86_serial_nr = 0; 595 c->extended_cpuid_level = 0;
421 return 1; 596
422} 597 if (!have_cpuid_p())
423__setup("serialnumber", x86_serial_nr_setup); 598 identify_cpu_without_cpuid(c);
599
600 /* cyrix could have cpuid enabled via c_identify()*/
601 if (!have_cpuid_p())
602 return;
603
604 cpu_detect(c);
605
606 get_cpu_vendor(c);
424 607
608 get_cpu_cap(c);
425 609
610 if (c->cpuid_level >= 0x00000001) {
611 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF;
612#ifdef CONFIG_X86_32
613# ifdef CONFIG_X86_HT
614 c->apicid = phys_pkg_id(c->initial_apicid, 0);
615# else
616 c->apicid = c->initial_apicid;
617# endif
618#endif
619
620#ifdef CONFIG_X86_HT
621 c->phys_proc_id = c->initial_apicid;
622#endif
623 }
624
625 get_model_name(c); /* Default name */
626
627 init_scattered_cpuid_features(c);
628 detect_nopl(c);
629}
426 630
427/* 631/*
428 * This does the hard work of actually picking apart the CPU stuff... 632 * This does the hard work of actually picking apart the CPU stuff...
@@ -434,30 +638,29 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
434 c->loops_per_jiffy = loops_per_jiffy; 638 c->loops_per_jiffy = loops_per_jiffy;
435 c->x86_cache_size = -1; 639 c->x86_cache_size = -1;
436 c->x86_vendor = X86_VENDOR_UNKNOWN; 640 c->x86_vendor = X86_VENDOR_UNKNOWN;
437 c->cpuid_level = -1; /* CPUID not detected */
438 c->x86_model = c->x86_mask = 0; /* So far unknown... */ 641 c->x86_model = c->x86_mask = 0; /* So far unknown... */
439 c->x86_vendor_id[0] = '\0'; /* Unset */ 642 c->x86_vendor_id[0] = '\0'; /* Unset */
440 c->x86_model_id[0] = '\0'; /* Unset */ 643 c->x86_model_id[0] = '\0'; /* Unset */
441 c->x86_max_cores = 1; 644 c->x86_max_cores = 1;
645 c->x86_coreid_bits = 0;
646#ifdef CONFIG_X86_64
647 c->x86_clflush_size = 64;
648#else
649 c->cpuid_level = -1; /* CPUID not detected */
442 c->x86_clflush_size = 32; 650 c->x86_clflush_size = 32;
651#endif
652 c->x86_cache_alignment = c->x86_clflush_size;
443 memset(&c->x86_capability, 0, sizeof c->x86_capability); 653 memset(&c->x86_capability, 0, sizeof c->x86_capability);
444 654
445 if (!have_cpuid_p()) {
446 /*
447 * First of all, decide if this is a 486 or higher
448 * It's a 486 if we can modify the AC flag
449 */
450 if (flag_is_changeable_p(X86_EFLAGS_AC))
451 c->x86 = 4;
452 else
453 c->x86 = 3;
454 }
455
456 generic_identify(c); 655 generic_identify(c);
457 656
458 if (this_cpu->c_identify) 657 if (this_cpu->c_identify)
459 this_cpu->c_identify(c); 658 this_cpu->c_identify(c);
460 659
660#ifdef CONFIG_X86_64
661 c->apicid = phys_pkg_id(0);
662#endif
663
461 /* 664 /*
462 * Vendor-specific initialization. In this section we 665 * Vendor-specific initialization. In this section we
463 * canonicalize the feature flags, meaning if there are 666 * canonicalize the feature flags, meaning if there are
@@ -491,6 +694,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
491 c->x86, c->x86_model); 694 c->x86, c->x86_model);
492 } 695 }
493 696
697#ifdef CONFIG_X86_64
698 detect_ht(c);
699#endif
700
494 /* 701 /*
495 * On SMP, boot_cpu_data holds the common feature set between 702 * On SMP, boot_cpu_data holds the common feature set between
496 * all CPUs; so make sure that we indicate which features are 703 * all CPUs; so make sure that we indicate which features are
@@ -499,7 +706,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
499 */ 706 */
500 if (c != &boot_cpu_data) { 707 if (c != &boot_cpu_data) {
501 /* AND the already accumulated flags with these */ 708 /* AND the already accumulated flags with these */
502 for (i = 0 ; i < NCAPINTS ; i++) 709 for (i = 0; i < NCAPINTS; i++)
503 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 710 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
504 } 711 }
505 712
@@ -507,72 +714,91 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
507 for (i = 0; i < NCAPINTS; i++) 714 for (i = 0; i < NCAPINTS; i++)
508 c->x86_capability[i] &= ~cleared_cpu_caps[i]; 715 c->x86_capability[i] &= ~cleared_cpu_caps[i];
509 716
717#ifdef CONFIG_X86_MCE
510 /* Init Machine Check Exception if available. */ 718 /* Init Machine Check Exception if available. */
511 mcheck_init(c); 719 mcheck_init(c);
720#endif
512 721
513 select_idle_routine(c); 722 select_idle_routine(c);
723
724#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
725 numa_add_cpu(smp_processor_id());
726#endif
514} 727}
515 728
729#ifdef CONFIG_X86_64
730static void vgetcpu_set_mode(void)
731{
732 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
733 vgetcpu_mode = VGETCPU_RDTSCP;
734 else
735 vgetcpu_mode = VGETCPU_LSL;
736}
737#endif
738
516void __init identify_boot_cpu(void) 739void __init identify_boot_cpu(void)
517{ 740{
518 identify_cpu(&boot_cpu_data); 741 identify_cpu(&boot_cpu_data);
742#ifdef CONFIG_X86_32
519 sysenter_setup(); 743 sysenter_setup();
520 enable_sep_cpu(); 744 enable_sep_cpu();
745#else
746 vgetcpu_set_mode();
747#endif
521} 748}
522 749
523void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 750void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
524{ 751{
525 BUG_ON(c == &boot_cpu_data); 752 BUG_ON(c == &boot_cpu_data);
526 identify_cpu(c); 753 identify_cpu(c);
754#ifdef CONFIG_X86_32
527 enable_sep_cpu(); 755 enable_sep_cpu();
756#endif
528 mtrr_ap_init(); 757 mtrr_ap_init();
529} 758}
530 759
531#ifdef CONFIG_X86_HT 760struct msr_range {
532void __cpuinit detect_ht(struct cpuinfo_x86 *c) 761 unsigned min;
533{ 762 unsigned max;
534 u32 eax, ebx, ecx, edx; 763};
535 int index_msb, core_bits;
536
537 cpuid(1, &eax, &ebx, &ecx, &edx);
538
539 if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
540 return;
541
542 smp_num_siblings = (ebx & 0xff0000) >> 16;
543 764
544 if (smp_num_siblings == 1) { 765static struct msr_range msr_range_array[] __cpuinitdata = {
545 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 766 { 0x00000000, 0x00000418},
546 } else if (smp_num_siblings > 1) { 767 { 0xc0000000, 0xc000040b},
768 { 0xc0010000, 0xc0010142},
769 { 0xc0011000, 0xc001103b},
770};
547 771
548 if (smp_num_siblings > NR_CPUS) { 772static void __cpuinit print_cpu_msr(void)
549 printk(KERN_WARNING "CPU: Unsupported number of the " 773{
550 "siblings %d", smp_num_siblings); 774 unsigned index;
551 smp_num_siblings = 1; 775 u64 val;
552 return; 776 int i;
777 unsigned index_min, index_max;
778
779 for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) {
780 index_min = msr_range_array[i].min;
781 index_max = msr_range_array[i].max;
782 for (index = index_min; index < index_max; index++) {
783 if (rdmsrl_amd_safe(index, &val))
784 continue;
785 printk(KERN_INFO " MSR%08x: %016llx\n", index, val);
553 } 786 }
787 }
788}
554 789
555 index_msb = get_count_order(smp_num_siblings); 790static int show_msr __cpuinitdata;
556 c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); 791static __init int setup_show_msr(char *arg)
557 792{
558 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 793 int num;
559 c->phys_proc_id);
560
561 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
562
563 index_msb = get_count_order(smp_num_siblings) ;
564
565 core_bits = get_count_order(c->x86_max_cores);
566 794
567 c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & 795 get_option(&arg, &num);
568 ((1 << core_bits) - 1);
569 796
570 if (c->x86_max_cores > 1) 797 if (num > 0)
571 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 798 show_msr = num;
572 c->cpu_core_id); 799 return 1;
573 }
574} 800}
575#endif 801__setup("show_msr=", setup_show_msr);
576 802
577static __init int setup_noclflush(char *arg) 803static __init int setup_noclflush(char *arg)
578{ 804{
@@ -590,18 +816,26 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
590 else if (c->cpuid_level >= 0) 816 else if (c->cpuid_level >= 0)
591 vendor = c->x86_vendor_id; 817 vendor = c->x86_vendor_id;
592 818
593 if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) 819 if (vendor && !strstr(c->x86_model_id, vendor))
594 printk("%s ", vendor); 820 printk(KERN_CONT "%s ", vendor);
595 821
596 if (!c->x86_model_id[0]) 822 if (c->x86_model_id[0])
597 printk("%d86", c->x86); 823 printk(KERN_CONT "%s", c->x86_model_id);
598 else 824 else
599 printk("%s", c->x86_model_id); 825 printk(KERN_CONT "%d86", c->x86);
600 826
601 if (c->x86_mask || c->cpuid_level >= 0) 827 if (c->x86_mask || c->cpuid_level >= 0)
602 printk(" stepping %02x\n", c->x86_mask); 828 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
603 else 829 else
604 printk("\n"); 830 printk(KERN_CONT "\n");
831
832#ifdef CONFIG_SMP
833 if (c->cpu_index < show_msr)
834 print_cpu_msr();
835#else
836 if (show_msr)
837 print_cpu_msr();
838#endif
605} 839}
606 840
607static __init int setup_disablecpuid(char *arg) 841static __init int setup_disablecpuid(char *arg)
@@ -617,19 +851,89 @@ __setup("clearcpuid=", setup_disablecpuid);
617 851
618cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; 852cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
619 853
620void __init early_cpu_init(void) 854#ifdef CONFIG_X86_64
855struct x8664_pda **_cpu_pda __read_mostly;
856EXPORT_SYMBOL(_cpu_pda);
857
858struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
859
860char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
861
862void __cpuinit pda_init(int cpu)
621{ 863{
622 struct cpu_vendor_dev *cvdev; 864 struct x8664_pda *pda = cpu_pda(cpu);
865
866 /* Setup up data that may be needed in __get_free_pages early */
867 loadsegment(fs, 0);
868 loadsegment(gs, 0);
869 /* Memory clobbers used to order PDA accessed */
870 mb();
871 wrmsrl(MSR_GS_BASE, pda);
872 mb();
873
874 pda->cpunumber = cpu;
875 pda->irqcount = -1;
876 pda->kernelstack = (unsigned long)stack_thread_info() -
877 PDA_STACKOFFSET + THREAD_SIZE;
878 pda->active_mm = &init_mm;
879 pda->mmu_state = 0;
880
881 if (cpu == 0) {
882 /* others are initialized in smpboot.c */
883 pda->pcurrent = &init_task;
884 pda->irqstackptr = boot_cpu_stack;
885 pda->irqstackptr += IRQSTACKSIZE - 64;
886 } else {
887 if (!pda->irqstackptr) {
888 pda->irqstackptr = (char *)
889 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
890 if (!pda->irqstackptr)
891 panic("cannot allocate irqstack for cpu %d",
892 cpu);
893 pda->irqstackptr += IRQSTACKSIZE - 64;
894 }
895
896 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
897 pda->nodenumber = cpu_to_node(cpu);
898 }
899}
900
901char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
902 DEBUG_STKSZ] __page_aligned_bss;
623 903
624 for (cvdev = __x86cpuvendor_start ; 904extern asmlinkage void ignore_sysret(void);
625 cvdev < __x86cpuvendor_end ;
626 cvdev++)
627 cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
628 905
629 early_cpu_detect(); 906/* May not be marked __init: used by software suspend */
630 validate_pat_support(&boot_cpu_data); 907void syscall_init(void)
908{
909 /*
910 * LSTAR and STAR live in a bit strange symbiosis.
911 * They both write to the same internal register. STAR allows to
912 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
913 */
914 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
915 wrmsrl(MSR_LSTAR, system_call);
916 wrmsrl(MSR_CSTAR, ignore_sysret);
917
918#ifdef CONFIG_IA32_EMULATION
919 syscall32_cpu_init();
920#endif
921
922 /* Flags to clear on syscall */
923 wrmsrl(MSR_SYSCALL_MASK,
924 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
631} 925}
632 926
927unsigned long kernel_eflags;
928
929/*
930 * Copies of the original ist values from the tss are only accessed during
931 * debugging, no special alignment required.
932 */
933DEFINE_PER_CPU(struct orig_ist, orig_ist);
934
935#else
936
633/* Make sure %fs is initialized properly in idle threads */ 937/* Make sure %fs is initialized properly in idle threads */
634struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) 938struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
635{ 939{
@@ -637,25 +941,136 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs)
637 regs->fs = __KERNEL_PERCPU; 941 regs->fs = __KERNEL_PERCPU;
638 return regs; 942 return regs;
639} 943}
640 944#endif
641/* Current gdt points %fs at the "master" per-cpu area: after this,
642 * it's on the real one. */
643void switch_to_new_gdt(void)
644{
645 struct desc_ptr gdt_descr;
646
647 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
648 gdt_descr.size = GDT_SIZE - 1;
649 load_gdt(&gdt_descr);
650 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory");
651}
652 945
653/* 946/*
654 * cpu_init() initializes state that is per-CPU. Some data is already 947 * cpu_init() initializes state that is per-CPU. Some data is already
655 * initialized (naturally) in the bootstrap process, such as the GDT 948 * initialized (naturally) in the bootstrap process, such as the GDT
656 * and IDT. We reload them nevertheless, this function acts as a 949 * and IDT. We reload them nevertheless, this function acts as a
657 * 'CPU state barrier', nothing should get across. 950 * 'CPU state barrier', nothing should get across.
951 * A lot of state is already set up in PDA init for 64 bit
658 */ 952 */
953#ifdef CONFIG_X86_64
954void __cpuinit cpu_init(void)
955{
956 int cpu = stack_smp_processor_id();
957 struct tss_struct *t = &per_cpu(init_tss, cpu);
958 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
959 unsigned long v;
960 char *estacks = NULL;
961 struct task_struct *me;
962 int i;
963
964 /* CPU 0 is initialised in head64.c */
965 if (cpu != 0)
966 pda_init(cpu);
967 else
968 estacks = boot_exception_stacks;
969
970 me = current;
971
972 if (cpu_test_and_set(cpu, cpu_initialized))
973 panic("CPU#%d already initialized!\n", cpu);
974
975 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
976
977 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
978
979 /*
980 * Initialize the per-CPU GDT with the boot GDT,
981 * and set up the GDT descriptor:
982 */
983
984 switch_to_new_gdt();
985 load_idt((const struct desc_ptr *)&idt_descr);
986
987 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
988 syscall_init();
989
990 wrmsrl(MSR_FS_BASE, 0);
991 wrmsrl(MSR_KERNEL_GS_BASE, 0);
992 barrier();
993
994 check_efer();
995 if (cpu != 0 && x2apic)
996 enable_x2apic();
997
998 /*
999 * set up and load the per-CPU TSS
1000 */
1001 if (!orig_ist->ist[0]) {
1002 static const unsigned int order[N_EXCEPTION_STACKS] = {
1003 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
1004 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
1005 };
1006 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1007 if (cpu) {
1008 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
1009 if (!estacks)
1010 panic("Cannot allocate exception "
1011 "stack %ld %d\n", v, cpu);
1012 }
1013 estacks += PAGE_SIZE << order[v];
1014 orig_ist->ist[v] = t->x86_tss.ist[v] =
1015 (unsigned long)estacks;
1016 }
1017 }
1018
1019 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
1020 /*
1021 * <= is required because the CPU will access up to
1022 * 8 bits beyond the end of the IO permission bitmap.
1023 */
1024 for (i = 0; i <= IO_BITMAP_LONGS; i++)
1025 t->io_bitmap[i] = ~0UL;
1026
1027 atomic_inc(&init_mm.mm_count);
1028 me->active_mm = &init_mm;
1029 if (me->mm)
1030 BUG();
1031 enter_lazy_tlb(&init_mm, me);
1032
1033 load_sp0(t, &current->thread);
1034 set_tss_desc(cpu, t);
1035 load_TR_desc();
1036 load_LDT(&init_mm.context);
1037
1038#ifdef CONFIG_KGDB
1039 /*
1040 * If the kgdb is connected no debug regs should be altered. This
1041 * is only applicable when KGDB and a KGDB I/O module are built
1042 * into the kernel and you are using early debugging with
1043 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
1044 */
1045 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
1046 arch_kgdb_ops.correct_hw_break();
1047 else {
1048#endif
1049 /*
1050 * Clear all 6 debug registers:
1051 */
1052
1053 set_debugreg(0UL, 0);
1054 set_debugreg(0UL, 1);
1055 set_debugreg(0UL, 2);
1056 set_debugreg(0UL, 3);
1057 set_debugreg(0UL, 6);
1058 set_debugreg(0UL, 7);
1059#ifdef CONFIG_KGDB
1060 /* If the kgdb is connected no debug regs should be altered. */
1061 }
1062#endif
1063
1064 fpu_init();
1065
1066 raw_local_save_flags(kernel_eflags);
1067
1068 if (is_uv_system())
1069 uv_cpu_init();
1070}
1071
1072#else
1073
659void __cpuinit cpu_init(void) 1074void __cpuinit cpu_init(void)
660{ 1075{
661 int cpu = smp_processor_id(); 1076 int cpu = smp_processor_id();
@@ -709,19 +1124,21 @@ void __cpuinit cpu_init(void)
709 /* 1124 /*
710 * Force FPU initialization: 1125 * Force FPU initialization:
711 */ 1126 */
712 current_thread_info()->status = 0; 1127 if (cpu_has_xsave)
1128 current_thread_info()->status = TS_XSAVE;
1129 else
1130 current_thread_info()->status = 0;
713 clear_used_math(); 1131 clear_used_math();
714 mxcsr_feature_mask_init(); 1132 mxcsr_feature_mask_init();
715}
716 1133
717#ifdef CONFIG_HOTPLUG_CPU 1134 /*
718void __cpuinit cpu_uninit(void) 1135 * Boot processor to setup the FP and extended state context info.
719{ 1136 */
720 int cpu = raw_smp_processor_id(); 1137 if (!smp_processor_id())
721 cpu_clear(cpu, cpu_initialized); 1138 init_thread_xstate();
722 1139
723 /* lazy TLB state */ 1140 xsave_init();
724 per_cpu(cpu_tlbstate, cpu).state = 0;
725 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
726} 1141}
1142
1143
727#endif 1144#endif
diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c
deleted file mode 100644
index 7b8cc72feb40..000000000000
--- a/arch/x86/kernel/cpu/common_64.c
+++ /dev/null
@@ -1,681 +0,0 @@
1#include <linux/init.h>
2#include <linux/kernel.h>
3#include <linux/sched.h>
4#include <linux/string.h>
5#include <linux/bootmem.h>
6#include <linux/bitops.h>
7#include <linux/module.h>
8#include <linux/kgdb.h>
9#include <linux/topology.h>
10#include <linux/string.h>
11#include <linux/delay.h>
12#include <linux/smp.h>
13#include <linux/module.h>
14#include <linux/percpu.h>
15#include <asm/processor.h>
16#include <asm/i387.h>
17#include <asm/msr.h>
18#include <asm/io.h>
19#include <asm/mmu_context.h>
20#include <asm/mtrr.h>
21#include <asm/mce.h>
22#include <asm/pat.h>
23#include <asm/numa.h>
24#ifdef CONFIG_X86_LOCAL_APIC
25#include <asm/mpspec.h>
26#include <asm/apic.h>
27#include <mach_apic.h>
28#endif
29#include <asm/pda.h>
30#include <asm/pgtable.h>
31#include <asm/processor.h>
32#include <asm/desc.h>
33#include <asm/atomic.h>
34#include <asm/proto.h>
35#include <asm/sections.h>
36#include <asm/setup.h>
37#include <asm/genapic.h>
38
39#include "cpu.h"
40
41/* We need valid kernel segments for data and code in long mode too
42 * IRET will check the segment types kkeil 2000/10/28
43 * Also sysret mandates a special GDT layout
44 */
45/* The TLS descriptors are currently at a different place compared to i386.
46 Hopefully nobody expects them at a fixed place (Wine?) */
47DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = {
48 [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } },
49 [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } },
50 [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } },
51 [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } },
52 [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } },
53 [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } },
54} };
55EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
56
57__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
58
59/* Current gdt points %fs at the "master" per-cpu area: after this,
60 * it's on the real one. */
61void switch_to_new_gdt(void)
62{
63 struct desc_ptr gdt_descr;
64
65 gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id());
66 gdt_descr.size = GDT_SIZE - 1;
67 load_gdt(&gdt_descr);
68}
69
70struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {};
71
72static void __cpuinit default_init(struct cpuinfo_x86 *c)
73{
74 display_cacheinfo(c);
75}
76
77static struct cpu_dev __cpuinitdata default_cpu = {
78 .c_init = default_init,
79 .c_vendor = "Unknown",
80};
81static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu;
82
83int __cpuinit get_model_name(struct cpuinfo_x86 *c)
84{
85 unsigned int *v;
86
87 if (c->extended_cpuid_level < 0x80000004)
88 return 0;
89
90 v = (unsigned int *) c->x86_model_id;
91 cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]);
92 cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]);
93 cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]);
94 c->x86_model_id[48] = 0;
95 return 1;
96}
97
98
99void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
100{
101 unsigned int n, dummy, ebx, ecx, edx;
102
103 n = c->extended_cpuid_level;
104
105 if (n >= 0x80000005) {
106 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
107 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), "
108 "D cache %dK (%d bytes/line)\n",
109 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
110 c->x86_cache_size = (ecx>>24) + (edx>>24);
111 /* On K8 L1 TLB is inclusive, so don't count it */
112 c->x86_tlbsize = 0;
113 }
114
115 if (n >= 0x80000006) {
116 cpuid(0x80000006, &dummy, &ebx, &ecx, &edx);
117 ecx = cpuid_ecx(0x80000006);
118 c->x86_cache_size = ecx >> 16;
119 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff);
120
121 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
122 c->x86_cache_size, ecx & 0xFF);
123 }
124}
125
126void __cpuinit detect_ht(struct cpuinfo_x86 *c)
127{
128#ifdef CONFIG_SMP
129 u32 eax, ebx, ecx, edx;
130 int index_msb, core_bits;
131
132 cpuid(1, &eax, &ebx, &ecx, &edx);
133
134
135 if (!cpu_has(c, X86_FEATURE_HT))
136 return;
137 if (cpu_has(c, X86_FEATURE_CMP_LEGACY))
138 goto out;
139
140 smp_num_siblings = (ebx & 0xff0000) >> 16;
141
142 if (smp_num_siblings == 1) {
143 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n");
144 } else if (smp_num_siblings > 1) {
145
146 if (smp_num_siblings > NR_CPUS) {
147 printk(KERN_WARNING "CPU: Unsupported number of "
148 "siblings %d", smp_num_siblings);
149 smp_num_siblings = 1;
150 return;
151 }
152
153 index_msb = get_count_order(smp_num_siblings);
154 c->phys_proc_id = phys_pkg_id(index_msb);
155
156 smp_num_siblings = smp_num_siblings / c->x86_max_cores;
157
158 index_msb = get_count_order(smp_num_siblings);
159
160 core_bits = get_count_order(c->x86_max_cores);
161
162 c->cpu_core_id = phys_pkg_id(index_msb) &
163 ((1 << core_bits) - 1);
164 }
165out:
166 if ((c->x86_max_cores * smp_num_siblings) > 1) {
167 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
168 c->phys_proc_id);
169 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
170 c->cpu_core_id);
171 }
172
173#endif
174}
175
176static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c)
177{
178 char *v = c->x86_vendor_id;
179 int i;
180 static int printed;
181
182 for (i = 0; i < X86_VENDOR_NUM; i++) {
183 if (cpu_devs[i]) {
184 if (!strcmp(v, cpu_devs[i]->c_ident[0]) ||
185 (cpu_devs[i]->c_ident[1] &&
186 !strcmp(v, cpu_devs[i]->c_ident[1]))) {
187 c->x86_vendor = i;
188 this_cpu = cpu_devs[i];
189 return;
190 }
191 }
192 }
193 if (!printed) {
194 printed++;
195 printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n");
196 printk(KERN_ERR "CPU: Your system may be unstable.\n");
197 }
198 c->x86_vendor = X86_VENDOR_UNKNOWN;
199}
200
201static void __init early_cpu_support_print(void)
202{
203 int i,j;
204 struct cpu_dev *cpu_devx;
205
206 printk("KERNEL supported cpus:\n");
207 for (i = 0; i < X86_VENDOR_NUM; i++) {
208 cpu_devx = cpu_devs[i];
209 if (!cpu_devx)
210 continue;
211 for (j = 0; j < 2; j++) {
212 if (!cpu_devx->c_ident[j])
213 continue;
214 printk(" %s %s\n", cpu_devx->c_vendor,
215 cpu_devx->c_ident[j]);
216 }
217 }
218}
219
220static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c);
221
222void __init early_cpu_init(void)
223{
224 struct cpu_vendor_dev *cvdev;
225
226 for (cvdev = __x86cpuvendor_start ;
227 cvdev < __x86cpuvendor_end ;
228 cvdev++)
229 cpu_devs[cvdev->vendor] = cvdev->cpu_dev;
230 early_cpu_support_print();
231 early_identify_cpu(&boot_cpu_data);
232}
233
234/* Do some early cpuid on the boot CPU to get some parameter that are
235 needed before check_bugs. Everything advanced is in identify_cpu
236 below. */
237static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c)
238{
239 u32 tfms, xlvl;
240
241 c->loops_per_jiffy = loops_per_jiffy;
242 c->x86_cache_size = -1;
243 c->x86_vendor = X86_VENDOR_UNKNOWN;
244 c->x86_model = c->x86_mask = 0; /* So far unknown... */
245 c->x86_vendor_id[0] = '\0'; /* Unset */
246 c->x86_model_id[0] = '\0'; /* Unset */
247 c->x86_clflush_size = 64;
248 c->x86_cache_alignment = c->x86_clflush_size;
249 c->x86_max_cores = 1;
250 c->x86_coreid_bits = 0;
251 c->extended_cpuid_level = 0;
252 memset(&c->x86_capability, 0, sizeof c->x86_capability);
253
254 /* Get vendor name */
255 cpuid(0x00000000, (unsigned int *)&c->cpuid_level,
256 (unsigned int *)&c->x86_vendor_id[0],
257 (unsigned int *)&c->x86_vendor_id[8],
258 (unsigned int *)&c->x86_vendor_id[4]);
259
260 get_cpu_vendor(c);
261
262 /* Initialize the standard set of capabilities */
263 /* Note that the vendor-specific code below might override */
264
265 /* Intel-defined flags: level 0x00000001 */
266 if (c->cpuid_level >= 0x00000001) {
267 __u32 misc;
268 cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4],
269 &c->x86_capability[0]);
270 c->x86 = (tfms >> 8) & 0xf;
271 c->x86_model = (tfms >> 4) & 0xf;
272 c->x86_mask = tfms & 0xf;
273 if (c->x86 == 0xf)
274 c->x86 += (tfms >> 20) & 0xff;
275 if (c->x86 >= 0x6)
276 c->x86_model += ((tfms >> 16) & 0xF) << 4;
277 if (test_cpu_cap(c, X86_FEATURE_CLFLSH))
278 c->x86_clflush_size = ((misc >> 8) & 0xff) * 8;
279 } else {
280 /* Have CPUID level 0 only - unheard of */
281 c->x86 = 4;
282 }
283
284 c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff;
285#ifdef CONFIG_SMP
286 c->phys_proc_id = c->initial_apicid;
287#endif
288 /* AMD-defined flags: level 0x80000001 */
289 xlvl = cpuid_eax(0x80000000);
290 c->extended_cpuid_level = xlvl;
291 if ((xlvl & 0xffff0000) == 0x80000000) {
292 if (xlvl >= 0x80000001) {
293 c->x86_capability[1] = cpuid_edx(0x80000001);
294 c->x86_capability[6] = cpuid_ecx(0x80000001);
295 }
296 if (xlvl >= 0x80000004)
297 get_model_name(c); /* Default name */
298 }
299
300 /* Transmeta-defined flags: level 0x80860001 */
301 xlvl = cpuid_eax(0x80860000);
302 if ((xlvl & 0xffff0000) == 0x80860000) {
303 /* Don't set x86_cpuid_level here for now to not confuse. */
304 if (xlvl >= 0x80860001)
305 c->x86_capability[2] = cpuid_edx(0x80860001);
306 }
307
308 c->extended_cpuid_level = cpuid_eax(0x80000000);
309 if (c->extended_cpuid_level >= 0x80000007)
310 c->x86_power = cpuid_edx(0x80000007);
311
312 if (c->extended_cpuid_level >= 0x80000008) {
313 u32 eax = cpuid_eax(0x80000008);
314
315 c->x86_virt_bits = (eax >> 8) & 0xff;
316 c->x86_phys_bits = eax & 0xff;
317 }
318
319 /* Assume all 64-bit CPUs support 32-bit syscall */
320 set_cpu_cap(c, X86_FEATURE_SYSCALL32);
321
322 if (c->x86_vendor != X86_VENDOR_UNKNOWN &&
323 cpu_devs[c->x86_vendor]->c_early_init)
324 cpu_devs[c->x86_vendor]->c_early_init(c);
325
326 validate_pat_support(c);
327
328 /* early_param could clear that, but recall get it set again */
329 if (disable_apic)
330 clear_cpu_cap(c, X86_FEATURE_APIC);
331}
332
333/*
334 * This does the hard work of actually picking apart the CPU stuff...
335 */
336static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
337{
338 int i;
339
340 early_identify_cpu(c);
341
342 init_scattered_cpuid_features(c);
343
344 c->apicid = phys_pkg_id(0);
345
346 /*
347 * Vendor-specific initialization. In this section we
348 * canonicalize the feature flags, meaning if there are
349 * features a certain CPU supports which CPUID doesn't
350 * tell us, CPUID claiming incorrect flags, or other bugs,
351 * we handle them here.
352 *
353 * At the end of this section, c->x86_capability better
354 * indicate the features this CPU genuinely supports!
355 */
356 if (this_cpu->c_init)
357 this_cpu->c_init(c);
358
359 detect_ht(c);
360
361 /*
362 * On SMP, boot_cpu_data holds the common feature set between
363 * all CPUs; so make sure that we indicate which features are
364 * common between the CPUs. The first time this routine gets
365 * executed, c == &boot_cpu_data.
366 */
367 if (c != &boot_cpu_data) {
368 /* AND the already accumulated flags with these */
369 for (i = 0; i < NCAPINTS; i++)
370 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
371 }
372
373 /* Clear all flags overriden by options */
374 for (i = 0; i < NCAPINTS; i++)
375 c->x86_capability[i] &= ~cleared_cpu_caps[i];
376
377#ifdef CONFIG_X86_MCE
378 mcheck_init(c);
379#endif
380 select_idle_routine(c);
381
382#ifdef CONFIG_NUMA
383 numa_add_cpu(smp_processor_id());
384#endif
385
386}
387
388void __cpuinit identify_boot_cpu(void)
389{
390 identify_cpu(&boot_cpu_data);
391}
392
393void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
394{
395 BUG_ON(c == &boot_cpu_data);
396 identify_cpu(c);
397 mtrr_ap_init();
398}
399
400static __init int setup_noclflush(char *arg)
401{
402 setup_clear_cpu_cap(X86_FEATURE_CLFLSH);
403 return 1;
404}
405__setup("noclflush", setup_noclflush);
406
407void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)
408{
409 if (c->x86_model_id[0])
410 printk(KERN_CONT "%s", c->x86_model_id);
411
412 if (c->x86_mask || c->cpuid_level >= 0)
413 printk(KERN_CONT " stepping %02x\n", c->x86_mask);
414 else
415 printk(KERN_CONT "\n");
416}
417
418static __init int setup_disablecpuid(char *arg)
419{
420 int bit;
421 if (get_option(&arg, &bit) && bit < NCAPINTS*32)
422 setup_clear_cpu_cap(bit);
423 else
424 return 0;
425 return 1;
426}
427__setup("clearcpuid=", setup_disablecpuid);
428
429cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE;
430
431struct x8664_pda **_cpu_pda __read_mostly;
432EXPORT_SYMBOL(_cpu_pda);
433
434struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table };
435
436char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss;
437
438unsigned long __supported_pte_mask __read_mostly = ~0UL;
439EXPORT_SYMBOL_GPL(__supported_pte_mask);
440
441static int do_not_nx __cpuinitdata;
442
443/* noexec=on|off
444Control non executable mappings for 64bit processes.
445
446on Enable(default)
447off Disable
448*/
449static int __init nonx_setup(char *str)
450{
451 if (!str)
452 return -EINVAL;
453 if (!strncmp(str, "on", 2)) {
454 __supported_pte_mask |= _PAGE_NX;
455 do_not_nx = 0;
456 } else if (!strncmp(str, "off", 3)) {
457 do_not_nx = 1;
458 __supported_pte_mask &= ~_PAGE_NX;
459 }
460 return 0;
461}
462early_param("noexec", nonx_setup);
463
464int force_personality32;
465
466/* noexec32=on|off
467Control non executable heap for 32bit processes.
468To control the stack too use noexec=off
469
470on PROT_READ does not imply PROT_EXEC for 32bit processes (default)
471off PROT_READ implies PROT_EXEC
472*/
473static int __init nonx32_setup(char *str)
474{
475 if (!strcmp(str, "on"))
476 force_personality32 &= ~READ_IMPLIES_EXEC;
477 else if (!strcmp(str, "off"))
478 force_personality32 |= READ_IMPLIES_EXEC;
479 return 1;
480}
481__setup("noexec32=", nonx32_setup);
482
483void pda_init(int cpu)
484{
485 struct x8664_pda *pda = cpu_pda(cpu);
486
487 /* Setup up data that may be needed in __get_free_pages early */
488 loadsegment(fs, 0);
489 loadsegment(gs, 0);
490 /* Memory clobbers used to order PDA accessed */
491 mb();
492 wrmsrl(MSR_GS_BASE, pda);
493 mb();
494
495 pda->cpunumber = cpu;
496 pda->irqcount = -1;
497 pda->kernelstack = (unsigned long)stack_thread_info() -
498 PDA_STACKOFFSET + THREAD_SIZE;
499 pda->active_mm = &init_mm;
500 pda->mmu_state = 0;
501
502 if (cpu == 0) {
503 /* others are initialized in smpboot.c */
504 pda->pcurrent = &init_task;
505 pda->irqstackptr = boot_cpu_stack;
506 } else {
507 pda->irqstackptr = (char *)
508 __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER);
509 if (!pda->irqstackptr)
510 panic("cannot allocate irqstack for cpu %d", cpu);
511
512 if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE)
513 pda->nodenumber = cpu_to_node(cpu);
514 }
515
516 pda->irqstackptr += IRQSTACKSIZE-64;
517}
518
519char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ +
520 DEBUG_STKSZ]
521__attribute__((section(".bss.page_aligned")));
522
523extern asmlinkage void ignore_sysret(void);
524
525/* May not be marked __init: used by software suspend */
526void syscall_init(void)
527{
528 /*
529 * LSTAR and STAR live in a bit strange symbiosis.
530 * They both write to the same internal register. STAR allows to
531 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
532 */
533 wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
534 wrmsrl(MSR_LSTAR, system_call);
535 wrmsrl(MSR_CSTAR, ignore_sysret);
536
537#ifdef CONFIG_IA32_EMULATION
538 syscall32_cpu_init();
539#endif
540
541 /* Flags to clear on syscall */
542 wrmsrl(MSR_SYSCALL_MASK,
543 X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL);
544}
545
546void __cpuinit check_efer(void)
547{
548 unsigned long efer;
549
550 rdmsrl(MSR_EFER, efer);
551 if (!(efer & EFER_NX) || do_not_nx)
552 __supported_pte_mask &= ~_PAGE_NX;
553}
554
555unsigned long kernel_eflags;
556
557/*
558 * Copies of the original ist values from the tss are only accessed during
559 * debugging, no special alignment required.
560 */
561DEFINE_PER_CPU(struct orig_ist, orig_ist);
562
563/*
564 * cpu_init() initializes state that is per-CPU. Some data is already
565 * initialized (naturally) in the bootstrap process, such as the GDT
566 * and IDT. We reload them nevertheless, this function acts as a
567 * 'CPU state barrier', nothing should get across.
568 * A lot of state is already set up in PDA init.
569 */
570void __cpuinit cpu_init(void)
571{
572 int cpu = stack_smp_processor_id();
573 struct tss_struct *t = &per_cpu(init_tss, cpu);
574 struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu);
575 unsigned long v;
576 char *estacks = NULL;
577 struct task_struct *me;
578 int i;
579
580 /* CPU 0 is initialised in head64.c */
581 if (cpu != 0)
582 pda_init(cpu);
583 else
584 estacks = boot_exception_stacks;
585
586 me = current;
587
588 if (cpu_test_and_set(cpu, cpu_initialized))
589 panic("CPU#%d already initialized!\n", cpu);
590
591 printk(KERN_INFO "Initializing CPU#%d\n", cpu);
592
593 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
594
595 /*
596 * Initialize the per-CPU GDT with the boot GDT,
597 * and set up the GDT descriptor:
598 */
599
600 switch_to_new_gdt();
601 load_idt((const struct desc_ptr *)&idt_descr);
602
603 memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8);
604 syscall_init();
605
606 wrmsrl(MSR_FS_BASE, 0);
607 wrmsrl(MSR_KERNEL_GS_BASE, 0);
608 barrier();
609
610 check_efer();
611
612 /*
613 * set up and load the per-CPU TSS
614 */
615 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
616 static const unsigned int order[N_EXCEPTION_STACKS] = {
617 [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER,
618 [DEBUG_STACK - 1] = DEBUG_STACK_ORDER
619 };
620 if (cpu) {
621 estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]);
622 if (!estacks)
623 panic("Cannot allocate exception stack %ld %d\n",
624 v, cpu);
625 }
626 estacks += PAGE_SIZE << order[v];
627 orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks;
628 }
629
630 t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
631 /*
632 * <= is required because the CPU will access up to
633 * 8 bits beyond the end of the IO permission bitmap.
634 */
635 for (i = 0; i <= IO_BITMAP_LONGS; i++)
636 t->io_bitmap[i] = ~0UL;
637
638 atomic_inc(&init_mm.mm_count);
639 me->active_mm = &init_mm;
640 if (me->mm)
641 BUG();
642 enter_lazy_tlb(&init_mm, me);
643
644 load_sp0(t, &current->thread);
645 set_tss_desc(cpu, t);
646 load_TR_desc();
647 load_LDT(&init_mm.context);
648
649#ifdef CONFIG_KGDB
650 /*
651 * If the kgdb is connected no debug regs should be altered. This
652 * is only applicable when KGDB and a KGDB I/O module are built
653 * into the kernel and you are using early debugging with
654 * kgdbwait. KGDB will control the kernel HW breakpoint registers.
655 */
656 if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
657 arch_kgdb_ops.correct_hw_break();
658 else {
659#endif
660 /*
661 * Clear all 6 debug registers:
662 */
663
664 set_debugreg(0UL, 0);
665 set_debugreg(0UL, 1);
666 set_debugreg(0UL, 2);
667 set_debugreg(0UL, 3);
668 set_debugreg(0UL, 6);
669 set_debugreg(0UL, 7);
670#ifdef CONFIG_KGDB
671 /* If the kgdb is connected no debug regs should be altered. */
672 }
673#endif
674
675 fpu_init();
676
677 raw_local_save_flags(kernel_eflags);
678
679 if (is_uv_system())
680 uv_cpu_init();
681}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 4d894e8565fe..de4094a39210 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -21,23 +21,16 @@ struct cpu_dev {
21 void (*c_init)(struct cpuinfo_x86 * c); 21 void (*c_init)(struct cpuinfo_x86 * c);
22 void (*c_identify)(struct cpuinfo_x86 * c); 22 void (*c_identify)(struct cpuinfo_x86 * c);
23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); 23 unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size);
24 int c_x86_vendor;
24}; 25};
25 26
26extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; 27#define cpu_dev_register(cpu_devX) \
28 static struct cpu_dev *__cpu_dev_##cpu_devX __used \
29 __attribute__((__section__(".x86_cpu_dev.init"))) = \
30 &cpu_devX;
27 31
28struct cpu_vendor_dev { 32extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[];
29 int vendor;
30 struct cpu_dev *cpu_dev;
31};
32
33#define cpu_vendor_dev_register(cpu_vendor_id, cpu_dev) \
34 static struct cpu_vendor_dev __cpu_vendor_dev_##cpu_vendor_id __used \
35 __attribute__((__section__(".x86cpuvendor.init"))) = \
36 { cpu_vendor_id, cpu_dev }
37
38extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[];
39 33
40extern int get_model_name(struct cpuinfo_x86 *c);
41extern void display_cacheinfo(struct cpuinfo_x86 *c); 34extern void display_cacheinfo(struct cpuinfo_x86 *c);
42 35
43#endif 36#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index cb7a5715596d..efae3b22a0ff 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -235,9 +235,9 @@ config X86_LONGHAUL
235 If in doubt, say N. 235 If in doubt, say N.
236 236
237config X86_E_POWERSAVER 237config X86_E_POWERSAVER
238 tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)" 238 tristate "VIA C7 Enhanced PowerSaver"
239 select CPU_FREQ_TABLE 239 select CPU_FREQ_TABLE
240 depends on X86_32 && EXPERIMENTAL 240 depends on X86_32
241 help 241 help
242 This adds the CPUFreq driver for VIA C7 processors. 242 This adds the CPUFreq driver for VIA C7 processors.
243 243
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index b0c8208df9fa..8e48c5d4467d 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -202,7 +202,7 @@ static void drv_write(struct drv_cmd *cmd)
202 cpumask_t saved_mask = current->cpus_allowed; 202 cpumask_t saved_mask = current->cpus_allowed;
203 unsigned int i; 203 unsigned int i;
204 204
205 for_each_cpu_mask(i, cmd->mask) { 205 for_each_cpu_mask_nr(i, cmd->mask) {
206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); 206 set_cpus_allowed_ptr(current, &cpumask_of_cpu(i));
207 do_drv_write(cmd); 207 do_drv_write(cmd);
208 } 208 }
@@ -256,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask)
256 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and 256 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
257 * no meaning should be associated with absolute values of these MSRs. 257 * no meaning should be associated with absolute values of these MSRs.
258 */ 258 */
259static unsigned int get_measured_perf(unsigned int cpu) 259static unsigned int get_measured_perf(struct cpufreq_policy *policy,
260 unsigned int cpu)
260{ 261{
261 union { 262 union {
262 struct { 263 struct {
@@ -326,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu)
326 327
327#endif 328#endif
328 329
329 retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; 330 retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100;
330 331
331 put_cpu(); 332 put_cpu();
332 set_cpus_allowed_ptr(current, &saved_mask); 333 set_cpus_allowed_ptr(current, &saved_mask);
@@ -451,7 +452,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
451 452
452 freqs.old = perf->states[perf->state].core_frequency * 1000; 453 freqs.old = perf->states[perf->state].core_frequency * 1000;
453 freqs.new = data->freq_table[next_state].frequency; 454 freqs.new = data->freq_table[next_state].frequency;
454 for_each_cpu_mask(i, cmd.mask) { 455 for_each_cpu_mask_nr(i, cmd.mask) {
455 freqs.cpu = i; 456 freqs.cpu = i;
456 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 457 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
457 } 458 }
@@ -466,7 +467,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
466 } 467 }
467 } 468 }
468 469
469 for_each_cpu_mask(i, cmd.mask) { 470 for_each_cpu_mask_nr(i, cmd.mask) {
470 freqs.cpu = i; 471 freqs.cpu = i;
471 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 472 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
472 } 473 }
@@ -779,13 +780,20 @@ static int __init acpi_cpufreq_init(void)
779{ 780{
780 int ret; 781 int ret;
781 782
783 if (acpi_disabled)
784 return 0;
785
782 dprintk("acpi_cpufreq_init\n"); 786 dprintk("acpi_cpufreq_init\n");
783 787
784 ret = acpi_cpufreq_early_init(); 788 ret = acpi_cpufreq_early_init();
785 if (ret) 789 if (ret)
786 return ret; 790 return ret;
787 791
788 return cpufreq_register_driver(&acpi_cpufreq_driver); 792 ret = cpufreq_register_driver(&acpi_cpufreq_driver);
793 if (ret)
794 free_percpu(acpi_perf_data);
795
796 return ret;
789} 797}
790 798
791static void __exit acpi_cpufreq_exit(void) 799static void __exit acpi_cpufreq_exit(void)
@@ -795,8 +803,6 @@ static void __exit acpi_cpufreq_exit(void)
795 cpufreq_unregister_driver(&acpi_cpufreq_driver); 803 cpufreq_unregister_driver(&acpi_cpufreq_driver);
796 804
797 free_percpu(acpi_perf_data); 805 free_percpu(acpi_perf_data);
798
799 return;
800} 806}
801 807
802module_param(acpi_pstate_strict, uint, 0644); 808module_param(acpi_pstate_strict, uint, 0644);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index 94619c22f563..fe613c93b366 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -25,8 +25,8 @@
25#include <linux/cpufreq.h> 25#include <linux/cpufreq.h>
26 26
27#include <asm/msr.h> 27#include <asm/msr.h>
28#include <asm/timex.h> 28#include <linux/timex.h>
29#include <asm/io.h> 29#include <linux/io.h>
30 30
31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ 31#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ 32#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
@@ -44,7 +44,7 @@ struct s_elan_multiplier {
44 * It is important that the frequencies 44 * It is important that the frequencies
45 * are listed in ascending order here! 45 * are listed in ascending order here!
46 */ 46 */
47struct s_elan_multiplier elan_multiplier[] = { 47static struct s_elan_multiplier elan_multiplier[] = {
48 {1000, 0x02, 0x18}, 48 {1000, 0x02, 0x18},
49 {2000, 0x02, 0x10}, 49 {2000, 0x02, 0x10},
50 {4000, 0x02, 0x08}, 50 {4000, 0x02, 0x08},
@@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
82 u8 clockspeed_reg; /* Clock Speed Register */ 82 u8 clockspeed_reg; /* Clock Speed Register */
83 83
84 local_irq_disable(); 84 local_irq_disable();
85 outb_p(0x80,REG_CSCIR); 85 outb_p(0x80, REG_CSCIR);
86 clockspeed_reg = inb_p(REG_CSCDR); 86 clockspeed_reg = inb_p(REG_CSCDR);
87 local_irq_enable(); 87 local_irq_enable();
88 88
@@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
98 } 98 }
99 99
100 /* 33 MHz is not 32 MHz... */ 100 /* 33 MHz is not 32 MHz... */
101 if ((clockspeed_reg & 0xE0)==0xA0) 101 if ((clockspeed_reg & 0xE0) == 0xA0)
102 return 33000; 102 return 33000;
103 103
104 return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000); 104 return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
105} 105}
106 106
107 107
@@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
117 * There is no return value. 117 * There is no return value.
118 */ 118 */
119 119
120static void elanfreq_set_cpu_state (unsigned int state) 120static void elanfreq_set_cpu_state(unsigned int state)
121{ 121{
122 struct cpufreq_freqs freqs; 122 struct cpufreq_freqs freqs;
123 123
@@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state)
144 */ 144 */
145 145
146 local_irq_disable(); 146 local_irq_disable();
147 outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */ 147 outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
148 outb_p(0x00,REG_CSCDR); 148 outb_p(0x00, REG_CSCDR);
149 local_irq_enable(); /* wait till internal pipelines and */ 149 local_irq_enable(); /* wait till internal pipelines and */
150 udelay(1000); /* buffers have cleaned up */ 150 udelay(1000); /* buffers have cleaned up */
151 151
152 local_irq_disable(); 152 local_irq_disable();
153 153
154 /* now, set the CPU clock speed register (0x80) */ 154 /* now, set the CPU clock speed register (0x80) */
155 outb_p(0x80,REG_CSCIR); 155 outb_p(0x80, REG_CSCIR);
156 outb_p(elan_multiplier[state].val80h,REG_CSCDR); 156 outb_p(elan_multiplier[state].val80h, REG_CSCDR);
157 157
158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ 158 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
159 outb_p(0x40,REG_CSCIR); 159 outb_p(0x40, REG_CSCIR);
160 outb_p(elan_multiplier[state].val40h,REG_CSCDR); 160 outb_p(elan_multiplier[state].val40h, REG_CSCDR);
161 udelay(10000); 161 udelay(10000);
162 local_irq_enable(); 162 local_irq_enable();
163 163
@@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state)
173 * for the hardware supported by the driver. 173 * for the hardware supported by the driver.
174 */ 174 */
175 175
176static int elanfreq_verify (struct cpufreq_policy *policy) 176static int elanfreq_verify(struct cpufreq_policy *policy)
177{ 177{
178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); 178 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
179} 179}
180 180
181static int elanfreq_target (struct cpufreq_policy *policy, 181static int elanfreq_target(struct cpufreq_policy *policy,
182 unsigned int target_freq, 182 unsigned int target_freq,
183 unsigned int relation) 183 unsigned int relation)
184{ 184{
@@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
205 205
206 /* capability check */ 206 /* capability check */
207 if ((c->x86_vendor != X86_VENDOR_AMD) || 207 if ((c->x86_vendor != X86_VENDOR_AMD) ||
208 (c->x86 != 4) || (c->x86_model!=10)) 208 (c->x86 != 4) || (c->x86_model != 10))
209 return -ENODEV; 209 return -ENODEV;
210 210
211 /* max freq */ 211 /* max freq */
@@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
213 max_freq = elanfreq_get_cpu_frequency(0); 213 max_freq = elanfreq_get_cpu_frequency(0);
214 214
215 /* table init */ 215 /* table init */
216 for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { 216 for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
217 if (elanfreq_table[i].frequency > max_freq) 217 if (elanfreq_table[i].frequency > max_freq)
218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; 218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
219 } 219 }
@@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy)
224 224
225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); 225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
226 if (result) 226 if (result)
227 return (result); 227 return result;
228 228
229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); 229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
230 return 0; 230 return 0;
@@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup);
260#endif 260#endif
261 261
262 262
263static struct freq_attr* elanfreq_attr[] = { 263static struct freq_attr *elanfreq_attr[] = {
264 &cpufreq_freq_attr_scaling_available_freqs, 264 &cpufreq_freq_attr_scaling_available_freqs,
265 NULL, 265 NULL,
266}; 266};
@@ -284,9 +284,9 @@ static int __init elanfreq_init(void)
284 284
285 /* Test if we have the right hardware */ 285 /* Test if we have the right hardware */
286 if ((c->x86_vendor != X86_VENDOR_AMD) || 286 if ((c->x86_vendor != X86_VENDOR_AMD) ||
287 (c->x86 != 4) || (c->x86_model!=10)) { 287 (c->x86 != 4) || (c->x86_model != 10)) {
288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); 288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
289 return -ENODEV; 289 return -ENODEV;
290 } 290 }
291 return cpufreq_register_driver(&elanfreq_driver); 291 return cpufreq_register_driver(&elanfreq_driver);
292} 292}
@@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void)
298} 298}
299 299
300 300
301module_param (max_freq, int, 0444); 301module_param(max_freq, int, 0444);
302 302
303MODULE_LICENSE("GPL"); 303MODULE_LICENSE("GPL");
304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>"); 304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, Sven Geggus <sven@geggus.net>");
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 06fcce516d51..b0461856acfb 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * (C) 2001-2004 Dave Jones. <davej@codemonkey.org.uk> 2 * (C) 2001-2004 Dave Jones. <davej@redhat.com>
3 * (C) 2002 Padraig Brady. <padraig@antefacto.com> 3 * (C) 2002 Padraig Brady. <padraig@antefacto.com>
4 * 4 *
5 * Licensed under the terms of the GNU GPL License version 2. 5 * Licensed under the terms of the GNU GPL License version 2.
@@ -1019,7 +1019,7 @@ MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
1019module_param(revid_errata, int, 0644); 1019module_param(revid_errata, int, 0644);
1020MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); 1020MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
1021 1021
1022MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); 1022MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
1023MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); 1023MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors.");
1024MODULE_LICENSE ("GPL"); 1024MODULE_LICENSE ("GPL");
1025 1025
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 199e4e05e5dc..b8e05ee4f736 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
122 return 0; 122 return 0;
123 123
124 /* notifiers */ 124 /* notifiers */
125 for_each_cpu_mask(i, policy->cpus) { 125 for_each_cpu_mask_nr(i, policy->cpus) {
126 freqs.cpu = i; 126 freqs.cpu = i;
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128 } 128 }
@@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy,
130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software 130 /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software
131 * Developer's Manual, Volume 3 131 * Developer's Manual, Volume 3
132 */ 132 */
133 for_each_cpu_mask(i, policy->cpus) 133 for_each_cpu_mask_nr(i, policy->cpus)
134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); 134 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
135 135
136 /* notifiers */ 136 /* notifiers */
137 for_each_cpu_mask(i, policy->cpus) { 137 for_each_cpu_mask_nr(i, policy->cpus) {
138 freqs.cpu = i; 138 freqs.cpu = i;
139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 139 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
140 } 140 }
@@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
171 } 171 }
172 172
173 if (c->x86 != 0xF) { 173 if (c->x86 != 0xF) {
174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@lists.linux.org.uk>\n"); 174 printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to <cpufreq@vger.kernel.org>\n");
175 return 0; 175 return 0;
176 } 176 }
177 177
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index eb9b62b0830c..c1ac5790c63e 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -15,12 +15,11 @@
15#include <linux/slab.h> 15#include <linux/slab.h>
16 16
17#include <asm/msr.h> 17#include <asm/msr.h>
18#include <asm/timex.h> 18#include <linux/timex.h>
19#include <asm/io.h> 19#include <linux/io.h>
20 20
21 21#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
22#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long 22 as it is unused */
23 as it is unused */
24 23
25static unsigned int busfreq; /* FSB, in 10 kHz */ 24static unsigned int busfreq; /* FSB, in 10 kHz */
26static unsigned int max_multiplier; 25static unsigned int max_multiplier;
@@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void)
53 52
54 msrval = POWERNOW_IOPORT + 0x1; 53 msrval = POWERNOW_IOPORT + 0x1;
55 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 54 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
56 invalue=inl(POWERNOW_IOPORT + 0x8); 55 invalue = inl(POWERNOW_IOPORT + 0x8);
57 msrval = POWERNOW_IOPORT + 0x0; 56 msrval = POWERNOW_IOPORT + 0x0;
58 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 57 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
59 58
@@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void)
67 * 66 *
68 * Tries to change the PowerNow! multiplier 67 * Tries to change the PowerNow! multiplier
69 */ 68 */
70static void powernow_k6_set_state (unsigned int best_i) 69static void powernow_k6_set_state(unsigned int best_i)
71{ 70{
72 unsigned long outvalue=0, invalue=0; 71 unsigned long outvalue = 0, invalue = 0;
73 unsigned long msrval; 72 unsigned long msrval;
74 struct cpufreq_freqs freqs; 73 struct cpufreq_freqs freqs;
75 74
@@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i)
90 89
91 msrval = POWERNOW_IOPORT + 0x1; 90 msrval = POWERNOW_IOPORT + 0x1;
92 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ 91 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
93 invalue=inl(POWERNOW_IOPORT + 0x8); 92 invalue = inl(POWERNOW_IOPORT + 0x8);
94 invalue = invalue & 0xf; 93 invalue = invalue & 0xf;
95 outvalue = outvalue | invalue; 94 outvalue = outvalue | invalue;
96 outl(outvalue ,(POWERNOW_IOPORT + 0x8)); 95 outl(outvalue , (POWERNOW_IOPORT + 0x8));
97 msrval = POWERNOW_IOPORT + 0x0; 96 msrval = POWERNOW_IOPORT + 0x0;
98 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ 97 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
99 98
@@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy)
124 * 123 *
125 * sets a new CPUFreq policy 124 * sets a new CPUFreq policy
126 */ 125 */
127static int powernow_k6_target (struct cpufreq_policy *policy, 126static int powernow_k6_target(struct cpufreq_policy *policy,
128 unsigned int target_freq, 127 unsigned int target_freq,
129 unsigned int relation) 128 unsigned int relation)
130{ 129{
@@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
152 busfreq = cpu_khz / max_multiplier; 151 busfreq = cpu_khz / max_multiplier;
153 152
154 /* table init */ 153 /* table init */
155 for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { 154 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
156 if (clock_ratio[i].index > max_multiplier) 155 if (clock_ratio[i].index > max_multiplier)
157 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; 156 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
158 else 157 else
@@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
165 164
166 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); 165 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
167 if (result) 166 if (result)
168 return (result); 167 return result;
169 168
170 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); 169 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
171 170
@@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
176static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) 175static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
177{ 176{
178 unsigned int i; 177 unsigned int i;
179 for (i=0; i<8; i++) { 178 for (i = 0; i < 8; i++) {
180 if (i==max_multiplier) 179 if (i == max_multiplier)
181 powernow_k6_set_state(i); 180 powernow_k6_set_state(i);
182 } 181 }
183 cpufreq_frequency_table_put_attr(policy->cpu); 182 cpufreq_frequency_table_put_attr(policy->cpu);
@@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu)
189 return busfreq * powernow_k6_get_cpu_multiplier(); 188 return busfreq * powernow_k6_get_cpu_multiplier();
190} 189}
191 190
192static struct freq_attr* powernow_k6_attr[] = { 191static struct freq_attr *powernow_k6_attr[] = {
193 &cpufreq_freq_attr_scaling_available_freqs, 192 &cpufreq_freq_attr_scaling_available_freqs,
194 NULL, 193 NULL,
195}; 194};
@@ -227,7 +226,7 @@ static int __init powernow_k6_init(void)
227 } 226 }
228 227
229 if (cpufreq_register_driver(&powernow_k6_driver)) { 228 if (cpufreq_register_driver(&powernow_k6_driver)) {
230 release_region (POWERNOW_IOPORT, 16); 229 release_region(POWERNOW_IOPORT, 16);
231 return -EINVAL; 230 return -EINVAL;
232 } 231 }
233 232
@@ -243,13 +242,13 @@ static int __init powernow_k6_init(void)
243static void __exit powernow_k6_exit(void) 242static void __exit powernow_k6_exit(void)
244{ 243{
245 cpufreq_unregister_driver(&powernow_k6_driver); 244 cpufreq_unregister_driver(&powernow_k6_driver);
246 release_region (POWERNOW_IOPORT, 16); 245 release_region(POWERNOW_IOPORT, 16);
247} 246}
248 247
249 248
250MODULE_AUTHOR ("Arjan van de Ven <arjanv@redhat.com>, Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); 249MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
251MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); 250MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
252MODULE_LICENSE ("GPL"); 251MODULE_LICENSE("GPL");
253 252
254module_init(powernow_k6_init); 253module_init(powernow_k6_init);
255module_exit(powernow_k6_exit); 254module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 0a61159d7b71..7c7d56b43136 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * AMD K7 Powernow driver. 2 * AMD K7 Powernow driver.
3 * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs. 3 * (C) 2003 Dave Jones on behalf of SuSE Labs.
4 * (C) 2003-2004 Dave Jones <davej@redhat.com> 4 * (C) 2003-2004 Dave Jones <davej@redhat.com>
5 * 5 *
6 * Licensed under the terms of the GNU GPL License version 2. 6 * Licensed under the terms of the GNU GPL License version 2.
@@ -692,7 +692,7 @@ static void __exit powernow_exit (void)
692module_param(acpi_force, int, 0444); 692module_param(acpi_force, int, 0444);
693MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); 693MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
694 694
695MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>"); 695MODULE_AUTHOR ("Dave Jones <davej@redhat.com>");
696MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors."); 696MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors.");
697MODULE_LICENSE ("GPL"); 697MODULE_LICENSE ("GPL");
698 698
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
index f8a63b3664e3..35fb4eaf6e1c 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
@@ -1,5 +1,4 @@
1/* 1/*
2 * $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $
3 * (C) 2003 Dave Jones. 2 * (C) 2003 Dave Jones.
4 * 3 *
5 * Licensed under the terms of the GNU GPL License version 2. 4 * Licensed under the terms of the GNU GPL License version 2.
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 206791eb46e3..d3dcd58b87cd 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -7,7 +7,7 @@
7 * Support : mark.langsdorf@amd.com 7 * Support : mark.langsdorf@amd.com
8 * 8 *
9 * Based on the powernow-k7.c module written by Dave Jones. 9 * Based on the powernow-k7.c module written by Dave Jones.
10 * (C) 2003 Dave Jones <davej@codemonkey.org.uk> on behalf of SuSE Labs 10 * (C) 2003 Dave Jones on behalf of SuSE Labs
11 * (C) 2004 Dominik Brodowski <linux@brodo.de> 11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
12 * (C) 2004 Pavel Machek <pavel@suse.cz> 12 * (C) 2004 Pavel Machek <pavel@suse.cz>
13 * Licensed under the terms of the GNU GPL License version 2. 13 * Licensed under the terms of the GNU GPL License version 2.
@@ -45,7 +45,6 @@
45#endif 45#endif
46 46
47#define PFX "powernow-k8: " 47#define PFX "powernow-k8: "
48#define BFX PFX "BIOS error: "
49#define VERSION "version 2.20.00" 48#define VERSION "version 2.20.00"
50#include "powernow-k8.h" 49#include "powernow-k8.h"
51 50
@@ -66,7 +65,6 @@ static u32 find_freq_from_fid(u32 fid)
66 return 800 + (fid * 100); 65 return 800 + (fid * 100);
67} 66}
68 67
69
70/* Return a frequency in KHz, given an input fid */ 68/* Return a frequency in KHz, given an input fid */
71static u32 find_khz_freq_from_fid(u32 fid) 69static u32 find_khz_freq_from_fid(u32 fid)
72{ 70{
@@ -78,7 +76,6 @@ static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 p
78 return data[pstate].frequency; 76 return data[pstate].frequency;
79} 77}
80 78
81
82/* Return the vco fid for an input fid 79/* Return the vco fid for an input fid
83 * 80 *
84 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids 81 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
@@ -166,7 +163,6 @@ static void fidvid_msr_init(void)
166 wrmsr(MSR_FIDVID_CTL, lo, hi); 163 wrmsr(MSR_FIDVID_CTL, lo, hi);
167} 164}
168 165
169
170/* write the new fid value along with the other control fields to the msr */ 166/* write the new fid value along with the other control fields to the msr */
171static int write_new_fid(struct powernow_k8_data *data, u32 fid) 167static int write_new_fid(struct powernow_k8_data *data, u32 fid)
172{ 168{
@@ -539,35 +535,40 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8
539 535
540 for (j = 0; j < data->numps; j++) { 536 for (j = 0; j < data->numps; j++) {
541 if (pst[j].vid > LEAST_VID) { 537 if (pst[j].vid > LEAST_VID) {
542 printk(KERN_ERR PFX "vid %d invalid : 0x%x\n", j, pst[j].vid); 538 printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
539 j, pst[j].vid);
543 return -EINVAL; 540 return -EINVAL;
544 } 541 }
545 if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */ 542 if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */
546 printk(KERN_ERR BFX "0 vid exceeded with pstate %d\n", j); 543 printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
544 " %d\n", j);
547 return -ENODEV; 545 return -ENODEV;
548 } 546 }
549 if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */ 547 if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */
550 printk(KERN_ERR BFX "maxvid exceeded with pstate %d\n", j); 548 printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
549 " %d\n", j);
551 return -ENODEV; 550 return -ENODEV;
552 } 551 }
553 if (pst[j].fid > MAX_FID) { 552 if (pst[j].fid > MAX_FID) {
554 printk(KERN_ERR BFX "maxfid exceeded with pstate %d\n", j); 553 printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
554 " %d\n", j);
555 return -ENODEV; 555 return -ENODEV;
556 } 556 }
557 if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) { 557 if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
558 /* Only first fid is allowed to be in "low" range */ 558 /* Only first fid is allowed to be in "low" range */
559 printk(KERN_ERR BFX "two low fids - %d : 0x%x\n", j, pst[j].fid); 559 printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
560 "0x%x\n", j, pst[j].fid);
560 return -EINVAL; 561 return -EINVAL;
561 } 562 }
562 if (pst[j].fid < lastfid) 563 if (pst[j].fid < lastfid)
563 lastfid = pst[j].fid; 564 lastfid = pst[j].fid;
564 } 565 }
565 if (lastfid & 1) { 566 if (lastfid & 1) {
566 printk(KERN_ERR BFX "lastfid invalid\n"); 567 printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
567 return -EINVAL; 568 return -EINVAL;
568 } 569 }
569 if (lastfid > LO_FID_TABLE_TOP) 570 if (lastfid > LO_FID_TABLE_TOP)
570 printk(KERN_INFO BFX "first fid not from lo freq table\n"); 571 printk(KERN_INFO FW_BUG PFX "first fid not from lo freq table\n");
571 572
572 return 0; 573 return 0;
573} 574}
@@ -675,13 +676,13 @@ static int find_psb_table(struct powernow_k8_data *data)
675 676
676 dprintk("table vers: 0x%x\n", psb->tableversion); 677 dprintk("table vers: 0x%x\n", psb->tableversion);
677 if (psb->tableversion != PSB_VERSION_1_4) { 678 if (psb->tableversion != PSB_VERSION_1_4) {
678 printk(KERN_ERR BFX "PSB table is not v1.4\n"); 679 printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
679 return -ENODEV; 680 return -ENODEV;
680 } 681 }
681 682
682 dprintk("flags: 0x%x\n", psb->flags1); 683 dprintk("flags: 0x%x\n", psb->flags1);
683 if (psb->flags1) { 684 if (psb->flags1) {
684 printk(KERN_ERR BFX "unknown flags\n"); 685 printk(KERN_ERR FW_BUG PFX "unknown flags\n");
685 return -ENODEV; 686 return -ENODEV;
686 } 687 }
687 688
@@ -708,7 +709,7 @@ static int find_psb_table(struct powernow_k8_data *data)
708 } 709 }
709 } 710 }
710 if (cpst != 1) { 711 if (cpst != 1) {
711 printk(KERN_ERR BFX "numpst must be 1\n"); 712 printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
712 return -ENODEV; 713 return -ENODEV;
713 } 714 }
714 715
@@ -966,7 +967,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
966 freqs.old = find_khz_freq_from_fid(data->currfid); 967 freqs.old = find_khz_freq_from_fid(data->currfid);
967 freqs.new = find_khz_freq_from_fid(fid); 968 freqs.new = find_khz_freq_from_fid(fid);
968 969
969 for_each_cpu_mask(i, *(data->available_cores)) { 970 for_each_cpu_mask_nr(i, *(data->available_cores)) {
970 freqs.cpu = i; 971 freqs.cpu = i;
971 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 972 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
972 } 973 }
@@ -974,7 +975,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i
974 res = transition_fid_vid(data, fid, vid); 975 res = transition_fid_vid(data, fid, vid);
975 freqs.new = find_khz_freq_from_fid(data->currfid); 976 freqs.new = find_khz_freq_from_fid(data->currfid);
976 977
977 for_each_cpu_mask(i, *(data->available_cores)) { 978 for_each_cpu_mask_nr(i, *(data->available_cores)) {
978 freqs.cpu = i; 979 freqs.cpu = i;
979 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 980 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
980 } 981 }
@@ -997,7 +998,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
997 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); 998 freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate);
998 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 999 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
999 1000
1000 for_each_cpu_mask(i, *(data->available_cores)) { 1001 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1001 freqs.cpu = i; 1002 freqs.cpu = i;
1002 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 1003 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1003 } 1004 }
@@ -1005,7 +1006,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i
1005 res = transition_pstate(data, pstate); 1006 res = transition_pstate(data, pstate);
1006 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); 1007 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1007 1008
1008 for_each_cpu_mask(i, *(data->available_cores)) { 1009 for_each_cpu_mask_nr(i, *(data->available_cores)) {
1009 freqs.cpu = i; 1010 freqs.cpu = i;
1010 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 1011 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1011 } 1012 }
@@ -1133,17 +1134,19 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1133 "ACPI Processor module before starting this " 1134 "ACPI Processor module before starting this "
1134 "driver.\n"); 1135 "driver.\n");
1135#else 1136#else
1136 printk(KERN_ERR PFX "Your BIOS does not provide ACPI " 1137 printk(KERN_ERR FW_BUG PFX "Your BIOS does not provide"
1137 "_PSS objects in a way that Linux understands. " 1138 " ACPI _PSS objects in a way that Linux "
1138 "Please report this to the Linux ACPI maintainers" 1139 "understands. Please report this to the Linux "
1139 " and complain to your BIOS vendor.\n"); 1140 "ACPI maintainers and complain to your BIOS "
1141 "vendor.\n");
1140#endif 1142#endif
1141 kfree(data); 1143 kfree(data);
1142 return -ENODEV; 1144 return -ENODEV;
1143 } 1145 }
1144 if (pol->cpu != 0) { 1146 if (pol->cpu != 0) {
1145 printk(KERN_ERR PFX "No ACPI _PSS objects for CPU other than " 1147 printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
1146 "CPU0. Complain to your BIOS vendor.\n"); 1148 "CPU other than CPU0. Complain to your BIOS "
1149 "vendor.\n");
1147 kfree(data); 1150 kfree(data);
1148 return -ENODEV; 1151 return -ENODEV;
1149 } 1152 }
@@ -1196,7 +1199,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1196 1199
1197 /* min/max the cpu is capable of */ 1200 /* min/max the cpu is capable of */
1198 if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) { 1201 if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
1199 printk(KERN_ERR PFX "invalid powernow_table\n"); 1202 printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
1200 powernow_k8_cpu_exit_acpi(data); 1203 powernow_k8_cpu_exit_acpi(data);
1201 kfree(data->powernow_table); 1204 kfree(data->powernow_table);
1202 kfree(data); 1205 kfree(data);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 908dd347c67e..3b5f06423e77 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -26,9 +26,10 @@
26#include <asm/cpufeature.h> 26#include <asm/cpufeature.h>
27 27
28#define PFX "speedstep-centrino: " 28#define PFX "speedstep-centrino: "
29#define MAINTAINER "cpufreq@lists.linux.org.uk" 29#define MAINTAINER "cpufreq@vger.kernel.org"
30 30
31#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) 31#define dprintk(msg...) \
32 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
32 33
33#define INTEL_MSR_RANGE (0xffff) 34#define INTEL_MSR_RANGE (0xffff)
34 35
@@ -66,11 +67,12 @@ struct cpu_model
66 67
67 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ 68 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
68}; 69};
69static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x); 70static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
71 const struct cpu_id *x);
70 72
71/* Operating points for current CPU */ 73/* Operating points for current CPU */
72static struct cpu_model *centrino_model[NR_CPUS]; 74static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
73static const struct cpu_id *centrino_cpu[NR_CPUS]; 75static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
74 76
75static struct cpufreq_driver centrino_driver; 77static struct cpufreq_driver centrino_driver;
76 78
@@ -255,7 +257,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
255 return -ENOENT; 257 return -ENOENT;
256 } 258 }
257 259
258 centrino_model[policy->cpu] = model; 260 per_cpu(centrino_model, policy->cpu) = model;
259 261
260 dprintk("found \"%s\": max frequency: %dkHz\n", 262 dprintk("found \"%s\": max frequency: %dkHz\n",
261 model->model_name, model->max_freq); 263 model->model_name, model->max_freq);
@@ -264,10 +266,14 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy)
264} 266}
265 267
266#else 268#else
267static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } 269static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
270{
271 return -ENODEV;
272}
268#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ 273#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
269 274
270static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x) 275static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
276 const struct cpu_id *x)
271{ 277{
272 if ((c->x86 == x->x86) && 278 if ((c->x86 == x->x86) &&
273 (c->x86_model == x->x86_model) && 279 (c->x86_model == x->x86_model) &&
@@ -286,23 +292,28 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
286 * for centrino, as some DSDTs are buggy. 292 * for centrino, as some DSDTs are buggy.
287 * Ideally, this can be done using the acpi_data structure. 293 * Ideally, this can be done using the acpi_data structure.
288 */ 294 */
289 if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) || 295 if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
290 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) || 296 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
291 (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) { 297 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
292 msr = (msr >> 8) & 0xff; 298 msr = (msr >> 8) & 0xff;
293 return msr * 100000; 299 return msr * 100000;
294 } 300 }
295 301
296 if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points)) 302 if ((!per_cpu(centrino_model, cpu)) ||
303 (!per_cpu(centrino_model, cpu)->op_points))
297 return 0; 304 return 0;
298 305
299 msr &= 0xffff; 306 msr &= 0xffff;
300 for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) { 307 for (i = 0;
301 if (msr == centrino_model[cpu]->op_points[i].index) 308 per_cpu(centrino_model, cpu)->op_points[i].frequency
302 return centrino_model[cpu]->op_points[i].frequency; 309 != CPUFREQ_TABLE_END;
310 i++) {
311 if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
312 return per_cpu(centrino_model, cpu)->
313 op_points[i].frequency;
303 } 314 }
304 if (failsafe) 315 if (failsafe)
305 return centrino_model[cpu]->op_points[i-1].frequency; 316 return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
306 else 317 else
307 return 0; 318 return 0;
308} 319}
@@ -347,7 +358,8 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
347 int i; 358 int i;
348 359
349 /* Only Intel makes Enhanced Speedstep-capable CPUs */ 360 /* Only Intel makes Enhanced Speedstep-capable CPUs */
350 if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) 361 if (cpu->x86_vendor != X86_VENDOR_INTEL ||
362 !cpu_has(cpu, X86_FEATURE_EST))
351 return -ENODEV; 363 return -ENODEV;
352 364
353 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) 365 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
@@ -361,9 +373,9 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
361 break; 373 break;
362 374
363 if (i != N_IDS) 375 if (i != N_IDS)
364 centrino_cpu[policy->cpu] = &cpu_ids[i]; 376 per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
365 377
366 if (!centrino_cpu[policy->cpu]) { 378 if (!per_cpu(centrino_cpu, policy->cpu)) {
367 dprintk("found unsupported CPU with " 379 dprintk("found unsupported CPU with "
368 "Enhanced SpeedStep: send /proc/cpuinfo to " 380 "Enhanced SpeedStep: send /proc/cpuinfo to "
369 MAINTAINER "\n"); 381 MAINTAINER "\n");
@@ -386,23 +398,26 @@ static int centrino_cpu_init(struct cpufreq_policy *policy)
386 /* check to see if it stuck */ 398 /* check to see if it stuck */
387 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 399 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
388 if (!(l & (1<<16))) { 400 if (!(l & (1<<16))) {
389 printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); 401 printk(KERN_INFO PFX
402 "couldn't enable Enhanced SpeedStep\n");
390 return -ENODEV; 403 return -ENODEV;
391 } 404 }
392 } 405 }
393 406
394 freq = get_cur_freq(policy->cpu); 407 freq = get_cur_freq(policy->cpu);
395 408 policy->cpuinfo.transition_latency = 10000;
396 policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */ 409 /* 10uS transition latency */
397 policy->cur = freq; 410 policy->cur = freq;
398 411
399 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); 412 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
400 413
401 ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points); 414 ret = cpufreq_frequency_table_cpuinfo(policy,
415 per_cpu(centrino_model, policy->cpu)->op_points);
402 if (ret) 416 if (ret)
403 return (ret); 417 return (ret);
404 418
405 cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu); 419 cpufreq_frequency_table_get_attr(
420 per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
406 421
407 return 0; 422 return 0;
408} 423}
@@ -411,12 +426,12 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
411{ 426{
412 unsigned int cpu = policy->cpu; 427 unsigned int cpu = policy->cpu;
413 428
414 if (!centrino_model[cpu]) 429 if (!per_cpu(centrino_model, cpu))
415 return -ENODEV; 430 return -ENODEV;
416 431
417 cpufreq_frequency_table_put_attr(cpu); 432 cpufreq_frequency_table_put_attr(cpu);
418 433
419 centrino_model[cpu] = NULL; 434 per_cpu(centrino_model, cpu) = NULL;
420 435
421 return 0; 436 return 0;
422} 437}
@@ -430,17 +445,26 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy)
430 */ 445 */
431static int centrino_verify (struct cpufreq_policy *policy) 446static int centrino_verify (struct cpufreq_policy *policy)
432{ 447{
433 return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points); 448 return cpufreq_frequency_table_verify(policy,
449 per_cpu(centrino_model, policy->cpu)->op_points);
434} 450}
435 451
436/** 452/**
437 * centrino_setpolicy - set a new CPUFreq policy 453 * centrino_setpolicy - set a new CPUFreq policy
438 * @policy: new policy 454 * @policy: new policy
439 * @target_freq: the target frequency 455 * @target_freq: the target frequency
440 * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) 456 * @relation: how that frequency relates to achieved frequency
457 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
441 * 458 *
442 * Sets a new CPUFreq policy. 459 * Sets a new CPUFreq policy.
443 */ 460 */
461struct allmasks {
462 cpumask_t online_policy_cpus;
463 cpumask_t saved_mask;
464 cpumask_t set_mask;
465 cpumask_t covered_cpus;
466};
467
444static int centrino_target (struct cpufreq_policy *policy, 468static int centrino_target (struct cpufreq_policy *policy,
445 unsigned int target_freq, 469 unsigned int target_freq,
446 unsigned int relation) 470 unsigned int relation)
@@ -448,48 +472,55 @@ static int centrino_target (struct cpufreq_policy *policy,
448 unsigned int newstate = 0; 472 unsigned int newstate = 0;
449 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; 473 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
450 struct cpufreq_freqs freqs; 474 struct cpufreq_freqs freqs;
451 cpumask_t online_policy_cpus;
452 cpumask_t saved_mask;
453 cpumask_t set_mask;
454 cpumask_t covered_cpus;
455 int retval = 0; 475 int retval = 0;
456 unsigned int j, k, first_cpu, tmp; 476 unsigned int j, k, first_cpu, tmp;
457 477 CPUMASK_ALLOC(allmasks);
458 if (unlikely(centrino_model[cpu] == NULL)) 478 CPUMASK_PTR(online_policy_cpus, allmasks);
459 return -ENODEV; 479 CPUMASK_PTR(saved_mask, allmasks);
480 CPUMASK_PTR(set_mask, allmasks);
481 CPUMASK_PTR(covered_cpus, allmasks);
482
483 if (unlikely(allmasks == NULL))
484 return -ENOMEM;
485
486 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
487 retval = -ENODEV;
488 goto out;
489 }
460 490
461 if (unlikely(cpufreq_frequency_table_target(policy, 491 if (unlikely(cpufreq_frequency_table_target(policy,
462 centrino_model[cpu]->op_points, 492 per_cpu(centrino_model, cpu)->op_points,
463 target_freq, 493 target_freq,
464 relation, 494 relation,
465 &newstate))) { 495 &newstate))) {
466 return -EINVAL; 496 retval = -EINVAL;
497 goto out;
467 } 498 }
468 499
469#ifdef CONFIG_HOTPLUG_CPU 500#ifdef CONFIG_HOTPLUG_CPU
470 /* cpufreq holds the hotplug lock, so we are safe from here on */ 501 /* cpufreq holds the hotplug lock, so we are safe from here on */
471 cpus_and(online_policy_cpus, cpu_online_map, policy->cpus); 502 cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus);
472#else 503#else
473 online_policy_cpus = policy->cpus; 504 *online_policy_cpus = policy->cpus;
474#endif 505#endif
475 506
476 saved_mask = current->cpus_allowed; 507 *saved_mask = current->cpus_allowed;
477 first_cpu = 1; 508 first_cpu = 1;
478 cpus_clear(covered_cpus); 509 cpus_clear(*covered_cpus);
479 for_each_cpu_mask(j, online_policy_cpus) { 510 for_each_cpu_mask_nr(j, *online_policy_cpus) {
480 /* 511 /*
481 * Support for SMP systems. 512 * Support for SMP systems.
482 * Make sure we are running on CPU that wants to change freq 513 * Make sure we are running on CPU that wants to change freq
483 */ 514 */
484 cpus_clear(set_mask); 515 cpus_clear(*set_mask);
485 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) 516 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
486 cpus_or(set_mask, set_mask, online_policy_cpus); 517 cpus_or(*set_mask, *set_mask, *online_policy_cpus);
487 else 518 else
488 cpu_set(j, set_mask); 519 cpu_set(j, *set_mask);
489 520
490 set_cpus_allowed_ptr(current, &set_mask); 521 set_cpus_allowed_ptr(current, set_mask);
491 preempt_disable(); 522 preempt_disable();
492 if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { 523 if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) {
493 dprintk("couldn't limit to CPUs in this domain\n"); 524 dprintk("couldn't limit to CPUs in this domain\n");
494 retval = -EAGAIN; 525 retval = -EAGAIN;
495 if (first_cpu) { 526 if (first_cpu) {
@@ -500,7 +531,7 @@ static int centrino_target (struct cpufreq_policy *policy,
500 break; 531 break;
501 } 532 }
502 533
503 msr = centrino_model[cpu]->op_points[newstate].index; 534 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
504 535
505 if (first_cpu) { 536 if (first_cpu) {
506 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); 537 rdmsr(MSR_IA32_PERF_CTL, oldmsr, h);
@@ -517,7 +548,7 @@ static int centrino_target (struct cpufreq_policy *policy,
517 dprintk("target=%dkHz old=%d new=%d msr=%04x\n", 548 dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
518 target_freq, freqs.old, freqs.new, msr); 549 target_freq, freqs.old, freqs.new, msr);
519 550
520 for_each_cpu_mask(k, online_policy_cpus) { 551 for_each_cpu_mask_nr(k, *online_policy_cpus) {
521 freqs.cpu = k; 552 freqs.cpu = k;
522 cpufreq_notify_transition(&freqs, 553 cpufreq_notify_transition(&freqs,
523 CPUFREQ_PRECHANGE); 554 CPUFREQ_PRECHANGE);
@@ -536,11 +567,11 @@ static int centrino_target (struct cpufreq_policy *policy,
536 break; 567 break;
537 } 568 }
538 569
539 cpu_set(j, covered_cpus); 570 cpu_set(j, *covered_cpus);
540 preempt_enable(); 571 preempt_enable();
541 } 572 }
542 573
543 for_each_cpu_mask(k, online_policy_cpus) { 574 for_each_cpu_mask_nr(k, *online_policy_cpus) {
544 freqs.cpu = k; 575 freqs.cpu = k;
545 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 576 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
546 } 577 }
@@ -553,30 +584,32 @@ static int centrino_target (struct cpufreq_policy *policy,
553 * Best effort undo.. 584 * Best effort undo..
554 */ 585 */
555 586
556 if (!cpus_empty(covered_cpus)) { 587 if (!cpus_empty(*covered_cpus))
557 for_each_cpu_mask(j, covered_cpus) { 588 for_each_cpu_mask_nr(j, *covered_cpus) {
558 set_cpus_allowed_ptr(current, 589 set_cpus_allowed_ptr(current,
559 &cpumask_of_cpu(j)); 590 &cpumask_of_cpu(j));
560 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); 591 wrmsr(MSR_IA32_PERF_CTL, oldmsr, h);
561 } 592 }
562 }
563 593
564 tmp = freqs.new; 594 tmp = freqs.new;
565 freqs.new = freqs.old; 595 freqs.new = freqs.old;
566 freqs.old = tmp; 596 freqs.old = tmp;
567 for_each_cpu_mask(j, online_policy_cpus) { 597 for_each_cpu_mask_nr(j, *online_policy_cpus) {
568 freqs.cpu = j; 598 freqs.cpu = j;
569 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 599 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
570 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 600 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
571 } 601 }
572 } 602 }
573 set_cpus_allowed_ptr(current, &saved_mask); 603 set_cpus_allowed_ptr(current, saved_mask);
574 return 0; 604 retval = 0;
605 goto out;
575 606
576migrate_end: 607migrate_end:
577 preempt_enable(); 608 preempt_enable();
578 set_cpus_allowed_ptr(current, &saved_mask); 609 set_cpus_allowed_ptr(current, saved_mask);
579 return 0; 610out:
611 CPUMASK_FREE(allmasks);
612 return retval;
580} 613}
581 614
582static struct freq_attr* centrino_attr[] = { 615static struct freq_attr* centrino_attr[] = {
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 1b50244b1fdf..04d0376b64b0 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -279,7 +279,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
279 279
280 cpus_allowed = current->cpus_allowed; 280 cpus_allowed = current->cpus_allowed;
281 281
282 for_each_cpu_mask(i, policy->cpus) { 282 for_each_cpu_mask_nr(i, policy->cpus) {
283 freqs.cpu = i; 283 freqs.cpu = i;
284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); 284 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
285 } 285 }
@@ -292,7 +292,7 @@ static int speedstep_target (struct cpufreq_policy *policy,
292 /* allow to be run on all CPUs */ 292 /* allow to be run on all CPUs */
293 set_cpus_allowed_ptr(current, &cpus_allowed); 293 set_cpus_allowed_ptr(current, &cpus_allowed);
294 294
295 for_each_cpu_mask(i, policy->cpus) { 295 for_each_cpu_mask_nr(i, policy->cpus) {
296 freqs.cpu = i; 296 freqs.cpu = i;
297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); 297 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
298 } 298 }
@@ -431,7 +431,7 @@ static void __exit speedstep_exit(void)
431} 431}
432 432
433 433
434MODULE_AUTHOR ("Dave Jones <davej@codemonkey.org.uk>, Dominik Brodowski <linux@brodo.de>"); 434MODULE_AUTHOR ("Dave Jones <davej@redhat.com>, Dominik Brodowski <linux@brodo.de>");
435MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges."); 435MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges.");
436MODULE_LICENSE ("GPL"); 436MODULE_LICENSE ("GPL");
437 437
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 3fd7a67bb06a..ffd0f5ed071a 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -15,13 +15,11 @@
15/* 15/*
16 * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU 16 * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU
17 */ 17 */
18static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) 18static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
19{ 19{
20 unsigned char ccr2, ccr3; 20 unsigned char ccr2, ccr3;
21 unsigned long flags;
22 21
23 /* we test for DEVID by checking whether CCR3 is writable */ 22 /* we test for DEVID by checking whether CCR3 is writable */
24 local_irq_save(flags);
25 ccr3 = getCx86(CX86_CCR3); 23 ccr3 = getCx86(CX86_CCR3);
26 setCx86(CX86_CCR3, ccr3 ^ 0x80); 24 setCx86(CX86_CCR3, ccr3 ^ 0x80);
27 getCx86(0xc0); /* dummy to change bus */ 25 getCx86(0xc0); /* dummy to change bus */
@@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
44 *dir0 = getCx86(CX86_DIR0); 42 *dir0 = getCx86(CX86_DIR0);
45 *dir1 = getCx86(CX86_DIR1); 43 *dir1 = getCx86(CX86_DIR1);
46 } 44 }
47 local_irq_restore(flags);
48} 45}
49 46
47static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1)
48{
49 unsigned long flags;
50
51 local_irq_save(flags);
52 __do_cyrix_devid(dir0, dir1);
53 local_irq_restore(flags);
54}
50/* 55/*
51 * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in 56 * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in
52 * order to identify the Cyrix CPU model after we're out of setup.c 57 * order to identify the Cyrix CPU model after we're out of setup.c
@@ -116,7 +121,7 @@ static void __cpuinit set_cx86_reorder(void)
116 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 121 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
117 122
118 /* Load/Store Serialize to mem access disable (=reorder it) */ 123 /* Load/Store Serialize to mem access disable (=reorder it) */
119 setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); 124 setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80);
120 /* set load/store serialize from 1GB to 4GB */ 125 /* set load/store serialize from 1GB to 4GB */
121 ccr3 |= 0xe0; 126 ccr3 |= 0xe0;
122 setCx86(CX86_CCR3, ccr3); 127 setCx86(CX86_CCR3, ccr3);
@@ -127,28 +132,11 @@ static void __cpuinit set_cx86_memwb(void)
127 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); 132 printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n");
128 133
129 /* CCR2 bit 2: unlock NW bit */ 134 /* CCR2 bit 2: unlock NW bit */
130 setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); 135 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04);
131 /* set 'Not Write-through' */ 136 /* set 'Not Write-through' */
132 write_cr0(read_cr0() | X86_CR0_NW); 137 write_cr0(read_cr0() | X86_CR0_NW);
133 /* CCR2 bit 2: lock NW bit and set WT1 */ 138 /* CCR2 bit 2: lock NW bit and set WT1 */
134 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); 139 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14);
135}
136
137static void __cpuinit set_cx86_inc(void)
138{
139 unsigned char ccr3;
140
141 printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n");
142
143 ccr3 = getCx86(CX86_CCR3);
144 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
145 /* PCR1 -- Performance Control */
146 /* Incrementor on, whatever that is */
147 setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02);
148 /* PCR0 -- Performance Control */
149 /* Incrementor Margin 10 */
150 setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04);
151 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
152} 140}
153 141
154/* 142/*
@@ -162,23 +150,40 @@ static void __cpuinit geode_configure(void)
162 local_irq_save(flags); 150 local_irq_save(flags);
163 151
164 /* Suspend on halt power saving and enable #SUSP pin */ 152 /* Suspend on halt power saving and enable #SUSP pin */
165 setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); 153 setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88);
166 154
167 ccr3 = getCx86(CX86_CCR3); 155 ccr3 = getCx86(CX86_CCR3);
168 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 156 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
169 157
170 158
171 /* FPU fast, DTE cache, Mem bypass */ 159 /* FPU fast, DTE cache, Mem bypass */
172 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); 160 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38);
173 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 161 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
174 162
175 set_cx86_memwb(); 163 set_cx86_memwb();
176 set_cx86_reorder(); 164 set_cx86_reorder();
177 set_cx86_inc();
178 165
179 local_irq_restore(flags); 166 local_irq_restore(flags);
180} 167}
181 168
169static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c)
170{
171 unsigned char dir0, dir0_msn, dir1 = 0;
172
173 __do_cyrix_devid(&dir0, &dir1);
174 dir0_msn = dir0 >> 4; /* identifies CPU "family" */
175
176 switch (dir0_msn) {
177 case 3: /* 6x86/6x86L */
178 /* Emulate MTRRs using Cyrix's ARRs. */
179 set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
180 break;
181 case 5: /* 6x86MX/M II */
182 /* Emulate MTRRs using Cyrix's ARRs. */
183 set_cpu_cap(c, X86_FEATURE_CYRIX_ARR);
184 break;
185 }
186}
182 187
183static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) 188static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
184{ 189{
@@ -286,7 +291,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
286 /* GXm supports extended cpuid levels 'ala' AMD */ 291 /* GXm supports extended cpuid levels 'ala' AMD */
287 if (c->cpuid_level == 2) { 292 if (c->cpuid_level == 2) {
288 /* Enable cxMMX extensions (GX1 Datasheet 54) */ 293 /* Enable cxMMX extensions (GX1 Datasheet 54) */
289 setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); 294 setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1);
290 295
291 /* 296 /*
292 * GXm : 0x30 ... 0x5f GXm datasheet 51 297 * GXm : 0x30 ... 0x5f GXm datasheet 51
@@ -296,7 +301,6 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
296 */ 301 */
297 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) 302 if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f))
298 geode_configure(); 303 geode_configure();
299 get_model_name(c); /* get CPU marketing name */
300 return; 304 return;
301 } else { /* MediaGX */ 305 } else { /* MediaGX */
302 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; 306 Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4';
@@ -309,7 +313,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c)
309 if (dir1 > 7) { 313 if (dir1 > 7) {
310 dir0_msn++; /* M II */ 314 dir0_msn++; /* M II */
311 /* Enable MMX extensions (App note 108) */ 315 /* Enable MMX extensions (App note 108) */
312 setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); 316 setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1);
313 } else { 317 } else {
314 c->coma_bug = 1; /* 6x86MX, it has the bug. */ 318 c->coma_bug = 1; /* 6x86MX, it has the bug. */
315 } 319 }
@@ -424,7 +428,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
424 local_irq_save(flags); 428 local_irq_save(flags);
425 ccr3 = getCx86(CX86_CCR3); 429 ccr3 = getCx86(CX86_CCR3);
426 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ 430 setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */
427 setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */ 431 setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */
428 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ 432 setCx86(CX86_CCR3, ccr3); /* disable MAPEN */
429 local_irq_restore(flags); 433 local_irq_restore(flags);
430 } 434 }
@@ -434,16 +438,19 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c)
434static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { 438static struct cpu_dev cyrix_cpu_dev __cpuinitdata = {
435 .c_vendor = "Cyrix", 439 .c_vendor = "Cyrix",
436 .c_ident = { "CyrixInstead" }, 440 .c_ident = { "CyrixInstead" },
441 .c_early_init = early_init_cyrix,
437 .c_init = init_cyrix, 442 .c_init = init_cyrix,
438 .c_identify = cyrix_identify, 443 .c_identify = cyrix_identify,
444 .c_x86_vendor = X86_VENDOR_CYRIX,
439}; 445};
440 446
441cpu_vendor_dev_register(X86_VENDOR_CYRIX, &cyrix_cpu_dev); 447cpu_dev_register(cyrix_cpu_dev);
442 448
443static struct cpu_dev nsc_cpu_dev __cpuinitdata = { 449static struct cpu_dev nsc_cpu_dev __cpuinitdata = {
444 .c_vendor = "NSC", 450 .c_vendor = "NSC",
445 .c_ident = { "Geode by NSC" }, 451 .c_ident = { "Geode by NSC" },
446 .c_init = init_nsc, 452 .c_init = init_nsc,
453 .c_x86_vendor = X86_VENDOR_NSC,
447}; 454};
448 455
449cpu_vendor_dev_register(X86_VENDOR_NSC, &nsc_cpu_dev); 456cpu_dev_register(nsc_cpu_dev);
diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c
deleted file mode 100644
index e43ad4ad4cba..000000000000
--- a/arch/x86/kernel/cpu/feature_names.c
+++ /dev/null
@@ -1,83 +0,0 @@
1/*
2 * Strings for the various x86 capability flags.
3 *
4 * This file must not contain any executable code.
5 */
6
7#include <asm/cpufeature.h>
8
9/*
10 * These flag bits must match the definitions in <asm/cpufeature.h>.
11 * NULL means this bit is undefined or reserved; either way it doesn't
12 * have meaning as far as Linux is concerned. Note that it's important
13 * to realize there is a difference between this table and CPUID -- if
14 * applications want to get the raw CPUID data, they should access
15 * /dev/cpu/<cpu_nr>/cpuid instead.
16 */
17const char * const x86_cap_flags[NCAPINTS*32] = {
18 /* Intel-defined */
19 "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce",
20 "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov",
21 "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx",
22 "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe",
23
24 /* AMD-defined */
25 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
26 NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL,
27 NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL,
28 NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm",
29 "3dnowext", "3dnow",
30
31 /* Transmeta-defined */
32 "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL,
33 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
34 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
35 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
36
37 /* Other (Linux-defined) */
38 "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr",
39 NULL, NULL, NULL, NULL,
40 "constant_tsc", "up", NULL, "arch_perfmon",
41 "pebs", "bts", NULL, NULL,
42 "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
43 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
44
45 /* Intel-defined (#2) */
46 "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est",
47 "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL,
48 NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt",
49 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
50
51 /* VIA/Cyrix/Centaur-defined */
52 NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en",
53 "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL,
54 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
55 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
56
57 /* AMD-defined (#2) */
58 "lahf_lm", "cmp_legacy", "svm", "extapic",
59 "cr8_legacy", "abm", "sse4a", "misalignsse",
60 "3dnowprefetch", "osvw", "ibs", "sse5",
61 "skinit", "wdt", NULL, NULL,
62 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
63 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
64
65 /* Auxiliary (Linux-defined) */
66 "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL,
67 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
68 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
69 NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
70};
71
72const char *const x86_power_flags[32] = {
73 "ts", /* temperature sensor */
74 "fid", /* frequency id control */
75 "vid", /* voltage id control */
76 "ttp", /* thermal trip */
77 "tm",
78 "stc",
79 "100mhzsteps",
80 "hwpstate",
81 "", /* tsc invariant mapped to constant_tsc */
82 /* nothing */
83};
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 70609efdf1da..cce0b6118d55 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -15,6 +15,11 @@
15#include <asm/ds.h> 15#include <asm/ds.h>
16#include <asm/bugs.h> 16#include <asm/bugs.h>
17 17
18#ifdef CONFIG_X86_64
19#include <asm/topology.h>
20#include <asm/numa_64.h>
21#endif
22
18#include "cpu.h" 23#include "cpu.h"
19 24
20#ifdef CONFIG_X86_LOCAL_APIC 25#ifdef CONFIG_X86_LOCAL_APIC
@@ -23,23 +28,22 @@
23#include <mach_apic.h> 28#include <mach_apic.h>
24#endif 29#endif
25 30
26#ifdef CONFIG_X86_INTEL_USERCOPY
27/*
28 * Alignment at which movsl is preferred for bulk memory copies.
29 */
30struct movsl_mask movsl_mask __read_mostly;
31#endif
32
33static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 31static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
34{ 32{
35 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
36 if (c->x86 == 15 && c->x86_cache_alignment == 64)
37 c->x86_cache_alignment = 128;
38 if ((c->x86 == 0xf && c->x86_model >= 0x03) || 33 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
39 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 34 (c->x86 == 0x6 && c->x86_model >= 0x0e))
40 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 35 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
36
37#ifdef CONFIG_X86_64
38 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
39#else
40 /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */
41 if (c->x86 == 15 && c->x86_cache_alignment == 64)
42 c->x86_cache_alignment = 128;
43#endif
41} 44}
42 45
46#ifdef CONFIG_X86_32
43/* 47/*
44 * Early probe support logic for ppro memory erratum #50 48 * Early probe support logic for ppro memory erratum #50
45 * 49 *
@@ -59,15 +63,54 @@ int __cpuinit ppro_with_ram_bug(void)
59 return 0; 63 return 0;
60} 64}
61 65
66#ifdef CONFIG_X86_F00F_BUG
67static void __cpuinit trap_init_f00f_bug(void)
68{
69 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO);
62 70
63/* 71 /*
64 * P4 Xeon errata 037 workaround. 72 * Update the IDT descriptor and reload the IDT so that
65 * Hardware prefetcher may cause stale data to be loaded into the cache. 73 * it uses the read-only mapped virtual address.
66 */ 74 */
67static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) 75 idt_descr.address = fix_to_virt(FIX_F00F_IDT);
76 load_idt(&idt_descr);
77}
78#endif
79
80static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
68{ 81{
69 unsigned long lo, hi; 82 unsigned long lo, hi;
70 83
84#ifdef CONFIG_X86_F00F_BUG
85 /*
86 * All current models of Pentium and Pentium with MMX technology CPUs
87 * have the F0 0F bug, which lets nonprivileged users lock up the system.
88 * Note that the workaround only should be initialized once...
89 */
90 c->f00f_bug = 0;
91 if (!paravirt_enabled() && c->x86 == 5) {
92 static int f00f_workaround_enabled;
93
94 c->f00f_bug = 1;
95 if (!f00f_workaround_enabled) {
96 trap_init_f00f_bug();
97 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
98 f00f_workaround_enabled = 1;
99 }
100 }
101#endif
102
103 /*
104 * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until
105 * model 3 mask 3
106 */
107 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633)
108 clear_cpu_cap(c, X86_FEATURE_SEP);
109
110 /*
111 * P4 Xeon errata 037 workaround.
112 * Hardware prefetcher may cause stale data to be loaded into the cache.
113 */
71 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { 114 if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) {
72 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); 115 rdmsr(MSR_IA32_MISC_ENABLE, lo, hi);
73 if ((lo & (1<<9)) == 0) { 116 if ((lo & (1<<9)) == 0) {
@@ -77,13 +120,68 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c)
77 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); 120 wrmsr (MSR_IA32_MISC_ENABLE, lo, hi);
78 } 121 }
79 } 122 }
123
124 /*
125 * See if we have a good local APIC by checking for buggy Pentia,
126 * i.e. all B steppings and the C2 stepping of P54C when using their
127 * integrated APIC (see 11AP erratum in "Pentium Processor
128 * Specification Update").
129 */
130 if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 &&
131 (c->x86_mask < 0x6 || c->x86_mask == 0xb))
132 set_cpu_cap(c, X86_FEATURE_11AP);
133
134
135#ifdef CONFIG_X86_INTEL_USERCOPY
136 /*
137 * Set up the preferred alignment for movsl bulk memory moves
138 */
139 switch (c->x86) {
140 case 4: /* 486: untested */
141 break;
142 case 5: /* Old Pentia: untested */
143 break;
144 case 6: /* PII/PIII only like movsl with 8-byte alignment */
145 movsl_mask.mask = 7;
146 break;
147 case 15: /* P4 is OK down to 8-byte alignment */
148 movsl_mask.mask = 7;
149 break;
150 }
151#endif
152
153#ifdef CONFIG_X86_NUMAQ
154 numaq_tsc_disable();
155#endif
80} 156}
157#else
158static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
159{
160}
161#endif
81 162
163static void __cpuinit srat_detect_node(void)
164{
165#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
166 unsigned node;
167 int cpu = smp_processor_id();
168 int apicid = hard_smp_processor_id();
169
170 /* Don't do the funky fallback heuristics the AMD version employs
171 for now. */
172 node = apicid_to_node[apicid];
173 if (node == NUMA_NO_NODE || !node_online(node))
174 node = first_node(node_online_map);
175 numa_set_node(cpu, node);
176
177 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
178#endif
179}
82 180
83/* 181/*
84 * find out the number of processor cores on the die 182 * find out the number of processor cores on the die
85 */ 183 */
86static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) 184static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
87{ 185{
88 unsigned int eax, ebx, ecx, edx; 186 unsigned int eax, ebx, ecx, edx;
89 187
@@ -98,45 +196,51 @@ static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c)
98 return 1; 196 return 1;
99} 197}
100 198
101#ifdef CONFIG_X86_F00F_BUG 199static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c)
102static void __cpuinit trap_init_f00f_bug(void)
103{ 200{
104 __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); 201 /* Intel VMX MSR indicated features */
105 202#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000
106 /* 203#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000
107 * Update the IDT descriptor and reload the IDT so that 204#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000
108 * it uses the read-only mapped virtual address. 205#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001
109 */ 206#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002
110 idt_descr.address = fix_to_virt(FIX_F00F_IDT); 207#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020
111 load_idt(&idt_descr); 208
209 u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2;
210
211 clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
212 clear_cpu_cap(c, X86_FEATURE_VNMI);
213 clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
214 clear_cpu_cap(c, X86_FEATURE_EPT);
215 clear_cpu_cap(c, X86_FEATURE_VPID);
216
217 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high);
218 msr_ctl = vmx_msr_high | vmx_msr_low;
219 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)
220 set_cpu_cap(c, X86_FEATURE_TPR_SHADOW);
221 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI)
222 set_cpu_cap(c, X86_FEATURE_VNMI);
223 if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) {
224 rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
225 vmx_msr_low, vmx_msr_high);
226 msr_ctl2 = vmx_msr_high | vmx_msr_low;
227 if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) &&
228 (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW))
229 set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY);
230 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT)
231 set_cpu_cap(c, X86_FEATURE_EPT);
232 if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID)
233 set_cpu_cap(c, X86_FEATURE_VPID);
234 }
112} 235}
113#endif
114 236
115static void __cpuinit init_intel(struct cpuinfo_x86 *c) 237static void __cpuinit init_intel(struct cpuinfo_x86 *c)
116{ 238{
117 unsigned int l2 = 0; 239 unsigned int l2 = 0;
118 char *p = NULL;
119 240
120 early_init_intel(c); 241 early_init_intel(c);
121 242
122#ifdef CONFIG_X86_F00F_BUG 243 intel_workarounds(c);
123 /*
124 * All current models of Pentium and Pentium with MMX technology CPUs
125 * have the F0 0F bug, which lets nonprivileged users lock up the system.
126 * Note that the workaround only should be initialized once...
127 */
128 c->f00f_bug = 0;
129 if (!paravirt_enabled() && c->x86 == 5) {
130 static int f00f_workaround_enabled;
131
132 c->f00f_bug = 1;
133 if (!f00f_workaround_enabled) {
134 trap_init_f00f_bug();
135 printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n");
136 f00f_workaround_enabled = 1;
137 }
138 }
139#endif
140 244
141 l2 = init_intel_cacheinfo(c); 245 l2 = init_intel_cacheinfo(c);
142 if (c->cpuid_level > 9) { 246 if (c->cpuid_level > 9) {
@@ -146,16 +250,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
146 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); 250 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
147 } 251 }
148 252
149 /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ 253 if (cpu_has_xmm2)
150 if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) 254 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
151 clear_cpu_cap(c, X86_FEATURE_SEP); 255 if (cpu_has_ds) {
256 unsigned int l1;
257 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
258 if (!(l1 & (1<<11)))
259 set_cpu_cap(c, X86_FEATURE_BTS);
260 if (!(l1 & (1<<12)))
261 set_cpu_cap(c, X86_FEATURE_PEBS);
262 ds_init_intel(c);
263 }
152 264
265#ifdef CONFIG_X86_64
266 if (c->x86 == 15)
267 c->x86_cache_alignment = c->x86_clflush_size * 2;
268 if (c->x86 == 6)
269 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
270#else
153 /* 271 /*
154 * Names for the Pentium II/Celeron processors 272 * Names for the Pentium II/Celeron processors
155 * detectable only by also checking the cache size. 273 * detectable only by also checking the cache size.
156 * Dixon is NOT a Celeron. 274 * Dixon is NOT a Celeron.
157 */ 275 */
158 if (c->x86 == 6) { 276 if (c->x86 == 6) {
277 char *p = NULL;
278
159 switch (c->x86_model) { 279 switch (c->x86_model) {
160 case 5: 280 case 5:
161 if (c->x86_mask == 0) { 281 if (c->x86_mask == 0) {
@@ -178,60 +298,41 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
178 p = "Celeron (Coppermine)"; 298 p = "Celeron (Coppermine)";
179 break; 299 break;
180 } 300 }
181 }
182 301
183 if (p) 302 if (p)
184 strcpy(c->x86_model_id, p); 303 strcpy(c->x86_model_id, p);
185
186 c->x86_max_cores = num_cpu_cores(c);
187
188 detect_ht(c);
189
190 /* Work around errata */
191 Intel_errata_workarounds(c);
192
193#ifdef CONFIG_X86_INTEL_USERCOPY
194 /*
195 * Set up the preferred alignment for movsl bulk memory moves
196 */
197 switch (c->x86) {
198 case 4: /* 486: untested */
199 break;
200 case 5: /* Old Pentia: untested */
201 break;
202 case 6: /* PII/PIII only like movsl with 8-byte alignment */
203 movsl_mask.mask = 7;
204 break;
205 case 15: /* P4 is OK down to 8-byte alignment */
206 movsl_mask.mask = 7;
207 break;
208 } 304 }
209#endif
210 305
211 if (cpu_has_xmm2) 306 if (c->x86 == 15)
212 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
213 if (c->x86 == 15) {
214 set_cpu_cap(c, X86_FEATURE_P4); 307 set_cpu_cap(c, X86_FEATURE_P4);
215 }
216 if (c->x86 == 6) 308 if (c->x86 == 6)
217 set_cpu_cap(c, X86_FEATURE_P3); 309 set_cpu_cap(c, X86_FEATURE_P3);
218 if (cpu_has_ds) {
219 unsigned int l1;
220 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
221 if (!(l1 & (1<<11)))
222 set_cpu_cap(c, X86_FEATURE_BTS);
223 if (!(l1 & (1<<12)))
224 set_cpu_cap(c, X86_FEATURE_PEBS);
225 }
226 310
227 if (cpu_has_bts) 311 if (cpu_has_bts)
228 ds_init_intel(c); 312 ptrace_bts_init_intel(c);
229 313
230#ifdef CONFIG_X86_NUMAQ
231 numaq_tsc_disable();
232#endif 314#endif
315
316 detect_extended_topology(c);
317 if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) {
318 /*
319 * let's use the legacy cpuid vector 0x1 and 0x4 for topology
320 * detection.
321 */
322 c->x86_max_cores = intel_num_cpu_cores(c);
323#ifdef CONFIG_X86_32
324 detect_ht(c);
325#endif
326 }
327
328 /* Work around errata */
329 srat_detect_node();
330
331 if (cpu_has(c, X86_FEATURE_VMX))
332 detect_vmx_virtcap(c);
233} 333}
234 334
335#ifdef CONFIG_X86_32
235static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) 336static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size)
236{ 337{
237 /* 338 /*
@@ -244,10 +345,12 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i
244 size = 256; 345 size = 256;
245 return size; 346 return size;
246} 347}
348#endif
247 349
248static struct cpu_dev intel_cpu_dev __cpuinitdata = { 350static struct cpu_dev intel_cpu_dev __cpuinitdata = {
249 .c_vendor = "Intel", 351 .c_vendor = "Intel",
250 .c_ident = { "GenuineIntel" }, 352 .c_ident = { "GenuineIntel" },
353#ifdef CONFIG_X86_32
251 .c_models = { 354 .c_models = {
252 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = 355 { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names =
253 { 356 {
@@ -297,76 +400,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = {
297 } 400 }
298 }, 401 },
299 }, 402 },
403 .c_size_cache = intel_size_cache,
404#endif
300 .c_early_init = early_init_intel, 405 .c_early_init = early_init_intel,
301 .c_init = init_intel, 406 .c_init = init_intel,
302 .c_size_cache = intel_size_cache, 407 .c_x86_vendor = X86_VENDOR_INTEL,
303}; 408};
304 409
305cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); 410cpu_dev_register(intel_cpu_dev);
306
307#ifndef CONFIG_X86_CMPXCHG
308unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
309{
310 u8 prev;
311 unsigned long flags;
312
313 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
314 local_irq_save(flags);
315 prev = *(u8 *)ptr;
316 if (prev == old)
317 *(u8 *)ptr = new;
318 local_irq_restore(flags);
319 return prev;
320}
321EXPORT_SYMBOL(cmpxchg_386_u8);
322
323unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
324{
325 u16 prev;
326 unsigned long flags;
327
328 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
329 local_irq_save(flags);
330 prev = *(u16 *)ptr;
331 if (prev == old)
332 *(u16 *)ptr = new;
333 local_irq_restore(flags);
334 return prev;
335}
336EXPORT_SYMBOL(cmpxchg_386_u16);
337
338unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
339{
340 u32 prev;
341 unsigned long flags;
342
343 /* Poor man's cmpxchg for 386. Unsuitable for SMP */
344 local_irq_save(flags);
345 prev = *(u32 *)ptr;
346 if (prev == old)
347 *(u32 *)ptr = new;
348 local_irq_restore(flags);
349 return prev;
350}
351EXPORT_SYMBOL(cmpxchg_386_u32);
352#endif
353
354#ifndef CONFIG_X86_CMPXCHG64
355unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
356{
357 u64 prev;
358 unsigned long flags;
359
360 /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
361 local_irq_save(flags);
362 prev = *(u64 *)ptr;
363 if (prev == old)
364 *(u64 *)ptr = new;
365 local_irq_restore(flags);
366 return prev;
367}
368EXPORT_SYMBOL(cmpxchg_486_u64);
369#endif
370
371/* arch_initcall(intel_cpu_init); */
372 411
diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c
deleted file mode 100644
index 1019c58d39f0..000000000000
--- a/arch/x86/kernel/cpu/intel_64.c
+++ /dev/null
@@ -1,95 +0,0 @@
1#include <linux/init.h>
2#include <linux/smp.h>
3#include <asm/processor.h>
4#include <asm/ptrace.h>
5#include <asm/topology.h>
6#include <asm/numa_64.h>
7
8#include "cpu.h"
9
10static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
11{
12 if ((c->x86 == 0xf && c->x86_model >= 0x03) ||
13 (c->x86 == 0x6 && c->x86_model >= 0x0e))
14 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
15
16 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
17}
18
19/*
20 * find out the number of processor cores on the die
21 */
22static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c)
23{
24 unsigned int eax, t;
25
26 if (c->cpuid_level < 4)
27 return 1;
28
29 cpuid_count(4, 0, &eax, &t, &t, &t);
30
31 if (eax & 0x1f)
32 return ((eax >> 26) + 1);
33 else
34 return 1;
35}
36
37static void __cpuinit srat_detect_node(void)
38{
39#ifdef CONFIG_NUMA
40 unsigned node;
41 int cpu = smp_processor_id();
42 int apicid = hard_smp_processor_id();
43
44 /* Don't do the funky fallback heuristics the AMD version employs
45 for now. */
46 node = apicid_to_node[apicid];
47 if (node == NUMA_NO_NODE || !node_online(node))
48 node = first_node(node_online_map);
49 numa_set_node(cpu, node);
50
51 printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node);
52#endif
53}
54
55static void __cpuinit init_intel(struct cpuinfo_x86 *c)
56{
57 init_intel_cacheinfo(c);
58 if (c->cpuid_level > 9) {
59 unsigned eax = cpuid_eax(10);
60 /* Check for version and the number of counters */
61 if ((eax & 0xff) && (((eax>>8) & 0xff) > 1))
62 set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
63 }
64
65 if (cpu_has_ds) {
66 unsigned int l1, l2;
67 rdmsr(MSR_IA32_MISC_ENABLE, l1, l2);
68 if (!(l1 & (1<<11)))
69 set_cpu_cap(c, X86_FEATURE_BTS);
70 if (!(l1 & (1<<12)))
71 set_cpu_cap(c, X86_FEATURE_PEBS);
72 }
73
74
75 if (cpu_has_bts)
76 ds_init_intel(c);
77
78 if (c->x86 == 15)
79 c->x86_cache_alignment = c->x86_clflush_size * 2;
80 if (c->x86 == 6)
81 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
82 set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
83 c->x86_max_cores = intel_num_cpu_cores(c);
84
85 srat_detect_node();
86}
87
88static struct cpu_dev intel_cpu_dev __cpuinitdata = {
89 .c_vendor = "Intel",
90 .c_ident = { "GenuineIntel" },
91 .c_early_init = early_init_intel,
92 .c_init = init_intel,
93};
94cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev);
95
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 2c8afafa18e8..3f46afbb1cf1 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * Routines to indentify caches on Intel CPU. 2 * Routines to indentify caches on Intel CPU.
3 * 3 *
4 * Changes: 4 * Changes:
5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4) 5 * Venkatesh Pallipadi : Adding cache identification through cpuid(4)
6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. 6 * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure.
7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. 7 * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
8 */ 8 */
@@ -13,6 +13,7 @@
13#include <linux/compiler.h> 13#include <linux/compiler.h>
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/pci.h>
16 17
17#include <asm/processor.h> 18#include <asm/processor.h>
18#include <asm/smp.h> 19#include <asm/smp.h>
@@ -130,9 +131,18 @@ struct _cpuid4_info {
130 union _cpuid4_leaf_ebx ebx; 131 union _cpuid4_leaf_ebx ebx;
131 union _cpuid4_leaf_ecx ecx; 132 union _cpuid4_leaf_ecx ecx;
132 unsigned long size; 133 unsigned long size;
134 unsigned long can_disable;
133 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ 135 cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */
134}; 136};
135 137
138#ifdef CONFIG_PCI
139static struct pci_device_id k8_nb_id[] = {
140 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
141 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
142 {}
143};
144#endif
145
136unsigned short num_cache_leaves; 146unsigned short num_cache_leaves;
137 147
138/* AMD doesn't have CPUID4. Emulate it here to report the same 148/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -182,9 +192,10 @@ static unsigned short assocs[] __cpuinitdata = {
182static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; 192static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 };
183static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; 193static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 };
184 194
185static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, 195static void __cpuinit
186 union _cpuid4_leaf_ebx *ebx, 196amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
187 union _cpuid4_leaf_ecx *ecx) 197 union _cpuid4_leaf_ebx *ebx,
198 union _cpuid4_leaf_ecx *ecx)
188{ 199{
189 unsigned dummy; 200 unsigned dummy;
190 unsigned line_size, lines_per_tag, assoc, size_in_kb; 201 unsigned line_size, lines_per_tag, assoc, size_in_kb;
@@ -251,27 +262,40 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
251 (ebx->split.ways_of_associativity + 1) - 1; 262 (ebx->split.ways_of_associativity + 1) - 1;
252} 263}
253 264
254static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) 265static void __cpuinit
266amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf)
267{
268 if (index < 3)
269 return;
270 this_leaf->can_disable = 1;
271}
272
273static int
274__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf)
255{ 275{
256 union _cpuid4_leaf_eax eax; 276 union _cpuid4_leaf_eax eax;
257 union _cpuid4_leaf_ebx ebx; 277 union _cpuid4_leaf_ebx ebx;
258 union _cpuid4_leaf_ecx ecx; 278 union _cpuid4_leaf_ecx ecx;
259 unsigned edx; 279 unsigned edx;
260 280
261 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) 281 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
262 amd_cpuid4(index, &eax, &ebx, &ecx); 282 amd_cpuid4(index, &eax, &ebx, &ecx);
263 else 283 if (boot_cpu_data.x86 >= 0x10)
264 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 284 amd_check_l3_disable(index, this_leaf);
285 } else {
286 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
287 }
288
265 if (eax.split.type == CACHE_TYPE_NULL) 289 if (eax.split.type == CACHE_TYPE_NULL)
266 return -EIO; /* better error ? */ 290 return -EIO; /* better error ? */
267 291
268 this_leaf->eax = eax; 292 this_leaf->eax = eax;
269 this_leaf->ebx = ebx; 293 this_leaf->ebx = ebx;
270 this_leaf->ecx = ecx; 294 this_leaf->ecx = ecx;
271 this_leaf->size = (ecx.split.number_of_sets + 1) * 295 this_leaf->size = (ecx.split.number_of_sets + 1) *
272 (ebx.split.coherency_line_size + 1) * 296 (ebx.split.coherency_line_size + 1) *
273 (ebx.split.physical_line_partition + 1) * 297 (ebx.split.physical_line_partition + 1) *
274 (ebx.split.ways_of_associativity + 1); 298 (ebx.split.ways_of_associativity + 1);
275 return 0; 299 return 0;
276} 300}
277 301
@@ -453,7 +477,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
453 477
454/* pointer to _cpuid4_info array (for each cache leaf) */ 478/* pointer to _cpuid4_info array (for each cache leaf) */
455static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); 479static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info);
456#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) 480#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y]))
457 481
458#ifdef CONFIG_SMP 482#ifdef CONFIG_SMP
459static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) 483static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
@@ -489,8 +513,8 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index)
489 int sibling; 513 int sibling;
490 514
491 this_leaf = CPUID4_INFO_IDX(cpu, index); 515 this_leaf = CPUID4_INFO_IDX(cpu, index);
492 for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) { 516 for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) {
493 sibling_leaf = CPUID4_INFO_IDX(sibling, index); 517 sibling_leaf = CPUID4_INFO_IDX(sibling, index);
494 cpu_clear(cpu, sibling_leaf->shared_cpu_map); 518 cpu_clear(cpu, sibling_leaf->shared_cpu_map);
495 } 519 }
496} 520}
@@ -572,7 +596,7 @@ struct _index_kobject {
572 596
573/* pointer to array of kobjects for cpuX/cache/indexY */ 597/* pointer to array of kobjects for cpuX/cache/indexY */
574static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); 598static DEFINE_PER_CPU(struct _index_kobject *, index_kobject);
575#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) 599#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y]))
576 600
577#define show_one_plus(file_name, object, val) \ 601#define show_one_plus(file_name, object, val) \
578static ssize_t show_##file_name \ 602static ssize_t show_##file_name \
@@ -637,6 +661,99 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) {
637 } 661 }
638} 662}
639 663
664#define to_object(k) container_of(k, struct _index_kobject, kobj)
665#define to_attr(a) container_of(a, struct _cache_attr, attr)
666
667#ifdef CONFIG_PCI
668static struct pci_dev *get_k8_northbridge(int node)
669{
670 struct pci_dev *dev = NULL;
671 int i;
672
673 for (i = 0; i <= node; i++) {
674 do {
675 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
676 if (!dev)
677 break;
678 } while (!pci_match_id(&k8_nb_id[0], dev));
679 if (!dev)
680 break;
681 }
682 return dev;
683}
684#else
685static struct pci_dev *get_k8_northbridge(int node)
686{
687 return NULL;
688}
689#endif
690
691static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
692{
693 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
694 struct pci_dev *dev = NULL;
695 ssize_t ret = 0;
696 int i;
697
698 if (!this_leaf->can_disable)
699 return sprintf(buf, "Feature not enabled\n");
700
701 dev = get_k8_northbridge(node);
702 if (!dev) {
703 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
704 return -EINVAL;
705 }
706
707 for (i = 0; i < 2; i++) {
708 unsigned int reg;
709
710 pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
711
712 ret += sprintf(buf, "%sEntry: %d\n", buf, i);
713 ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n",
714 buf,
715 reg & 0x80000000 ? "Disabled" : "Allowed",
716 reg & 0x40000000 ? "Disabled" : "Allowed");
717 ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
718 buf, (reg & 0x30000) >> 16, reg & 0xfff);
719 }
720 return ret;
721}
722
723static ssize_t
724store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
725 size_t count)
726{
727 int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map));
728 struct pci_dev *dev = NULL;
729 unsigned int ret, index, val;
730
731 if (!this_leaf->can_disable)
732 return 0;
733
734 if (strlen(buf) > 15)
735 return -EINVAL;
736
737 ret = sscanf(buf, "%x %x", &index, &val);
738 if (ret != 2)
739 return -EINVAL;
740 if (index > 1)
741 return -EINVAL;
742
743 val |= 0xc0000000;
744 dev = get_k8_northbridge(node);
745 if (!dev) {
746 printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
747 return -EINVAL;
748 }
749
750 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
751 wbinvd();
752 pci_write_config_dword(dev, 0x1BC + index * 4, val);
753
754 return 1;
755}
756
640struct _cache_attr { 757struct _cache_attr {
641 struct attribute attr; 758 struct attribute attr;
642 ssize_t (*show)(struct _cpuid4_info *, char *); 759 ssize_t (*show)(struct _cpuid4_info *, char *);
@@ -657,6 +774,8 @@ define_one_ro(size);
657define_one_ro(shared_cpu_map); 774define_one_ro(shared_cpu_map);
658define_one_ro(shared_cpu_list); 775define_one_ro(shared_cpu_list);
659 776
777static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
778
660static struct attribute * default_attrs[] = { 779static struct attribute * default_attrs[] = {
661 &type.attr, 780 &type.attr,
662 &level.attr, 781 &level.attr,
@@ -667,12 +786,10 @@ static struct attribute * default_attrs[] = {
667 &size.attr, 786 &size.attr,
668 &shared_cpu_map.attr, 787 &shared_cpu_map.attr,
669 &shared_cpu_list.attr, 788 &shared_cpu_list.attr,
789 &cache_disable.attr,
670 NULL 790 NULL
671}; 791};
672 792
673#define to_object(k) container_of(k, struct _index_kobject, kobj)
674#define to_attr(a) container_of(a, struct _cache_attr, attr)
675
676static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) 793static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
677{ 794{
678 struct _cache_attr *fattr = to_attr(attr); 795 struct _cache_attr *fattr = to_attr(attr);
@@ -682,14 +799,22 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf)
682 ret = fattr->show ? 799 ret = fattr->show ?
683 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 800 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
684 buf) : 801 buf) :
685 0; 802 0;
686 return ret; 803 return ret;
687} 804}
688 805
689static ssize_t store(struct kobject * kobj, struct attribute * attr, 806static ssize_t store(struct kobject * kobj, struct attribute * attr,
690 const char * buf, size_t count) 807 const char * buf, size_t count)
691{ 808{
692 return 0; 809 struct _cache_attr *fattr = to_attr(attr);
810 struct _index_kobject *this_leaf = to_object(kobj);
811 ssize_t ret;
812
813 ret = fattr->store ?
814 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
815 buf, count) :
816 0;
817 return ret;
693} 818}
694 819
695static struct sysfs_ops sysfs_ops = { 820static struct sysfs_ops sysfs_ops = {
@@ -780,15 +905,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
780 } 905 }
781 kobject_put(per_cpu(cache_kobject, cpu)); 906 kobject_put(per_cpu(cache_kobject, cpu));
782 cpuid4_cache_sysfs_exit(cpu); 907 cpuid4_cache_sysfs_exit(cpu);
783 break; 908 return retval;
784 } 909 }
785 kobject_uevent(&(this_object->kobj), KOBJ_ADD); 910 kobject_uevent(&(this_object->kobj), KOBJ_ADD);
786 } 911 }
787 if (!retval) 912 cpu_set(cpu, cache_dev_map);
788 cpu_set(cpu, cache_dev_map);
789 913
790 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 914 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD);
791 return retval; 915 return 0;
792} 916}
793 917
794static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) 918static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c
index f390c9f66351..dd3af6e7b39a 100644
--- a/arch/x86/kernel/cpu/mcheck/k7.c
+++ b/arch/x86/kernel/cpu/mcheck/k7.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Athlon/Hammer specific Machine Check Exception Reporting 2 * Athlon specific Machine Check Exception Reporting
3 * (C) Copyright 2002 Dave Jones <davej@codemonkey.org.uk> 3 * (C) Copyright 2002 Dave Jones <davej@redhat.com>
4 */ 4 */
5 5
6#include <linux/init.h> 6#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c
index 774d87cfd8cd..0ebf3fc6a610 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_32.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_32.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * mce.c - x86 Machine Check Exception Reporting 2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk> 3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@redhat.com>
4 */ 4 */
5 5
6#include <linux/init.h> 6#include <linux/init.h>
diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c
index c4a7ec31394c..4b031a4ac856 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_64.c
@@ -580,7 +580,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
580 char __user *buf = ubuf; 580 char __user *buf = ubuf;
581 int i, err; 581 int i, err;
582 582
583 cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); 583 cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL);
584 if (!cpu_tsc) 584 if (!cpu_tsc)
585 return -ENOMEM; 585 return -ENOMEM;
586 586
@@ -759,13 +759,18 @@ static struct sysdev_class mce_sysclass = {
759}; 759};
760 760
761DEFINE_PER_CPU(struct sys_device, device_mce); 761DEFINE_PER_CPU(struct sys_device, device_mce);
762void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata;
762 763
763/* Why are there no generic functions for this? */ 764/* Why are there no generic functions for this? */
764#define ACCESSOR(name, var, start) \ 765#define ACCESSOR(name, var, start) \
765 static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ 766 static ssize_t show_ ## name(struct sys_device *s, \
767 struct sysdev_attribute *attr, \
768 char *buf) { \
766 return sprintf(buf, "%lx\n", (unsigned long)var); \ 769 return sprintf(buf, "%lx\n", (unsigned long)var); \
767 } \ 770 } \
768 static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ 771 static ssize_t set_ ## name(struct sys_device *s, \
772 struct sysdev_attribute *attr, \
773 const char *buf, size_t siz) { \
769 char *end; \ 774 char *end; \
770 unsigned long new = simple_strtoul(buf, &end, 0); \ 775 unsigned long new = simple_strtoul(buf, &end, 0); \
771 if (end == buf) return -EINVAL; \ 776 if (end == buf) return -EINVAL; \
@@ -786,14 +791,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart())
786ACCESSOR(bank4ctl,bank[4],mce_restart()) 791ACCESSOR(bank4ctl,bank[4],mce_restart())
787ACCESSOR(bank5ctl,bank[5],mce_restart()) 792ACCESSOR(bank5ctl,bank[5],mce_restart())
788 793
789static ssize_t show_trigger(struct sys_device *s, char *buf) 794static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr,
795 char *buf)
790{ 796{
791 strcpy(buf, trigger); 797 strcpy(buf, trigger);
792 strcat(buf, "\n"); 798 strcat(buf, "\n");
793 return strlen(trigger) + 1; 799 return strlen(trigger) + 1;
794} 800}
795 801
796static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) 802static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr,
803 const char *buf,size_t siz)
797{ 804{
798 char *p; 805 char *p;
799 int len; 806 int len;
@@ -806,12 +813,12 @@ static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz)
806} 813}
807 814
808static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); 815static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger);
809ACCESSOR(tolerant,tolerant,) 816static SYSDEV_INT_ATTR(tolerant, 0644, tolerant);
810ACCESSOR(check_interval,check_interval,mce_restart()) 817ACCESSOR(check_interval,check_interval,mce_restart())
811static struct sysdev_attribute *mce_attributes[] = { 818static struct sysdev_attribute *mce_attributes[] = {
812 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, 819 &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl,
813 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, 820 &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl,
814 &attr_tolerant, &attr_check_interval, &attr_trigger, 821 &attr_tolerant.attr, &attr_check_interval, &attr_trigger,
815 NULL 822 NULL
816}; 823};
817 824
@@ -853,7 +860,7 @@ error:
853 return err; 860 return err;
854} 861}
855 862
856static void mce_remove_device(unsigned int cpu) 863static __cpuinit void mce_remove_device(unsigned int cpu)
857{ 864{
858 int i; 865 int i;
859 866
@@ -877,9 +884,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb,
877 case CPU_ONLINE: 884 case CPU_ONLINE:
878 case CPU_ONLINE_FROZEN: 885 case CPU_ONLINE_FROZEN:
879 mce_create_device(cpu); 886 mce_create_device(cpu);
887 if (threshold_cpu_callback)
888 threshold_cpu_callback(action, cpu);
880 break; 889 break;
881 case CPU_DEAD: 890 case CPU_DEAD:
882 case CPU_DEAD_FROZEN: 891 case CPU_DEAD_FROZEN:
892 if (threshold_cpu_callback)
893 threshold_cpu_callback(action, cpu);
883 mce_remove_device(cpu); 894 mce_remove_device(cpu);
884 break; 895 break;
885 } 896 }
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
index 7c9a813e1193..5eb390a4b2e9 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c
@@ -527,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
527 if (err) 527 if (err)
528 goto out_free; 528 goto out_free;
529 529
530 for_each_cpu_mask(i, b->cpus) { 530 for_each_cpu_mask_nr(i, b->cpus) {
531 if (i == cpu) 531 if (i == cpu)
532 continue; 532 continue;
533 533
@@ -617,7 +617,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
617#endif 617#endif
618 618
619 /* remove all sibling symlinks before unregistering */ 619 /* remove all sibling symlinks before unregistering */
620 for_each_cpu_mask(i, b->cpus) { 620 for_each_cpu_mask_nr(i, b->cpus) {
621 if (i == cpu) 621 if (i == cpu)
622 continue; 622 continue;
623 623
@@ -628,6 +628,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank)
628 deallocate_threshold_block(cpu, bank); 628 deallocate_threshold_block(cpu, bank);
629 629
630free_out: 630free_out:
631 kobject_del(b->kobj);
631 kobject_put(b->kobj); 632 kobject_put(b->kobj);
632 kfree(b); 633 kfree(b);
633 per_cpu(threshold_banks, cpu)[bank] = NULL; 634 per_cpu(threshold_banks, cpu)[bank] = NULL;
@@ -645,14 +646,11 @@ static void threshold_remove_device(unsigned int cpu)
645} 646}
646 647
647/* get notified when a cpu comes on/off */ 648/* get notified when a cpu comes on/off */
648static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, 649static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action,
649 unsigned long action, void *hcpu) 650 unsigned int cpu)
650{ 651{
651 /* cpu was unsigned int to begin with */
652 unsigned int cpu = (unsigned long)hcpu;
653
654 if (cpu >= NR_CPUS) 652 if (cpu >= NR_CPUS)
655 goto out; 653 return;
656 654
657 switch (action) { 655 switch (action) {
658 case CPU_ONLINE: 656 case CPU_ONLINE:
@@ -666,14 +664,8 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb,
666 default: 664 default:
667 break; 665 break;
668 } 666 }
669 out:
670 return NOTIFY_OK;
671} 667}
672 668
673static struct notifier_block threshold_cpu_notifier __cpuinitdata = {
674 .notifier_call = threshold_cpu_callback,
675};
676
677static __init int threshold_init_device(void) 669static __init int threshold_init_device(void)
678{ 670{
679 unsigned lcpu = 0; 671 unsigned lcpu = 0;
@@ -684,7 +676,7 @@ static __init int threshold_init_device(void)
684 if (err) 676 if (err)
685 return err; 677 return err;
686 } 678 }
687 register_hotcpu_notifier(&threshold_cpu_notifier); 679 threshold_cpu_callback = amd_64_threshold_cpu_callback;
688 return 0; 680 return 0;
689} 681}
690 682
diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c
index cc1fccdd31e0..a74af128efc9 100644
--- a/arch/x86/kernel/cpu/mcheck/non-fatal.c
+++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * Non Fatal Machine Check Exception Reporting 2 * Non Fatal Machine Check Exception Reporting
3 * 3 *
4 * (C) Copyright 2002 Dave Jones. <davej@codemonkey.org.uk> 4 * (C) Copyright 2002 Dave Jones. <davej@redhat.com>
5 * 5 *
6 * This file contains routines to check for non-fatal MCEs every 15s 6 * This file contains routines to check for non-fatal MCEs every 15s
7 * 7 *
diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c
index eef001ad3bde..9b60fce09f75 100644
--- a/arch/x86/kernel/cpu/mcheck/p4.c
+++ b/arch/x86/kernel/cpu/mcheck/p4.c
@@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
102 /* The temperature transition interrupt handler setup */ 102 /* The temperature transition interrupt handler setup */
103 h = THERMAL_APIC_VECTOR; /* our delivery vector */ 103 h = THERMAL_APIC_VECTOR; /* our delivery vector */
104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ 104 h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
105 apic_write_around(APIC_LVTTHMR, h); 105 apic_write(APIC_LVTTHMR, h);
106 106
107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); 107 rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); 108 wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h);
@@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c)
114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); 114 wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h);
115 115
116 l = apic_read(APIC_LVTTHMR); 116 l = apic_read(APIC_LVTTHMR);
117 apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 117 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); 118 printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu);
119 119
120 /* enable thermal throttle processing */ 120 /* enable thermal throttle processing */
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 1f4cc48c14c6..d5ae2243f0b9 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0);
35 35
36#define define_therm_throt_sysdev_show_func(name) \ 36#define define_therm_throt_sysdev_show_func(name) \
37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ 37static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \
38 struct sysdev_attribute *attr, \
38 char *buf) \ 39 char *buf) \
39{ \ 40{ \
40 unsigned int cpu = dev->id; \ 41 unsigned int cpu = dev->id; \
diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl
new file mode 100644
index 000000000000..dfea390e1608
--- /dev/null
+++ b/arch/x86/kernel/cpu/mkcapflags.pl
@@ -0,0 +1,32 @@
1#!/usr/bin/perl
2#
3# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h
4#
5
6($in, $out) = @ARGV;
7
8open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n";
9open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n";
10
11print OUT "#include <asm/cpufeature.h>\n\n";
12print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n";
13
14while (defined($line = <IN>)) {
15 if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) {
16 $macro = $1;
17 $feature = $2;
18 $tail = $3;
19 if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) {
20 $feature = $1;
21 }
22
23 if ($feature ne '') {
24 printf OUT "\t%-32s = \"%s\",\n",
25 "[$macro]", "\L$feature";
26 }
27 }
28}
29print OUT "};\n";
30
31close(IN);
32close(OUT);
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 509bd3d9eacd..4e8d77f01eeb 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -379,6 +379,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
379 unsigned long *size, mtrr_type *type) 379 unsigned long *size, mtrr_type *type)
380{ 380{
381 unsigned int mask_lo, mask_hi, base_lo, base_hi; 381 unsigned int mask_lo, mask_hi, base_lo, base_hi;
382 unsigned int tmp, hi;
382 383
383 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); 384 rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
384 if ((mask_lo & 0x800) == 0) { 385 if ((mask_lo & 0x800) == 0) {
@@ -392,8 +393,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
392 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); 393 rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi);
393 394
394 /* Work out the shifted address mask. */ 395 /* Work out the shifted address mask. */
395 mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) 396 tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT;
396 | mask_lo >> PAGE_SHIFT; 397 mask_lo = size_or_mask | tmp;
398 /* Expand tmp with high bits to all 1s*/
399 hi = fls(tmp);
400 if (hi > 0) {
401 tmp |= ~((1<<(hi - 1)) - 1);
402
403 if (tmp != mask_lo) {
404 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n");
405 mask_lo = tmp;
406 }
407 }
397 408
398 /* This works correctly if size is a power of two, i.e. a 409 /* This works correctly if size is a power of two, i.e. a
399 contiguous range. */ 410 contiguous range. */
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 84c480bb3715..4c4214690dd1 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset)
405 } 405 }
406 /* RED-PEN: base can be > 32bit */ 406 /* RED-PEN: base can be > 32bit */
407 len += seq_printf(seq, 407 len += seq_printf(seq,
408 "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", 408 "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n",
409 i, base, base >> (20 - PAGE_SHIFT), size, factor, 409 i, base, base >> (20 - PAGE_SHIFT), size, factor,
410 mtrr_attrib_to_str(type), mtrr_usage_table[i]); 410 mtrr_usage_table[i], mtrr_attrib_to_str(type));
411 } 411 }
412 } 412 }
413 return 0; 413 return 0;
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 6f23969c8faf..c78c04821ea1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -729,7 +729,7 @@ struct var_mtrr_range_state {
729 mtrr_type type; 729 mtrr_type type;
730}; 730};
731 731
732struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; 732static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
733static int __initdata debug_print; 733static int __initdata debug_print;
734 734
735static int __init 735static int __init
@@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
759 /* take out UC ranges */ 759 /* take out UC ranges */
760 for (i = 0; i < num_var_ranges; i++) { 760 for (i = 0; i < num_var_ranges; i++) {
761 type = range_state[i].type; 761 type = range_state[i].type;
762 if (type != MTRR_TYPE_UNCACHABLE) 762 if (type != MTRR_TYPE_UNCACHABLE &&
763 type != MTRR_TYPE_WRPROT)
763 continue; 764 continue;
764 size = range_state[i].size_pfn; 765 size = range_state[i].size_pfn;
765 if (!size) 766 if (!size)
@@ -834,7 +835,14 @@ static int __init enable_mtrr_cleanup_setup(char *str)
834 enable_mtrr_cleanup = 1; 835 enable_mtrr_cleanup = 1;
835 return 0; 836 return 0;
836} 837}
837early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); 838early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup);
839
840static int __init mtrr_cleanup_debug_setup(char *str)
841{
842 debug_print = 1;
843 return 0;
844}
845early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup);
838 846
839struct var_mtrr_state { 847struct var_mtrr_state {
840 unsigned long range_startk; 848 unsigned long range_startk;
@@ -898,6 +906,27 @@ set_var_mtrr_all(unsigned int address_bits)
898 } 906 }
899} 907}
900 908
909static unsigned long to_size_factor(unsigned long sizek, char *factorp)
910{
911 char factor;
912 unsigned long base = sizek;
913
914 if (base & ((1<<10) - 1)) {
915 /* not MB alignment */
916 factor = 'K';
917 } else if (base & ((1<<20) - 1)){
918 factor = 'M';
919 base >>= 10;
920 } else {
921 factor = 'G';
922 base >>= 20;
923 }
924
925 *factorp = factor;
926
927 return base;
928}
929
901static unsigned int __init 930static unsigned int __init
902range_to_mtrr(unsigned int reg, unsigned long range_startk, 931range_to_mtrr(unsigned int reg, unsigned long range_startk,
903 unsigned long range_sizek, unsigned char type) 932 unsigned long range_sizek, unsigned char type)
@@ -919,13 +948,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk,
919 align = max_align; 948 align = max_align;
920 949
921 sizek = 1 << align; 950 sizek = 1 << align;
922 if (debug_print) 951 if (debug_print) {
952 char start_factor = 'K', size_factor = 'K';
953 unsigned long start_base, size_base;
954
955 start_base = to_size_factor(range_startk, &start_factor),
956 size_base = to_size_factor(sizek, &size_factor),
957
923 printk(KERN_DEBUG "Setting variable MTRR %d, " 958 printk(KERN_DEBUG "Setting variable MTRR %d, "
924 "base: %ldMB, range: %ldMB, type %s\n", 959 "base: %ld%cB, range: %ld%cB, type %s\n",
925 reg, range_startk >> 10, sizek >> 10, 960 reg, start_base, start_factor,
961 size_base, size_factor,
926 (type == MTRR_TYPE_UNCACHABLE)?"UC": 962 (type == MTRR_TYPE_UNCACHABLE)?"UC":
927 ((type == MTRR_TYPE_WRBACK)?"WB":"Other") 963 ((type == MTRR_TYPE_WRBACK)?"WB":"Other")
928 ); 964 );
965 }
929 save_var_mtrr(reg++, range_startk, sizek, type); 966 save_var_mtrr(reg++, range_startk, sizek, type);
930 range_startk += sizek; 967 range_startk += sizek;
931 range_sizek -= sizek; 968 range_sizek -= sizek;
@@ -970,6 +1007,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
970 /* try to append some small hole */ 1007 /* try to append some small hole */
971 range0_basek = state->range_startk; 1008 range0_basek = state->range_startk;
972 range0_sizek = ALIGN(state->range_sizek, chunk_sizek); 1009 range0_sizek = ALIGN(state->range_sizek, chunk_sizek);
1010
1011 /* no increase */
973 if (range0_sizek == state->range_sizek) { 1012 if (range0_sizek == state->range_sizek) {
974 if (debug_print) 1013 if (debug_print)
975 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", 1014 printk(KERN_DEBUG "rangeX: %016lx - %016lx\n",
@@ -980,13 +1019,40 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
980 return 0; 1019 return 0;
981 } 1020 }
982 1021
983 range0_sizek -= chunk_sizek; 1022 /* only cut back, when it is not the last */
984 if (range0_sizek && sizek) { 1023 if (sizek) {
985 while (range0_basek + range0_sizek > (basek + sizek)) { 1024 while (range0_basek + range0_sizek > (basek + sizek)) {
986 range0_sizek -= chunk_sizek; 1025 if (range0_sizek >= chunk_sizek)
987 if (!range0_sizek) 1026 range0_sizek -= chunk_sizek;
988 break; 1027 else
989 } 1028 range0_sizek = 0;
1029
1030 if (!range0_sizek)
1031 break;
1032 }
1033 }
1034
1035second_try:
1036 range_basek = range0_basek + range0_sizek;
1037
1038 /* one hole in the middle */
1039 if (range_basek > basek && range_basek <= (basek + sizek))
1040 second_sizek = range_basek - basek;
1041
1042 if (range0_sizek > state->range_sizek) {
1043
1044 /* one hole in middle or at end */
1045 hole_sizek = range0_sizek - state->range_sizek - second_sizek;
1046
1047 /* hole size should be less than half of range0 size */
1048 if (hole_sizek >= (range0_sizek >> 1) &&
1049 range0_sizek >= chunk_sizek) {
1050 range0_sizek -= chunk_sizek;
1051 second_sizek = 0;
1052 hole_sizek = 0;
1053
1054 goto second_try;
1055 }
990 } 1056 }
991 1057
992 if (range0_sizek) { 1058 if (range0_sizek) {
@@ -996,50 +1062,28 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek,
996 (range0_basek + range0_sizek)<<10); 1062 (range0_basek + range0_sizek)<<10);
997 state->reg = range_to_mtrr(state->reg, range0_basek, 1063 state->reg = range_to_mtrr(state->reg, range0_basek,
998 range0_sizek, MTRR_TYPE_WRBACK); 1064 range0_sizek, MTRR_TYPE_WRBACK);
999
1000 }
1001
1002 range_basek = range0_basek + range0_sizek;
1003 range_sizek = chunk_sizek;
1004
1005 if (range_basek + range_sizek > basek &&
1006 range_basek + range_sizek <= (basek + sizek)) {
1007 /* one hole */
1008 second_basek = basek;
1009 second_sizek = range_basek + range_sizek - basek;
1010 } 1065 }
1011 1066
1012 /* if last piece, only could one hole near end */ 1067 if (range0_sizek < state->range_sizek) {
1013 if ((second_basek || !basek) && 1068 /* need to handle left over */
1014 range_sizek - (state->range_sizek - range0_sizek) - second_sizek <
1015 (chunk_sizek >> 1)) {
1016 /*
1017 * one hole in middle (second_sizek is 0) or at end
1018 * (second_sizek is 0 )
1019 */
1020 hole_sizek = range_sizek - (state->range_sizek - range0_sizek)
1021 - second_sizek;
1022 hole_basek = range_basek + range_sizek - hole_sizek
1023 - second_sizek;
1024 } else {
1025 /* fallback for big hole, or several holes */
1026 range_sizek = state->range_sizek - range0_sizek; 1069 range_sizek = state->range_sizek - range0_sizek;
1027 second_basek = 0; 1070
1028 second_sizek = 0; 1071 if (debug_print)
1072 printk(KERN_DEBUG "range: %016lx - %016lx\n",
1073 range_basek<<10,
1074 (range_basek + range_sizek)<<10);
1075 state->reg = range_to_mtrr(state->reg, range_basek,
1076 range_sizek, MTRR_TYPE_WRBACK);
1029 } 1077 }
1030 1078
1031 if (debug_print)
1032 printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10,
1033 (range_basek + range_sizek)<<10);
1034 state->reg = range_to_mtrr(state->reg, range_basek, range_sizek,
1035 MTRR_TYPE_WRBACK);
1036 if (hole_sizek) { 1079 if (hole_sizek) {
1080 hole_basek = range_basek - hole_sizek - second_sizek;
1037 if (debug_print) 1081 if (debug_print)
1038 printk(KERN_DEBUG "hole: %016lx - %016lx\n", 1082 printk(KERN_DEBUG "hole: %016lx - %016lx\n",
1039 hole_basek<<10, (hole_basek + hole_sizek)<<10); 1083 hole_basek<<10,
1040 state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, 1084 (hole_basek + hole_sizek)<<10);
1041 MTRR_TYPE_UNCACHABLE); 1085 state->reg = range_to_mtrr(state->reg, hole_basek,
1042 1086 hole_sizek, MTRR_TYPE_UNCACHABLE);
1043 } 1087 }
1044 1088
1045 return second_sizek; 1089 return second_sizek;
@@ -1154,11 +1198,11 @@ struct mtrr_cleanup_result {
1154}; 1198};
1155 1199
1156/* 1200/*
1157 * gran_size: 1M, 2M, ..., 2G 1201 * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G
1158 * chunk size: gran_size, ..., 4G 1202 * chunk size: gran_size, ..., 2G
1159 * so we need (2+13)*6 1203 * so we need (1+16)*8
1160 */ 1204 */
1161#define NUM_RESULT 90 1205#define NUM_RESULT 136
1162#define PSHIFT (PAGE_SHIFT - 10) 1206#define PSHIFT (PAGE_SHIFT - 10)
1163 1207
1164static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; 1208static struct mtrr_cleanup_result __initdata result[NUM_RESULT];
@@ -1168,13 +1212,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM];
1168static int __init mtrr_cleanup(unsigned address_bits) 1212static int __init mtrr_cleanup(unsigned address_bits)
1169{ 1213{
1170 unsigned long extra_remove_base, extra_remove_size; 1214 unsigned long extra_remove_base, extra_remove_size;
1171 unsigned long i, base, size, def, dummy; 1215 unsigned long base, size, def, dummy;
1172 mtrr_type type; 1216 mtrr_type type;
1173 int nr_range, nr_range_new; 1217 int nr_range, nr_range_new;
1174 u64 chunk_size, gran_size; 1218 u64 chunk_size, gran_size;
1175 unsigned long range_sums, range_sums_new; 1219 unsigned long range_sums, range_sums_new;
1176 int index_good; 1220 int index_good;
1177 int num_reg_good; 1221 int num_reg_good;
1222 int i;
1178 1223
1179 /* extra one for all 0 */ 1224 /* extra one for all 0 */
1180 int num[MTRR_NUM_TYPES + 1]; 1225 int num[MTRR_NUM_TYPES + 1];
@@ -1204,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1204 continue; 1249 continue;
1205 if (!size) 1250 if (!size)
1206 type = MTRR_NUM_TYPES; 1251 type = MTRR_NUM_TYPES;
1252 if (type == MTRR_TYPE_WRPROT)
1253 type = MTRR_TYPE_UNCACHABLE;
1207 num[type]++; 1254 num[type]++;
1208 } 1255 }
1209 1256
@@ -1216,23 +1263,57 @@ static int __init mtrr_cleanup(unsigned address_bits)
1216 num_var_ranges - num[MTRR_NUM_TYPES]) 1263 num_var_ranges - num[MTRR_NUM_TYPES])
1217 return 0; 1264 return 0;
1218 1265
1266 /* print original var MTRRs at first, for debugging: */
1267 printk(KERN_DEBUG "original variable MTRRs\n");
1268 for (i = 0; i < num_var_ranges; i++) {
1269 char start_factor = 'K', size_factor = 'K';
1270 unsigned long start_base, size_base;
1271
1272 size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10);
1273 if (!size_base)
1274 continue;
1275
1276 size_base = to_size_factor(size_base, &size_factor),
1277 start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10);
1278 start_base = to_size_factor(start_base, &start_factor),
1279 type = range_state[i].type;
1280
1281 printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n",
1282 i, start_base, start_factor,
1283 size_base, size_factor,
1284 (type == MTRR_TYPE_UNCACHABLE) ? "UC" :
1285 ((type == MTRR_TYPE_WRPROT) ? "WP" :
1286 ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other"))
1287 );
1288 }
1289
1219 memset(range, 0, sizeof(range)); 1290 memset(range, 0, sizeof(range));
1220 extra_remove_size = 0; 1291 extra_remove_size = 0;
1221 if (mtrr_tom2) { 1292 extra_remove_base = 1 << (32 - PAGE_SHIFT);
1222 extra_remove_base = 1 << (32 - PAGE_SHIFT); 1293 if (mtrr_tom2)
1223 extra_remove_size = 1294 extra_remove_size =
1224 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; 1295 (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base;
1225 }
1226 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, 1296 nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base,
1227 extra_remove_size); 1297 extra_remove_size);
1298 /*
1299 * [0, 1M) should always be coverred by var mtrr with WB
1300 * and fixed mtrrs should take effective before var mtrr for it
1301 */
1302 nr_range = add_range_with_merge(range, nr_range, 0,
1303 (1ULL<<(20 - PAGE_SHIFT)) - 1);
1304 /* sort the ranges */
1305 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL);
1306
1228 range_sums = sum_ranges(range, nr_range); 1307 range_sums = sum_ranges(range, nr_range);
1229 printk(KERN_INFO "total RAM coverred: %ldM\n", 1308 printk(KERN_INFO "total RAM coverred: %ldM\n",
1230 range_sums >> (20 - PAGE_SHIFT)); 1309 range_sums >> (20 - PAGE_SHIFT));
1231 1310
1232 if (mtrr_chunk_size && mtrr_gran_size) { 1311 if (mtrr_chunk_size && mtrr_gran_size) {
1233 int num_reg; 1312 int num_reg;
1313 char gran_factor, chunk_factor, lose_factor;
1314 unsigned long gran_base, chunk_base, lose_base;
1234 1315
1235 debug_print = 1; 1316 debug_print++;
1236 /* convert ranges to var ranges state */ 1317 /* convert ranges to var ranges state */
1237 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, 1318 num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size,
1238 mtrr_gran_size); 1319 mtrr_gran_size);
@@ -1256,34 +1337,48 @@ static int __init mtrr_cleanup(unsigned address_bits)
1256 result[i].lose_cover_sizek = 1337 result[i].lose_cover_sizek =
1257 (range_sums - range_sums_new) << PSHIFT; 1338 (range_sums - range_sums_new) << PSHIFT;
1258 1339
1259 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1340 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1260 result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10, 1341 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1261 result[i].chunk_sizek >> 10); 1342 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1262 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", 1343 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1344 result[i].bad?"*BAD*":" ",
1345 gran_base, gran_factor, chunk_base, chunk_factor);
1346 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1263 result[i].num_reg, result[i].bad?"-":"", 1347 result[i].num_reg, result[i].bad?"-":"",
1264 result[i].lose_cover_sizek >> 10); 1348 lose_base, lose_factor);
1265 if (!result[i].bad) { 1349 if (!result[i].bad) {
1266 set_var_mtrr_all(address_bits); 1350 set_var_mtrr_all(address_bits);
1267 return 1; 1351 return 1;
1268 } 1352 }
1269 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " 1353 printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, "
1270 "will find optimal one\n"); 1354 "will find optimal one\n");
1271 debug_print = 0; 1355 debug_print--;
1272 memset(result, 0, sizeof(result[0])); 1356 memset(result, 0, sizeof(result[0]));
1273 } 1357 }
1274 1358
1275 i = 0; 1359 i = 0;
1276 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); 1360 memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn));
1277 memset(result, 0, sizeof(result)); 1361 memset(result, 0, sizeof(result));
1278 for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { 1362 for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) {
1279 for (chunk_size = gran_size; chunk_size < (1ULL<<33); 1363 char gran_factor;
1364 unsigned long gran_base;
1365
1366 if (debug_print)
1367 gran_base = to_size_factor(gran_size >> 10, &gran_factor);
1368
1369 for (chunk_size = gran_size; chunk_size < (1ULL<<32);
1280 chunk_size <<= 1) { 1370 chunk_size <<= 1) {
1281 int num_reg; 1371 int num_reg;
1282 1372
1283 if (debug_print) 1373 if (debug_print) {
1284 printk(KERN_INFO 1374 char chunk_factor;
1285 "\ngran_size: %lldM chunk_size_size: %lldM\n", 1375 unsigned long chunk_base;
1286 gran_size >> 20, chunk_size >> 20); 1376
1377 chunk_base = to_size_factor(chunk_size>>10, &chunk_factor),
1378 printk(KERN_INFO "\n");
1379 printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n",
1380 gran_base, gran_factor, chunk_base, chunk_factor);
1381 }
1287 if (i >= NUM_RESULT) 1382 if (i >= NUM_RESULT)
1288 continue; 1383 continue;
1289 1384
@@ -1326,12 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits)
1326 1421
1327 /* print out all */ 1422 /* print out all */
1328 for (i = 0; i < NUM_RESULT; i++) { 1423 for (i = 0; i < NUM_RESULT; i++) {
1329 printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", 1424 char gran_factor, chunk_factor, lose_factor;
1330 result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, 1425 unsigned long gran_base, chunk_base, lose_base;
1331 result[i].chunk_sizek >> 10); 1426
1332 printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n", 1427 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1333 result[i].num_reg, result[i].bad?"-":"", 1428 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1334 result[i].lose_cover_sizek >> 10); 1429 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1430 printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t",
1431 result[i].bad?"*BAD*":" ",
1432 gran_base, gran_factor, chunk_base, chunk_factor);
1433 printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n",
1434 result[i].num_reg, result[i].bad?"-":"",
1435 lose_base, lose_factor);
1335 } 1436 }
1336 1437
1337 /* try to find the optimal index */ 1438 /* try to find the optimal index */
@@ -1339,10 +1440,8 @@ static int __init mtrr_cleanup(unsigned address_bits)
1339 nr_mtrr_spare_reg = num_var_ranges - 1; 1440 nr_mtrr_spare_reg = num_var_ranges - 1;
1340 num_reg_good = -1; 1441 num_reg_good = -1;
1341 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { 1442 for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) {
1342 if (!min_loss_pfn[i]) { 1443 if (!min_loss_pfn[i])
1343 num_reg_good = i; 1444 num_reg_good = i;
1344 break;
1345 }
1346 } 1445 }
1347 1446
1348 index_good = -1; 1447 index_good = -1;
@@ -1358,21 +1457,26 @@ static int __init mtrr_cleanup(unsigned address_bits)
1358 } 1457 }
1359 1458
1360 if (index_good != -1) { 1459 if (index_good != -1) {
1460 char gran_factor, chunk_factor, lose_factor;
1461 unsigned long gran_base, chunk_base, lose_base;
1462
1361 printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); 1463 printk(KERN_INFO "Found optimal setting for mtrr clean up\n");
1362 i = index_good; 1464 i = index_good;
1363 printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", 1465 gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
1364 result[i].gran_sizek >> 10, 1466 chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
1365 result[i].chunk_sizek >> 10); 1467 lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
1366 printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n", 1468 printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t",
1367 result[i].num_reg, 1469 gran_base, gran_factor, chunk_base, chunk_factor);
1368 result[i].lose_cover_sizek >> 10); 1470 printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n",
1471 result[i].num_reg, lose_base, lose_factor);
1369 /* convert ranges to var ranges state */ 1472 /* convert ranges to var ranges state */
1370 chunk_size = result[i].chunk_sizek; 1473 chunk_size = result[i].chunk_sizek;
1371 chunk_size <<= 10; 1474 chunk_size <<= 10;
1372 gran_size = result[i].gran_sizek; 1475 gran_size = result[i].gran_sizek;
1373 gran_size <<= 10; 1476 gran_size <<= 10;
1374 debug_print = 1; 1477 debug_print++;
1375 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); 1478 x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size);
1479 debug_print--;
1376 set_var_mtrr_all(address_bits); 1480 set_var_mtrr_all(address_bits);
1377 return 1; 1481 return 1;
1378 } 1482 }
@@ -1496,11 +1600,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1496 1600
1497 /* kvm/qemu doesn't have mtrr set right, don't trim them all */ 1601 /* kvm/qemu doesn't have mtrr set right, don't trim them all */
1498 if (!highest_pfn) { 1602 if (!highest_pfn) {
1499 if (!kvm_para_available()) { 1603 WARN(!kvm_para_available(), KERN_WARNING
1500 printk(KERN_WARNING
1501 "WARNING: strange, CPU MTRRs all blank?\n"); 1604 "WARNING: strange, CPU MTRRs all blank?\n");
1502 WARN_ON(1);
1503 }
1504 return 0; 1605 return 0;
1505 } 1606 }
1506 1607
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index 6d4bdc02388a..9abd48b22674 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -17,6 +17,8 @@
17#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kprobes.h>
21
20#include <asm/apic.h> 22#include <asm/apic.h>
21#include <asm/intel_arch_perfmon.h> 23#include <asm/intel_arch_perfmon.h>
22 24
@@ -250,7 +252,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr,
250 252
251 do_div(count, nmi_hz); 253 do_div(count, nmi_hz);
252 if(descr) 254 if(descr)
253 Dprintk("setting %s to -0x%08Lx\n", descr, count); 255 pr_debug("setting %s to -0x%08Lx\n", descr, count);
254 wrmsrl(perfctr_msr, 0 - count); 256 wrmsrl(perfctr_msr, 0 - count);
255} 257}
256 258
@@ -261,7 +263,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr,
261 263
262 do_div(count, nmi_hz); 264 do_div(count, nmi_hz);
263 if(descr) 265 if(descr)
264 Dprintk("setting %s to -0x%08Lx\n", descr, count); 266 pr_debug("setting %s to -0x%08Lx\n", descr, count);
265 wrmsr(perfctr_msr, (u32)(-count), 0); 267 wrmsr(perfctr_msr, (u32)(-count), 0);
266} 268}
267 269
@@ -295,13 +297,19 @@ static int setup_k7_watchdog(unsigned nmi_hz)
295 /* setup the timer */ 297 /* setup the timer */
296 wrmsr(evntsel_msr, evntsel, 0); 298 wrmsr(evntsel_msr, evntsel, 0);
297 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); 299 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz);
298 apic_write(APIC_LVTPC, APIC_DM_NMI);
299 evntsel |= K7_EVNTSEL_ENABLE;
300 wrmsr(evntsel_msr, evntsel, 0);
301 300
301 /* initialize the wd struct before enabling */
302 wd->perfctr_msr = perfctr_msr; 302 wd->perfctr_msr = perfctr_msr;
303 wd->evntsel_msr = evntsel_msr; 303 wd->evntsel_msr = evntsel_msr;
304 wd->cccr_msr = 0; /* unused */ 304 wd->cccr_msr = 0; /* unused */
305
306 /* ok, everything is initialized, announce that we're set */
307 cpu_nmi_set_wd_enabled();
308
309 apic_write(APIC_LVTPC, APIC_DM_NMI);
310 evntsel |= K7_EVNTSEL_ENABLE;
311 wrmsr(evntsel_msr, evntsel, 0);
312
305 return 1; 313 return 1;
306} 314}
307 315
@@ -330,7 +338,8 @@ static void single_msr_unreserve(void)
330 release_perfctr_nmi(wd_ops->perfctr); 338 release_perfctr_nmi(wd_ops->perfctr);
331} 339}
332 340
333static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) 341static void __kprobes
342single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
334{ 343{
335 /* start the cycle over again */ 344 /* start the cycle over again */
336 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); 345 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
@@ -379,17 +388,23 @@ static int setup_p6_watchdog(unsigned nmi_hz)
379 wrmsr(evntsel_msr, evntsel, 0); 388 wrmsr(evntsel_msr, evntsel, 0);
380 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 389 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
381 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); 390 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz);
382 apic_write(APIC_LVTPC, APIC_DM_NMI);
383 evntsel |= P6_EVNTSEL0_ENABLE;
384 wrmsr(evntsel_msr, evntsel, 0);
385 391
392 /* initialize the wd struct before enabling */
386 wd->perfctr_msr = perfctr_msr; 393 wd->perfctr_msr = perfctr_msr;
387 wd->evntsel_msr = evntsel_msr; 394 wd->evntsel_msr = evntsel_msr;
388 wd->cccr_msr = 0; /* unused */ 395 wd->cccr_msr = 0; /* unused */
396
397 /* ok, everything is initialized, announce that we're set */
398 cpu_nmi_set_wd_enabled();
399
400 apic_write(APIC_LVTPC, APIC_DM_NMI);
401 evntsel |= P6_EVNTSEL0_ENABLE;
402 wrmsr(evntsel_msr, evntsel, 0);
403
389 return 1; 404 return 1;
390} 405}
391 406
392static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) 407static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
393{ 408{
394 /* 409 /*
395 * P6 based Pentium M need to re-unmask 410 * P6 based Pentium M need to re-unmask
@@ -432,6 +447,27 @@ static const struct wd_ops p6_wd_ops = {
432#define P4_CCCR_ENABLE (1 << 12) 447#define P4_CCCR_ENABLE (1 << 12)
433#define P4_CCCR_OVF (1 << 31) 448#define P4_CCCR_OVF (1 << 31)
434 449
450#define P4_CONTROLS 18
451static unsigned int p4_controls[18] = {
452 MSR_P4_BPU_CCCR0,
453 MSR_P4_BPU_CCCR1,
454 MSR_P4_BPU_CCCR2,
455 MSR_P4_BPU_CCCR3,
456 MSR_P4_MS_CCCR0,
457 MSR_P4_MS_CCCR1,
458 MSR_P4_MS_CCCR2,
459 MSR_P4_MS_CCCR3,
460 MSR_P4_FLAME_CCCR0,
461 MSR_P4_FLAME_CCCR1,
462 MSR_P4_FLAME_CCCR2,
463 MSR_P4_FLAME_CCCR3,
464 MSR_P4_IQ_CCCR0,
465 MSR_P4_IQ_CCCR1,
466 MSR_P4_IQ_CCCR2,
467 MSR_P4_IQ_CCCR3,
468 MSR_P4_IQ_CCCR4,
469 MSR_P4_IQ_CCCR5,
470};
435/* 471/*
436 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter 472 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
437 * CRU_ESCR0 (with any non-null event selector) through a complemented 473 * CRU_ESCR0 (with any non-null event selector) through a complemented
@@ -473,12 +509,38 @@ static int setup_p4_watchdog(unsigned nmi_hz)
473 evntsel_msr = MSR_P4_CRU_ESCR0; 509 evntsel_msr = MSR_P4_CRU_ESCR0;
474 cccr_msr = MSR_P4_IQ_CCCR0; 510 cccr_msr = MSR_P4_IQ_CCCR0;
475 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); 511 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
512
513 /*
514 * If we're on the kdump kernel or other situation, we may
515 * still have other performance counter registers set to
516 * interrupt and they'll keep interrupting forever because
517 * of the P4_CCCR_OVF quirk. So we need to ACK all the
518 * pending interrupts and disable all the registers here,
519 * before reenabling the NMI delivery. Refer to p4_rearm()
520 * about the P4_CCCR_OVF quirk.
521 */
522 if (reset_devices) {
523 unsigned int low, high;
524 int i;
525
526 for (i = 0; i < P4_CONTROLS; i++) {
527 rdmsr(p4_controls[i], low, high);
528 low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
529 wrmsr(p4_controls[i], low, high);
530 }
531 }
476 } else { 532 } else {
477 /* logical cpu 1 */ 533 /* logical cpu 1 */
478 perfctr_msr = MSR_P4_IQ_PERFCTR1; 534 perfctr_msr = MSR_P4_IQ_PERFCTR1;
479 evntsel_msr = MSR_P4_CRU_ESCR0; 535 evntsel_msr = MSR_P4_CRU_ESCR0;
480 cccr_msr = MSR_P4_IQ_CCCR1; 536 cccr_msr = MSR_P4_IQ_CCCR1;
481 cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); 537
538 /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
539 if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
540 cccr_val = P4_CCCR_OVF_PMI0;
541 else
542 cccr_val = P4_CCCR_OVF_PMI1;
543 cccr_val |= P4_CCCR_ESCR_SELECT(4);
482 } 544 }
483 545
484 evntsel = P4_ESCR_EVENT_SELECT(0x3F) 546 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
@@ -493,12 +555,17 @@ static int setup_p4_watchdog(unsigned nmi_hz)
493 wrmsr(evntsel_msr, evntsel, 0); 555 wrmsr(evntsel_msr, evntsel, 0);
494 wrmsr(cccr_msr, cccr_val, 0); 556 wrmsr(cccr_msr, cccr_val, 0);
495 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); 557 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
496 apic_write(APIC_LVTPC, APIC_DM_NMI); 558
497 cccr_val |= P4_CCCR_ENABLE;
498 wrmsr(cccr_msr, cccr_val, 0);
499 wd->perfctr_msr = perfctr_msr; 559 wd->perfctr_msr = perfctr_msr;
500 wd->evntsel_msr = evntsel_msr; 560 wd->evntsel_msr = evntsel_msr;
501 wd->cccr_msr = cccr_msr; 561 wd->cccr_msr = cccr_msr;
562
563 /* ok, everything is initialized, announce that we're set */
564 cpu_nmi_set_wd_enabled();
565
566 apic_write(APIC_LVTPC, APIC_DM_NMI);
567 cccr_val |= P4_CCCR_ENABLE;
568 wrmsr(cccr_msr, cccr_val, 0);
502 return 1; 569 return 1;
503} 570}
504 571
@@ -541,7 +608,7 @@ static void p4_unreserve(void)
541 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); 608 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
542} 609}
543 610
544static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) 611static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
545{ 612{
546 unsigned dummy; 613 unsigned dummy;
547 /* 614 /*
@@ -614,13 +681,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
614 wrmsr(evntsel_msr, evntsel, 0); 681 wrmsr(evntsel_msr, evntsel, 0);
615 nmi_hz = adjust_for_32bit_ctr(nmi_hz); 682 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
616 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); 683 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
617 apic_write(APIC_LVTPC, APIC_DM_NMI);
618 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
619 wrmsr(evntsel_msr, evntsel, 0);
620 684
621 wd->perfctr_msr = perfctr_msr; 685 wd->perfctr_msr = perfctr_msr;
622 wd->evntsel_msr = evntsel_msr; 686 wd->evntsel_msr = evntsel_msr;
623 wd->cccr_msr = 0; /* unused */ 687 wd->cccr_msr = 0; /* unused */
688
689 /* ok, everything is initialized, announce that we're set */
690 cpu_nmi_set_wd_enabled();
691
692 apic_write(APIC_LVTPC, APIC_DM_NMI);
693 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE;
694 wrmsr(evntsel_msr, evntsel, 0);
624 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 695 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
625 return 1; 696 return 1;
626} 697}
@@ -716,7 +787,7 @@ unsigned lapic_adjust_nmi_hz(unsigned hz)
716 return hz; 787 return hz;
717} 788}
718 789
719int lapic_wd_event(unsigned nmi_hz) 790int __kprobes lapic_wd_event(unsigned nmi_hz)
720{ 791{
721 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); 792 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
722 u64 ctr; 793 u64 ctr;
diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c
new file mode 100644
index 000000000000..5abbea297e0c
--- /dev/null
+++ b/arch/x86/kernel/cpu/powerflags.c
@@ -0,0 +1,20 @@
1/*
2 * Strings for the various x86 power flags
3 *
4 * This file must not contain any executable code.
5 */
6
7#include <asm/cpufeature.h>
8
9const char *const x86_power_flags[32] = {
10 "ts", /* temperature sensor */
11 "fid", /* frequency id control */
12 "vid", /* voltage id control */
13 "ttp", /* thermal trip */
14 "tm",
15 "stc",
16 "100mhzsteps",
17 "hwpstate",
18 "", /* tsc invariant mapped to constant_tsc */
19 /* nothing */
20};
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 0d0d9057e7c0..01b1244ef1c0 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -160,14 +160,16 @@ static void *c_start(struct seq_file *m, loff_t *pos)
160{ 160{
161 if (*pos == 0) /* just in case, cpu 0 is not the first */ 161 if (*pos == 0) /* just in case, cpu 0 is not the first */
162 *pos = first_cpu(cpu_online_map); 162 *pos = first_cpu(cpu_online_map);
163 if ((*pos) < NR_CPUS && cpu_online(*pos)) 163 else
164 *pos = next_cpu_nr(*pos - 1, cpu_online_map);
165 if ((*pos) < nr_cpu_ids)
164 return &cpu_data(*pos); 166 return &cpu_data(*pos);
165 return NULL; 167 return NULL;
166} 168}
167 169
168static void *c_next(struct seq_file *m, void *v, loff_t *pos) 170static void *c_next(struct seq_file *m, void *v, loff_t *pos)
169{ 171{
170 *pos = next_cpu(*pos, cpu_online_map); 172 (*pos)++;
171 return c_start(m, pos); 173 return c_start(m, pos);
172} 174}
173 175
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index b911a2c61b8f..52b3fefbd5af 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -5,6 +5,18 @@
5#include <asm/msr.h> 5#include <asm/msr.h>
6#include "cpu.h" 6#include "cpu.h"
7 7
8static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c)
9{
10 u32 xlvl;
11
12 /* Transmeta-defined flags: level 0x80860001 */
13 xlvl = cpuid_eax(0x80860000);
14 if ((xlvl & 0xffff0000) == 0x80860000) {
15 if (xlvl >= 0x80860001)
16 c->x86_capability[2] = cpuid_edx(0x80860001);
17 }
18}
19
8static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) 20static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
9{ 21{
10 unsigned int cap_mask, uk, max, dummy; 22 unsigned int cap_mask, uk, max, dummy;
@@ -12,7 +24,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
12 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; 24 unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev;
13 char cpu_info[65]; 25 char cpu_info[65];
14 26
15 get_model_name(c); /* Same as AMD/Cyrix */ 27 early_init_transmeta(c);
28
16 display_cacheinfo(c); 29 display_cacheinfo(c);
17 30
18 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
@@ -85,23 +98,12 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
85#endif 98#endif
86} 99}
87 100
88static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c)
89{
90 u32 xlvl;
91
92 /* Transmeta-defined flags: level 0x80860001 */
93 xlvl = cpuid_eax(0x80860000);
94 if ((xlvl & 0xffff0000) == 0x80860000) {
95 if (xlvl >= 0x80860001)
96 c->x86_capability[2] = cpuid_edx(0x80860001);
97 }
98}
99
100static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { 101static struct cpu_dev transmeta_cpu_dev __cpuinitdata = {
101 .c_vendor = "Transmeta", 102 .c_vendor = "Transmeta",
102 .c_ident = { "GenuineTMx86", "TransmetaCPU" }, 103 .c_ident = { "GenuineTMx86", "TransmetaCPU" },
104 .c_early_init = early_init_transmeta,
103 .c_init = init_transmeta, 105 .c_init = init_transmeta,
104 .c_identify = transmeta_identify, 106 .c_x86_vendor = X86_VENDOR_TRANSMETA,
105}; 107};
106 108
107cpu_vendor_dev_register(X86_VENDOR_TRANSMETA, &transmeta_cpu_dev); 109cpu_dev_register(transmeta_cpu_dev);
diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c
index b1fc90989d75..e777f79e0960 100644
--- a/arch/x86/kernel/cpu/umc.c
+++ b/arch/x86/kernel/cpu/umc.c
@@ -19,7 +19,8 @@ static struct cpu_dev umc_cpu_dev __cpuinitdata = {
19 } 19 }
20 }, 20 },
21 }, 21 },
22 .c_x86_vendor = X86_VENDOR_UMC,
22}; 23};
23 24
24cpu_vendor_dev_register(X86_VENDOR_UMC, &umc_cpu_dev); 25cpu_dev_register(umc_cpu_dev);
25 26
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 2de5fa2bbf77..72cefd1e649b 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -36,7 +36,6 @@
36#include <linux/smp_lock.h> 36#include <linux/smp_lock.h>
37#include <linux/major.h> 37#include <linux/major.h>
38#include <linux/fs.h> 38#include <linux/fs.h>
39#include <linux/smp_lock.h>
40#include <linux/device.h> 39#include <linux/device.h>
41#include <linux/cpu.h> 40#include <linux/cpu.h>
42#include <linux/notifier.h> 41#include <linux/notifier.h>
@@ -89,6 +88,8 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
89 struct cpuid_regs cmd; 88 struct cpuid_regs cmd;
90 int cpu = iminor(file->f_path.dentry->d_inode); 89 int cpu = iminor(file->f_path.dentry->d_inode);
91 u64 pos = *ppos; 90 u64 pos = *ppos;
91 ssize_t bytes = 0;
92 int err = 0;
92 93
93 if (count % 16) 94 if (count % 16)
94 return -EINVAL; /* Invalid chunk size */ 95 return -EINVAL; /* Invalid chunk size */
@@ -96,14 +97,19 @@ static ssize_t cpuid_read(struct file *file, char __user *buf,
96 for (; count; count -= 16) { 97 for (; count; count -= 16) {
97 cmd.eax = pos; 98 cmd.eax = pos;
98 cmd.ecx = pos >> 32; 99 cmd.ecx = pos >> 32;
99 smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); 100 err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1);
100 if (copy_to_user(tmp, &cmd, 16)) 101 if (err)
101 return -EFAULT; 102 break;
103 if (copy_to_user(tmp, &cmd, 16)) {
104 err = -EFAULT;
105 break;
106 }
102 tmp += 16; 107 tmp += 16;
108 bytes += 16;
103 *ppos = ++pos; 109 *ppos = ++pos;
104 } 110 }
105 111
106 return tmp - buf; 112 return bytes ? bytes : err;
107} 113}
108 114
109static int cpuid_open(struct inode *inode, struct file *file) 115static int cpuid_open(struct inode *inode, struct file *file)
@@ -141,7 +147,7 @@ static __cpuinit int cpuid_device_create(int cpu)
141{ 147{
142 struct device *dev; 148 struct device *dev;
143 149
144 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), 150 dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL,
145 "cpu%d", cpu); 151 "cpu%d", cpu);
146 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 152 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
147} 153}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 72d0c56c1b48..f7cdb3b457aa 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -13,6 +13,9 @@
13 13
14static void *kdump_buf_page; 14static void *kdump_buf_page;
15 15
16/* Stores the physical address of elf header of crash image. */
17unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
18
16/** 19/**
17 * copy_oldmem_page - copy one page from "oldmem" 20 * copy_oldmem_page - copy one page from "oldmem"
18 * @pfn: page frame number to be copied 21 * @pfn: page frame number to be copied
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 15e6c6bc4a46..045b36cada65 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -7,9 +7,11 @@
7 7
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/crash_dump.h> 9#include <linux/crash_dump.h>
10#include <linux/uaccess.h>
11#include <linux/io.h>
10 12
11#include <asm/uaccess.h> 13/* Stores the physical address of elf header of crash image. */
12#include <asm/io.h> 14unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
13 15
14/** 16/**
15 * copy_oldmem_page - copy one page from "oldmem" 17 * copy_oldmem_page - copy one page from "oldmem"
@@ -25,7 +27,7 @@
25 * in the current kernel. We stitch up a pte, similar to kmap_atomic. 27 * in the current kernel. We stitch up a pte, similar to kmap_atomic.
26 */ 28 */
27ssize_t copy_oldmem_page(unsigned long pfn, char *buf, 29ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
28 size_t csize, unsigned long offset, int userbuf) 30 size_t csize, unsigned long offset, int userbuf)
29{ 31{
30 void *vaddr; 32 void *vaddr;
31 33
@@ -33,14 +35,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
33 return 0; 35 return 0;
34 36
35 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); 37 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE);
38 if (!vaddr)
39 return -ENOMEM;
36 40
37 if (userbuf) { 41 if (userbuf) {
38 if (copy_to_user(buf, (vaddr + offset), csize)) { 42 if (copy_to_user(buf, vaddr + offset, csize)) {
39 iounmap(vaddr); 43 iounmap(vaddr);
40 return -EFAULT; 44 return -EFAULT;
41 } 45 }
42 } else 46 } else
43 memcpy(buf, (vaddr + offset), csize); 47 memcpy(buf, vaddr + offset, csize);
44 48
45 iounmap(vaddr); 49 iounmap(vaddr);
46 return csize; 50 return csize;
diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c
index a47798b59f07..b4f14c6c09d9 100644
--- a/arch/x86/kernel/doublefault_32.c
+++ b/arch/x86/kernel/doublefault_32.c
@@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = {
66 .ds = __USER_DS, 66 .ds = __USER_DS,
67 .fs = __KERNEL_PERCPU, 67 .fs = __KERNEL_PERCPU,
68 68
69 .__cr3 = __pa(swapper_pg_dir) 69 .__cr3 = __pa_nodebug(swapper_pg_dir),
70 } 70 }
71}; 71};
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index 11c11b8ec48d..2b69994fd3a8 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -2,26 +2,49 @@
2 * Debug Store support 2 * Debug Store support
3 * 3 *
4 * This provides a low-level interface to the hardware's Debug Store 4 * This provides a low-level interface to the hardware's Debug Store
5 * feature that is used for last branch recording (LBR) and 5 * feature that is used for branch trace store (BTS) and
6 * precise-event based sampling (PEBS). 6 * precise-event based sampling (PEBS).
7 * 7 *
8 * Different architectures use a different DS layout/pointer size. 8 * It manages:
9 * The below functions therefore work on a void*. 9 * - per-thread and per-cpu allocation of BTS and PEBS
10 * - buffer memory allocation (optional)
11 * - buffer overflow handling
12 * - buffer access
10 * 13 *
14 * It assumes:
15 * - get_task_struct on all parameter tasks
16 * - current is allowed to trace parameter tasks
11 * 17 *
12 * Since there is no user for PEBS, yet, only LBR (or branch
13 * trace store, BTS) is supported.
14 * 18 *
15 * 19 * Copyright (C) 2007-2008 Intel Corporation.
16 * Copyright (C) 2007 Intel Corporation. 20 * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
17 * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
18 */ 21 */
19 22
23
24#ifdef CONFIG_X86_DS
25
20#include <asm/ds.h> 26#include <asm/ds.h>
21 27
22#include <linux/errno.h> 28#include <linux/errno.h>
23#include <linux/string.h> 29#include <linux/string.h>
24#include <linux/slab.h> 30#include <linux/slab.h>
31#include <linux/sched.h>
32#include <linux/mm.h>
33
34
35/*
36 * The configuration for a particular DS hardware implementation.
37 */
38struct ds_configuration {
39 /* the size of the DS structure in bytes */
40 unsigned char sizeof_ds;
41 /* the size of one pointer-typed field in the DS structure in bytes;
42 this covers the first 8 fields related to buffer management. */
43 unsigned char sizeof_field;
44 /* the size of a BTS/PEBS record in bytes */
45 unsigned char sizeof_rec[2];
46};
47static struct ds_configuration ds_cfg;
25 48
26 49
27/* 50/*
@@ -44,378 +67,747 @@
44 * (interrupt occurs when write pointer passes interrupt pointer) 67 * (interrupt occurs when write pointer passes interrupt pointer)
45 * - value to which counter is reset following counter overflow 68 * - value to which counter is reset following counter overflow
46 * 69 *
47 * On later architectures, the last branch recording hardware uses 70 * Later architectures use 64bit pointers throughout, whereas earlier
48 * 64bit pointers even in 32bit mode. 71 * architectures use 32bit pointers in 32bit mode.
49 *
50 *
51 * Branch Trace Store (BTS) records store information about control
52 * flow changes. They at least provide the following information:
53 * - source linear address
54 * - destination linear address
55 * 72 *
56 * Netburst supported a predicated bit that had been dropped in later
57 * architectures. We do not suppor it.
58 * 73 *
74 * We compute the base address for the first 8 fields based on:
75 * - the field size stored in the DS configuration
76 * - the relative field position
77 * - an offset giving the start of the respective region
59 * 78 *
60 * In order to abstract from the actual DS and BTS layout, we describe 79 * This offset is further used to index various arrays holding
61 * the access to the relevant fields. 80 * information for BTS and PEBS at the respective index.
62 * Thanks to Andi Kleen for proposing this design.
63 * 81 *
64 * The implementation, however, is not as general as it might seem. In 82 * On later 32bit processors, we only access the lower 32bit of the
65 * order to stay somewhat simple and efficient, we assume an 83 * 64bit pointer fields. The upper halves will be zeroed out.
66 * underlying unsigned type (mostly a pointer type) and we expect the
67 * field to be at least as big as that type.
68 */ 84 */
69 85
70/* 86enum ds_field {
71 * A special from_ip address to indicate that the BTS record is an 87 ds_buffer_base = 0,
72 * info record that needs to be interpreted or skipped. 88 ds_index,
73 */ 89 ds_absolute_maximum,
74#define BTS_ESCAPE_ADDRESS (-1) 90 ds_interrupt_threshold,
91};
75 92
76/* 93enum ds_qualifier {
77 * A field access descriptor 94 ds_bts = 0,
78 */ 95 ds_pebs
79struct access_desc {
80 unsigned char offset;
81 unsigned char size;
82}; 96};
83 97
98static inline unsigned long ds_get(const unsigned char *base,
99 enum ds_qualifier qual, enum ds_field field)
100{
101 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
102 return *(unsigned long *)base;
103}
104
105static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
106 enum ds_field field, unsigned long value)
107{
108 base += (ds_cfg.sizeof_field * (field + (4 * qual)));
109 (*(unsigned long *)base) = value;
110}
111
112
84/* 113/*
85 * The configuration for a particular DS/BTS hardware implementation. 114 * Locking is done only for allocating BTS or PEBS resources and for
115 * guarding context and buffer memory allocation.
116 *
117 * Most functions require the current task to own the ds context part
118 * they are going to access. All the locking is done when validating
119 * access to the context.
86 */ 120 */
87struct ds_configuration { 121static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock);
88 /* the DS configuration */
89 unsigned char sizeof_ds;
90 struct access_desc bts_buffer_base;
91 struct access_desc bts_index;
92 struct access_desc bts_absolute_maximum;
93 struct access_desc bts_interrupt_threshold;
94 /* the BTS configuration */
95 unsigned char sizeof_bts;
96 struct access_desc from_ip;
97 struct access_desc to_ip;
98 /* BTS variants used to store additional information like
99 timestamps */
100 struct access_desc info_type;
101 struct access_desc info_data;
102 unsigned long debugctl_mask;
103};
104 122
105/* 123/*
106 * The global configuration used by the below accessor functions 124 * Validate that the current task is allowed to access the BTS/PEBS
125 * buffer of the parameter task.
126 *
127 * Returns 0, if access is granted; -Eerrno, otherwise.
107 */ 128 */
108static struct ds_configuration ds_cfg; 129static inline int ds_validate_access(struct ds_context *context,
130 enum ds_qualifier qual)
131{
132 if (!context)
133 return -EPERM;
134
135 if (context->owner[qual] == current)
136 return 0;
137
138 return -EPERM;
139}
140
109 141
110/* 142/*
111 * Accessor functions for some DS and BTS fields using the above 143 * We either support (system-wide) per-cpu or per-thread allocation.
112 * global ptrace_bts_cfg. 144 * We distinguish the two based on the task_struct pointer, where a
145 * NULL pointer indicates per-cpu allocation for the current cpu.
146 *
147 * Allocations are use-counted. As soon as resources are allocated,
148 * further allocations must be of the same type (per-cpu or
149 * per-thread). We model this by counting allocations (i.e. the number
150 * of tracers of a certain type) for one type negatively:
151 * =0 no tracers
152 * >0 number of per-thread tracers
153 * <0 number of per-cpu tracers
154 *
155 * The below functions to get and put tracers and to check the
156 * allocation type require the ds_lock to be held by the caller.
157 *
158 * Tracers essentially gives the number of ds contexts for a certain
159 * type of allocation.
113 */ 160 */
114static inline unsigned long get_bts_buffer_base(char *base) 161static long tracers;
162
163static inline void get_tracer(struct task_struct *task)
115{ 164{
116 return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); 165 tracers += (task ? 1 : -1);
117} 166}
118static inline void set_bts_buffer_base(char *base, unsigned long value) 167
168static inline void put_tracer(struct task_struct *task)
119{ 169{
120 (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; 170 tracers -= (task ? 1 : -1);
121} 171}
122static inline unsigned long get_bts_index(char *base) 172
173static inline int check_tracer(struct task_struct *task)
123{ 174{
124 return *(unsigned long *)(base + ds_cfg.bts_index.offset); 175 return (task ? (tracers >= 0) : (tracers <= 0));
125} 176}
126static inline void set_bts_index(char *base, unsigned long value) 177
178
179/*
180 * The DS context is either attached to a thread or to a cpu:
181 * - in the former case, the thread_struct contains a pointer to the
182 * attached context.
183 * - in the latter case, we use a static array of per-cpu context
184 * pointers.
185 *
186 * Contexts are use-counted. They are allocated on first access and
187 * deallocated when the last user puts the context.
188 *
189 * We distinguish between an allocating and a non-allocating get of a
190 * context:
191 * - the allocating get is used for requesting BTS/PEBS resources. It
192 * requires the caller to hold the global ds_lock.
193 * - the non-allocating get is used for all other cases. A
194 * non-existing context indicates an error. It acquires and releases
195 * the ds_lock itself for obtaining the context.
196 *
197 * A context and its DS configuration are allocated and deallocated
198 * together. A context always has a DS configuration of the
199 * appropriate size.
200 */
201static DEFINE_PER_CPU(struct ds_context *, system_context);
202
203#define this_system_context per_cpu(system_context, smp_processor_id())
204
205/*
206 * Returns the pointer to the parameter task's context or to the
207 * system-wide context, if task is NULL.
208 *
209 * Increases the use count of the returned context, if not NULL.
210 */
211static inline struct ds_context *ds_get_context(struct task_struct *task)
127{ 212{
128 (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; 213 struct ds_context *context;
214
215 spin_lock(&ds_lock);
216
217 context = (task ? task->thread.ds_ctx : this_system_context);
218 if (context)
219 context->count++;
220
221 spin_unlock(&ds_lock);
222
223 return context;
129} 224}
130static inline unsigned long get_bts_absolute_maximum(char *base) 225
226/*
227 * Same as ds_get_context, but allocates the context and it's DS
228 * structure, if necessary; returns NULL; if out of memory.
229 *
230 * pre: requires ds_lock to be held
231 */
232static inline struct ds_context *ds_alloc_context(struct task_struct *task)
131{ 233{
132 return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); 234 struct ds_context **p_context =
235 (task ? &task->thread.ds_ctx : &this_system_context);
236 struct ds_context *context = *p_context;
237
238 if (!context) {
239 context = kzalloc(sizeof(*context), GFP_KERNEL);
240
241 if (!context)
242 return NULL;
243
244 context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL);
245 if (!context->ds) {
246 kfree(context);
247 return NULL;
248 }
249
250 *p_context = context;
251
252 context->this = p_context;
253 context->task = task;
254
255 if (task)
256 set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
257
258 if (!task || (task == current))
259 wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0);
260
261 get_tracer(task);
262 }
263
264 context->count++;
265
266 return context;
133} 267}
134static inline void set_bts_absolute_maximum(char *base, unsigned long value) 268
269/*
270 * Decreases the use count of the parameter context, if not NULL.
271 * Deallocates the context, if the use count reaches zero.
272 */
273static inline void ds_put_context(struct ds_context *context)
135{ 274{
136 (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; 275 if (!context)
276 return;
277
278 spin_lock(&ds_lock);
279
280 if (--context->count)
281 goto out;
282
283 *(context->this) = NULL;
284
285 if (context->task)
286 clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
287
288 if (!context->task || (context->task == current))
289 wrmsrl(MSR_IA32_DS_AREA, 0);
290
291 put_tracer(context->task);
292
293 /* free any leftover buffers from tracers that did not
294 * deallocate them properly. */
295 kfree(context->buffer[ds_bts]);
296 kfree(context->buffer[ds_pebs]);
297 kfree(context->ds);
298 kfree(context);
299 out:
300 spin_unlock(&ds_lock);
137} 301}
138static inline unsigned long get_bts_interrupt_threshold(char *base) 302
303
304/*
305 * Handle a buffer overflow
306 *
307 * task: the task whose buffers are overflowing;
308 * NULL for a buffer overflow on the current cpu
309 * context: the ds context
310 * qual: the buffer type
311 */
312static void ds_overflow(struct task_struct *task, struct ds_context *context,
313 enum ds_qualifier qual)
139{ 314{
140 return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); 315 if (!context)
316 return;
317
318 if (context->callback[qual])
319 (*context->callback[qual])(task);
320
321 /* todo: do some more overflow handling */
141} 322}
142static inline void set_bts_interrupt_threshold(char *base, unsigned long value) 323
324
325/*
326 * Allocate a non-pageable buffer of the parameter size.
327 * Checks the memory and the locked memory rlimit.
328 *
329 * Returns the buffer, if successful;
330 * NULL, if out of memory or rlimit exceeded.
331 *
332 * size: the requested buffer size in bytes
333 * pages (out): if not NULL, contains the number of pages reserved
334 */
335static inline void *ds_allocate_buffer(size_t size, unsigned int *pages)
143{ 336{
144 (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; 337 unsigned long rlim, vm, pgsz;
338 void *buffer;
339
340 pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
341
342 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
343 vm = current->mm->total_vm + pgsz;
344 if (rlim < vm)
345 return NULL;
346
347 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
348 vm = current->mm->locked_vm + pgsz;
349 if (rlim < vm)
350 return NULL;
351
352 buffer = kzalloc(size, GFP_KERNEL);
353 if (!buffer)
354 return NULL;
355
356 current->mm->total_vm += pgsz;
357 current->mm->locked_vm += pgsz;
358
359 if (pages)
360 *pages = pgsz;
361
362 return buffer;
145} 363}
146static inline unsigned long get_from_ip(char *base) 364
365static int ds_request(struct task_struct *task, void *base, size_t size,
366 ds_ovfl_callback_t ovfl, enum ds_qualifier qual)
147{ 367{
148 return *(unsigned long *)(base + ds_cfg.from_ip.offset); 368 struct ds_context *context;
369 unsigned long buffer, adj;
370 const unsigned long alignment = (1 << 3);
371 int error = 0;
372
373 if (!ds_cfg.sizeof_ds)
374 return -EOPNOTSUPP;
375
376 /* we require some space to do alignment adjustments below */
377 if (size < (alignment + ds_cfg.sizeof_rec[qual]))
378 return -EINVAL;
379
380 /* buffer overflow notification is not yet implemented */
381 if (ovfl)
382 return -EOPNOTSUPP;
383
384
385 spin_lock(&ds_lock);
386
387 if (!check_tracer(task))
388 return -EPERM;
389
390 error = -ENOMEM;
391 context = ds_alloc_context(task);
392 if (!context)
393 goto out_unlock;
394
395 error = -EALREADY;
396 if (context->owner[qual] == current)
397 goto out_unlock;
398 error = -EPERM;
399 if (context->owner[qual] != NULL)
400 goto out_unlock;
401 context->owner[qual] = current;
402
403 spin_unlock(&ds_lock);
404
405
406 error = -ENOMEM;
407 if (!base) {
408 base = ds_allocate_buffer(size, &context->pages[qual]);
409 if (!base)
410 goto out_release;
411
412 context->buffer[qual] = base;
413 }
414 error = 0;
415
416 context->callback[qual] = ovfl;
417
418 /* adjust the buffer address and size to meet alignment
419 * constraints:
420 * - buffer is double-word aligned
421 * - size is multiple of record size
422 *
423 * We checked the size at the very beginning; we have enough
424 * space to do the adjustment.
425 */
426 buffer = (unsigned long)base;
427
428 adj = ALIGN(buffer, alignment) - buffer;
429 buffer += adj;
430 size -= adj;
431
432 size /= ds_cfg.sizeof_rec[qual];
433 size *= ds_cfg.sizeof_rec[qual];
434
435 ds_set(context->ds, qual, ds_buffer_base, buffer);
436 ds_set(context->ds, qual, ds_index, buffer);
437 ds_set(context->ds, qual, ds_absolute_maximum, buffer + size);
438
439 if (ovfl) {
440 /* todo: select a suitable interrupt threshold */
441 } else
442 ds_set(context->ds, qual,
443 ds_interrupt_threshold, buffer + size + 1);
444
445 /* we keep the context until ds_release */
446 return error;
447
448 out_release:
449 context->owner[qual] = NULL;
450 ds_put_context(context);
451 return error;
452
453 out_unlock:
454 spin_unlock(&ds_lock);
455 ds_put_context(context);
456 return error;
149} 457}
150static inline void set_from_ip(char *base, unsigned long value) 458
459int ds_request_bts(struct task_struct *task, void *base, size_t size,
460 ds_ovfl_callback_t ovfl)
151{ 461{
152 (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; 462 return ds_request(task, base, size, ovfl, ds_bts);
153} 463}
154static inline unsigned long get_to_ip(char *base) 464
465int ds_request_pebs(struct task_struct *task, void *base, size_t size,
466 ds_ovfl_callback_t ovfl)
155{ 467{
156 return *(unsigned long *)(base + ds_cfg.to_ip.offset); 468 return ds_request(task, base, size, ovfl, ds_pebs);
157} 469}
158static inline void set_to_ip(char *base, unsigned long value) 470
471static int ds_release(struct task_struct *task, enum ds_qualifier qual)
159{ 472{
160 (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; 473 struct ds_context *context;
474 int error;
475
476 context = ds_get_context(task);
477 error = ds_validate_access(context, qual);
478 if (error < 0)
479 goto out;
480
481 kfree(context->buffer[qual]);
482 context->buffer[qual] = NULL;
483
484 current->mm->total_vm -= context->pages[qual];
485 current->mm->locked_vm -= context->pages[qual];
486 context->pages[qual] = 0;
487 context->owner[qual] = NULL;
488
489 /*
490 * we put the context twice:
491 * once for the ds_get_context
492 * once for the corresponding ds_request
493 */
494 ds_put_context(context);
495 out:
496 ds_put_context(context);
497 return error;
161} 498}
162static inline unsigned char get_info_type(char *base) 499
500int ds_release_bts(struct task_struct *task)
163{ 501{
164 return *(unsigned char *)(base + ds_cfg.info_type.offset); 502 return ds_release(task, ds_bts);
165} 503}
166static inline void set_info_type(char *base, unsigned char value) 504
505int ds_release_pebs(struct task_struct *task)
167{ 506{
168 (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; 507 return ds_release(task, ds_pebs);
169} 508}
170static inline unsigned long get_info_data(char *base) 509
510static int ds_get_index(struct task_struct *task, size_t *pos,
511 enum ds_qualifier qual)
171{ 512{
172 return *(unsigned long *)(base + ds_cfg.info_data.offset); 513 struct ds_context *context;
514 unsigned long base, index;
515 int error;
516
517 context = ds_get_context(task);
518 error = ds_validate_access(context, qual);
519 if (error < 0)
520 goto out;
521
522 base = ds_get(context->ds, qual, ds_buffer_base);
523 index = ds_get(context->ds, qual, ds_index);
524
525 error = ((index - base) / ds_cfg.sizeof_rec[qual]);
526 if (pos)
527 *pos = error;
528 out:
529 ds_put_context(context);
530 return error;
173} 531}
174static inline void set_info_data(char *base, unsigned long value) 532
533int ds_get_bts_index(struct task_struct *task, size_t *pos)
175{ 534{
176 (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; 535 return ds_get_index(task, pos, ds_bts);
177} 536}
178 537
538int ds_get_pebs_index(struct task_struct *task, size_t *pos)
539{
540 return ds_get_index(task, pos, ds_pebs);
541}
179 542
180int ds_allocate(void **dsp, size_t bts_size_in_bytes) 543static int ds_get_end(struct task_struct *task, size_t *pos,
544 enum ds_qualifier qual)
181{ 545{
182 size_t bts_size_in_records; 546 struct ds_context *context;
183 unsigned long bts; 547 unsigned long base, end;
184 void *ds; 548 int error;
549
550 context = ds_get_context(task);
551 error = ds_validate_access(context, qual);
552 if (error < 0)
553 goto out;
554
555 base = ds_get(context->ds, qual, ds_buffer_base);
556 end = ds_get(context->ds, qual, ds_absolute_maximum);
557
558 error = ((end - base) / ds_cfg.sizeof_rec[qual]);
559 if (pos)
560 *pos = error;
561 out:
562 ds_put_context(context);
563 return error;
564}
185 565
186 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 566int ds_get_bts_end(struct task_struct *task, size_t *pos)
187 return -EOPNOTSUPP; 567{
568 return ds_get_end(task, pos, ds_bts);
569}
188 570
189 if (bts_size_in_bytes < 0) 571int ds_get_pebs_end(struct task_struct *task, size_t *pos)
190 return -EINVAL; 572{
573 return ds_get_end(task, pos, ds_pebs);
574}
191 575
192 bts_size_in_records = 576static int ds_access(struct task_struct *task, size_t index,
193 bts_size_in_bytes / ds_cfg.sizeof_bts; 577 const void **record, enum ds_qualifier qual)
194 bts_size_in_bytes = 578{
195 bts_size_in_records * ds_cfg.sizeof_bts; 579 struct ds_context *context;
580 unsigned long base, idx;
581 int error;
196 582
197 if (bts_size_in_bytes <= 0) 583 if (!record)
198 return -EINVAL; 584 return -EINVAL;
199 585
200 bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); 586 context = ds_get_context(task);
201 587 error = ds_validate_access(context, qual);
202 if (!bts) 588 if (error < 0)
203 return -ENOMEM; 589 goto out;
204 590
205 ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); 591 base = ds_get(context->ds, qual, ds_buffer_base);
592 idx = base + (index * ds_cfg.sizeof_rec[qual]);
206 593
207 if (!ds) { 594 error = -EINVAL;
208 kfree((void *)bts); 595 if (idx > ds_get(context->ds, qual, ds_absolute_maximum))
209 return -ENOMEM; 596 goto out;
210 }
211
212 set_bts_buffer_base(ds, bts);
213 set_bts_index(ds, bts);
214 set_bts_absolute_maximum(ds, bts + bts_size_in_bytes);
215 set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1);
216 597
217 *dsp = ds; 598 *record = (const void *)idx;
218 return 0; 599 error = ds_cfg.sizeof_rec[qual];
600 out:
601 ds_put_context(context);
602 return error;
219} 603}
220 604
221int ds_free(void **dsp) 605int ds_access_bts(struct task_struct *task, size_t index, const void **record)
222{ 606{
223 if (*dsp) { 607 return ds_access(task, index, record, ds_bts);
224 kfree((void *)get_bts_buffer_base(*dsp));
225 kfree(*dsp);
226 *dsp = NULL;
227 }
228 return 0;
229} 608}
230 609
231int ds_get_bts_size(void *ds) 610int ds_access_pebs(struct task_struct *task, size_t index, const void **record)
232{ 611{
233 int size_in_bytes; 612 return ds_access(task, index, record, ds_pebs);
234
235 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
236 return -EOPNOTSUPP;
237
238 if (!ds)
239 return 0;
240
241 size_in_bytes =
242 get_bts_absolute_maximum(ds) -
243 get_bts_buffer_base(ds);
244 return size_in_bytes;
245} 613}
246 614
247int ds_get_bts_end(void *ds) 615static int ds_write(struct task_struct *task, const void *record, size_t size,
616 enum ds_qualifier qual, int force)
248{ 617{
249 int size_in_bytes = ds_get_bts_size(ds); 618 struct ds_context *context;
250 619 int error;
251 if (size_in_bytes <= 0)
252 return size_in_bytes;
253 620
254 return size_in_bytes / ds_cfg.sizeof_bts; 621 if (!record)
255} 622 return -EINVAL;
256 623
257int ds_get_bts_index(void *ds) 624 error = -EPERM;
258{ 625 context = ds_get_context(task);
259 int index_offset_in_bytes; 626 if (!context)
627 goto out;
260 628
261 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 629 if (!force) {
262 return -EOPNOTSUPP; 630 error = ds_validate_access(context, qual);
631 if (error < 0)
632 goto out;
633 }
263 634
264 index_offset_in_bytes = 635 error = 0;
265 get_bts_index(ds) - 636 while (size) {
266 get_bts_buffer_base(ds); 637 unsigned long base, index, end, write_end, int_th;
638 unsigned long write_size, adj_write_size;
639
640 /*
641 * write as much as possible without producing an
642 * overflow interrupt.
643 *
644 * interrupt_threshold must either be
645 * - bigger than absolute_maximum or
646 * - point to a record between buffer_base and absolute_maximum
647 *
648 * index points to a valid record.
649 */
650 base = ds_get(context->ds, qual, ds_buffer_base);
651 index = ds_get(context->ds, qual, ds_index);
652 end = ds_get(context->ds, qual, ds_absolute_maximum);
653 int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
654
655 write_end = min(end, int_th);
656
657 /* if we are already beyond the interrupt threshold,
658 * we fill the entire buffer */
659 if (write_end <= index)
660 write_end = end;
661
662 if (write_end <= index)
663 goto out;
664
665 write_size = min((unsigned long) size, write_end - index);
666 memcpy((void *)index, record, write_size);
667
668 record = (const char *)record + write_size;
669 size -= write_size;
670 error += write_size;
671
672 adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
673 adj_write_size *= ds_cfg.sizeof_rec[qual];
674
675 /* zero out trailing bytes */
676 memset((char *)index + write_size, 0,
677 adj_write_size - write_size);
678 index += adj_write_size;
679
680 if (index >= end)
681 index = base;
682 ds_set(context->ds, qual, ds_index, index);
683
684 if (index >= int_th)
685 ds_overflow(task, context, qual);
686 }
267 687
268 return index_offset_in_bytes / ds_cfg.sizeof_bts; 688 out:
689 ds_put_context(context);
690 return error;
269} 691}
270 692
271int ds_set_overflow(void *ds, int method) 693int ds_write_bts(struct task_struct *task, const void *record, size_t size)
272{ 694{
273 switch (method) { 695 return ds_write(task, record, size, ds_bts, /* force = */ 0);
274 case DS_O_SIGNAL:
275 return -EOPNOTSUPP;
276 case DS_O_WRAP:
277 return 0;
278 default:
279 return -EINVAL;
280 }
281} 696}
282 697
283int ds_get_overflow(void *ds) 698int ds_write_pebs(struct task_struct *task, const void *record, size_t size)
284{ 699{
285 return DS_O_WRAP; 700 return ds_write(task, record, size, ds_pebs, /* force = */ 0);
286} 701}
287 702
288int ds_clear(void *ds) 703int ds_unchecked_write_bts(struct task_struct *task,
704 const void *record, size_t size)
289{ 705{
290 int bts_size = ds_get_bts_size(ds); 706 return ds_write(task, record, size, ds_bts, /* force = */ 1);
291 unsigned long bts_base;
292
293 if (bts_size <= 0)
294 return bts_size;
295
296 bts_base = get_bts_buffer_base(ds);
297 memset((void *)bts_base, 0, bts_size);
298
299 set_bts_index(ds, bts_base);
300 return 0;
301} 707}
302 708
303int ds_read_bts(void *ds, int index, struct bts_struct *out) 709int ds_unchecked_write_pebs(struct task_struct *task,
710 const void *record, size_t size)
304{ 711{
305 void *bts; 712 return ds_write(task, record, size, ds_pebs, /* force = */ 1);
713}
306 714
307 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) 715static int ds_reset_or_clear(struct task_struct *task,
308 return -EOPNOTSUPP; 716 enum ds_qualifier qual, int clear)
717{
718 struct ds_context *context;
719 unsigned long base, end;
720 int error;
309 721
310 if (index < 0) 722 context = ds_get_context(task);
311 return -EINVAL; 723 error = ds_validate_access(context, qual);
724 if (error < 0)
725 goto out;
312 726
313 if (index >= ds_get_bts_size(ds)) 727 base = ds_get(context->ds, qual, ds_buffer_base);
314 return -EINVAL; 728 end = ds_get(context->ds, qual, ds_absolute_maximum);
315 729
316 bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); 730 if (clear)
731 memset((void *)base, 0, end - base);
317 732
318 memset(out, 0, sizeof(*out)); 733 ds_set(context->ds, qual, ds_index, base);
319 if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) {
320 out->qualifier = get_info_type(bts);
321 out->variant.jiffies = get_info_data(bts);
322 } else {
323 out->qualifier = BTS_BRANCH;
324 out->variant.lbr.from_ip = get_from_ip(bts);
325 out->variant.lbr.to_ip = get_to_ip(bts);
326 }
327 734
328 return sizeof(*out);; 735 error = 0;
736 out:
737 ds_put_context(context);
738 return error;
329} 739}
330 740
331int ds_write_bts(void *ds, const struct bts_struct *in) 741int ds_reset_bts(struct task_struct *task)
332{ 742{
333 unsigned long bts; 743 return ds_reset_or_clear(task, ds_bts, /* clear = */ 0);
334 744}
335 if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts)
336 return -EOPNOTSUPP;
337
338 if (ds_get_bts_size(ds) <= 0)
339 return -ENXIO;
340 745
341 bts = get_bts_index(ds); 746int ds_reset_pebs(struct task_struct *task)
747{
748 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0);
749}
342 750
343 memset((void *)bts, 0, ds_cfg.sizeof_bts); 751int ds_clear_bts(struct task_struct *task)
344 switch (in->qualifier) { 752{
345 case BTS_INVALID: 753 return ds_reset_or_clear(task, ds_bts, /* clear = */ 1);
346 break; 754}
347 755
348 case BTS_BRANCH: 756int ds_clear_pebs(struct task_struct *task)
349 set_from_ip((void *)bts, in->variant.lbr.from_ip); 757{
350 set_to_ip((void *)bts, in->variant.lbr.to_ip); 758 return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1);
351 break; 759}
352 760
353 case BTS_TASK_ARRIVES: 761int ds_get_pebs_reset(struct task_struct *task, u64 *value)
354 case BTS_TASK_DEPARTS: 762{
355 set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); 763 struct ds_context *context;
356 set_info_type((void *)bts, in->qualifier); 764 int error;
357 set_info_data((void *)bts, in->variant.jiffies);
358 break;
359 765
360 default: 766 if (!value)
361 return -EINVAL; 767 return -EINVAL;
362 }
363 768
364 bts = bts + ds_cfg.sizeof_bts; 769 context = ds_get_context(task);
365 if (bts >= get_bts_absolute_maximum(ds)) 770 error = ds_validate_access(context, ds_pebs);
366 bts = get_bts_buffer_base(ds); 771 if (error < 0)
367 set_bts_index(ds, bts); 772 goto out;
368 773
369 return ds_cfg.sizeof_bts; 774 *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8));
775
776 error = 0;
777 out:
778 ds_put_context(context);
779 return error;
370} 780}
371 781
372unsigned long ds_debugctl_mask(void) 782int ds_set_pebs_reset(struct task_struct *task, u64 value)
373{ 783{
374 return ds_cfg.debugctl_mask; 784 struct ds_context *context;
375} 785 int error;
376 786
377#ifdef __i386__ 787 context = ds_get_context(task);
378static const struct ds_configuration ds_cfg_netburst = { 788 error = ds_validate_access(context, ds_pebs);
379 .sizeof_ds = 9 * 4, 789 if (error < 0)
380 .bts_buffer_base = { 0, 4 }, 790 goto out;
381 .bts_index = { 4, 4 },
382 .bts_absolute_maximum = { 8, 4 },
383 .bts_interrupt_threshold = { 12, 4 },
384 .sizeof_bts = 3 * 4,
385 .from_ip = { 0, 4 },
386 .to_ip = { 4, 4 },
387 .info_type = { 4, 1 },
388 .info_data = { 8, 4 },
389 .debugctl_mask = (1<<2)|(1<<3)
390};
391 791
392static const struct ds_configuration ds_cfg_pentium_m = { 792 *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value;
393 .sizeof_ds = 9 * 4, 793
394 .bts_buffer_base = { 0, 4 }, 794 error = 0;
395 .bts_index = { 4, 4 }, 795 out:
396 .bts_absolute_maximum = { 8, 4 }, 796 ds_put_context(context);
397 .bts_interrupt_threshold = { 12, 4 }, 797 return error;
398 .sizeof_bts = 3 * 4, 798}
399 .from_ip = { 0, 4 }, 799
400 .to_ip = { 4, 4 }, 800static const struct ds_configuration ds_cfg_var = {
401 .info_type = { 4, 1 }, 801 .sizeof_ds = sizeof(long) * 12,
402 .info_data = { 8, 4 }, 802 .sizeof_field = sizeof(long),
403 .debugctl_mask = (1<<6)|(1<<7) 803 .sizeof_rec[ds_bts] = sizeof(long) * 3,
804 .sizeof_rec[ds_pebs] = sizeof(long) * 10
404}; 805};
405#endif /* _i386_ */ 806static const struct ds_configuration ds_cfg_64 = {
406 807 .sizeof_ds = 8 * 12,
407static const struct ds_configuration ds_cfg_core2 = { 808 .sizeof_field = 8,
408 .sizeof_ds = 9 * 8, 809 .sizeof_rec[ds_bts] = 8 * 3,
409 .bts_buffer_base = { 0, 8 }, 810 .sizeof_rec[ds_pebs] = 8 * 10
410 .bts_index = { 8, 8 },
411 .bts_absolute_maximum = { 16, 8 },
412 .bts_interrupt_threshold = { 24, 8 },
413 .sizeof_bts = 3 * 8,
414 .from_ip = { 0, 8 },
415 .to_ip = { 8, 8 },
416 .info_type = { 8, 1 },
417 .info_data = { 16, 8 },
418 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
419}; 811};
420 812
421static inline void 813static inline void
@@ -429,14 +821,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
429 switch (c->x86) { 821 switch (c->x86) {
430 case 0x6: 822 case 0x6:
431 switch (c->x86_model) { 823 switch (c->x86_model) {
432#ifdef __i386__
433 case 0xD: 824 case 0xD:
434 case 0xE: /* Pentium M */ 825 case 0xE: /* Pentium M */
435 ds_configure(&ds_cfg_pentium_m); 826 ds_configure(&ds_cfg_var);
436 break; 827 break;
437#endif /* _i386_ */
438 case 0xF: /* Core2 */ 828 case 0xF: /* Core2 */
439 ds_configure(&ds_cfg_core2); 829 case 0x1C: /* Atom */
830 ds_configure(&ds_cfg_64);
440 break; 831 break;
441 default: 832 default:
442 /* sorry, don't know about them */ 833 /* sorry, don't know about them */
@@ -445,13 +836,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
445 break; 836 break;
446 case 0xF: 837 case 0xF:
447 switch (c->x86_model) { 838 switch (c->x86_model) {
448#ifdef __i386__
449 case 0x0: 839 case 0x0:
450 case 0x1: 840 case 0x1:
451 case 0x2: /* Netburst */ 841 case 0x2: /* Netburst */
452 ds_configure(&ds_cfg_netburst); 842 ds_configure(&ds_cfg_var);
453 break; 843 break;
454#endif /* _i386_ */
455 default: 844 default:
456 /* sorry, don't know about them */ 845 /* sorry, don't know about them */
457 break; 846 break;
@@ -462,3 +851,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
462 break; 851 break;
463 } 852 }
464} 853}
854
855void ds_free(struct ds_context *context)
856{
857 /* This is called when the task owning the parameter context
858 * is dying. There should not be any user of that context left
859 * to disturb us, anymore. */
860 unsigned long leftovers = context->count;
861 while (leftovers--)
862 ds_put_context(context);
863}
864#endif /* CONFIG_X86_DS */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
new file mode 100644
index 000000000000..b3614752197b
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -0,0 +1,449 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16#include <linux/sysfs.h>
17
18#include <asm/stacktrace.h>
19
20#define STACKSLOTS_PER_LINE 8
21#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33
34static inline int valid_stack_ptr(struct thread_info *tinfo,
35 void *p, unsigned int size, void *end)
36{
37 void *t = tinfo;
38 if (end) {
39 if (p < end && p >= (end-THREAD_SIZE))
40 return 1;
41 else
42 return 0;
43 }
44 return p > t && p < t + THREAD_SIZE - size;
45}
46
47/* The form of the top of the frame on the stack */
48struct stack_frame {
49 struct stack_frame *next_frame;
50 unsigned long return_address;
51};
52
53static inline unsigned long
54print_context_stack(struct thread_info *tinfo,
55 unsigned long *stack, unsigned long bp,
56 const struct stacktrace_ops *ops, void *data,
57 unsigned long *end)
58{
59 struct stack_frame *frame = (struct stack_frame *)bp;
60
61 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
62 unsigned long addr;
63
64 addr = *stack;
65 if (__kernel_text_address(addr)) {
66 if ((unsigned long) stack == bp + sizeof(long)) {
67 ops->address(data, addr, 1);
68 frame = frame->next_frame;
69 bp = (unsigned long) frame;
70 } else {
71 ops->address(data, addr, bp == 0);
72 }
73 }
74 stack++;
75 }
76 return bp;
77}
78
79void dump_trace(struct task_struct *task, struct pt_regs *regs,
80 unsigned long *stack, unsigned long bp,
81 const struct stacktrace_ops *ops, void *data)
82{
83 if (!task)
84 task = current;
85
86 if (!stack) {
87 unsigned long dummy;
88 stack = &dummy;
89 if (task && task != current)
90 stack = (unsigned long *)task->thread.sp;
91 }
92
93#ifdef CONFIG_FRAME_POINTER
94 if (!bp) {
95 if (task == current) {
96 /* Grab bp right from our regs */
97 get_bp(bp);
98 } else {
99 /* bp is the last reg pushed by switch_to */
100 bp = *(unsigned long *) task->thread.sp;
101 }
102 }
103#endif
104
105 for (;;) {
106 struct thread_info *context;
107
108 context = (struct thread_info *)
109 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
110 bp = print_context_stack(context, stack, bp, ops, data, NULL);
111
112 stack = (unsigned long *)context->previous_esp;
113 if (!stack)
114 break;
115 if (ops->stack(data, "IRQ") < 0)
116 break;
117 touch_nmi_watchdog();
118 }
119}
120EXPORT_SYMBOL(dump_trace);
121
122static void
123print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
124{
125 printk(data);
126 print_symbol(msg, symbol);
127 printk("\n");
128}
129
130static void print_trace_warning(void *data, char *msg)
131{
132 printk("%s%s\n", (char *)data, msg);
133}
134
135static int print_trace_stack(void *data, char *name)
136{
137 printk("%s <%s> ", (char *)data, name);
138 return 0;
139}
140
141/*
142 * Print one address/symbol entries per line.
143 */
144static void print_trace_address(void *data, unsigned long addr, int reliable)
145{
146 touch_nmi_watchdog();
147 printk(data);
148 printk_address(addr, reliable);
149}
150
151static const struct stacktrace_ops print_trace_ops = {
152 .warning = print_trace_warning,
153 .warning_symbol = print_trace_warning_symbol,
154 .stack = print_trace_stack,
155 .address = print_trace_address,
156};
157
158static void
159show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
160 unsigned long *stack, unsigned long bp, char *log_lvl)
161{
162 printk("%sCall Trace:\n", log_lvl);
163 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
164}
165
166void show_trace(struct task_struct *task, struct pt_regs *regs,
167 unsigned long *stack, unsigned long bp)
168{
169 show_trace_log_lvl(task, regs, stack, bp, "");
170}
171
172static void
173show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
174 unsigned long *sp, unsigned long bp, char *log_lvl)
175{
176 unsigned long *stack;
177 int i;
178
179 if (sp == NULL) {
180 if (task)
181 sp = (unsigned long *)task->thread.sp;
182 else
183 sp = (unsigned long *)&sp;
184 }
185
186 stack = sp;
187 for (i = 0; i < kstack_depth_to_print; i++) {
188 if (kstack_end(stack))
189 break;
190 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
191 printk("\n%s", log_lvl);
192 printk(" %08lx", *stack++);
193 touch_nmi_watchdog();
194 }
195 printk("\n");
196 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
197}
198
199void show_stack(struct task_struct *task, unsigned long *sp)
200{
201 show_stack_log_lvl(task, NULL, sp, 0, "");
202}
203
204/*
205 * The architecture-independent dump_stack generator
206 */
207void dump_stack(void)
208{
209 unsigned long bp = 0;
210 unsigned long stack;
211
212#ifdef CONFIG_FRAME_POINTER
213 if (!bp)
214 get_bp(bp);
215#endif
216
217 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
218 current->pid, current->comm, print_tainted(),
219 init_utsname()->release,
220 (int)strcspn(init_utsname()->version, " "),
221 init_utsname()->version);
222 show_trace(NULL, NULL, &stack, bp);
223}
224
225EXPORT_SYMBOL(dump_stack);
226
227void show_registers(struct pt_regs *regs)
228{
229 int i;
230
231 print_modules();
232 __show_regs(regs, 0);
233
234 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
235 TASK_COMM_LEN, current->comm, task_pid_nr(current),
236 current_thread_info(), current, task_thread_info(current));
237 /*
238 * When in-kernel, we also print out the stack and code at the
239 * time of the fault..
240 */
241 if (!user_mode_vm(regs)) {
242 unsigned int code_prologue = code_bytes * 43 / 64;
243 unsigned int code_len = code_bytes;
244 unsigned char c;
245 u8 *ip;
246
247 printk(KERN_EMERG "Stack:\n");
248 show_stack_log_lvl(NULL, regs, &regs->sp,
249 0, KERN_EMERG);
250
251 printk(KERN_EMERG "Code: ");
252
253 ip = (u8 *)regs->ip - code_prologue;
254 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
255 /* try starting at IP */
256 ip = (u8 *)regs->ip;
257 code_len = code_len - code_prologue + 1;
258 }
259 for (i = 0; i < code_len; i++, ip++) {
260 if (ip < (u8 *)PAGE_OFFSET ||
261 probe_kernel_address(ip, c)) {
262 printk(" Bad EIP value.");
263 break;
264 }
265 if (ip == (u8 *)regs->ip)
266 printk("<%02x> ", c);
267 else
268 printk("%02x ", c);
269 }
270 }
271 printk("\n");
272}
273
274int is_valid_bugaddr(unsigned long ip)
275{
276 unsigned short ud2;
277
278 if (ip < PAGE_OFFSET)
279 return 0;
280 if (probe_kernel_address((unsigned short *)ip, ud2))
281 return 0;
282
283 return ud2 == 0x0b0f;
284}
285
286static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
287static int die_owner = -1;
288static unsigned int die_nest_count;
289
290unsigned __kprobes long oops_begin(void)
291{
292 unsigned long flags;
293
294 oops_enter();
295
296 if (die_owner != raw_smp_processor_id()) {
297 console_verbose();
298 raw_local_irq_save(flags);
299 __raw_spin_lock(&die_lock);
300 die_owner = smp_processor_id();
301 die_nest_count = 0;
302 bust_spinlocks(1);
303 } else {
304 raw_local_irq_save(flags);
305 }
306 die_nest_count++;
307 return flags;
308}
309
310void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
311{
312 bust_spinlocks(0);
313 die_owner = -1;
314 add_taint(TAINT_DIE);
315 __raw_spin_unlock(&die_lock);
316 raw_local_irq_restore(flags);
317
318 if (!regs)
319 return;
320
321 if (kexec_should_crash(current))
322 crash_kexec(regs);
323 if (in_interrupt())
324 panic("Fatal exception in interrupt");
325 if (panic_on_oops)
326 panic("Fatal exception");
327 oops_exit();
328 do_exit(signr);
329}
330
331int __kprobes __die(const char *str, struct pt_regs *regs, long err)
332{
333 unsigned short ss;
334 unsigned long sp;
335
336 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
337#ifdef CONFIG_PREEMPT
338 printk("PREEMPT ");
339#endif
340#ifdef CONFIG_SMP
341 printk("SMP ");
342#endif
343#ifdef CONFIG_DEBUG_PAGEALLOC
344 printk("DEBUG_PAGEALLOC");
345#endif
346 printk("\n");
347 sysfs_printk_last_file();
348 if (notify_die(DIE_OOPS, str, regs, err,
349 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
350 return 1;
351
352 show_registers(regs);
353 /* Executive summary in case the oops scrolled away */
354 sp = (unsigned long) (&regs->sp);
355 savesegment(ss, ss);
356 if (user_mode(regs)) {
357 sp = regs->sp;
358 ss = regs->ss & 0xffff;
359 }
360 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
361 print_symbol("%s", regs->ip);
362 printk(" SS:ESP %04x:%08lx\n", ss, sp);
363 return 0;
364}
365
366/*
367 * This is gone through when something in the kernel has done something bad
368 * and is about to be terminated:
369 */
370void die(const char *str, struct pt_regs *regs, long err)
371{
372 unsigned long flags = oops_begin();
373
374 if (die_nest_count < 3) {
375 report_bug(regs->ip, regs);
376
377 if (__die(str, regs, err))
378 regs = NULL;
379 } else {
380 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
381 }
382
383 oops_end(flags, regs, SIGSEGV);
384}
385
386static DEFINE_SPINLOCK(nmi_print_lock);
387
388void notrace __kprobes
389die_nmi(char *str, struct pt_regs *regs, int do_panic)
390{
391 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
392 return;
393
394 spin_lock(&nmi_print_lock);
395 /*
396 * We are in trouble anyway, lets at least try
397 * to get a message out:
398 */
399 bust_spinlocks(1);
400 printk(KERN_EMERG "%s", str);
401 printk(" on CPU%d, ip %08lx, registers:\n",
402 smp_processor_id(), regs->ip);
403 show_registers(regs);
404 if (do_panic)
405 panic("Non maskable interrupt");
406 console_silent();
407 spin_unlock(&nmi_print_lock);
408
409 /*
410 * If we are in kernel we are probably nested up pretty bad
411 * and might aswell get out now while we still can:
412 */
413 if (!user_mode_vm(regs)) {
414 current->thread.trap_no = 2;
415 crash_kexec(regs);
416 }
417
418 bust_spinlocks(0);
419 do_exit(SIGSEGV);
420}
421
422static int __init oops_setup(char *s)
423{
424 if (!s)
425 return -EINVAL;
426 if (!strcmp(s, "panic"))
427 panic_on_oops = 1;
428 return 0;
429}
430early_param("oops", oops_setup);
431
432static int __init kstack_setup(char *s)
433{
434 if (!s)
435 return -EINVAL;
436 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
437 return 0;
438}
439early_param("kstack", kstack_setup);
440
441static int __init code_bytes_setup(char *s)
442{
443 code_bytes = simple_strtoul(s, NULL, 0);
444 if (code_bytes > 8192)
445 code_bytes = 8192;
446
447 return 1;
448}
449__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
new file mode 100644
index 000000000000..96a5db7da8a7
--- /dev/null
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -0,0 +1,575 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 */
5#include <linux/kallsyms.h>
6#include <linux/kprobes.h>
7#include <linux/uaccess.h>
8#include <linux/utsname.h>
9#include <linux/hardirq.h>
10#include <linux/kdebug.h>
11#include <linux/module.h>
12#include <linux/ptrace.h>
13#include <linux/kexec.h>
14#include <linux/bug.h>
15#include <linux/nmi.h>
16#include <linux/sysfs.h>
17
18#include <asm/stacktrace.h>
19
20#define STACKSLOTS_PER_LINE 4
21#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
22
23int panic_on_unrecovered_nmi;
24int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
25static unsigned int code_bytes = 64;
26static int die_counter;
27
28void printk_address(unsigned long address, int reliable)
29{
30 printk(" [<%p>] %s%pS\n", (void *) address,
31 reliable ? "" : "? ", (void *) address);
32}
33
34static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
35 unsigned *usedp, char **idp)
36{
37 static char ids[][8] = {
38 [DEBUG_STACK - 1] = "#DB",
39 [NMI_STACK - 1] = "NMI",
40 [DOUBLEFAULT_STACK - 1] = "#DF",
41 [STACKFAULT_STACK - 1] = "#SS",
42 [MCE_STACK - 1] = "#MC",
43#if DEBUG_STKSZ > EXCEPTION_STKSZ
44 [N_EXCEPTION_STACKS ...
45 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
46#endif
47 };
48 unsigned k;
49
50 /*
51 * Iterate over all exception stacks, and figure out whether
52 * 'stack' is in one of them:
53 */
54 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
55 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
56 /*
57 * Is 'stack' above this exception frame's end?
58 * If yes then skip to the next frame.
59 */
60 if (stack >= end)
61 continue;
62 /*
63 * Is 'stack' above this exception frame's start address?
64 * If yes then we found the right frame.
65 */
66 if (stack >= end - EXCEPTION_STKSZ) {
67 /*
68 * Make sure we only iterate through an exception
69 * stack once. If it comes up for the second time
70 * then there's something wrong going on - just
71 * break out and return NULL:
72 */
73 if (*usedp & (1U << k))
74 break;
75 *usedp |= 1U << k;
76 *idp = ids[k];
77 return (unsigned long *)end;
78 }
79 /*
80 * If this is a debug stack, and if it has a larger size than
81 * the usual exception stacks, then 'stack' might still
82 * be within the lower portion of the debug stack:
83 */
84#if DEBUG_STKSZ > EXCEPTION_STKSZ
85 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
86 unsigned j = N_EXCEPTION_STACKS - 1;
87
88 /*
89 * Black magic. A large debug stack is composed of
90 * multiple exception stack entries, which we
91 * iterate through now. Dont look:
92 */
93 do {
94 ++j;
95 end -= EXCEPTION_STKSZ;
96 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
97 } while (stack < end - EXCEPTION_STKSZ);
98 if (*usedp & (1U << j))
99 break;
100 *usedp |= 1U << j;
101 *idp = ids[j];
102 return (unsigned long *)end;
103 }
104#endif
105 }
106 return NULL;
107}
108
109/*
110 * x86-64 can have up to three kernel stacks:
111 * process stack
112 * interrupt stack
113 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
114 */
115
116static inline int valid_stack_ptr(struct thread_info *tinfo,
117 void *p, unsigned int size, void *end)
118{
119 void *t = tinfo;
120 if (end) {
121 if (p < end && p >= (end-THREAD_SIZE))
122 return 1;
123 else
124 return 0;
125 }
126 return p > t && p < t + THREAD_SIZE - size;
127}
128
129/* The form of the top of the frame on the stack */
130struct stack_frame {
131 struct stack_frame *next_frame;
132 unsigned long return_address;
133};
134
135static inline unsigned long
136print_context_stack(struct thread_info *tinfo,
137 unsigned long *stack, unsigned long bp,
138 const struct stacktrace_ops *ops, void *data,
139 unsigned long *end)
140{
141 struct stack_frame *frame = (struct stack_frame *)bp;
142
143 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
144 unsigned long addr;
145
146 addr = *stack;
147 if (__kernel_text_address(addr)) {
148 if ((unsigned long) stack == bp + sizeof(long)) {
149 ops->address(data, addr, 1);
150 frame = frame->next_frame;
151 bp = (unsigned long) frame;
152 } else {
153 ops->address(data, addr, bp == 0);
154 }
155 }
156 stack++;
157 }
158 return bp;
159}
160
161void dump_trace(struct task_struct *task, struct pt_regs *regs,
162 unsigned long *stack, unsigned long bp,
163 const struct stacktrace_ops *ops, void *data)
164{
165 const unsigned cpu = get_cpu();
166 unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr;
167 unsigned used = 0;
168 struct thread_info *tinfo;
169
170 if (!task)
171 task = current;
172
173 if (!stack) {
174 unsigned long dummy;
175 stack = &dummy;
176 if (task && task != current)
177 stack = (unsigned long *)task->thread.sp;
178 }
179
180#ifdef CONFIG_FRAME_POINTER
181 if (!bp) {
182 if (task == current) {
183 /* Grab bp right from our regs */
184 get_bp(bp);
185 } else {
186 /* bp is the last reg pushed by switch_to */
187 bp = *(unsigned long *) task->thread.sp;
188 }
189 }
190#endif
191
192 /*
193 * Print function call entries in all stacks, starting at the
194 * current stack address. If the stacks consist of nested
195 * exceptions
196 */
197 tinfo = task_thread_info(task);
198 for (;;) {
199 char *id;
200 unsigned long *estack_end;
201 estack_end = in_exception_stack(cpu, (unsigned long)stack,
202 &used, &id);
203
204 if (estack_end) {
205 if (ops->stack(data, id) < 0)
206 break;
207
208 bp = print_context_stack(tinfo, stack, bp, ops,
209 data, estack_end);
210 ops->stack(data, "<EOE>");
211 /*
212 * We link to the next stack via the
213 * second-to-last pointer (index -2 to end) in the
214 * exception stack:
215 */
216 stack = (unsigned long *) estack_end[-2];
217 continue;
218 }
219 if (irqstack_end) {
220 unsigned long *irqstack;
221 irqstack = irqstack_end -
222 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
223
224 if (stack >= irqstack && stack < irqstack_end) {
225 if (ops->stack(data, "IRQ") < 0)
226 break;
227 bp = print_context_stack(tinfo, stack, bp,
228 ops, data, irqstack_end);
229 /*
230 * We link to the next stack (which would be
231 * the process stack normally) the last
232 * pointer (index -1 to end) in the IRQ stack:
233 */
234 stack = (unsigned long *) (irqstack_end[-1]);
235 irqstack_end = NULL;
236 ops->stack(data, "EOI");
237 continue;
238 }
239 }
240 break;
241 }
242
243 /*
244 * This handles the process stack:
245 */
246 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
247 put_cpu();
248}
249EXPORT_SYMBOL(dump_trace);
250
251static void
252print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
253{
254 printk(data);
255 print_symbol(msg, symbol);
256 printk("\n");
257}
258
259static void print_trace_warning(void *data, char *msg)
260{
261 printk("%s%s\n", (char *)data, msg);
262}
263
264static int print_trace_stack(void *data, char *name)
265{
266 printk("%s <%s> ", (char *)data, name);
267 return 0;
268}
269
270/*
271 * Print one address/symbol entries per line.
272 */
273static void print_trace_address(void *data, unsigned long addr, int reliable)
274{
275 touch_nmi_watchdog();
276 printk(data);
277 printk_address(addr, reliable);
278}
279
280static const struct stacktrace_ops print_trace_ops = {
281 .warning = print_trace_warning,
282 .warning_symbol = print_trace_warning_symbol,
283 .stack = print_trace_stack,
284 .address = print_trace_address,
285};
286
287static void
288show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
289 unsigned long *stack, unsigned long bp, char *log_lvl)
290{
291 printk("%sCall Trace:\n", log_lvl);
292 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl);
293}
294
295void show_trace(struct task_struct *task, struct pt_regs *regs,
296 unsigned long *stack, unsigned long bp)
297{
298 show_trace_log_lvl(task, regs, stack, bp, "");
299}
300
301static void
302show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
303 unsigned long *sp, unsigned long bp, char *log_lvl)
304{
305 unsigned long *stack;
306 int i;
307 const int cpu = smp_processor_id();
308 unsigned long *irqstack_end =
309 (unsigned long *) (cpu_pda(cpu)->irqstackptr);
310 unsigned long *irqstack =
311 (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
312
313 /*
314 * debugging aid: "show_stack(NULL, NULL);" prints the
315 * back trace for this cpu.
316 */
317
318 if (sp == NULL) {
319 if (task)
320 sp = (unsigned long *)task->thread.sp;
321 else
322 sp = (unsigned long *)&sp;
323 }
324
325 stack = sp;
326 for (i = 0; i < kstack_depth_to_print; i++) {
327 if (stack >= irqstack && stack <= irqstack_end) {
328 if (stack == irqstack_end) {
329 stack = (unsigned long *) (irqstack_end[-1]);
330 printk(" <EOI> ");
331 }
332 } else {
333 if (((long) stack & (THREAD_SIZE-1)) == 0)
334 break;
335 }
336 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
337 printk("\n%s", log_lvl);
338 printk(" %016lx", *stack++);
339 touch_nmi_watchdog();
340 }
341 printk("\n");
342 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
343}
344
345void show_stack(struct task_struct *task, unsigned long *sp)
346{
347 show_stack_log_lvl(task, NULL, sp, 0, "");
348}
349
350/*
351 * The architecture-independent dump_stack generator
352 */
353void dump_stack(void)
354{
355 unsigned long bp = 0;
356 unsigned long stack;
357
358#ifdef CONFIG_FRAME_POINTER
359 if (!bp)
360 get_bp(bp);
361#endif
362
363 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
364 current->pid, current->comm, print_tainted(),
365 init_utsname()->release,
366 (int)strcspn(init_utsname()->version, " "),
367 init_utsname()->version);
368 show_trace(NULL, NULL, &stack, bp);
369}
370EXPORT_SYMBOL(dump_stack);
371
372void show_registers(struct pt_regs *regs)
373{
374 int i;
375 unsigned long sp;
376 const int cpu = smp_processor_id();
377 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
378
379 sp = regs->sp;
380 printk("CPU %d ", cpu);
381 __show_regs(regs, 1);
382 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
383 cur->comm, cur->pid, task_thread_info(cur), cur);
384
385 /*
386 * When in-kernel, we also print out the stack and code at the
387 * time of the fault..
388 */
389 if (!user_mode(regs)) {
390 unsigned int code_prologue = code_bytes * 43 / 64;
391 unsigned int code_len = code_bytes;
392 unsigned char c;
393 u8 *ip;
394
395 printk(KERN_EMERG "Stack:\n");
396 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
397 regs->bp, KERN_EMERG);
398
399 printk(KERN_EMERG "Code: ");
400
401 ip = (u8 *)regs->ip - code_prologue;
402 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
403 /* try starting at IP */
404 ip = (u8 *)regs->ip;
405 code_len = code_len - code_prologue + 1;
406 }
407 for (i = 0; i < code_len; i++, ip++) {
408 if (ip < (u8 *)PAGE_OFFSET ||
409 probe_kernel_address(ip, c)) {
410 printk(" Bad RIP value.");
411 break;
412 }
413 if (ip == (u8 *)regs->ip)
414 printk("<%02x> ", c);
415 else
416 printk("%02x ", c);
417 }
418 }
419 printk("\n");
420}
421
422int is_valid_bugaddr(unsigned long ip)
423{
424 unsigned short ud2;
425
426 if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
427 return 0;
428
429 return ud2 == 0x0b0f;
430}
431
432static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
433static int die_owner = -1;
434static unsigned int die_nest_count;
435
436unsigned __kprobes long oops_begin(void)
437{
438 int cpu;
439 unsigned long flags;
440
441 oops_enter();
442
443 /* racy, but better than risking deadlock. */
444 raw_local_irq_save(flags);
445 cpu = smp_processor_id();
446 if (!__raw_spin_trylock(&die_lock)) {
447 if (cpu == die_owner)
448 /* nested oops. should stop eventually */;
449 else
450 __raw_spin_lock(&die_lock);
451 }
452 die_nest_count++;
453 die_owner = cpu;
454 console_verbose();
455 bust_spinlocks(1);
456 return flags;
457}
458
459void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
460{
461 die_owner = -1;
462 bust_spinlocks(0);
463 die_nest_count--;
464 if (!die_nest_count)
465 /* Nest count reaches zero, release the lock. */
466 __raw_spin_unlock(&die_lock);
467 raw_local_irq_restore(flags);
468 if (!regs) {
469 oops_exit();
470 return;
471 }
472 if (in_interrupt())
473 panic("Fatal exception in interrupt");
474 if (panic_on_oops)
475 panic("Fatal exception");
476 oops_exit();
477 do_exit(signr);
478}
479
480int __kprobes __die(const char *str, struct pt_regs *regs, long err)
481{
482 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
483#ifdef CONFIG_PREEMPT
484 printk("PREEMPT ");
485#endif
486#ifdef CONFIG_SMP
487 printk("SMP ");
488#endif
489#ifdef CONFIG_DEBUG_PAGEALLOC
490 printk("DEBUG_PAGEALLOC");
491#endif
492 printk("\n");
493 sysfs_printk_last_file();
494 if (notify_die(DIE_OOPS, str, regs, err,
495 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
496 return 1;
497
498 show_registers(regs);
499 add_taint(TAINT_DIE);
500 /* Executive summary in case the oops scrolled away */
501 printk(KERN_ALERT "RIP ");
502 printk_address(regs->ip, 1);
503 printk(" RSP <%016lx>\n", regs->sp);
504 if (kexec_should_crash(current))
505 crash_kexec(regs);
506 return 0;
507}
508
509void die(const char *str, struct pt_regs *regs, long err)
510{
511 unsigned long flags = oops_begin();
512
513 if (!user_mode(regs))
514 report_bug(regs->ip, regs);
515
516 if (__die(str, regs, err))
517 regs = NULL;
518 oops_end(flags, regs, SIGSEGV);
519}
520
521notrace __kprobes void
522die_nmi(char *str, struct pt_regs *regs, int do_panic)
523{
524 unsigned long flags;
525
526 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
527 return;
528
529 flags = oops_begin();
530 /*
531 * We are in trouble anyway, lets at least try
532 * to get a message out.
533 */
534 printk(KERN_EMERG "%s", str);
535 printk(" on CPU%d, ip %08lx, registers:\n",
536 smp_processor_id(), regs->ip);
537 show_registers(regs);
538 if (kexec_should_crash(current))
539 crash_kexec(regs);
540 if (do_panic || panic_on_oops)
541 panic("Non maskable interrupt");
542 oops_end(flags, NULL, SIGBUS);
543 nmi_exit();
544 local_irq_enable();
545 do_exit(SIGBUS);
546}
547
548static int __init oops_setup(char *s)
549{
550 if (!s)
551 return -EINVAL;
552 if (!strcmp(s, "panic"))
553 panic_on_oops = 1;
554 return 0;
555}
556early_param("oops", oops_setup);
557
558static int __init kstack_setup(char *s)
559{
560 if (!s)
561 return -EINVAL;
562 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
563 return 0;
564}
565early_param("kstack", kstack_setup);
566
567static int __init code_bytes_setup(char *s)
568{
569 code_bytes = simple_strtoul(s, NULL, 0);
570 if (code_bytes > 8192)
571 code_bytes = 8192;
572
573 return 1;
574}
575__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 28c29180b380..ce97bf3bed12 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -148,6 +148,9 @@ void __init e820_print_map(char *who)
148 case E820_NVS: 148 case E820_NVS:
149 printk(KERN_CONT "(ACPI NVS)\n"); 149 printk(KERN_CONT "(ACPI NVS)\n");
150 break; 150 break;
151 case E820_UNUSABLE:
152 printk("(unusable)\n");
153 break;
151 default: 154 default:
152 printk(KERN_CONT "type %u\n", e820.map[i].type); 155 printk(KERN_CONT "type %u\n", e820.map[i].type);
153 break; 156 break;
@@ -877,7 +880,8 @@ void __init early_res_to_bootmem(u64 start, u64 end)
877 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) 880 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
878 count++; 881 count++;
879 882
880 printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count); 883 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
884 count, start, end);
881 for (i = 0; i < count; i++) { 885 for (i = 0; i < count; i++) {
882 struct early_res *r = &early_res[i]; 886 struct early_res *r = &early_res[i];
883 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, 887 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
@@ -1202,7 +1206,7 @@ static int __init parse_memmap_opt(char *p)
1202 if (!p) 1206 if (!p)
1203 return -EINVAL; 1207 return -EINVAL;
1204 1208
1205 if (!strcmp(p, "exactmap")) { 1209 if (!strncmp(p, "exactmap", 8)) {
1206#ifdef CONFIG_CRASH_DUMP 1210#ifdef CONFIG_CRASH_DUMP
1207 /* 1211 /*
1208 * If we are doing a crash dump, we still need to know 1212 * If we are doing a crash dump, we still need to know
@@ -1259,6 +1263,7 @@ static inline const char *e820_type_to_string(int e820_type)
1259 case E820_RAM: return "System RAM"; 1263 case E820_RAM: return "System RAM";
1260 case E820_ACPI: return "ACPI Tables"; 1264 case E820_ACPI: return "ACPI Tables";
1261 case E820_NVS: return "ACPI Non-volatile Storage"; 1265 case E820_NVS: return "ACPI Non-volatile Storage";
1266 case E820_UNUSABLE: return "Unusable memory";
1262 default: return "reserved"; 1267 default: return "reserved";
1263 } 1268 }
1264} 1269}
@@ -1266,6 +1271,7 @@ static inline const char *e820_type_to_string(int e820_type)
1266/* 1271/*
1267 * Mark e820 reserved areas as busy for the resource manager. 1272 * Mark e820 reserved areas as busy for the resource manager.
1268 */ 1273 */
1274static struct resource __initdata *e820_res;
1269void __init e820_reserve_resources(void) 1275void __init e820_reserve_resources(void)
1270{ 1276{
1271 int i; 1277 int i;
@@ -1273,20 +1279,26 @@ void __init e820_reserve_resources(void)
1273 u64 end; 1279 u64 end;
1274 1280
1275 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); 1281 res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map);
1282 e820_res = res;
1276 for (i = 0; i < e820.nr_map; i++) { 1283 for (i = 0; i < e820.nr_map; i++) {
1277 end = e820.map[i].addr + e820.map[i].size - 1; 1284 end = e820.map[i].addr + e820.map[i].size - 1;
1278#ifndef CONFIG_RESOURCES_64BIT 1285 if (end != (resource_size_t)end) {
1279 if (end > 0x100000000ULL) {
1280 res++; 1286 res++;
1281 continue; 1287 continue;
1282 } 1288 }
1283#endif
1284 res->name = e820_type_to_string(e820.map[i].type); 1289 res->name = e820_type_to_string(e820.map[i].type);
1285 res->start = e820.map[i].addr; 1290 res->start = e820.map[i].addr;
1286 res->end = end; 1291 res->end = end;
1287 1292
1288 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 1293 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
1289 insert_resource(&iomem_resource, res); 1294
1295 /*
1296 * don't register the region that could be conflicted with
1297 * pci device BAR resource and insert them later in
1298 * pcibios_resource_survey()
1299 */
1300 if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20))
1301 insert_resource(&iomem_resource, res);
1290 res++; 1302 res++;
1291 } 1303 }
1292 1304
@@ -1298,10 +1310,18 @@ void __init e820_reserve_resources(void)
1298 } 1310 }
1299} 1311}
1300 1312
1301/* 1313void __init e820_reserve_resources_late(void)
1302 * Non-standard memory setup can be specified via this quirk: 1314{
1303 */ 1315 int i;
1304char * (*arch_memory_setup_quirk)(void); 1316 struct resource *res;
1317
1318 res = e820_res;
1319 for (i = 0; i < e820.nr_map; i++) {
1320 if (!res->parent && res->end)
1321 reserve_region_with_split(&iomem_resource, res->start, res->end, res->name);
1322 res++;
1323 }
1324}
1305 1325
1306char *__init default_machine_specific_memory_setup(void) 1326char *__init default_machine_specific_memory_setup(void)
1307{ 1327{
@@ -1343,8 +1363,8 @@ char *__init default_machine_specific_memory_setup(void)
1343 1363
1344char *__init __attribute__((weak)) machine_specific_memory_setup(void) 1364char *__init __attribute__((weak)) machine_specific_memory_setup(void)
1345{ 1365{
1346 if (arch_memory_setup_quirk) { 1366 if (x86_quirks->arch_memory_setup) {
1347 char *who = arch_memory_setup_quirk(); 1367 char *who = x86_quirks->arch_memory_setup();
1348 1368
1349 if (who) 1369 if (who)
1350 return who; 1370 return who;
@@ -1367,24 +1387,3 @@ void __init setup_memory_map(void)
1367 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1387 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1368 e820_print_map(who); 1388 e820_print_map(who);
1369} 1389}
1370
1371#ifdef CONFIG_X86_64
1372int __init arch_get_ram_range(int slot, u64 *addr, u64 *size)
1373{
1374 int i;
1375
1376 if (slot < 0 || slot >= e820.nr_map)
1377 return -1;
1378 for (i = slot; i < e820.nr_map; i++) {
1379 if (e820.map[i].type != E820_RAM)
1380 continue;
1381 break;
1382 }
1383 if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT))
1384 return -1;
1385 *addr = e820.map[i].addr;
1386 *size = min_t(u64, e820.map[i].size + e820.map[i].addr,
1387 max_pfn << PAGE_SHIFT) - *addr;
1388 return i + 1;
1389}
1390#endif
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index a0e11c0cc872..3ce029ffaa55 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -16,10 +16,7 @@
16#include <asm/dma.h> 16#include <asm/dma.h>
17#include <asm/io_apic.h> 17#include <asm/io_apic.h>
18#include <asm/apic.h> 18#include <asm/apic.h>
19 19#include <asm/iommu.h>
20#ifdef CONFIG_GART_IOMMU
21#include <asm/gart.h>
22#endif
23 20
24static void __init fix_hypertransport_config(int num, int slot, int func) 21static void __init fix_hypertransport_config(int num, int slot, int func)
25{ 22{
@@ -98,6 +95,113 @@ static void __init nvidia_bugs(int num, int slot, int func)
98 95
99} 96}
100 97
98#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
99static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
100{
101 u32 d;
102 u8 b;
103
104 b = read_pci_config_byte(num, slot, func, 0xac);
105 b &= ~(1<<5);
106 write_pci_config_byte(num, slot, func, 0xac, b);
107
108 d = read_pci_config(num, slot, func, 0x70);
109 d |= 1<<8;
110 write_pci_config(num, slot, func, 0x70, d);
111
112 d = read_pci_config(num, slot, func, 0x8);
113 d &= 0xff;
114 return d;
115}
116
117static void __init ati_bugs(int num, int slot, int func)
118{
119 u32 d;
120 u8 b;
121
122 if (acpi_use_timer_override)
123 return;
124
125 d = ati_ixp4x0_rev(num, slot, func);
126 if (d < 0x82)
127 acpi_skip_timer_override = 1;
128 else {
129 /* check for IRQ0 interrupt swap */
130 outb(0x72, 0xcd6); b = inb(0xcd7);
131 if (!(b & 0x2))
132 acpi_skip_timer_override = 1;
133 }
134
135 if (acpi_skip_timer_override) {
136 printk(KERN_INFO "SB4X0 revision 0x%x\n", d);
137 printk(KERN_INFO "Ignoring ACPI timer override.\n");
138 printk(KERN_INFO "If you got timer trouble "
139 "try acpi_use_timer_override\n");
140 }
141}
142
143static u32 __init ati_sbx00_rev(int num, int slot, int func)
144{
145 u32 old, d;
146
147 d = read_pci_config(num, slot, func, 0x70);
148 old = d;
149 d &= ~(1<<8);
150 write_pci_config(num, slot, func, 0x70, d);
151 d = read_pci_config(num, slot, func, 0x8);
152 d &= 0xff;
153 write_pci_config(num, slot, func, 0x70, old);
154
155 return d;
156}
157
158static void __init ati_bugs_contd(int num, int slot, int func)
159{
160 u32 d, rev;
161
162 if (acpi_use_timer_override)
163 return;
164
165 rev = ati_sbx00_rev(num, slot, func);
166 if (rev > 0x13)
167 return;
168
169 /* check for IRQ0 interrupt swap */
170 d = read_pci_config(num, slot, func, 0x64);
171 if (!(d & (1<<14)))
172 acpi_skip_timer_override = 1;
173
174 if (acpi_skip_timer_override) {
175 printk(KERN_INFO "SB600 revision 0x%x\n", rev);
176 printk(KERN_INFO "Ignoring ACPI timer override.\n");
177 printk(KERN_INFO "If you got timer trouble "
178 "try acpi_use_timer_override\n");
179 }
180}
181#else
182static void __init ati_bugs(int num, int slot, int func)
183{
184}
185
186static void __init ati_bugs_contd(int num, int slot, int func)
187{
188}
189#endif
190
191#ifdef CONFIG_DMAR
192static void __init intel_g33_dmar(int num, int slot, int func)
193{
194 struct acpi_table_header *dmar_tbl;
195 acpi_status status;
196
197 status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl);
198 if (ACPI_SUCCESS(status)) {
199 printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n");
200 dmar_disabled = 1;
201 }
202}
203#endif
204
101#define QFLAG_APPLY_ONCE 0x1 205#define QFLAG_APPLY_ONCE 0x1
102#define QFLAG_APPLIED 0x2 206#define QFLAG_APPLIED 0x2
103#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) 207#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -117,6 +221,14 @@ static struct chipset early_qrk[] __initdata = {
117 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, 221 PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs },
118 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, 222 { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
119 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, 223 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config },
224 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
225 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
226 { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
227 PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
228#ifdef CONFIG_DMAR
229 { PCI_VENDOR_ID_INTEL, 0x29c0,
230 PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar },
231#endif
120 {} 232 {}
121}; 233};
122 234
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index ff9e7350da54..34ad997d3834 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -3,11 +3,19 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/string.h> 4#include <linux/string.h>
5#include <linux/screen_info.h> 5#include <linux/screen_info.h>
6#include <linux/usb/ch9.h>
7#include <linux/pci_regs.h>
8#include <linux/pci_ids.h>
9#include <linux/errno.h>
6#include <asm/io.h> 10#include <asm/io.h>
7#include <asm/processor.h> 11#include <asm/processor.h>
8#include <asm/fcntl.h> 12#include <asm/fcntl.h>
9#include <asm/setup.h> 13#include <asm/setup.h>
10#include <xen/hvc-console.h> 14#include <xen/hvc-console.h>
15#include <asm/pci-direct.h>
16#include <asm/pgtable.h>
17#include <asm/fixmap.h>
18#include <linux/usb/ehci_def.h>
11 19
12/* Simple VGA output */ 20/* Simple VGA output */
13#define VGABASE (__ISA_IO_base + 0xb8000) 21#define VGABASE (__ISA_IO_base + 0xb8000)
@@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8; /* ttyS0 */
78static int early_serial_putc(unsigned char ch) 86static int early_serial_putc(unsigned char ch)
79{ 87{
80 unsigned timeout = 0xffff; 88 unsigned timeout = 0xffff;
89
81 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) 90 while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout)
82 cpu_relax(); 91 cpu_relax();
83 outb(ch, early_serial_base + TXR); 92 outb(ch, early_serial_base + TXR);
@@ -111,7 +120,7 @@ static __init void early_serial_init(char *s)
111 if (!strncmp(s, "0x", 2)) { 120 if (!strncmp(s, "0x", 2)) {
112 early_serial_base = simple_strtoul(s, &e, 16); 121 early_serial_base = simple_strtoul(s, &e, 16);
113 } else { 122 } else {
114 static int bases[] = { 0x3f8, 0x2f8 }; 123 static const int __initconst bases[] = { 0x3f8, 0x2f8 };
115 124
116 if (!strncmp(s, "ttyS", 4)) 125 if (!strncmp(s, "ttyS", 4))
117 s += 4; 126 s += 4;
@@ -151,6 +160,721 @@ static struct console early_serial_console = {
151 .index = -1, 160 .index = -1,
152}; 161};
153 162
163#ifdef CONFIG_EARLY_PRINTK_DBGP
164
165static struct ehci_caps __iomem *ehci_caps;
166static struct ehci_regs __iomem *ehci_regs;
167static struct ehci_dbg_port __iomem *ehci_debug;
168static unsigned int dbgp_endpoint_out;
169
170struct ehci_dev {
171 u32 bus;
172 u32 slot;
173 u32 func;
174};
175
176static struct ehci_dev ehci_dev;
177
178#define USB_DEBUG_DEVNUM 127
179
180#define DBGP_DATA_TOGGLE 0x8800
181
182static inline u32 dbgp_pid_update(u32 x, u32 tok)
183{
184 return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff);
185}
186
187static inline u32 dbgp_len_update(u32 x, u32 len)
188{
189 return (x & ~0x0f) | (len & 0x0f);
190}
191
192/*
193 * USB Packet IDs (PIDs)
194 */
195
196/* token */
197#define USB_PID_OUT 0xe1
198#define USB_PID_IN 0x69
199#define USB_PID_SOF 0xa5
200#define USB_PID_SETUP 0x2d
201/* handshake */
202#define USB_PID_ACK 0xd2
203#define USB_PID_NAK 0x5a
204#define USB_PID_STALL 0x1e
205#define USB_PID_NYET 0x96
206/* data */
207#define USB_PID_DATA0 0xc3
208#define USB_PID_DATA1 0x4b
209#define USB_PID_DATA2 0x87
210#define USB_PID_MDATA 0x0f
211/* Special */
212#define USB_PID_PREAMBLE 0x3c
213#define USB_PID_ERR 0x3c
214#define USB_PID_SPLIT 0x78
215#define USB_PID_PING 0xb4
216#define USB_PID_UNDEF_0 0xf0
217
218#define USB_PID_DATA_TOGGLE 0x88
219#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE)
220
221#define PCI_CAP_ID_EHCI_DEBUG 0xa
222
223#define HUB_ROOT_RESET_TIME 50 /* times are in msec */
224#define HUB_SHORT_RESET_TIME 10
225#define HUB_LONG_RESET_TIME 200
226#define HUB_RESET_TIMEOUT 500
227
228#define DBGP_MAX_PACKET 8
229
230static int dbgp_wait_until_complete(void)
231{
232 u32 ctrl;
233 int loop = 0x100000;
234
235 do {
236 ctrl = readl(&ehci_debug->control);
237 /* Stop when the transaction is finished */
238 if (ctrl & DBGP_DONE)
239 break;
240 } while (--loop > 0);
241
242 if (!loop)
243 return -1;
244
245 /*
246 * Now that we have observed the completed transaction,
247 * clear the done bit.
248 */
249 writel(ctrl | DBGP_DONE, &ehci_debug->control);
250 return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl);
251}
252
253static void dbgp_mdelay(int ms)
254{
255 int i;
256
257 while (ms--) {
258 for (i = 0; i < 1000; i++)
259 outb(0x1, 0x80);
260 }
261}
262
263static void dbgp_breath(void)
264{
265 /* Sleep to give the debug port a chance to breathe */
266}
267
268static int dbgp_wait_until_done(unsigned ctrl)
269{
270 u32 pids, lpid;
271 int ret;
272 int loop = 3;
273
274retry:
275 writel(ctrl | DBGP_GO, &ehci_debug->control);
276 ret = dbgp_wait_until_complete();
277 pids = readl(&ehci_debug->pids);
278 lpid = DBGP_PID_GET(pids);
279
280 if (ret < 0)
281 return ret;
282
283 /*
284 * If the port is getting full or it has dropped data
285 * start pacing ourselves, not necessary but it's friendly.
286 */
287 if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET))
288 dbgp_breath();
289
290 /* If I get a NACK reissue the transmission */
291 if (lpid == USB_PID_NAK) {
292 if (--loop > 0)
293 goto retry;
294 }
295
296 return ret;
297}
298
299static void dbgp_set_data(const void *buf, int size)
300{
301 const unsigned char *bytes = buf;
302 u32 lo, hi;
303 int i;
304
305 lo = hi = 0;
306 for (i = 0; i < 4 && i < size; i++)
307 lo |= bytes[i] << (8*i);
308 for (; i < 8 && i < size; i++)
309 hi |= bytes[i] << (8*(i - 4));
310 writel(lo, &ehci_debug->data03);
311 writel(hi, &ehci_debug->data47);
312}
313
314static void dbgp_get_data(void *buf, int size)
315{
316 unsigned char *bytes = buf;
317 u32 lo, hi;
318 int i;
319
320 lo = readl(&ehci_debug->data03);
321 hi = readl(&ehci_debug->data47);
322 for (i = 0; i < 4 && i < size; i++)
323 bytes[i] = (lo >> (8*i)) & 0xff;
324 for (; i < 8 && i < size; i++)
325 bytes[i] = (hi >> (8*(i - 4))) & 0xff;
326}
327
328static int dbgp_bulk_write(unsigned devnum, unsigned endpoint,
329 const char *bytes, int size)
330{
331 u32 pids, addr, ctrl;
332 int ret;
333
334 if (size > DBGP_MAX_PACKET)
335 return -1;
336
337 addr = DBGP_EPADDR(devnum, endpoint);
338
339 pids = readl(&ehci_debug->pids);
340 pids = dbgp_pid_update(pids, USB_PID_OUT);
341
342 ctrl = readl(&ehci_debug->control);
343 ctrl = dbgp_len_update(ctrl, size);
344 ctrl |= DBGP_OUT;
345 ctrl |= DBGP_GO;
346
347 dbgp_set_data(bytes, size);
348 writel(addr, &ehci_debug->address);
349 writel(pids, &ehci_debug->pids);
350
351 ret = dbgp_wait_until_done(ctrl);
352 if (ret < 0)
353 return ret;
354
355 return ret;
356}
357
358static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data,
359 int size)
360{
361 u32 pids, addr, ctrl;
362 int ret;
363
364 if (size > DBGP_MAX_PACKET)
365 return -1;
366
367 addr = DBGP_EPADDR(devnum, endpoint);
368
369 pids = readl(&ehci_debug->pids);
370 pids = dbgp_pid_update(pids, USB_PID_IN);
371
372 ctrl = readl(&ehci_debug->control);
373 ctrl = dbgp_len_update(ctrl, size);
374 ctrl &= ~DBGP_OUT;
375 ctrl |= DBGP_GO;
376
377 writel(addr, &ehci_debug->address);
378 writel(pids, &ehci_debug->pids);
379 ret = dbgp_wait_until_done(ctrl);
380 if (ret < 0)
381 return ret;
382
383 if (size > ret)
384 size = ret;
385 dbgp_get_data(data, size);
386 return ret;
387}
388
389static int dbgp_control_msg(unsigned devnum, int requesttype, int request,
390 int value, int index, void *data, int size)
391{
392 u32 pids, addr, ctrl;
393 struct usb_ctrlrequest req;
394 int read;
395 int ret;
396
397 read = (requesttype & USB_DIR_IN) != 0;
398 if (size > (read ? DBGP_MAX_PACKET:0))
399 return -1;
400
401 /* Compute the control message */
402 req.bRequestType = requesttype;
403 req.bRequest = request;
404 req.wValue = cpu_to_le16(value);
405 req.wIndex = cpu_to_le16(index);
406 req.wLength = cpu_to_le16(size);
407
408 pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP);
409 addr = DBGP_EPADDR(devnum, 0);
410
411 ctrl = readl(&ehci_debug->control);
412 ctrl = dbgp_len_update(ctrl, sizeof(req));
413 ctrl |= DBGP_OUT;
414 ctrl |= DBGP_GO;
415
416 /* Send the setup message */
417 dbgp_set_data(&req, sizeof(req));
418 writel(addr, &ehci_debug->address);
419 writel(pids, &ehci_debug->pids);
420 ret = dbgp_wait_until_done(ctrl);
421 if (ret < 0)
422 return ret;
423
424 /* Read the result */
425 return dbgp_bulk_read(devnum, 0, data, size);
426}
427
428
429/* Find a PCI capability */
430static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap)
431{
432 u8 pos;
433 int bytes;
434
435 if (!(read_pci_config_16(num, slot, func, PCI_STATUS) &
436 PCI_STATUS_CAP_LIST))
437 return 0;
438
439 pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST);
440 for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) {
441 u8 id;
442
443 pos &= ~3;
444 id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID);
445 if (id == 0xff)
446 break;
447 if (id == cap)
448 return pos;
449
450 pos = read_pci_config_byte(num, slot, func,
451 pos+PCI_CAP_LIST_NEXT);
452 }
453 return 0;
454}
455
456static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func)
457{
458 u32 class;
459
460 class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION);
461 if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI)
462 return 0;
463
464 return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG);
465}
466
467static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc)
468{
469 u32 bus, slot, func;
470
471 for (bus = 0; bus < 256; bus++) {
472 for (slot = 0; slot < 32; slot++) {
473 for (func = 0; func < 8; func++) {
474 unsigned cap;
475
476 cap = __find_dbgp(bus, slot, func);
477
478 if (!cap)
479 continue;
480 if (ehci_num-- != 0)
481 continue;
482 *rbus = bus;
483 *rslot = slot;
484 *rfunc = func;
485 return cap;
486 }
487 }
488 }
489 return 0;
490}
491
492static int ehci_reset_port(int port)
493{
494 u32 portsc;
495 u32 delay_time, delay;
496 int loop;
497
498 /* Reset the usb debug port */
499 portsc = readl(&ehci_regs->port_status[port - 1]);
500 portsc &= ~PORT_PE;
501 portsc |= PORT_RESET;
502 writel(portsc, &ehci_regs->port_status[port - 1]);
503
504 delay = HUB_ROOT_RESET_TIME;
505 for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT;
506 delay_time += delay) {
507 dbgp_mdelay(delay);
508
509 portsc = readl(&ehci_regs->port_status[port - 1]);
510 if (portsc & PORT_RESET) {
511 /* force reset to complete */
512 loop = 2;
513 writel(portsc & ~(PORT_RWC_BITS | PORT_RESET),
514 &ehci_regs->port_status[port - 1]);
515 do {
516 portsc = readl(&ehci_regs->port_status[port-1]);
517 } while ((portsc & PORT_RESET) && (--loop > 0));
518 }
519
520 /* Device went away? */
521 if (!(portsc & PORT_CONNECT))
522 return -ENOTCONN;
523
524 /* bomb out completely if something weird happend */
525 if ((portsc & PORT_CSC))
526 return -EINVAL;
527
528 /* If we've finished resetting, then break out of the loop */
529 if (!(portsc & PORT_RESET) && (portsc & PORT_PE))
530 return 0;
531 }
532 return -EBUSY;
533}
534
535static int ehci_wait_for_port(int port)
536{
537 u32 status;
538 int ret, reps;
539
540 for (reps = 0; reps < 3; reps++) {
541 dbgp_mdelay(100);
542 status = readl(&ehci_regs->status);
543 if (status & STS_PCD) {
544 ret = ehci_reset_port(port);
545 if (ret == 0)
546 return 0;
547 }
548 }
549 return -ENOTCONN;
550}
551
552#ifdef DBGP_DEBUG
553# define dbgp_printk early_printk
554#else
555static inline void dbgp_printk(const char *fmt, ...) { }
556#endif
557
558typedef void (*set_debug_port_t)(int port);
559
560static void default_set_debug_port(int port)
561{
562}
563
564static set_debug_port_t set_debug_port = default_set_debug_port;
565
566static void nvidia_set_debug_port(int port)
567{
568 u32 dword;
569 dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
570 0x74);
571 dword &= ~(0x0f<<12);
572 dword |= ((port & 0x0f)<<12);
573 write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74,
574 dword);
575 dbgp_printk("set debug port to %d\n", port);
576}
577
578static void __init detect_set_debug_port(void)
579{
580 u32 vendorid;
581
582 vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func,
583 0x00);
584
585 if ((vendorid & 0xffff) == 0x10de) {
586 dbgp_printk("using nvidia set_debug_port\n");
587 set_debug_port = nvidia_set_debug_port;
588 }
589}
590
591static int __init ehci_setup(void)
592{
593 struct usb_debug_descriptor dbgp_desc;
594 u32 cmd, ctrl, status, portsc, hcs_params;
595 u32 debug_port, new_debug_port = 0, n_ports;
596 u32 devnum;
597 int ret, i;
598 int loop;
599 int port_map_tried;
600 int playtimes = 3;
601
602try_next_time:
603 port_map_tried = 0;
604
605try_next_port:
606
607 hcs_params = readl(&ehci_caps->hcs_params);
608 debug_port = HCS_DEBUG_PORT(hcs_params);
609 n_ports = HCS_N_PORTS(hcs_params);
610
611 dbgp_printk("debug_port: %d\n", debug_port);
612 dbgp_printk("n_ports: %d\n", n_ports);
613
614 for (i = 1; i <= n_ports; i++) {
615 portsc = readl(&ehci_regs->port_status[i-1]);
616 dbgp_printk("portstatus%d: %08x\n", i, portsc);
617 }
618
619 if (port_map_tried && (new_debug_port != debug_port)) {
620 if (--playtimes) {
621 set_debug_port(new_debug_port);
622 goto try_next_time;
623 }
624 return -1;
625 }
626
627 loop = 10;
628 /* Reset the EHCI controller */
629 cmd = readl(&ehci_regs->command);
630 cmd |= CMD_RESET;
631 writel(cmd, &ehci_regs->command);
632 do {
633 cmd = readl(&ehci_regs->command);
634 } while ((cmd & CMD_RESET) && (--loop > 0));
635
636 if (!loop) {
637 dbgp_printk("can not reset ehci\n");
638 return -1;
639 }
640 dbgp_printk("ehci reset done\n");
641
642 /* Claim ownership, but do not enable yet */
643 ctrl = readl(&ehci_debug->control);
644 ctrl |= DBGP_OWNER;
645 ctrl &= ~(DBGP_ENABLED | DBGP_INUSE);
646 writel(ctrl, &ehci_debug->control);
647
648 /* Start the ehci running */
649 cmd = readl(&ehci_regs->command);
650 cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET);
651 cmd |= CMD_RUN;
652 writel(cmd, &ehci_regs->command);
653
654 /* Ensure everything is routed to the EHCI */
655 writel(FLAG_CF, &ehci_regs->configured_flag);
656
657 /* Wait until the controller is no longer halted */
658 loop = 10;
659 do {
660 status = readl(&ehci_regs->status);
661 } while ((status & STS_HALT) && (--loop > 0));
662
663 if (!loop) {
664 dbgp_printk("ehci can be started\n");
665 return -1;
666 }
667 dbgp_printk("ehci started\n");
668
669 /* Wait for a device to show up in the debug port */
670 ret = ehci_wait_for_port(debug_port);
671 if (ret < 0) {
672 dbgp_printk("No device found in debug port\n");
673 goto next_debug_port;
674 }
675 dbgp_printk("ehci wait for port done\n");
676
677 /* Enable the debug port */
678 ctrl = readl(&ehci_debug->control);
679 ctrl |= DBGP_CLAIM;
680 writel(ctrl, &ehci_debug->control);
681 ctrl = readl(&ehci_debug->control);
682 if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) {
683 dbgp_printk("No device in debug port\n");
684 writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control);
685 goto err;
686 }
687 dbgp_printk("debug ported enabled\n");
688
689 /* Completely transfer the debug device to the debug controller */
690 portsc = readl(&ehci_regs->port_status[debug_port - 1]);
691 portsc &= ~PORT_PE;
692 writel(portsc, &ehci_regs->port_status[debug_port - 1]);
693
694 dbgp_mdelay(100);
695
696 /* Find the debug device and make it device number 127 */
697 for (devnum = 0; devnum <= 127; devnum++) {
698 ret = dbgp_control_msg(devnum,
699 USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
700 USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0,
701 &dbgp_desc, sizeof(dbgp_desc));
702 if (ret > 0)
703 break;
704 }
705 if (devnum > 127) {
706 dbgp_printk("Could not find attached debug device\n");
707 goto err;
708 }
709 if (ret < 0) {
710 dbgp_printk("Attached device is not a debug device\n");
711 goto err;
712 }
713 dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint;
714
715 /* Move the device to 127 if it isn't already there */
716 if (devnum != USB_DEBUG_DEVNUM) {
717 ret = dbgp_control_msg(devnum,
718 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
719 USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0);
720 if (ret < 0) {
721 dbgp_printk("Could not move attached device to %d\n",
722 USB_DEBUG_DEVNUM);
723 goto err;
724 }
725 devnum = USB_DEBUG_DEVNUM;
726 dbgp_printk("debug device renamed to 127\n");
727 }
728
729 /* Enable the debug interface */
730 ret = dbgp_control_msg(USB_DEBUG_DEVNUM,
731 USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE,
732 USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0);
733 if (ret < 0) {
734 dbgp_printk(" Could not enable the debug device\n");
735 goto err;
736 }
737 dbgp_printk("debug interface enabled\n");
738
739 /* Perform a small write to get the even/odd data state in sync
740 */
741 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1);
742 if (ret < 0) {
743 dbgp_printk("dbgp_bulk_write failed: %d\n", ret);
744 goto err;
745 }
746 dbgp_printk("small write doned\n");
747
748 return 0;
749err:
750 /* Things didn't work so remove my claim */
751 ctrl = readl(&ehci_debug->control);
752 ctrl &= ~(DBGP_CLAIM | DBGP_OUT);
753 writel(ctrl, &ehci_debug->control);
754 return -1;
755
756next_debug_port:
757 port_map_tried |= (1<<(debug_port - 1));
758 new_debug_port = ((debug_port-1+1)%n_ports) + 1;
759 if (port_map_tried != ((1<<n_ports) - 1)) {
760 set_debug_port(new_debug_port);
761 goto try_next_port;
762 }
763 if (--playtimes) {
764 set_debug_port(new_debug_port);
765 goto try_next_time;
766 }
767
768 return -1;
769}
770
771static int __init early_dbgp_init(char *s)
772{
773 u32 debug_port, bar, offset;
774 u32 bus, slot, func, cap;
775 void __iomem *ehci_bar;
776 u32 dbgp_num;
777 u32 bar_val;
778 char *e;
779 int ret;
780 u8 byte;
781
782 if (!early_pci_allowed())
783 return -1;
784
785 dbgp_num = 0;
786 if (*s)
787 dbgp_num = simple_strtoul(s, &e, 10);
788 dbgp_printk("dbgp_num: %d\n", dbgp_num);
789
790 cap = find_dbgp(dbgp_num, &bus, &slot, &func);
791 if (!cap)
792 return -1;
793
794 dbgp_printk("Found EHCI debug port on %02x:%02x.%1x\n", bus, slot,
795 func);
796
797 debug_port = read_pci_config(bus, slot, func, cap);
798 bar = (debug_port >> 29) & 0x7;
799 bar = (bar * 4) + 0xc;
800 offset = (debug_port >> 16) & 0xfff;
801 dbgp_printk("bar: %02x offset: %03x\n", bar, offset);
802 if (bar != PCI_BASE_ADDRESS_0) {
803 dbgp_printk("only debug ports on bar 1 handled.\n");
804
805 return -1;
806 }
807
808 bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0);
809 dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset);
810 if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) {
811 dbgp_printk("only simple 32bit mmio bars supported\n");
812
813 return -1;
814 }
815
816 /* double check if the mem space is enabled */
817 byte = read_pci_config_byte(bus, slot, func, 0x04);
818 if (!(byte & 0x2)) {
819 byte |= 0x02;
820 write_pci_config_byte(bus, slot, func, 0x04, byte);
821 dbgp_printk("mmio for ehci enabled\n");
822 }
823
824 /*
825 * FIXME I don't have the bar size so just guess PAGE_SIZE is more
826 * than enough. 1K is the biggest I have seen.
827 */
828 set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK);
829 ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE);
830 ehci_bar += bar_val & ~PAGE_MASK;
831 dbgp_printk("ehci_bar: %p\n", ehci_bar);
832
833 ehci_caps = ehci_bar;
834 ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase));
835 ehci_debug = ehci_bar + offset;
836 ehci_dev.bus = bus;
837 ehci_dev.slot = slot;
838 ehci_dev.func = func;
839
840 detect_set_debug_port();
841
842 ret = ehci_setup();
843 if (ret < 0) {
844 dbgp_printk("ehci_setup failed\n");
845 ehci_debug = NULL;
846
847 return -1;
848 }
849
850 return 0;
851}
852
853static void early_dbgp_write(struct console *con, const char *str, u32 n)
854{
855 int chunk, ret;
856
857 if (!ehci_debug)
858 return;
859 while (n > 0) {
860 chunk = n;
861 if (chunk > DBGP_MAX_PACKET)
862 chunk = DBGP_MAX_PACKET;
863 ret = dbgp_bulk_write(USB_DEBUG_DEVNUM,
864 dbgp_endpoint_out, str, chunk);
865 str += chunk;
866 n -= chunk;
867 }
868}
869
870static struct console early_dbgp_console = {
871 .name = "earlydbg",
872 .write = early_dbgp_write,
873 .flags = CON_PRINTBUFFER,
874 .index = -1,
875};
876#endif
877
154/* Console interface to a host file on AMD's SimNow! */ 878/* Console interface to a host file on AMD's SimNow! */
155 879
156static int simnow_fd; 880static int simnow_fd;
@@ -165,6 +889,7 @@ enum {
165static noinline long simnow(long cmd, long a, long b, long c) 889static noinline long simnow(long cmd, long a, long b, long c)
166{ 890{
167 long ret; 891 long ret;
892
168 asm volatile("cpuid" : 893 asm volatile("cpuid" :
169 "=a" (ret) : 894 "=a" (ret) :
170 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); 895 "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2));
@@ -174,6 +899,7 @@ static noinline long simnow(long cmd, long a, long b, long c)
174static void __init simnow_init(char *str) 899static void __init simnow_init(char *str)
175{ 900{
176 char *fn = "klog"; 901 char *fn = "klog";
902
177 if (*str == '=') 903 if (*str == '=')
178 fn = ++str; 904 fn = ++str;
179 /* error ignored */ 905 /* error ignored */
@@ -194,7 +920,7 @@ static struct console simnow_console = {
194 920
195/* Direct interface for emergencies */ 921/* Direct interface for emergencies */
196static struct console *early_console = &early_vga_console; 922static struct console *early_console = &early_vga_console;
197static int early_console_initialized; 923static int __initdata early_console_initialized;
198 924
199asmlinkage void early_printk(const char *fmt, ...) 925asmlinkage void early_printk(const char *fmt, ...)
200{ 926{
@@ -208,10 +934,11 @@ asmlinkage void early_printk(const char *fmt, ...)
208 va_end(ap); 934 va_end(ap);
209} 935}
210 936
211static int __initdata keep_early;
212 937
213static int __init setup_early_printk(char *buf) 938static int __init setup_early_printk(char *buf)
214{ 939{
940 int keep_early;
941
215 if (!buf) 942 if (!buf)
216 return 0; 943 return 0;
217 944
@@ -219,8 +946,7 @@ static int __init setup_early_printk(char *buf)
219 return 0; 946 return 0;
220 early_console_initialized = 1; 947 early_console_initialized = 1;
221 948
222 if (strstr(buf, "keep")) 949 keep_early = (strstr(buf, "keep") != NULL);
223 keep_early = 1;
224 950
225 if (!strncmp(buf, "serial", 6)) { 951 if (!strncmp(buf, "serial", 6)) {
226 early_serial_init(buf + 6); 952 early_serial_init(buf + 6);
@@ -238,6 +964,17 @@ static int __init setup_early_printk(char *buf)
238 simnow_init(buf + 6); 964 simnow_init(buf + 6);
239 early_console = &simnow_console; 965 early_console = &simnow_console;
240 keep_early = 1; 966 keep_early = 1;
967#ifdef CONFIG_EARLY_PRINTK_DBGP
968 } else if (!strncmp(buf, "dbgp", 4)) {
969 if (early_dbgp_init(buf+4) < 0)
970 return 0;
971 early_console = &early_dbgp_console;
972 /*
973 * usb subsys will reset ehci controller, so don't keep
974 * that early console
975 */
976 keep_early = 0;
977#endif
241#ifdef CONFIG_HVC_XEN 978#ifdef CONFIG_HVC_XEN
242 } else if (!strncmp(buf, "xen", 3)) { 979 } else if (!strncmp(buf, "xen", 3)) {
243 early_console = &xenboot_console; 980 early_console = &xenboot_console;
@@ -251,4 +988,5 @@ static int __init setup_early_printk(char *buf)
251 register_console(early_console); 988 register_console(early_console);
252 return 0; 989 return 0;
253} 990}
991
254early_param("earlyprintk", setup_early_printk); 992early_param("earlyprintk", setup_early_printk);
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index 06cc8d4254b1..1119d247fe11 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -367,6 +367,10 @@ void __init efi_init(void)
367 efi.smbios = config_tables[i].table; 367 efi.smbios = config_tables[i].table;
368 printk(" SMBIOS=0x%lx ", config_tables[i].table); 368 printk(" SMBIOS=0x%lx ", config_tables[i].table);
369 } else if (!efi_guidcmp(config_tables[i].guid, 369 } else if (!efi_guidcmp(config_tables[i].guid,
370 UV_SYSTEM_TABLE_GUID)) {
371 efi.uv_systab = config_tables[i].table;
372 printk(" UVsystab=0x%lx ", config_tables[i].table);
373 } else if (!efi_guidcmp(config_tables[i].guid,
370 HCDP_TABLE_GUID)) { 374 HCDP_TABLE_GUID)) {
371 efi.hcdp = config_tables[i].table; 375 efi.hcdp = config_tables[i].table;
372 printk(" HCDP=0x%lx ", config_tables[i].table); 376 printk(" HCDP=0x%lx ", config_tables[i].table);
@@ -414,9 +418,11 @@ void __init efi_init(void)
414 if (memmap.map == NULL) 418 if (memmap.map == NULL)
415 printk(KERN_ERR "Could not map the EFI memory map!\n"); 419 printk(KERN_ERR "Could not map the EFI memory map!\n");
416 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); 420 memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size);
421
417 if (memmap.desc_size != sizeof(efi_memory_desc_t)) 422 if (memmap.desc_size != sizeof(efi_memory_desc_t))
418 printk(KERN_WARNING "Kernel-defined memdesc" 423 printk(KERN_WARNING
419 "doesn't match the one from EFI!\n"); 424 "Kernel-defined memdesc doesn't match the one from EFI!\n");
425
420 if (add_efi_memmap) 426 if (add_efi_memmap)
421 do_add_efi_memmap(); 427 do_add_efi_memmap();
422 428
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c
index 4b63c8e1f13b..5cab48ee61a4 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/kernel/efi_32.c
@@ -53,7 +53,7 @@ void efi_call_phys_prelog(void)
53 * directory. If I have PAE, I just need to duplicate one entry in 53 * directory. If I have PAE, I just need to duplicate one entry in
54 * page directory. 54 * page directory.
55 */ 55 */
56 cr4 = read_cr4(); 56 cr4 = read_cr4_safe();
57 57
58 if (cr4 & X86_CR4_PAE) { 58 if (cr4 & X86_CR4_PAE) {
59 efi_bak_pg_dir_pointer[0].pgd = 59 efi_bak_pg_dir_pointer[0].pgd =
@@ -91,7 +91,7 @@ void efi_call_phys_epilog(void)
91 gdt_descr.size = GDT_SIZE - 1; 91 gdt_descr.size = GDT_SIZE - 1;
92 load_gdt(&gdt_descr); 92 load_gdt(&gdt_descr);
93 93
94 cr4 = read_cr4(); 94 cr4 = read_cr4_safe();
95 95
96 if (cr4 & X86_CR4_PAE) { 96 if (cr4 & X86_CR4_PAE) {
97 swapper_pg_dir[pgd_index(0)].pgd = 97 swapper_pg_dir[pgd_index(0)].pgd =
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 6bc07f0f1202..dd65143941a8 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -54,6 +54,16 @@
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55#include <asm/irq_vectors.h> 55#include <asm/irq_vectors.h>
56 56
57/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
58#include <linux/elf-em.h>
59#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE)
60#define __AUDIT_ARCH_LE 0x40000000
61
62#ifndef CONFIG_AUDITSYSCALL
63#define sysenter_audit syscall_trace_entry
64#define sysexit_audit syscall_exit_work
65#endif
66
57/* 67/*
58 * We use macros for low-level operations which need to be overridden 68 * We use macros for low-level operations which need to be overridden
59 * for paravirtualization. The following will never clobber any registers: 69 * for paravirtualization. The following will never clobber any registers:
@@ -332,8 +342,9 @@ sysenter_past_esp:
332 GET_THREAD_INFO(%ebp) 342 GET_THREAD_INFO(%ebp)
333 343
334 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 344 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
335 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 345 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
336 jnz syscall_trace_entry 346 jnz sysenter_audit
347sysenter_do_call:
337 cmpl $(nr_syscalls), %eax 348 cmpl $(nr_syscalls), %eax
338 jae syscall_badsys 349 jae syscall_badsys
339 call *sys_call_table(,%eax,4) 350 call *sys_call_table(,%eax,4)
@@ -343,7 +354,8 @@ sysenter_past_esp:
343 TRACE_IRQS_OFF 354 TRACE_IRQS_OFF
344 movl TI_flags(%ebp), %ecx 355 movl TI_flags(%ebp), %ecx
345 testw $_TIF_ALLWORK_MASK, %cx 356 testw $_TIF_ALLWORK_MASK, %cx
346 jne syscall_exit_work 357 jne sysexit_audit
358sysenter_exit:
347/* if something modifies registers it must also disable sysexit */ 359/* if something modifies registers it must also disable sysexit */
348 movl PT_EIP(%esp), %edx 360 movl PT_EIP(%esp), %edx
349 movl PT_OLDESP(%esp), %ecx 361 movl PT_OLDESP(%esp), %ecx
@@ -351,6 +363,45 @@ sysenter_past_esp:
351 TRACE_IRQS_ON 363 TRACE_IRQS_ON
3521: mov PT_FS(%esp), %fs 3641: mov PT_FS(%esp), %fs
353 ENABLE_INTERRUPTS_SYSEXIT 365 ENABLE_INTERRUPTS_SYSEXIT
366
367#ifdef CONFIG_AUDITSYSCALL
368sysenter_audit:
369 testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp)
370 jnz syscall_trace_entry
371 addl $4,%esp
372 CFI_ADJUST_CFA_OFFSET -4
373 /* %esi already in 8(%esp) 6th arg: 4th syscall arg */
374 /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */
375 /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */
376 movl %ebx,%ecx /* 3rd arg: 1st syscall arg */
377 movl %eax,%edx /* 2nd arg: syscall number */
378 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
379 call audit_syscall_entry
380 pushl %ebx
381 CFI_ADJUST_CFA_OFFSET 4
382 movl PT_EAX(%esp),%eax /* reload syscall number */
383 jmp sysenter_do_call
384
385sysexit_audit:
386 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
387 jne syscall_exit_work
388 TRACE_IRQS_ON
389 ENABLE_INTERRUPTS(CLBR_ANY)
390 movl %eax,%edx /* second arg, syscall return value */
391 cmpl $0,%eax /* is it < 0? */
392 setl %al /* 1 if so, 0 if not */
393 movzbl %al,%eax /* zero-extend that */
394 inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
395 call audit_syscall_exit
396 DISABLE_INTERRUPTS(CLBR_ANY)
397 TRACE_IRQS_OFF
398 movl TI_flags(%ebp), %ecx
399 testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx
400 jne syscall_exit_work
401 movl PT_EAX(%esp),%eax /* reload syscall return value */
402 jmp sysenter_exit
403#endif
404
354 CFI_ENDPROC 405 CFI_ENDPROC
355.pushsection .fixup,"ax" 406.pushsection .fixup,"ax"
3562: movl $0,PT_FS(%esp) 4072: movl $0,PT_FS(%esp)
@@ -370,7 +421,7 @@ ENTRY(system_call)
370 GET_THREAD_INFO(%ebp) 421 GET_THREAD_INFO(%ebp)
371 # system call tracing in operation / emulation 422 # system call tracing in operation / emulation
372 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ 423 /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
373 testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) 424 testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp)
374 jnz syscall_trace_entry 425 jnz syscall_trace_entry
375 cmpl $(nr_syscalls), %eax 426 cmpl $(nr_syscalls), %eax
376 jae syscall_badsys 427 jae syscall_badsys
@@ -383,10 +434,6 @@ syscall_exit:
383 # setting need_resched or sigpending 434 # setting need_resched or sigpending
384 # between sampling and the iret 435 # between sampling and the iret
385 TRACE_IRQS_OFF 436 TRACE_IRQS_OFF
386 testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit
387 jz no_singlestep
388 orl $_TIF_SINGLESTEP,TI_flags(%ebp)
389no_singlestep:
390 movl TI_flags(%ebp), %ecx 437 movl TI_flags(%ebp), %ecx
391 testw $_TIF_ALLWORK_MASK, %cx # current->work 438 testw $_TIF_ALLWORK_MASK, %cx # current->work
392 jne syscall_exit_work 439 jne syscall_exit_work
@@ -514,12 +561,8 @@ END(work_pending)
514syscall_trace_entry: 561syscall_trace_entry:
515 movl $-ENOSYS,PT_EAX(%esp) 562 movl $-ENOSYS,PT_EAX(%esp)
516 movl %esp, %eax 563 movl %esp, %eax
517 xorl %edx,%edx 564 call syscall_trace_enter
518 call do_syscall_trace 565 /* What it returned is what we'll actually use. */
519 cmpl $0, %eax
520 jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU,
521 # so must skip actual syscall
522 movl PT_ORIG_EAX(%esp), %eax
523 cmpl $(nr_syscalls), %eax 566 cmpl $(nr_syscalls), %eax
524 jnae syscall_call 567 jnae syscall_call
525 jmp syscall_exit 568 jmp syscall_exit
@@ -528,14 +571,13 @@ END(syscall_trace_entry)
528 # perform syscall exit tracing 571 # perform syscall exit tracing
529 ALIGN 572 ALIGN
530syscall_exit_work: 573syscall_exit_work:
531 testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl 574 testb $_TIF_WORK_SYSCALL_EXIT, %cl
532 jz work_pending 575 jz work_pending
533 TRACE_IRQS_ON 576 TRACE_IRQS_ON
534 ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call 577 ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call
535 # schedule() instead 578 # schedule() instead
536 movl %esp, %eax 579 movl %esp, %eax
537 movl $1, %edx 580 call syscall_trace_leave
538 call do_syscall_trace
539 jmp resume_userspace 581 jmp resume_userspace
540END(syscall_exit_work) 582END(syscall_exit_work)
541 CFI_ENDPROC 583 CFI_ENDPROC
@@ -587,7 +629,7 @@ ENTRY(interrupt)
587ENTRY(irq_entries_start) 629ENTRY(irq_entries_start)
588 RING0_INT_FRAME 630 RING0_INT_FRAME
589vector=0 631vector=0
590.rept NR_IRQS 632.rept NR_VECTORS
591 ALIGN 633 ALIGN
592 .if vector 634 .if vector
593 CFI_ADJUST_CFA_OFFSET -4 635 CFI_ADJUST_CFA_OFFSET -4
@@ -688,6 +730,7 @@ error_code:
688 movl $(__USER_DS), %ecx 730 movl $(__USER_DS), %ecx
689 movl %ecx, %ds 731 movl %ecx, %ds
690 movl %ecx, %es 732 movl %ecx, %es
733 TRACE_IRQS_OFF
691 movl %esp,%eax # pt_regs pointer 734 movl %esp,%eax # pt_regs pointer
692 call *%edi 735 call *%edi
693 jmp ret_from_exception 736 jmp ret_from_exception
@@ -718,20 +761,9 @@ ENTRY(device_not_available)
718 RING0_INT_FRAME 761 RING0_INT_FRAME
719 pushl $-1 # mark this as an int 762 pushl $-1 # mark this as an int
720 CFI_ADJUST_CFA_OFFSET 4 763 CFI_ADJUST_CFA_OFFSET 4
721 SAVE_ALL 764 pushl $do_device_not_available
722 GET_CR0_INTO_EAX
723 testl $0x4, %eax # EM (math emulation bit)
724 jne device_not_available_emulate
725 preempt_stop(CLBR_ANY)
726 call math_state_restore
727 jmp ret_from_exception
728device_not_available_emulate:
729 pushl $0 # temporary storage for ORIG_EIP
730 CFI_ADJUST_CFA_OFFSET 4 765 CFI_ADJUST_CFA_OFFSET 4
731 call math_emulate 766 jmp error_code
732 addl $4, %esp
733 CFI_ADJUST_CFA_OFFSET -4
734 jmp ret_from_exception
735 CFI_ENDPROC 767 CFI_ENDPROC
736END(device_not_available) 768END(device_not_available)
737 769
@@ -772,6 +804,7 @@ debug_stack_correct:
772 pushl $-1 # mark this as an int 804 pushl $-1 # mark this as an int
773 CFI_ADJUST_CFA_OFFSET 4 805 CFI_ADJUST_CFA_OFFSET 4
774 SAVE_ALL 806 SAVE_ALL
807 TRACE_IRQS_OFF
775 xorl %edx,%edx # error code 0 808 xorl %edx,%edx # error code 0
776 movl %esp,%eax # pt_regs pointer 809 movl %esp,%eax # pt_regs pointer
777 call do_debug 810 call do_debug
@@ -816,6 +849,7 @@ nmi_stack_correct:
816 pushl %eax 849 pushl %eax
817 CFI_ADJUST_CFA_OFFSET 4 850 CFI_ADJUST_CFA_OFFSET 4
818 SAVE_ALL 851 SAVE_ALL
852 TRACE_IRQS_OFF
819 xorl %edx,%edx # zero error code 853 xorl %edx,%edx # zero error code
820 movl %esp,%eax # pt_regs pointer 854 movl %esp,%eax # pt_regs pointer
821 call do_nmi 855 call do_nmi
@@ -856,6 +890,7 @@ nmi_espfix_stack:
856 pushl %eax 890 pushl %eax
857 CFI_ADJUST_CFA_OFFSET 4 891 CFI_ADJUST_CFA_OFFSET 4
858 SAVE_ALL 892 SAVE_ALL
893 TRACE_IRQS_OFF
859 FIXUP_ESPFIX_STACK # %eax == %esp 894 FIXUP_ESPFIX_STACK # %eax == %esp
860 xorl %edx,%edx # zero error code 895 xorl %edx,%edx # zero error code
861 call do_nmi 896 call do_nmi
@@ -886,6 +921,7 @@ KPROBE_ENTRY(int3)
886 pushl $-1 # mark this as an int 921 pushl $-1 # mark this as an int
887 CFI_ADJUST_CFA_OFFSET 4 922 CFI_ADJUST_CFA_OFFSET 4
888 SAVE_ALL 923 SAVE_ALL
924 TRACE_IRQS_OFF
889 xorl %edx,%edx # zero error code 925 xorl %edx,%edx # zero error code
890 movl %esp,%eax # pt_regs pointer 926 movl %esp,%eax # pt_regs pointer
891 call do_int3 927 call do_int3
@@ -1024,6 +1060,7 @@ ENDPROC(kernel_thread_helper)
1024ENTRY(xen_sysenter_target) 1060ENTRY(xen_sysenter_target)
1025 RING0_INT_FRAME 1061 RING0_INT_FRAME
1026 addl $5*4, %esp /* remove xen-provided frame */ 1062 addl $5*4, %esp /* remove xen-provided frame */
1063 CFI_ADJUST_CFA_OFFSET -5*4
1027 jmp sysenter_past_esp 1064 jmp sysenter_past_esp
1028 CFI_ENDPROC 1065 CFI_ENDPROC
1029 1066
@@ -1116,20 +1153,6 @@ ENDPROC(xen_failsafe_callback)
1116#ifdef CONFIG_DYNAMIC_FTRACE 1153#ifdef CONFIG_DYNAMIC_FTRACE
1117 1154
1118ENTRY(mcount) 1155ENTRY(mcount)
1119 pushl %eax
1120 pushl %ecx
1121 pushl %edx
1122 movl 0xc(%esp), %eax
1123 subl $MCOUNT_INSN_SIZE, %eax
1124
1125.globl mcount_call
1126mcount_call:
1127 call ftrace_stub
1128
1129 popl %edx
1130 popl %ecx
1131 popl %eax
1132
1133 ret 1156 ret
1134END(mcount) 1157END(mcount)
1135 1158
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index ae63e584c340..09e7145484c5 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -53,37 +53,17 @@
53#include <asm/paravirt.h> 53#include <asm/paravirt.h>
54#include <asm/ftrace.h> 54#include <asm/ftrace.h>
55 55
56/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
57#include <linux/elf-em.h>
58#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
59#define __AUDIT_ARCH_64BIT 0x80000000
60#define __AUDIT_ARCH_LE 0x40000000
61
56 .code64 62 .code64
57 63
58#ifdef CONFIG_FTRACE 64#ifdef CONFIG_FTRACE
59#ifdef CONFIG_DYNAMIC_FTRACE 65#ifdef CONFIG_DYNAMIC_FTRACE
60ENTRY(mcount) 66ENTRY(mcount)
61
62 subq $0x38, %rsp
63 movq %rax, (%rsp)
64 movq %rcx, 8(%rsp)
65 movq %rdx, 16(%rsp)
66 movq %rsi, 24(%rsp)
67 movq %rdi, 32(%rsp)
68 movq %r8, 40(%rsp)
69 movq %r9, 48(%rsp)
70
71 movq 0x38(%rsp), %rdi
72 subq $MCOUNT_INSN_SIZE, %rdi
73
74.globl mcount_call
75mcount_call:
76 call ftrace_stub
77
78 movq 48(%rsp), %r9
79 movq 40(%rsp), %r8
80 movq 32(%rsp), %rdi
81 movq 24(%rsp), %rsi
82 movq 16(%rsp), %rdx
83 movq 8(%rsp), %rcx
84 movq (%rsp), %rax
85 addq $0x38, %rsp
86
87 retq 67 retq
88END(mcount) 68END(mcount)
89 69
@@ -269,9 +249,9 @@ ENTRY(native_usergs_sysret64)
269ENTRY(ret_from_fork) 249ENTRY(ret_from_fork)
270 CFI_DEFAULT_STACK 250 CFI_DEFAULT_STACK
271 push kernel_eflags(%rip) 251 push kernel_eflags(%rip)
272 CFI_ADJUST_CFA_OFFSET 4 252 CFI_ADJUST_CFA_OFFSET 8
273 popf # reset kernel eflags 253 popf # reset kernel eflags
274 CFI_ADJUST_CFA_OFFSET -4 254 CFI_ADJUST_CFA_OFFSET -8
275 call schedule_tail 255 call schedule_tail
276 GET_THREAD_INFO(%rcx) 256 GET_THREAD_INFO(%rcx)
277 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) 257 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
@@ -349,9 +329,9 @@ ENTRY(system_call_after_swapgs)
349 movq %rcx,RIP-ARGOFFSET(%rsp) 329 movq %rcx,RIP-ARGOFFSET(%rsp)
350 CFI_REL_OFFSET rip,RIP-ARGOFFSET 330 CFI_REL_OFFSET rip,RIP-ARGOFFSET
351 GET_THREAD_INFO(%rcx) 331 GET_THREAD_INFO(%rcx)
352 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ 332 testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
353 TI_flags(%rcx)
354 jnz tracesys 333 jnz tracesys
334system_call_fastpath:
355 cmpq $__NR_syscall_max,%rax 335 cmpq $__NR_syscall_max,%rax
356 ja badsys 336 ja badsys
357 movq %r10,%rcx 337 movq %r10,%rcx
@@ -403,16 +383,16 @@ sysret_careful:
403sysret_signal: 383sysret_signal:
404 TRACE_IRQS_ON 384 TRACE_IRQS_ON
405 ENABLE_INTERRUPTS(CLBR_NONE) 385 ENABLE_INTERRUPTS(CLBR_NONE)
406 testl $_TIF_DO_NOTIFY_MASK,%edx 386#ifdef CONFIG_AUDITSYSCALL
407 jz 1f 387 bt $TIF_SYSCALL_AUDIT,%edx
408 388 jc sysret_audit
409 /* Really a signal */ 389#endif
410 /* edx: work flags (arg3) */ 390 /* edx: work flags (arg3) */
411 leaq do_notify_resume(%rip),%rax 391 leaq do_notify_resume(%rip),%rax
412 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 392 leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
413 xorl %esi,%esi # oldset -> arg2 393 xorl %esi,%esi # oldset -> arg2
414 call ptregscall_common 394 call ptregscall_common
4151: movl $_TIF_WORK_MASK,%edi 395 movl $_TIF_WORK_MASK,%edi
416 /* Use IRET because user could have changed frame. This 396 /* Use IRET because user could have changed frame. This
417 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ 397 works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
418 DISABLE_INTERRUPTS(CLBR_NONE) 398 DISABLE_INTERRUPTS(CLBR_NONE)
@@ -423,14 +403,56 @@ badsys:
423 movq $-ENOSYS,RAX-ARGOFFSET(%rsp) 403 movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
424 jmp ret_from_sys_call 404 jmp ret_from_sys_call
425 405
406#ifdef CONFIG_AUDITSYSCALL
407 /*
408 * Fast path for syscall audit without full syscall trace.
409 * We just call audit_syscall_entry() directly, and then
410 * jump back to the normal fast path.
411 */
412auditsys:
413 movq %r10,%r9 /* 6th arg: 4th syscall arg */
414 movq %rdx,%r8 /* 5th arg: 3rd syscall arg */
415 movq %rsi,%rcx /* 4th arg: 2nd syscall arg */
416 movq %rdi,%rdx /* 3rd arg: 1st syscall arg */
417 movq %rax,%rsi /* 2nd arg: syscall number */
418 movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */
419 call audit_syscall_entry
420 LOAD_ARGS 0 /* reload call-clobbered registers */
421 jmp system_call_fastpath
422
423 /*
424 * Return fast path for syscall audit. Call audit_syscall_exit()
425 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
426 * masked off.
427 */
428sysret_audit:
429 movq %rax,%rsi /* second arg, syscall return value */
430 cmpq $0,%rax /* is it < 0? */
431 setl %al /* 1 if so, 0 if not */
432 movzbl %al,%edi /* zero-extend that into %edi */
433 inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
434 call audit_syscall_exit
435 movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
436 jmp sysret_check
437#endif /* CONFIG_AUDITSYSCALL */
438
426 /* Do syscall tracing */ 439 /* Do syscall tracing */
427tracesys: 440tracesys:
441#ifdef CONFIG_AUDITSYSCALL
442 testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
443 jz auditsys
444#endif
428 SAVE_REST 445 SAVE_REST
429 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ 446 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
430 FIXUP_TOP_OF_STACK %rdi 447 FIXUP_TOP_OF_STACK %rdi
431 movq %rsp,%rdi 448 movq %rsp,%rdi
432 call syscall_trace_enter 449 call syscall_trace_enter
433 LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ 450 /*
451 * Reload arg registers from stack in case ptrace changed them.
452 * We don't reload %rax because syscall_trace_enter() returned
453 * the value it wants us to use in the table lookup.
454 */
455 LOAD_ARGS ARGOFFSET, 1
434 RESTORE_REST 456 RESTORE_REST
435 cmpq $__NR_syscall_max,%rax 457 cmpq $__NR_syscall_max,%rax
436 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ 458 ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */
@@ -444,6 +466,7 @@ tracesys:
444 * Has correct top of stack, but partial stack frame. 466 * Has correct top of stack, but partial stack frame.
445 */ 467 */
446 .globl int_ret_from_sys_call 468 .globl int_ret_from_sys_call
469 .globl int_with_check
447int_ret_from_sys_call: 470int_ret_from_sys_call:
448 DISABLE_INTERRUPTS(CLBR_NONE) 471 DISABLE_INTERRUPTS(CLBR_NONE)
449 TRACE_IRQS_OFF 472 TRACE_IRQS_OFF
@@ -483,7 +506,7 @@ int_very_careful:
483 ENABLE_INTERRUPTS(CLBR_NONE) 506 ENABLE_INTERRUPTS(CLBR_NONE)
484 SAVE_REST 507 SAVE_REST
485 /* Check for syscall exit trace */ 508 /* Check for syscall exit trace */
486 testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx 509 testl $_TIF_WORK_SYSCALL_EXIT,%edx
487 jz int_signal 510 jz int_signal
488 pushq %rdi 511 pushq %rdi
489 CFI_ADJUST_CFA_OFFSET 8 512 CFI_ADJUST_CFA_OFFSET 8
@@ -491,7 +514,7 @@ int_very_careful:
491 call syscall_trace_leave 514 call syscall_trace_leave
492 popq %rdi 515 popq %rdi
493 CFI_ADJUST_CFA_OFFSET -8 516 CFI_ADJUST_CFA_OFFSET -8
494 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi 517 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
495 jmp int_restore_rest 518 jmp int_restore_rest
496 519
497int_signal: 520int_signal:
@@ -618,6 +641,13 @@ END(stub_rt_sigreturn)
618 SAVE_ARGS 641 SAVE_ARGS
619 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler 642 leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler
620 pushq %rbp 643 pushq %rbp
644 /*
645 * Save rbp twice: One is for marking the stack frame, as usual, and the
646 * other, to fill pt_regs properly. This is because bx comes right
647 * before the last saved register in that structure, and not bp. If the
648 * base pointer were in the place bx is today, this would not be needed.
649 */
650 movq %rbp, -8(%rsp)
621 CFI_ADJUST_CFA_OFFSET 8 651 CFI_ADJUST_CFA_OFFSET 8
622 CFI_REL_OFFSET rbp, 0 652 CFI_REL_OFFSET rbp, 0
623 movq %rsp,%rbp 653 movq %rsp,%rbp
@@ -883,6 +913,9 @@ END(spurious_interrupt)
883 .if \ist 913 .if \ist
884 movq %gs:pda_data_offset, %rbp 914 movq %gs:pda_data_offset, %rbp
885 .endif 915 .endif
916 .if \irqtrace
917 TRACE_IRQS_OFF
918 .endif
886 movq %rsp,%rdi 919 movq %rsp,%rdi
887 movq ORIG_RAX(%rsp),%rsi 920 movq ORIG_RAX(%rsp),%rsi
888 movq $-1,ORIG_RAX(%rsp) 921 movq $-1,ORIG_RAX(%rsp)
@@ -1009,7 +1042,8 @@ KPROBE_ENTRY(error_entry)
1009 je error_kernelspace 1042 je error_kernelspace
1010error_swapgs: 1043error_swapgs:
1011 SWAPGS 1044 SWAPGS
1012error_sti: 1045error_sti:
1046 TRACE_IRQS_OFF
1013 movq %rdi,RDI(%rsp) 1047 movq %rdi,RDI(%rsp)
1014 CFI_REL_OFFSET rdi,RDI 1048 CFI_REL_OFFSET rdi,RDI
1015 movq %rsp,%rdi 1049 movq %rsp,%rdi
@@ -1183,12 +1217,13 @@ ENTRY(simd_coprocessor_error)
1183END(simd_coprocessor_error) 1217END(simd_coprocessor_error)
1184 1218
1185ENTRY(device_not_available) 1219ENTRY(device_not_available)
1186 zeroentry math_state_restore 1220 zeroentry do_device_not_available
1187END(device_not_available) 1221END(device_not_available)
1188 1222
1189 /* runs on exception stack */ 1223 /* runs on exception stack */
1190KPROBE_ENTRY(debug) 1224KPROBE_ENTRY(debug)
1191 INTR_FRAME 1225 INTR_FRAME
1226 PARAVIRT_ADJUST_EXCEPTION_FRAME
1192 pushq $0 1227 pushq $0
1193 CFI_ADJUST_CFA_OFFSET 8 1228 CFI_ADJUST_CFA_OFFSET 8
1194 paranoidentry do_debug, DEBUG_STACK 1229 paranoidentry do_debug, DEBUG_STACK
@@ -1198,6 +1233,7 @@ KPROBE_END(debug)
1198 /* runs on exception stack */ 1233 /* runs on exception stack */
1199KPROBE_ENTRY(nmi) 1234KPROBE_ENTRY(nmi)
1200 INTR_FRAME 1235 INTR_FRAME
1236 PARAVIRT_ADJUST_EXCEPTION_FRAME
1201 pushq $-1 1237 pushq $-1
1202 CFI_ADJUST_CFA_OFFSET 8 1238 CFI_ADJUST_CFA_OFFSET 8
1203 paranoidentry do_nmi, 0, 0 1239 paranoidentry do_nmi, 0, 0
@@ -1211,6 +1247,7 @@ KPROBE_END(nmi)
1211 1247
1212KPROBE_ENTRY(int3) 1248KPROBE_ENTRY(int3)
1213 INTR_FRAME 1249 INTR_FRAME
1250 PARAVIRT_ADJUST_EXCEPTION_FRAME
1214 pushq $0 1251 pushq $0
1215 CFI_ADJUST_CFA_OFFSET 8 1252 CFI_ADJUST_CFA_OFFSET 8
1216 paranoidentry do_int3, DEBUG_STACK 1253 paranoidentry do_int3, DEBUG_STACK
@@ -1237,6 +1274,7 @@ END(coprocessor_segment_overrun)
1237 /* runs on exception stack */ 1274 /* runs on exception stack */
1238ENTRY(double_fault) 1275ENTRY(double_fault)
1239 XCPT_FRAME 1276 XCPT_FRAME
1277 PARAVIRT_ADJUST_EXCEPTION_FRAME
1240 paranoidentry do_double_fault 1278 paranoidentry do_double_fault
1241 jmp paranoid_exit1 1279 jmp paranoid_exit1
1242 CFI_ENDPROC 1280 CFI_ENDPROC
@@ -1253,6 +1291,7 @@ END(segment_not_present)
1253 /* runs on exception stack */ 1291 /* runs on exception stack */
1254ENTRY(stack_segment) 1292ENTRY(stack_segment)
1255 XCPT_FRAME 1293 XCPT_FRAME
1294 PARAVIRT_ADJUST_EXCEPTION_FRAME
1256 paranoidentry do_stack_segment 1295 paranoidentry do_stack_segment
1257 jmp paranoid_exit1 1296 jmp paranoid_exit1
1258 CFI_ENDPROC 1297 CFI_ENDPROC
@@ -1278,6 +1317,7 @@ END(spurious_interrupt_bug)
1278 /* runs on exception stack */ 1317 /* runs on exception stack */
1279ENTRY(machine_check) 1318ENTRY(machine_check)
1280 INTR_FRAME 1319 INTR_FRAME
1320 PARAVIRT_ADJUST_EXCEPTION_FRAME
1281 pushq $0 1321 pushq $0
1282 CFI_ADJUST_CFA_OFFSET 8 1322 CFI_ADJUST_CFA_OFFSET 8
1283 paranoidentry do_machine_check 1323 paranoidentry do_machine_check
@@ -1312,3 +1352,103 @@ KPROBE_ENTRY(ignore_sysret)
1312 sysret 1352 sysret
1313 CFI_ENDPROC 1353 CFI_ENDPROC
1314ENDPROC(ignore_sysret) 1354ENDPROC(ignore_sysret)
1355
1356#ifdef CONFIG_XEN
1357ENTRY(xen_hypervisor_callback)
1358 zeroentry xen_do_hypervisor_callback
1359END(xen_hypervisor_callback)
1360
1361/*
1362# A note on the "critical region" in our callback handler.
1363# We want to avoid stacking callback handlers due to events occurring
1364# during handling of the last event. To do this, we keep events disabled
1365# until we've done all processing. HOWEVER, we must enable events before
1366# popping the stack frame (can't be done atomically) and so it would still
1367# be possible to get enough handler activations to overflow the stack.
1368# Although unlikely, bugs of that kind are hard to track down, so we'd
1369# like to avoid the possibility.
1370# So, on entry to the handler we detect whether we interrupted an
1371# existing activation in its critical region -- if so, we pop the current
1372# activation and restart the handler using the previous one.
1373*/
1374ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1375 CFI_STARTPROC
1376/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1377 see the correct pointer to the pt_regs */
1378 movq %rdi, %rsp # we don't return, adjust the stack frame
1379 CFI_ENDPROC
1380 CFI_DEFAULT_STACK
138111: incl %gs:pda_irqcount
1382 movq %rsp,%rbp
1383 CFI_DEF_CFA_REGISTER rbp
1384 cmovzq %gs:pda_irqstackptr,%rsp
1385 pushq %rbp # backlink for old unwinder
1386 call xen_evtchn_do_upcall
1387 popq %rsp
1388 CFI_DEF_CFA_REGISTER rsp
1389 decl %gs:pda_irqcount
1390 jmp error_exit
1391 CFI_ENDPROC
1392END(do_hypervisor_callback)
1393
1394/*
1395# Hypervisor uses this for application faults while it executes.
1396# We get here for two reasons:
1397# 1. Fault while reloading DS, ES, FS or GS
1398# 2. Fault while executing IRET
1399# Category 1 we do not need to fix up as Xen has already reloaded all segment
1400# registers that could be reloaded and zeroed the others.
1401# Category 2 we fix up by killing the current process. We cannot use the
1402# normal Linux return path in this case because if we use the IRET hypercall
1403# to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1404# We distinguish between categories by comparing each saved segment register
1405# with its current contents: any discrepancy means we in category 1.
1406*/
1407ENTRY(xen_failsafe_callback)
1408 framesz = (RIP-0x30) /* workaround buggy gas */
1409 _frame framesz
1410 CFI_REL_OFFSET rcx, 0
1411 CFI_REL_OFFSET r11, 8
1412 movw %ds,%cx
1413 cmpw %cx,0x10(%rsp)
1414 CFI_REMEMBER_STATE
1415 jne 1f
1416 movw %es,%cx
1417 cmpw %cx,0x18(%rsp)
1418 jne 1f
1419 movw %fs,%cx
1420 cmpw %cx,0x20(%rsp)
1421 jne 1f
1422 movw %gs,%cx
1423 cmpw %cx,0x28(%rsp)
1424 jne 1f
1425 /* All segments match their saved values => Category 2 (Bad IRET). */
1426 movq (%rsp),%rcx
1427 CFI_RESTORE rcx
1428 movq 8(%rsp),%r11
1429 CFI_RESTORE r11
1430 addq $0x30,%rsp
1431 CFI_ADJUST_CFA_OFFSET -0x30
1432 pushq $0
1433 CFI_ADJUST_CFA_OFFSET 8
1434 pushq %r11
1435 CFI_ADJUST_CFA_OFFSET 8
1436 pushq %rcx
1437 CFI_ADJUST_CFA_OFFSET 8
1438 jmp general_protection
1439 CFI_RESTORE_STATE
14401: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1441 movq (%rsp),%rcx
1442 CFI_RESTORE rcx
1443 movq 8(%rsp),%r11
1444 CFI_RESTORE r11
1445 addq $0x30,%rsp
1446 CFI_ADJUST_CFA_OFFSET -0x30
1447 pushq $0
1448 CFI_ADJUST_CFA_OFFSET 8
1449 SAVE_ALL
1450 jmp error_exit
1451 CFI_ENDPROC
1452END(xen_failsafe_callback)
1453
1454#endif /* CONFIG_XEN */
diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/kernel/es7000_32.c
index 4354ce804889..f454c78fcef6 100644
--- a/arch/x86/mach-es7000/es7000plat.c
+++ b/arch/x86/kernel/es7000_32.c
@@ -39,10 +39,94 @@
39#include <asm/nmi.h> 39#include <asm/nmi.h>
40#include <asm/smp.h> 40#include <asm/smp.h>
41#include <asm/apicdef.h> 41#include <asm/apicdef.h>
42#include "es7000.h"
43#include <mach_mpparse.h> 42#include <mach_mpparse.h>
44 43
45/* 44/*
45 * ES7000 chipsets
46 */
47
48#define NON_UNISYS 0
49#define ES7000_CLASSIC 1
50#define ES7000_ZORRO 2
51
52
53#define MIP_REG 1
54#define MIP_PSAI_REG 4
55
56#define MIP_BUSY 1
57#define MIP_SPIN 0xf0000
58#define MIP_VALID 0x0100000000000000ULL
59#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
60
61#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
62
63struct mip_reg_info {
64 unsigned long long mip_info;
65 unsigned long long delivery_info;
66 unsigned long long host_reg;
67 unsigned long long mip_reg;
68};
69
70struct part_info {
71 unsigned char type;
72 unsigned char length;
73 unsigned char part_id;
74 unsigned char apic_mode;
75 unsigned long snum;
76 char ptype[16];
77 char sname[64];
78 char pname[64];
79};
80
81struct psai {
82 unsigned long long entry_type;
83 unsigned long long addr;
84 unsigned long long bep_addr;
85};
86
87struct es7000_mem_info {
88 unsigned char type;
89 unsigned char length;
90 unsigned char resv[6];
91 unsigned long long start;
92 unsigned long long size;
93};
94
95struct es7000_oem_table {
96 unsigned long long hdr;
97 struct mip_reg_info mip;
98 struct part_info pif;
99 struct es7000_mem_info shm;
100 struct psai psai;
101};
102
103#ifdef CONFIG_ACPI
104
105struct oem_table {
106 struct acpi_table_header Header;
107 u32 OEMTableAddr;
108 u32 OEMTableSize;
109};
110
111extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
112extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr);
113#endif
114
115struct mip_reg {
116 unsigned long long off_0;
117 unsigned long long off_8;
118 unsigned long long off_10;
119 unsigned long long off_18;
120 unsigned long long off_20;
121 unsigned long long off_28;
122 unsigned long long off_30;
123 unsigned long long off_38;
124};
125
126#define MIP_SW_APIC 0x1020b
127#define MIP_FUNC(VALUE) (VALUE & 0xff)
128
129/*
46 * ES7000 Globals 130 * ES7000 Globals
47 */ 131 */
48 132
@@ -72,7 +156,7 @@ es7000_rename_gsi(int ioapic, int gsi)
72 base += nr_ioapic_registers[i]; 156 base += nr_ioapic_registers[i];
73 } 157 }
74 158
75 if (!ioapic && (gsi < 16)) 159 if (!ioapic && (gsi < 16))
76 gsi += base; 160 gsi += base;
77 return gsi; 161 return gsi;
78} 162}
@@ -130,10 +214,10 @@ parse_unisys_oem (char *oemptr)
130 mip_addr = val; 214 mip_addr = val;
131 mip = (struct mip_reg *)val; 215 mip = (struct mip_reg *)val;
132 mip_reg = __va(mip); 216 mip_reg = __va(mip);
133 Dprintk("es7000_mipcfg: host_reg = 0x%lx \n", 217 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n",
134 (unsigned long)host_reg); 218 (unsigned long)host_reg);
135 Dprintk("es7000_mipcfg: mip_reg = 0x%lx \n", 219 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n",
136 (unsigned long)mip_reg); 220 (unsigned long)mip_reg);
137 success++; 221 success++;
138 break; 222 break;
139 case MIP_PSAI_REG: 223 case MIP_PSAI_REG:
@@ -160,21 +244,38 @@ parse_unisys_oem (char *oemptr)
160} 244}
161 245
162#ifdef CONFIG_ACPI 246#ifdef CONFIG_ACPI
163int __init 247static unsigned long oem_addrX;
164find_unisys_acpi_oem_table(unsigned long *oem_addr) 248static unsigned long oem_size;
249int __init find_unisys_acpi_oem_table(unsigned long *oem_addr)
165{ 250{
166 struct acpi_table_header *header = NULL; 251 struct acpi_table_header *header = NULL;
167 int i = 0; 252 int i = 0;
168 while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { 253 acpi_size tbl_size;
254
255 while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) {
169 if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { 256 if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) {
170 struct oem_table *t = (struct oem_table *)header; 257 struct oem_table *t = (struct oem_table *)header;
171 *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr, 258
172 t->OEMTableSize); 259 oem_addrX = t->OEMTableAddr;
260 oem_size = t->OEMTableSize;
261 early_acpi_os_unmap_memory(header, tbl_size);
262
263 *oem_addr = (unsigned long)__acpi_map_table(oem_addrX,
264 oem_size);
173 return 0; 265 return 0;
174 } 266 }
267 early_acpi_os_unmap_memory(header, tbl_size);
175 } 268 }
176 return -1; 269 return -1;
177} 270}
271
272void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr)
273{
274 if (!oem_addr)
275 return;
276
277 __acpi_unmap_table((char *)oem_addr, oem_size);
278}
178#endif 279#endif
179 280
180static void 281static void
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index ab115cd15fdf..d073d981a730 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -11,17 +11,18 @@
11 11
12#include <linux/spinlock.h> 12#include <linux/spinlock.h>
13#include <linux/hardirq.h> 13#include <linux/hardirq.h>
14#include <linux/uaccess.h>
14#include <linux/ftrace.h> 15#include <linux/ftrace.h>
15#include <linux/percpu.h> 16#include <linux/percpu.h>
16#include <linux/init.h> 17#include <linux/init.h>
17#include <linux/list.h> 18#include <linux/list.h>
18 19
19#include <asm/alternative.h>
20#include <asm/ftrace.h> 20#include <asm/ftrace.h>
21#include <asm/nops.h>
21 22
22 23
23/* Long is fine, even if it is only 4 bytes ;-) */ 24/* Long is fine, even if it is only 4 bytes ;-) */
24static long *ftrace_nop; 25static unsigned long *ftrace_nop;
25 26
26union ftrace_code_union { 27union ftrace_code_union {
27 char code[MCOUNT_INSN_SIZE]; 28 char code[MCOUNT_INSN_SIZE];
@@ -60,11 +61,7 @@ notrace int
60ftrace_modify_code(unsigned long ip, unsigned char *old_code, 61ftrace_modify_code(unsigned long ip, unsigned char *old_code,
61 unsigned char *new_code) 62 unsigned char *new_code)
62{ 63{
63 unsigned replaced; 64 unsigned char replaced[MCOUNT_INSN_SIZE];
64 unsigned old = *(unsigned *)old_code; /* 4 bytes */
65 unsigned new = *(unsigned *)new_code; /* 4 bytes */
66 unsigned char newch = new_code[4];
67 int faulted = 0;
68 65
69 /* 66 /*
70 * Note: Due to modules and __init, code can 67 * Note: Due to modules and __init, code can
@@ -72,29 +69,20 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
72 * as well as code changing. 69 * as well as code changing.
73 * 70 *
74 * No real locking needed, this code is run through 71 * No real locking needed, this code is run through
75 * kstop_machine. 72 * kstop_machine, or before SMP starts.
76 */ 73 */
77 asm volatile ( 74 if (__copy_from_user_inatomic(replaced, (char __user *)ip, MCOUNT_INSN_SIZE))
78 "1: lock\n" 75 return 1;
79 " cmpxchg %3, (%2)\n" 76
80 " jnz 2f\n" 77 if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0)
81 " movb %b4, 4(%2)\n" 78 return 2;
82 "2:\n"
83 ".section .fixup, \"ax\"\n"
84 "3: movl $1, %0\n"
85 " jmp 2b\n"
86 ".previous\n"
87 _ASM_EXTABLE(1b, 3b)
88 : "=r"(faulted), "=a"(replaced)
89 : "r"(ip), "r"(new), "c"(newch),
90 "0"(faulted), "a"(old)
91 : "memory");
92 sync_core();
93 79
94 if (replaced != old && replaced != new) 80 WARN_ON_ONCE(__copy_to_user_inatomic((char __user *)ip, new_code,
95 faulted = 2; 81 MCOUNT_INSN_SIZE));
96 82
97 return faulted; 83 sync_core();
84
85 return 0;
98} 86}
99 87
100notrace int ftrace_update_ftrace_func(ftrace_func_t func) 88notrace int ftrace_update_ftrace_func(ftrace_func_t func)
@@ -112,30 +100,76 @@ notrace int ftrace_update_ftrace_func(ftrace_func_t func)
112 100
113notrace int ftrace_mcount_set(unsigned long *data) 101notrace int ftrace_mcount_set(unsigned long *data)
114{ 102{
115 unsigned long ip = (long)(&mcount_call); 103 /* mcount is initialized as a nop */
116 unsigned long *addr = data; 104 *data = 0;
117 unsigned char old[MCOUNT_INSN_SIZE], *new;
118
119 /*
120 * Replace the mcount stub with a pointer to the
121 * ip recorder function.
122 */
123 memcpy(old, &mcount_call, MCOUNT_INSN_SIZE);
124 new = ftrace_call_replace(ip, *addr);
125 *addr = ftrace_modify_code(ip, old, new);
126
127 return 0; 105 return 0;
128} 106}
129 107
130int __init ftrace_dyn_arch_init(void *data) 108int __init ftrace_dyn_arch_init(void *data)
131{ 109{
132 const unsigned char *const *noptable = find_nop_table(); 110 extern const unsigned char ftrace_test_p6nop[];
133 111 extern const unsigned char ftrace_test_nop5[];
134 /* This is running in kstop_machine */ 112 extern const unsigned char ftrace_test_jmp[];
135 113 int faulted = 0;
136 ftrace_mcount_set(data);
137 114
138 ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE]; 115 /*
116 * There is no good nop for all x86 archs.
117 * We will default to using the P6_NOP5, but first we
118 * will test to make sure that the nop will actually
119 * work on this CPU. If it faults, we will then
120 * go to a lesser efficient 5 byte nop. If that fails
121 * we then just use a jmp as our nop. This isn't the most
122 * efficient nop, but we can not use a multi part nop
123 * since we would then risk being preempted in the middle
124 * of that nop, and if we enabled tracing then, it might
125 * cause a system crash.
126 *
127 * TODO: check the cpuid to determine the best nop.
128 */
129 asm volatile (
130 "jmp ftrace_test_jmp\n"
131 /* This code needs to stay around */
132 ".section .text, \"ax\"\n"
133 "ftrace_test_jmp:"
134 "jmp ftrace_test_p6nop\n"
135 "nop\n"
136 "nop\n"
137 "nop\n" /* 2 byte jmp + 3 bytes */
138 "ftrace_test_p6nop:"
139 P6_NOP5
140 "jmp 1f\n"
141 "ftrace_test_nop5:"
142 ".byte 0x66,0x66,0x66,0x66,0x90\n"
143 "jmp 1f\n"
144 ".previous\n"
145 "1:"
146 ".section .fixup, \"ax\"\n"
147 "2: movl $1, %0\n"
148 " jmp ftrace_test_nop5\n"
149 "3: movl $2, %0\n"
150 " jmp 1b\n"
151 ".previous\n"
152 _ASM_EXTABLE(ftrace_test_p6nop, 2b)
153 _ASM_EXTABLE(ftrace_test_nop5, 3b)
154 : "=r"(faulted) : "0" (faulted));
155
156 switch (faulted) {
157 case 0:
158 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n");
159 ftrace_nop = (unsigned long *)ftrace_test_p6nop;
160 break;
161 case 1:
162 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n");
163 ftrace_nop = (unsigned long *)ftrace_test_nop5;
164 break;
165 case 2:
166 pr_info("ftrace: converting mcount calls to jmp . + 5\n");
167 ftrace_nop = (unsigned long *)ftrace_test_jmp;
168 break;
169 }
170
171 /* The return code is retured via data */
172 *(unsigned long *)data = 0;
139 173
140 return 0; 174 return 0;
141} 175}
diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c
index 1fa8be5bd217..6c9bfc9e1e95 100644
--- a/arch/x86/kernel/genapic_64.c
+++ b/arch/x86/kernel/genapic_64.c
@@ -16,86 +16,63 @@
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h> 18#include <linux/hardirq.h>
19#include <linux/dmar.h>
19 20
20#include <asm/smp.h> 21#include <asm/smp.h>
21#include <asm/ipi.h> 22#include <asm/ipi.h>
22#include <asm/genapic.h> 23#include <asm/genapic.h>
23 24
24#ifdef CONFIG_ACPI 25extern struct genapic apic_flat;
25#include <acpi/acpi_bus.h> 26extern struct genapic apic_physflat;
26#endif 27extern struct genapic apic_x2xpic_uv_x;
27 28extern struct genapic apic_x2apic_phys;
28DEFINE_PER_CPU(int, x2apic_extra_bits); 29extern struct genapic apic_x2apic_cluster;
29 30
30struct genapic __read_mostly *genapic = &apic_flat; 31struct genapic __read_mostly *genapic = &apic_flat;
31 32
32static enum uv_system_type uv_system_type; 33static struct genapic *apic_probe[] __initdata = {
34 &apic_x2apic_uv_x,
35 &apic_x2apic_phys,
36 &apic_x2apic_cluster,
37 &apic_physflat,
38 NULL,
39};
33 40
34/* 41/*
35 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. 42 * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode.
36 */ 43 */
37void __init setup_apic_routing(void) 44void __init setup_apic_routing(void)
38{ 45{
39 if (uv_system_type == UV_NON_UNIQUE_APIC) 46 if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) {
40 genapic = &apic_x2apic_uv_x; 47 if (!intr_remapping_enabled)
41 else 48 genapic = &apic_flat;
42#ifdef CONFIG_ACPI 49 }
43 /*
44 * Quirk: some x86_64 machines can only use physical APIC mode
45 * regardless of how many processors are present (x86_64 ES7000
46 * is an example).
47 */
48 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
49 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL))
50 genapic = &apic_physflat;
51 else
52#endif
53
54 if (max_physical_apicid < 8)
55 genapic = &apic_flat;
56 else
57 genapic = &apic_physflat;
58 50
59 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); 51 if (genapic == &apic_flat) {
52 if (max_physical_apicid >= 8)
53 genapic = &apic_physflat;
54 printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name);
55 }
60} 56}
61 57
62/* Same for both flat and physical. */ 58/* Same for both flat and physical. */
63 59
64void send_IPI_self(int vector) 60void apic_send_IPI_self(int vector)
65{ 61{
66 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); 62 __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL);
67} 63}
68 64
69int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 65int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
70{ 66{
71 if (!strcmp(oem_id, "SGI")) { 67 int i;
72 if (!strcmp(oem_table_id, "UVL")) 68
73 uv_system_type = UV_LEGACY_APIC; 69 for (i = 0; apic_probe[i]; ++i) {
74 else if (!strcmp(oem_table_id, "UVX")) 70 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) {
75 uv_system_type = UV_X2APIC; 71 genapic = apic_probe[i];
76 else if (!strcmp(oem_table_id, "UVH")) 72 printk(KERN_INFO "Setting APIC routing to %s.\n",
77 uv_system_type = UV_NON_UNIQUE_APIC; 73 genapic->name);
74 return 1;
75 }
78 } 76 }
79 return 0; 77 return 0;
80} 78}
81
82unsigned int read_apic_id(void)
83{
84 unsigned int id;
85
86 WARN_ON(preemptible() && num_online_cpus() > 1);
87 id = apic_read(APIC_ID);
88 if (uv_system_type >= UV_X2APIC)
89 id |= __get_cpu_var(x2apic_extra_bits);
90 return id;
91}
92
93enum uv_system_type get_uv_system_type(void)
94{
95 return uv_system_type;
96}
97
98int is_uv_system(void)
99{
100 return uv_system_type != UV_NONE;
101}
diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c
index 1a9c68845ee8..c0262791bda4 100644
--- a/arch/x86/kernel/genapic_flat_64.c
+++ b/arch/x86/kernel/genapic_flat_64.c
@@ -15,9 +15,20 @@
15#include <linux/kernel.h> 15#include <linux/kernel.h>
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h>
18#include <asm/smp.h> 19#include <asm/smp.h>
19#include <asm/ipi.h> 20#include <asm/ipi.h>
20#include <asm/genapic.h> 21#include <asm/genapic.h>
22#include <mach_apicdef.h>
23
24#ifdef CONFIG_ACPI
25#include <acpi/acpi_bus.h>
26#endif
27
28static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
29{
30 return 1;
31}
21 32
22static cpumask_t flat_target_cpus(void) 33static cpumask_t flat_target_cpus(void)
23{ 34{
@@ -95,9 +106,33 @@ static void flat_send_IPI_all(int vector)
95 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); 106 __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL);
96} 107}
97 108
109static unsigned int get_apic_id(unsigned long x)
110{
111 unsigned int id;
112
113 id = (((x)>>24) & 0xFFu);
114 return id;
115}
116
117static unsigned long set_apic_id(unsigned int id)
118{
119 unsigned long x;
120
121 x = ((id & 0xFFu)<<24);
122 return x;
123}
124
125static unsigned int read_xapic_id(void)
126{
127 unsigned int id;
128
129 id = get_apic_id(apic_read(APIC_ID));
130 return id;
131}
132
98static int flat_apic_id_registered(void) 133static int flat_apic_id_registered(void)
99{ 134{
100 return physid_isset(GET_APIC_ID(read_apic_id()), phys_cpu_present_map); 135 return physid_isset(read_xapic_id(), phys_cpu_present_map);
101} 136}
102 137
103static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) 138static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask)
@@ -112,6 +147,7 @@ static unsigned int phys_pkg_id(int index_msb)
112 147
113struct genapic apic_flat = { 148struct genapic apic_flat = {
114 .name = "flat", 149 .name = "flat",
150 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
115 .int_delivery_mode = dest_LowestPrio, 151 .int_delivery_mode = dest_LowestPrio,
116 .int_dest_mode = (APIC_DEST_LOGICAL != 0), 152 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
117 .target_cpus = flat_target_cpus, 153 .target_cpus = flat_target_cpus,
@@ -121,8 +157,12 @@ struct genapic apic_flat = {
121 .send_IPI_all = flat_send_IPI_all, 157 .send_IPI_all = flat_send_IPI_all,
122 .send_IPI_allbutself = flat_send_IPI_allbutself, 158 .send_IPI_allbutself = flat_send_IPI_allbutself,
123 .send_IPI_mask = flat_send_IPI_mask, 159 .send_IPI_mask = flat_send_IPI_mask,
160 .send_IPI_self = apic_send_IPI_self,
124 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, 161 .cpu_mask_to_apicid = flat_cpu_mask_to_apicid,
125 .phys_pkg_id = phys_pkg_id, 162 .phys_pkg_id = phys_pkg_id,
163 .get_apic_id = get_apic_id,
164 .set_apic_id = set_apic_id,
165 .apic_id_mask = (0xFFu<<24),
126}; 166};
127 167
128/* 168/*
@@ -130,6 +170,23 @@ struct genapic apic_flat = {
130 * We cannot use logical delivery in this case because the mask 170 * We cannot use logical delivery in this case because the mask
131 * overflows, so use physical mode. 171 * overflows, so use physical mode.
132 */ 172 */
173static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
174{
175#ifdef CONFIG_ACPI
176 /*
177 * Quirk: some x86_64 machines can only use physical APIC mode
178 * regardless of how many processors are present (x86_64 ES7000
179 * is an example).
180 */
181 if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
182 (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
183 printk(KERN_DEBUG "system APIC only can use physical flat");
184 return 1;
185 }
186#endif
187
188 return 0;
189}
133 190
134static cpumask_t physflat_target_cpus(void) 191static cpumask_t physflat_target_cpus(void)
135{ 192{
@@ -168,7 +225,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
168 * May as well be the first. 225 * May as well be the first.
169 */ 226 */
170 cpu = first_cpu(cpumask); 227 cpu = first_cpu(cpumask);
171 if ((unsigned)cpu < NR_CPUS) 228 if ((unsigned)cpu < nr_cpu_ids)
172 return per_cpu(x86_cpu_to_apicid, cpu); 229 return per_cpu(x86_cpu_to_apicid, cpu);
173 else 230 else
174 return BAD_APICID; 231 return BAD_APICID;
@@ -176,6 +233,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask)
176 233
177struct genapic apic_physflat = { 234struct genapic apic_physflat = {
178 .name = "physical flat", 235 .name = "physical flat",
236 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
179 .int_delivery_mode = dest_Fixed, 237 .int_delivery_mode = dest_Fixed,
180 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 238 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
181 .target_cpus = physflat_target_cpus, 239 .target_cpus = physflat_target_cpus,
@@ -185,6 +243,10 @@ struct genapic apic_physflat = {
185 .send_IPI_all = physflat_send_IPI_all, 243 .send_IPI_all = physflat_send_IPI_all,
186 .send_IPI_allbutself = physflat_send_IPI_allbutself, 244 .send_IPI_allbutself = physflat_send_IPI_allbutself,
187 .send_IPI_mask = physflat_send_IPI_mask, 245 .send_IPI_mask = physflat_send_IPI_mask,
246 .send_IPI_self = apic_send_IPI_self,
188 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, 247 .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid,
189 .phys_pkg_id = phys_pkg_id, 248 .phys_pkg_id = phys_pkg_id,
249 .get_apic_id = get_apic_id,
250 .set_apic_id = set_apic_id,
251 .apic_id_mask = (0xFFu<<24),
190}; 252};
diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c
new file mode 100644
index 000000000000..f6a2c8eb48a6
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_cluster.c
@@ -0,0 +1,159 @@
1#include <linux/threads.h>
2#include <linux/cpumask.h>
3#include <linux/string.h>
4#include <linux/kernel.h>
5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h>
8
9#include <asm/smp.h>
10#include <asm/ipi.h>
11#include <asm/genapic.h>
12
13DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{
17 if (cpu_has_x2apic)
18 return 1;
19
20 return 0;
21}
22
23/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
24
25static cpumask_t x2apic_target_cpus(void)
26{
27 return cpumask_of_cpu(0);
28}
29
30/*
31 * for now each logical cpu is in its own vector allocation domain.
32 */
33static cpumask_t x2apic_vector_allocation_domain(int cpu)
34{
35 cpumask_t domain = CPU_MASK_NONE;
36 cpu_set(cpu, domain);
37 return domain;
38}
39
40static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
41 unsigned int dest)
42{
43 unsigned long cfg;
44
45 cfg = __prepare_ICR(0, vector, dest);
46
47 /*
48 * send the IPI.
49 */
50 x2apic_icr_write(cfg, apicid);
51}
52
53/*
54 * for now, we send the IPI's one by one in the cpumask.
55 * TBD: Based on the cpu mask, we can send the IPI's to the cluster group
56 * at once. We have 16 cpu's in a cluster. This will minimize IPI register
57 * writes.
58 */
59static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
60{
61 unsigned long flags;
62 unsigned long query_cpu;
63
64 local_irq_save(flags);
65 for_each_cpu_mask(query_cpu, mask) {
66 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu),
67 vector, APIC_DEST_LOGICAL);
68 }
69 local_irq_restore(flags);
70}
71
72static void x2apic_send_IPI_allbutself(int vector)
73{
74 cpumask_t mask = cpu_online_map;
75
76 cpu_clear(smp_processor_id(), mask);
77
78 if (!cpus_empty(mask))
79 x2apic_send_IPI_mask(mask, vector);
80}
81
82static void x2apic_send_IPI_all(int vector)
83{
84 x2apic_send_IPI_mask(cpu_online_map, vector);
85}
86
87static int x2apic_apic_id_registered(void)
88{
89 return 1;
90}
91
92static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
93{
94 int cpu;
95
96 /*
97 * We're using fixed IRQ delivery, can only return one phys APIC ID.
98 * May as well be the first.
99 */
100 cpu = first_cpu(cpumask);
101 if ((unsigned)cpu < NR_CPUS)
102 return per_cpu(x86_cpu_to_logical_apicid, cpu);
103 else
104 return BAD_APICID;
105}
106
107static unsigned int get_apic_id(unsigned long x)
108{
109 unsigned int id;
110
111 id = x;
112 return id;
113}
114
115static unsigned long set_apic_id(unsigned int id)
116{
117 unsigned long x;
118
119 x = id;
120 return x;
121}
122
123static unsigned int phys_pkg_id(int index_msb)
124{
125 return current_cpu_data.initial_apicid >> index_msb;
126}
127
128static void x2apic_send_IPI_self(int vector)
129{
130 apic_write(APIC_SELF_IPI, vector);
131}
132
133static void init_x2apic_ldr(void)
134{
135 int cpu = smp_processor_id();
136
137 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR);
138 return;
139}
140
141struct genapic apic_x2apic_cluster = {
142 .name = "cluster x2apic",
143 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
144 .int_delivery_mode = dest_LowestPrio,
145 .int_dest_mode = (APIC_DEST_LOGICAL != 0),
146 .target_cpus = x2apic_target_cpus,
147 .vector_allocation_domain = x2apic_vector_allocation_domain,
148 .apic_id_registered = x2apic_apic_id_registered,
149 .init_apic_ldr = init_x2apic_ldr,
150 .send_IPI_all = x2apic_send_IPI_all,
151 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
152 .send_IPI_mask = x2apic_send_IPI_mask,
153 .send_IPI_self = x2apic_send_IPI_self,
154 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
155 .phys_pkg_id = phys_pkg_id,
156 .get_apic_id = get_apic_id,
157 .set_apic_id = set_apic_id,
158 .apic_id_mask = (0xFFFFFFFFu),
159};
diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c
new file mode 100644
index 000000000000..d042211768b7
--- /dev/null
+++ b/arch/x86/kernel/genx2apic_phys.c
@@ -0,0 +1,154 @@
1#include <linux/threads.h>
2#include <linux/cpumask.h>
3#include <linux/string.h>
4#include <linux/kernel.h>
5#include <linux/ctype.h>
6#include <linux/init.h>
7#include <linux/dmar.h>
8
9#include <asm/smp.h>
10#include <asm/ipi.h>
11#include <asm/genapic.h>
12
13static int x2apic_phys;
14
15static int set_x2apic_phys_mode(char *arg)
16{
17 x2apic_phys = 1;
18 return 0;
19}
20early_param("x2apic_phys", set_x2apic_phys_mode);
21
22static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
23{
24 if (cpu_has_x2apic && x2apic_phys)
25 return 1;
26
27 return 0;
28}
29
30/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
31
32static cpumask_t x2apic_target_cpus(void)
33{
34 return cpumask_of_cpu(0);
35}
36
37static cpumask_t x2apic_vector_allocation_domain(int cpu)
38{
39 cpumask_t domain = CPU_MASK_NONE;
40 cpu_set(cpu, domain);
41 return domain;
42}
43
44static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
45 unsigned int dest)
46{
47 unsigned long cfg;
48
49 cfg = __prepare_ICR(0, vector, dest);
50
51 /*
52 * send the IPI.
53 */
54 x2apic_icr_write(cfg, apicid);
55}
56
57static void x2apic_send_IPI_mask(cpumask_t mask, int vector)
58{
59 unsigned long flags;
60 unsigned long query_cpu;
61
62 local_irq_save(flags);
63 for_each_cpu_mask(query_cpu, mask) {
64 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
65 vector, APIC_DEST_PHYSICAL);
66 }
67 local_irq_restore(flags);
68}
69
70static void x2apic_send_IPI_allbutself(int vector)
71{
72 cpumask_t mask = cpu_online_map;
73
74 cpu_clear(smp_processor_id(), mask);
75
76 if (!cpus_empty(mask))
77 x2apic_send_IPI_mask(mask, vector);
78}
79
80static void x2apic_send_IPI_all(int vector)
81{
82 x2apic_send_IPI_mask(cpu_online_map, vector);
83}
84
85static int x2apic_apic_id_registered(void)
86{
87 return 1;
88}
89
90static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask)
91{
92 int cpu;
93
94 /*
95 * We're using fixed IRQ delivery, can only return one phys APIC ID.
96 * May as well be the first.
97 */
98 cpu = first_cpu(cpumask);
99 if ((unsigned)cpu < NR_CPUS)
100 return per_cpu(x86_cpu_to_apicid, cpu);
101 else
102 return BAD_APICID;
103}
104
105static unsigned int get_apic_id(unsigned long x)
106{
107 unsigned int id;
108
109 id = x;
110 return id;
111}
112
113static unsigned long set_apic_id(unsigned int id)
114{
115 unsigned long x;
116
117 x = id;
118 return x;
119}
120
121static unsigned int phys_pkg_id(int index_msb)
122{
123 return current_cpu_data.initial_apicid >> index_msb;
124}
125
126void x2apic_send_IPI_self(int vector)
127{
128 apic_write(APIC_SELF_IPI, vector);
129}
130
131void init_x2apic_ldr(void)
132{
133 return;
134}
135
136struct genapic apic_x2apic_phys = {
137 .name = "physical x2apic",
138 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
139 .int_delivery_mode = dest_Fixed,
140 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
141 .target_cpus = x2apic_target_cpus,
142 .vector_allocation_domain = x2apic_vector_allocation_domain,
143 .apic_id_registered = x2apic_apic_id_registered,
144 .init_apic_ldr = init_x2apic_ldr,
145 .send_IPI_all = x2apic_send_IPI_all,
146 .send_IPI_allbutself = x2apic_send_IPI_allbutself,
147 .send_IPI_mask = x2apic_send_IPI_mask,
148 .send_IPI_self = x2apic_send_IPI_self,
149 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
150 .phys_pkg_id = phys_pkg_id,
151 .get_apic_id = get_apic_id,
152 .set_apic_id = set_apic_id,
153 .apic_id_mask = (0xFFFFFFFFu),
154};
diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c
index 711f11c30b06..680a06557c5e 100644
--- a/arch/x86/kernel/genx2apic_uv_x.c
+++ b/arch/x86/kernel/genx2apic_uv_x.c
@@ -12,18 +12,49 @@
12#include <linux/threads.h> 12#include <linux/threads.h>
13#include <linux/cpumask.h> 13#include <linux/cpumask.h>
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/kernel.h>
16#include <linux/ctype.h> 15#include <linux/ctype.h>
17#include <linux/init.h> 16#include <linux/init.h>
18#include <linux/sched.h> 17#include <linux/sched.h>
19#include <linux/bootmem.h> 18#include <linux/bootmem.h>
20#include <linux/module.h> 19#include <linux/module.h>
20#include <linux/hardirq.h>
21#include <asm/smp.h> 21#include <asm/smp.h>
22#include <asm/ipi.h> 22#include <asm/ipi.h>
23#include <asm/genapic.h> 23#include <asm/genapic.h>
24#include <asm/pgtable.h> 24#include <asm/pgtable.h>
25#include <asm/uv/uv_mmrs.h> 25#include <asm/uv/uv_mmrs.h>
26#include <asm/uv/uv_hub.h> 26#include <asm/uv/uv_hub.h>
27#include <asm/uv/bios.h>
28
29DEFINE_PER_CPU(int, x2apic_extra_bits);
30
31static enum uv_system_type uv_system_type;
32
33static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
34{
35 if (!strcmp(oem_id, "SGI")) {
36 if (!strcmp(oem_table_id, "UVL"))
37 uv_system_type = UV_LEGACY_APIC;
38 else if (!strcmp(oem_table_id, "UVX"))
39 uv_system_type = UV_X2APIC;
40 else if (!strcmp(oem_table_id, "UVH")) {
41 uv_system_type = UV_NON_UNIQUE_APIC;
42 return 1;
43 }
44 }
45 return 0;
46}
47
48enum uv_system_type get_uv_system_type(void)
49{
50 return uv_system_type;
51}
52
53int is_uv_system(void)
54{
55 return uv_system_type != UV_NONE;
56}
57EXPORT_SYMBOL_GPL(is_uv_system);
27 58
28DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 59DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
29EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); 60EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info);
@@ -40,6 +71,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade);
40short uv_possible_blades; 71short uv_possible_blades;
41EXPORT_SYMBOL_GPL(uv_possible_blades); 72EXPORT_SYMBOL_GPL(uv_possible_blades);
42 73
74unsigned long sn_rtc_cycles_per_second;
75EXPORT_SYMBOL(sn_rtc_cycles_per_second);
76
43/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ 77/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
44 78
45static cpumask_t uv_target_cpus(void) 79static cpumask_t uv_target_cpus(void)
@@ -80,7 +114,7 @@ static void uv_send_IPI_one(int cpu, int vector)
80 unsigned long val, apicid, lapicid; 114 unsigned long val, apicid, lapicid;
81 int pnode; 115 int pnode;
82 116
83 apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ 117 apicid = per_cpu(x86_cpu_to_apicid, cpu);
84 lapicid = apicid & 0x3f; /* ZZZ macro needed */ 118 lapicid = apicid & 0x3f; /* ZZZ macro needed */
85 pnode = uv_apicid_to_pnode(apicid); 119 pnode = uv_apicid_to_pnode(apicid);
86 val = 120 val =
@@ -94,7 +128,7 @@ static void uv_send_IPI_mask(cpumask_t mask, int vector)
94{ 128{
95 unsigned int cpu; 129 unsigned int cpu;
96 130
97 for (cpu = 0; cpu < NR_CPUS; ++cpu) 131 for_each_possible_cpu(cpu)
98 if (cpu_isset(cpu, mask)) 132 if (cpu_isset(cpu, mask))
99 uv_send_IPI_one(cpu, vector); 133 uv_send_IPI_one(cpu, vector);
100} 134}
@@ -119,6 +153,10 @@ static int uv_apic_id_registered(void)
119 return 1; 153 return 1;
120} 154}
121 155
156static void uv_init_apic_ldr(void)
157{
158}
159
122static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) 160static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
123{ 161{
124 int cpu; 162 int cpu;
@@ -128,37 +166,65 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask)
128 * May as well be the first. 166 * May as well be the first.
129 */ 167 */
130 cpu = first_cpu(cpumask); 168 cpu = first_cpu(cpumask);
131 if ((unsigned)cpu < NR_CPUS) 169 if ((unsigned)cpu < nr_cpu_ids)
132 return per_cpu(x86_cpu_to_apicid, cpu); 170 return per_cpu(x86_cpu_to_apicid, cpu);
133 else 171 else
134 return BAD_APICID; 172 return BAD_APICID;
135} 173}
136 174
175static unsigned int get_apic_id(unsigned long x)
176{
177 unsigned int id;
178
179 WARN_ON(preemptible() && num_online_cpus() > 1);
180 id = x | __get_cpu_var(x2apic_extra_bits);
181
182 return id;
183}
184
185static unsigned long set_apic_id(unsigned int id)
186{
187 unsigned long x;
188
189 /* maskout x2apic_extra_bits ? */
190 x = id;
191 return x;
192}
193
194static unsigned int uv_read_apic_id(void)
195{
196
197 return get_apic_id(apic_read(APIC_ID));
198}
199
137static unsigned int phys_pkg_id(int index_msb) 200static unsigned int phys_pkg_id(int index_msb)
138{ 201{
139 return GET_APIC_ID(read_apic_id()) >> index_msb; 202 return uv_read_apic_id() >> index_msb;
140} 203}
141 204
142#ifdef ZZZ /* Needs x2apic patch */
143static void uv_send_IPI_self(int vector) 205static void uv_send_IPI_self(int vector)
144{ 206{
145 apic_write(APIC_SELF_IPI, vector); 207 apic_write(APIC_SELF_IPI, vector);
146} 208}
147#endif
148 209
149struct genapic apic_x2apic_uv_x = { 210struct genapic apic_x2apic_uv_x = {
150 .name = "UV large system", 211 .name = "UV large system",
212 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
151 .int_delivery_mode = dest_Fixed, 213 .int_delivery_mode = dest_Fixed,
152 .int_dest_mode = (APIC_DEST_PHYSICAL != 0), 214 .int_dest_mode = (APIC_DEST_PHYSICAL != 0),
153 .target_cpus = uv_target_cpus, 215 .target_cpus = uv_target_cpus,
154 .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */ 216 .vector_allocation_domain = uv_vector_allocation_domain,
155 .apic_id_registered = uv_apic_id_registered, 217 .apic_id_registered = uv_apic_id_registered,
218 .init_apic_ldr = uv_init_apic_ldr,
156 .send_IPI_all = uv_send_IPI_all, 219 .send_IPI_all = uv_send_IPI_all,
157 .send_IPI_allbutself = uv_send_IPI_allbutself, 220 .send_IPI_allbutself = uv_send_IPI_allbutself,
158 .send_IPI_mask = uv_send_IPI_mask, 221 .send_IPI_mask = uv_send_IPI_mask,
159 /* ZZZ.send_IPI_self = uv_send_IPI_self, */ 222 .send_IPI_self = uv_send_IPI_self,
160 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, 223 .cpu_mask_to_apicid = uv_cpu_mask_to_apicid,
161 .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ 224 .phys_pkg_id = phys_pkg_id,
225 .get_apic_id = get_apic_id,
226 .set_apic_id = set_apic_id,
227 .apic_id_mask = (0xFFFFFFFFu),
162}; 228};
163 229
164static __cpuinit void set_x2apic_extra_bits(int pnode) 230static __cpuinit void set_x2apic_extra_bits(int pnode)
@@ -218,12 +284,13 @@ static __init void map_low_mmrs(void)
218 284
219enum map_type {map_wb, map_uc}; 285enum map_type {map_wb, map_uc};
220 286
221static void map_high(char *id, unsigned long base, int shift, enum map_type map_type) 287static __init void map_high(char *id, unsigned long base, int shift,
288 int max_pnode, enum map_type map_type)
222{ 289{
223 unsigned long bytes, paddr; 290 unsigned long bytes, paddr;
224 291
225 paddr = base << shift; 292 paddr = base << shift;
226 bytes = (1UL << shift); 293 bytes = (1UL << shift) * (max_pnode + 1);
227 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, 294 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
228 paddr + bytes); 295 paddr + bytes);
229 if (map_type == map_uc) 296 if (map_type == map_uc)
@@ -239,7 +306,7 @@ static __init void map_gru_high(int max_pnode)
239 306
240 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 307 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
241 if (gru.s.enable) 308 if (gru.s.enable)
242 map_high("GRU", gru.s.base, shift, map_wb); 309 map_high("GRU", gru.s.base, shift, max_pnode, map_wb);
243} 310}
244 311
245static __init void map_config_high(int max_pnode) 312static __init void map_config_high(int max_pnode)
@@ -249,7 +316,7 @@ static __init void map_config_high(int max_pnode)
249 316
250 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); 317 cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR);
251 if (cfg.s.enable) 318 if (cfg.s.enable)
252 map_high("CONFIG", cfg.s.base, shift, map_uc); 319 map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc);
253} 320}
254 321
255static __init void map_mmr_high(int max_pnode) 322static __init void map_mmr_high(int max_pnode)
@@ -259,7 +326,7 @@ static __init void map_mmr_high(int max_pnode)
259 326
260 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); 327 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
261 if (mmr.s.enable) 328 if (mmr.s.enable)
262 map_high("MMR", mmr.s.base, shift, map_uc); 329 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc);
263} 330}
264 331
265static __init void map_mmioh_high(int max_pnode) 332static __init void map_mmioh_high(int max_pnode)
@@ -269,10 +336,44 @@ static __init void map_mmioh_high(int max_pnode)
269 336
270 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 337 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
271 if (mmioh.s.enable) 338 if (mmioh.s.enable)
272 map_high("MMIOH", mmioh.s.base, shift, map_uc); 339 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc);
340}
341
342static __init void uv_rtc_init(void)
343{
344 long status;
345 u64 ticks_per_sec;
346
347 status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK,
348 &ticks_per_sec);
349 if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) {
350 printk(KERN_WARNING
351 "unable to determine platform RTC clock frequency, "
352 "guessing.\n");
353 /* BIOS gives wrong value for clock freq. so guess */
354 sn_rtc_cycles_per_second = 1000000000000UL / 30000UL;
355 } else
356 sn_rtc_cycles_per_second = ticks_per_sec;
273} 357}
274 358
275static __init void uv_system_init(void) 359/*
360 * Called on each cpu to initialize the per_cpu UV data area.
361 * ZZZ hotplug not supported yet
362 */
363void __cpuinit uv_cpu_init(void)
364{
365 /* CPU 0 initilization will be done via uv_system_init. */
366 if (!uv_blade_info)
367 return;
368
369 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
370
371 if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
372 set_x2apic_extra_bits(uv_hub_info->pnode);
373}
374
375
376void __init uv_system_init(void)
276{ 377{
277 union uvh_si_addr_map_config_u m_n_config; 378 union uvh_si_addr_map_config_u m_n_config;
278 union uvh_node_id_u node_id; 379 union uvh_node_id_u node_id;
@@ -326,6 +427,11 @@ static __init void uv_system_init(void)
326 gnode_upper = (((unsigned long)node_id.s.node_id) & 427 gnode_upper = (((unsigned long)node_id.s.node_id) &
327 ~((1 << n_val) - 1)) << m_val; 428 ~((1 << n_val) - 1)) << m_val;
328 429
430 uv_bios_init();
431 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
432 &uv_coherency_id, &uv_region_size);
433 uv_rtc_init();
434
329 for_each_present_cpu(cpu) { 435 for_each_present_cpu(cpu) {
330 nid = cpu_to_node(cpu); 436 nid = cpu_to_node(cpu);
331 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); 437 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu));
@@ -345,7 +451,7 @@ static __init void uv_system_init(void)
345 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; 451 uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
346 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 452 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
347 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 453 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
348 uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ 454 uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id;
349 uv_node_to_blade[nid] = blade; 455 uv_node_to_blade[nid] = blade;
350 uv_cpu_to_blade[cpu] = blade; 456 uv_cpu_to_blade[cpu] = blade;
351 max_pnode = max(pnode, max_pnode); 457 max_pnode = max(pnode, max_pnode);
@@ -360,19 +466,6 @@ static __init void uv_system_init(void)
360 map_mmr_high(max_pnode); 466 map_mmr_high(max_pnode);
361 map_config_high(max_pnode); 467 map_config_high(max_pnode);
362 map_mmioh_high(max_pnode); 468 map_mmioh_high(max_pnode);
363}
364 469
365/* 470 uv_cpu_init();
366 * Called on each cpu to initialize the per_cpu UV data area.
367 * ZZZ hotplug not supported yet
368 */
369void __cpuinit uv_cpu_init(void)
370{
371 if (!uv_node_to_blade)
372 uv_system_init();
373
374 uv_blade_info[uv_numa_blade_id()].nr_online_cpus++;
375
376 if (get_uv_system_type() == UV_NON_UNIQUE_APIC)
377 set_x2apic_extra_bits(uv_hub_info->pnode);
378} 471}
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 3e66bd364a9d..1dcb0f13897e 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -35,6 +35,7 @@ void __init reserve_ebda_region(void)
35 35
36 /* start of EBDA area */ 36 /* start of EBDA area */
37 ebda_addr = get_bios_ebda(); 37 ebda_addr = get_bios_ebda();
38 printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem);
38 39
39 /* Fixup: bios puts an EBDA in the top 64K segment */ 40 /* Fixup: bios puts an EBDA in the top 64K segment */
40 /* of conventional memory, but does not adjust lowmem. */ 41 /* of conventional memory, but does not adjust lowmem. */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c97819829146..d16084f90649 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata;
39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; 39static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly;
40#endif 40#endif
41 41
42void __init x86_64_init_pda(void)
43{
44 _cpu_pda = __cpu_pda;
45 cpu_pda(0) = &_boot_cpu_pda;
46 pda_init(0);
47}
48
42static void __init zap_identity_mappings(void) 49static void __init zap_identity_mappings(void)
43{ 50{
44 pgd_t *pgd = pgd_offset_k(0UL); 51 pgd_t *pgd = pgd_offset_k(0UL);
@@ -81,6 +88,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
81 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); 88 BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL));
82 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == 89 BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) ==
83 (__START_KERNEL & PGDIR_MASK))); 90 (__START_KERNEL & PGDIR_MASK)));
91 BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
84 92
85 /* clear bss before set_intr_gate with early_idt_handler */ 93 /* clear bss before set_intr_gate with early_idt_handler */
86 clear_bss(); 94 clear_bss();
@@ -100,13 +108,10 @@ void __init x86_64_start_kernel(char * real_mode_data)
100 } 108 }
101 load_idt((const struct desc_ptr *)&idt_descr); 109 load_idt((const struct desc_ptr *)&idt_descr);
102 110
103 early_printk("Kernel alive\n"); 111 if (console_loglevel == 10)
104 112 early_printk("Kernel alive\n");
105 _cpu_pda = __cpu_pda;
106 cpu_pda(0) = &_boot_cpu_pda;
107 pda_init(0);
108 113
109 early_printk("Kernel really alive\n"); 114 x86_64_init_pda();
110 115
111 x86_64_start_reservations(real_mode_data); 116 x86_64_start_reservations(real_mode_data);
112} 117}
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f67e93441caf..e835b4eea70b 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4
172 * 172 *
173 * Note that the stack is not yet set up! 173 * Note that the stack is not yet set up!
174 */ 174 */
175#define PTE_ATTR 0x007 /* PRESENT+RW+USER */
176#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */
177#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */
178
179default_entry: 175default_entry:
180#ifdef CONFIG_X86_PAE 176#ifdef CONFIG_X86_PAE
181 177
@@ -196,9 +192,9 @@ default_entry:
196 movl $pa(pg0), %edi 192 movl $pa(pg0), %edi
197 movl %edi, pa(init_pg_tables_start) 193 movl %edi, pa(init_pg_tables_start)
198 movl $pa(swapper_pg_pmd), %edx 194 movl $pa(swapper_pg_pmd), %edx
199 movl $PTE_ATTR, %eax 195 movl $PTE_IDENT_ATTR, %eax
20010: 19610:
201 leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ 197 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
202 movl %ecx,(%edx) /* Store PMD entry */ 198 movl %ecx,(%edx) /* Store PMD entry */
203 /* Upper half already zero */ 199 /* Upper half already zero */
204 addl $8,%edx 200 addl $8,%edx
@@ -215,7 +211,7 @@ default_entry:
215 * End condition: we must map up to and including INIT_MAP_BEYOND_END 211 * End condition: we must map up to and including INIT_MAP_BEYOND_END
216 * bytes beyond the end of our own page tables. 212 * bytes beyond the end of our own page tables.
217 */ 213 */
218 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 214 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
219 cmpl %ebp,%eax 215 cmpl %ebp,%eax
220 jb 10b 216 jb 10b
2211: 2171:
@@ -224,7 +220,7 @@ default_entry:
224 movl %eax, pa(max_pfn_mapped) 220 movl %eax, pa(max_pfn_mapped)
225 221
226 /* Do early initialization of the fixmap area */ 222 /* Do early initialization of the fixmap area */
227 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 223 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
228 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) 224 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8)
229#else /* Not PAE */ 225#else /* Not PAE */
230 226
@@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
233 movl $pa(pg0), %edi 229 movl $pa(pg0), %edi
234 movl %edi, pa(init_pg_tables_start) 230 movl %edi, pa(init_pg_tables_start)
235 movl $pa(swapper_pg_dir), %edx 231 movl $pa(swapper_pg_dir), %edx
236 movl $PTE_ATTR, %eax 232 movl $PTE_IDENT_ATTR, %eax
23710: 23310:
238 leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ 234 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
239 movl %ecx,(%edx) /* Store identity PDE entry */ 235 movl %ecx,(%edx) /* Store identity PDE entry */
240 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ 236 movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */
241 addl $4,%edx 237 addl $4,%edx
@@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
249 * bytes beyond the end of our own page tables; the +0x007 is 245 * bytes beyond the end of our own page tables; the +0x007 is
250 * the attribute bits 246 * the attribute bits
251 */ 247 */
252 leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp 248 leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp
253 cmpl %ebp,%eax 249 cmpl %ebp,%eax
254 jb 10b 250 jb 10b
255 movl %edi,pa(init_pg_tables_end) 251 movl %edi,pa(init_pg_tables_end)
@@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
257 movl %eax, pa(max_pfn_mapped) 253 movl %eax, pa(max_pfn_mapped)
258 254
259 /* Do early initialization of the fixmap area */ 255 /* Do early initialization of the fixmap area */
260 movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax 256 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
261 movl %eax,pa(swapper_pg_dir+0xffc) 257 movl %eax,pa(swapper_pg_dir+0xffc)
262#endif 258#endif
263 jmp 3f 259 jmp 3f
@@ -456,9 +452,6 @@ is386: movl $2,%ecx # set MP
4561: 4521:
457#endif /* CONFIG_SMP */ 453#endif /* CONFIG_SMP */
458 jmp *(initial_code) 454 jmp *(initial_code)
459.align 4
460ENTRY(initial_code)
461 .long i386_start_kernel
462 455
463/* 456/*
464 * We depend on ET to be correct. This checks for 287/387. 457 * We depend on ET to be correct. This checks for 287/387.
@@ -601,6 +594,11 @@ ignore_int:
601#endif 594#endif
602 iret 595 iret
603 596
597.section .cpuinit.data,"wa"
598.align 4
599ENTRY(initial_code)
600 .long i386_start_kernel
601
604.section .text 602.section .text
605/* 603/*
606 * Real beginning of normal "text" segment 604 * Real beginning of normal "text" segment
@@ -632,19 +630,19 @@ ENTRY(empty_zero_page)
632 /* Page-aligned for the benefit of paravirt? */ 630 /* Page-aligned for the benefit of paravirt? */
633 .align PAGE_SIZE_asm 631 .align PAGE_SIZE_asm
634ENTRY(swapper_pg_dir) 632ENTRY(swapper_pg_dir)
635 .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ 633 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
636# if KPMDS == 3 634# if KPMDS == 3
637 .long pa(swapper_pg_pmd+PGD_ATTR),0 635 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
638 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 636 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
639 .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 637 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0
640# elif KPMDS == 2 638# elif KPMDS == 2
641 .long 0,0 639 .long 0,0
642 .long pa(swapper_pg_pmd+PGD_ATTR),0 640 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
643 .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 641 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0
644# elif KPMDS == 1 642# elif KPMDS == 1
645 .long 0,0 643 .long 0,0
646 .long 0,0 644 .long 0,0
647 .long pa(swapper_pg_pmd+PGD_ATTR),0 645 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0
648# else 646# else
649# error "Kernel PMDs should be 1, 2 or 3" 647# error "Kernel PMDs should be 1, 2 or 3"
650# endif 648# endif
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b07ac7b217cb..26cfdc1d7c7f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -110,7 +110,7 @@ startup_64:
110 movq %rdi, %rax 110 movq %rdi, %rax
111 shrq $PMD_SHIFT, %rax 111 shrq $PMD_SHIFT, %rax
112 andq $(PTRS_PER_PMD - 1), %rax 112 andq $(PTRS_PER_PMD - 1), %rax
113 leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx 113 leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
114 leaq level2_spare_pgt(%rip), %rbx 114 leaq level2_spare_pgt(%rip), %rbx
115 movq %rdx, 0(%rbx, %rax, 8) 115 movq %rdx, 0(%rbx, %rax, 8)
116ident_complete: 116ident_complete:
@@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt)
374 /* Since I easily can, map the first 1G. 374 /* Since I easily can, map the first 1G.
375 * Don't set NX because code runs from these pages. 375 * Don't set NX because code runs from these pages.
376 */ 376 */
377 PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) 377 PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
378 378
379NEXT_PAGE(level2_kernel_pgt) 379NEXT_PAGE(level2_kernel_pgt)
380 /* 380 /*
@@ -407,6 +407,7 @@ ENTRY(phys_base)
407 /* This must match the first entry in level2_kernel_pgt */ 407 /* This must match the first entry in level2_kernel_pgt */
408 .quad 0x0000000000000000 408 .quad 0x0000000000000000
409 409
410#include "../../x86/xen/xen-head.S"
410 411
411 .section .bss, "aw", @nobits 412 .section .bss, "aw", @nobits
412 .align L1_CACHE_BYTES 413 .align L1_CACHE_BYTES
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 0ea6a19bfdfe..77017e834cf7 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1,29 +1,49 @@
1#include <linux/clocksource.h> 1#include <linux/clocksource.h>
2#include <linux/clockchips.h> 2#include <linux/clockchips.h>
3#include <linux/interrupt.h>
4#include <linux/sysdev.h>
3#include <linux/delay.h> 5#include <linux/delay.h>
4#include <linux/errno.h> 6#include <linux/errno.h>
5#include <linux/hpet.h> 7#include <linux/hpet.h>
6#include <linux/init.h> 8#include <linux/init.h>
7#include <linux/sysdev.h> 9#include <linux/cpu.h>
8#include <linux/pm.h> 10#include <linux/pm.h>
11#include <linux/io.h>
9 12
10#include <asm/fixmap.h> 13#include <asm/fixmap.h>
11#include <asm/hpet.h>
12#include <asm/i8253.h> 14#include <asm/i8253.h>
13#include <asm/io.h> 15#include <asm/hpet.h>
14 16
15#define HPET_MASK CLOCKSOURCE_MASK(32) 17#define HPET_MASK CLOCKSOURCE_MASK(32)
16#define HPET_SHIFT 22 18#define HPET_SHIFT 22
17 19
18/* FSEC = 10^-15 20/* FSEC = 10^-15
19 NSEC = 10^-9 */ 21 NSEC = 10^-9 */
20#define FSEC_PER_NSEC 1000000L 22#define FSEC_PER_NSEC 1000000L
23
24#define HPET_DEV_USED_BIT 2
25#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT)
26#define HPET_DEV_VALID 0x8
27#define HPET_DEV_FSB_CAP 0x1000
28#define HPET_DEV_PERI_CAP 0x2000
29
30#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
21 31
22/* 32/*
23 * HPET address is set in acpi/boot.c, when an ACPI entry exists 33 * HPET address is set in acpi/boot.c, when an ACPI entry exists
24 */ 34 */
25unsigned long hpet_address; 35unsigned long hpet_address;
26static void __iomem *hpet_virt_address; 36unsigned long hpet_num_timers;
37static void __iomem *hpet_virt_address;
38
39struct hpet_dev {
40 struct clock_event_device evt;
41 unsigned int num;
42 int cpu;
43 unsigned int irq;
44 unsigned int flags;
45 char name[10];
46};
27 47
28unsigned long hpet_readl(unsigned long a) 48unsigned long hpet_readl(unsigned long a)
29{ 49{
@@ -59,7 +79,7 @@ static inline void hpet_clear_mapping(void)
59static int boot_hpet_disable; 79static int boot_hpet_disable;
60int hpet_force_user; 80int hpet_force_user;
61 81
62static int __init hpet_setup(char* str) 82static int __init hpet_setup(char *str)
63{ 83{
64 if (str) { 84 if (str) {
65 if (!strncmp("disable", str, 7)) 85 if (!strncmp("disable", str, 7))
@@ -80,7 +100,7 @@ __setup("nohpet", disable_hpet);
80 100
81static inline int is_hpet_capable(void) 101static inline int is_hpet_capable(void)
82{ 102{
83 return (!boot_hpet_disable && hpet_address); 103 return !boot_hpet_disable && hpet_address;
84} 104}
85 105
86/* 106/*
@@ -102,6 +122,9 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled);
102 * timer 0 and timer 1 in case of RTC emulation. 122 * timer 0 and timer 1 in case of RTC emulation.
103 */ 123 */
104#ifdef CONFIG_HPET 124#ifdef CONFIG_HPET
125
126static void hpet_reserve_msi_timers(struct hpet_data *hd);
127
105static void hpet_reserve_platform_timers(unsigned long id) 128static void hpet_reserve_platform_timers(unsigned long id)
106{ 129{
107 struct hpet __iomem *hpet = hpet_virt_address; 130 struct hpet __iomem *hpet = hpet_virt_address;
@@ -111,25 +134,31 @@ static void hpet_reserve_platform_timers(unsigned long id)
111 134
112 nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; 135 nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1;
113 136
114 memset(&hd, 0, sizeof (hd)); 137 memset(&hd, 0, sizeof(hd));
115 hd.hd_phys_address = hpet_address; 138 hd.hd_phys_address = hpet_address;
116 hd.hd_address = hpet; 139 hd.hd_address = hpet;
117 hd.hd_nirqs = nrtimers; 140 hd.hd_nirqs = nrtimers;
118 hd.hd_flags = HPET_DATA_PLATFORM;
119 hpet_reserve_timer(&hd, 0); 141 hpet_reserve_timer(&hd, 0);
120 142
121#ifdef CONFIG_HPET_EMULATE_RTC 143#ifdef CONFIG_HPET_EMULATE_RTC
122 hpet_reserve_timer(&hd, 1); 144 hpet_reserve_timer(&hd, 1);
123#endif 145#endif
124 146
147 /*
148 * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254
149 * is wrong for i8259!) not the output IRQ. Many BIOS writers
150 * don't bother configuring *any* comparator interrupts.
151 */
125 hd.hd_irq[0] = HPET_LEGACY_8254; 152 hd.hd_irq[0] = HPET_LEGACY_8254;
126 hd.hd_irq[1] = HPET_LEGACY_RTC; 153 hd.hd_irq[1] = HPET_LEGACY_RTC;
127 154
128 for (i = 2; i < nrtimers; timer++, i++) { 155 for (i = 2; i < nrtimers; timer++, i++) {
129 hd.hd_irq[i] = (readl(&timer->hpet_config) & Tn_INT_ROUTE_CNF_MASK) >> 156 hd.hd_irq[i] = (readl(&timer->hpet_config) &
130 Tn_INT_ROUTE_CNF_SHIFT; 157 Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT;
131 } 158 }
132 159
160 hpet_reserve_msi_timers(&hd);
161
133 hpet_alloc(&hd); 162 hpet_alloc(&hd);
134 163
135} 164}
@@ -210,8 +239,8 @@ static void hpet_legacy_clockevent_register(void)
210 /* Calculate the min / max delta */ 239 /* Calculate the min / max delta */
211 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, 240 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
212 &hpet_clockevent); 241 &hpet_clockevent);
213 hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, 242 /* 5 usec minimum reprogramming delta. */
214 &hpet_clockevent); 243 hpet_clockevent.min_delta_ns = 5000;
215 244
216 /* 245 /*
217 * Start hpet with the boot cpu mask and make it 246 * Start hpet with the boot cpu mask and make it
@@ -223,63 +252,421 @@ static void hpet_legacy_clockevent_register(void)
223 printk(KERN_DEBUG "hpet clockevent registered\n"); 252 printk(KERN_DEBUG "hpet clockevent registered\n");
224} 253}
225 254
226static void hpet_legacy_set_mode(enum clock_event_mode mode, 255static int hpet_setup_msi_irq(unsigned int irq);
227 struct clock_event_device *evt) 256
257static void hpet_set_mode(enum clock_event_mode mode,
258 struct clock_event_device *evt, int timer)
228{ 259{
229 unsigned long cfg, cmp, now; 260 unsigned long cfg, cmp, now;
230 uint64_t delta; 261 uint64_t delta;
231 262
232 switch(mode) { 263 switch (mode) {
233 case CLOCK_EVT_MODE_PERIODIC: 264 case CLOCK_EVT_MODE_PERIODIC:
234 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult; 265 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
235 delta >>= hpet_clockevent.shift; 266 delta >>= evt->shift;
236 now = hpet_readl(HPET_COUNTER); 267 now = hpet_readl(HPET_COUNTER);
237 cmp = now + (unsigned long) delta; 268 cmp = now + (unsigned long) delta;
238 cfg = hpet_readl(HPET_T0_CFG); 269 cfg = hpet_readl(HPET_Tn_CFG(timer));
239 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | 270 cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC |
240 HPET_TN_SETVAL | HPET_TN_32BIT; 271 HPET_TN_SETVAL | HPET_TN_32BIT;
241 hpet_writel(cfg, HPET_T0_CFG); 272 hpet_writel(cfg, HPET_Tn_CFG(timer));
242 /* 273 /*
243 * The first write after writing TN_SETVAL to the 274 * The first write after writing TN_SETVAL to the
244 * config register sets the counter value, the second 275 * config register sets the counter value, the second
245 * write sets the period. 276 * write sets the period.
246 */ 277 */
247 hpet_writel(cmp, HPET_T0_CMP); 278 hpet_writel(cmp, HPET_Tn_CMP(timer));
248 udelay(1); 279 udelay(1);
249 hpet_writel((unsigned long) delta, HPET_T0_CMP); 280 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer));
250 break; 281 break;
251 282
252 case CLOCK_EVT_MODE_ONESHOT: 283 case CLOCK_EVT_MODE_ONESHOT:
253 cfg = hpet_readl(HPET_T0_CFG); 284 cfg = hpet_readl(HPET_Tn_CFG(timer));
254 cfg &= ~HPET_TN_PERIODIC; 285 cfg &= ~HPET_TN_PERIODIC;
255 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; 286 cfg |= HPET_TN_ENABLE | HPET_TN_32BIT;
256 hpet_writel(cfg, HPET_T0_CFG); 287 hpet_writel(cfg, HPET_Tn_CFG(timer));
257 break; 288 break;
258 289
259 case CLOCK_EVT_MODE_UNUSED: 290 case CLOCK_EVT_MODE_UNUSED:
260 case CLOCK_EVT_MODE_SHUTDOWN: 291 case CLOCK_EVT_MODE_SHUTDOWN:
261 cfg = hpet_readl(HPET_T0_CFG); 292 cfg = hpet_readl(HPET_Tn_CFG(timer));
262 cfg &= ~HPET_TN_ENABLE; 293 cfg &= ~HPET_TN_ENABLE;
263 hpet_writel(cfg, HPET_T0_CFG); 294 hpet_writel(cfg, HPET_Tn_CFG(timer));
264 break; 295 break;
265 296
266 case CLOCK_EVT_MODE_RESUME: 297 case CLOCK_EVT_MODE_RESUME:
267 hpet_enable_legacy_int(); 298 if (timer == 0) {
299 hpet_enable_legacy_int();
300 } else {
301 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
302 hpet_setup_msi_irq(hdev->irq);
303 disable_irq(hdev->irq);
304 irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu));
305 enable_irq(hdev->irq);
306 }
268 break; 307 break;
269 } 308 }
270} 309}
271 310
272static int hpet_legacy_next_event(unsigned long delta, 311static int hpet_next_event(unsigned long delta,
273 struct clock_event_device *evt) 312 struct clock_event_device *evt, int timer)
274{ 313{
275 unsigned long cnt; 314 u32 cnt;
276 315
277 cnt = hpet_readl(HPET_COUNTER); 316 cnt = hpet_readl(HPET_COUNTER);
278 cnt += delta; 317 cnt += (u32) delta;
279 hpet_writel(cnt, HPET_T0_CMP); 318 hpet_writel(cnt, HPET_Tn_CMP(timer));
319
320 /*
321 * We need to read back the CMP register to make sure that
322 * what we wrote hit the chip before we compare it to the
323 * counter.
324 */
325 WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt);
326
327 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
328}
329
330static void hpet_legacy_set_mode(enum clock_event_mode mode,
331 struct clock_event_device *evt)
332{
333 hpet_set_mode(mode, evt, 0);
334}
335
336static int hpet_legacy_next_event(unsigned long delta,
337 struct clock_event_device *evt)
338{
339 return hpet_next_event(delta, evt, 0);
340}
341
342/*
343 * HPET MSI Support
344 */
345#ifdef CONFIG_PCI_MSI
346
347static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
348static struct hpet_dev *hpet_devs;
349
350void hpet_msi_unmask(unsigned int irq)
351{
352 struct hpet_dev *hdev = get_irq_data(irq);
353 unsigned long cfg;
354
355 /* unmask it */
356 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
357 cfg |= HPET_TN_FSB;
358 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
359}
360
361void hpet_msi_mask(unsigned int irq)
362{
363 unsigned long cfg;
364 struct hpet_dev *hdev = get_irq_data(irq);
365
366 /* mask it */
367 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
368 cfg &= ~HPET_TN_FSB;
369 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
370}
371
372void hpet_msi_write(unsigned int irq, struct msi_msg *msg)
373{
374 struct hpet_dev *hdev = get_irq_data(irq);
375
376 hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
377 hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
378}
379
380void hpet_msi_read(unsigned int irq, struct msi_msg *msg)
381{
382 struct hpet_dev *hdev = get_irq_data(irq);
383
384 msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
385 msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
386 msg->address_hi = 0;
387}
388
389static void hpet_msi_set_mode(enum clock_event_mode mode,
390 struct clock_event_device *evt)
391{
392 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
393 hpet_set_mode(mode, evt, hdev->num);
394}
395
396static int hpet_msi_next_event(unsigned long delta,
397 struct clock_event_device *evt)
398{
399 struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt);
400 return hpet_next_event(delta, evt, hdev->num);
401}
402
403static int hpet_setup_msi_irq(unsigned int irq)
404{
405 if (arch_setup_hpet_msi(irq)) {
406 destroy_irq(irq);
407 return -EINVAL;
408 }
409 return 0;
410}
411
412static int hpet_assign_irq(struct hpet_dev *dev)
413{
414 unsigned int irq;
415
416 irq = create_irq();
417 if (!irq)
418 return -EINVAL;
419
420 set_irq_data(irq, dev);
421
422 if (hpet_setup_msi_irq(irq))
423 return -EINVAL;
424
425 dev->irq = irq;
426 return 0;
427}
428
429static irqreturn_t hpet_interrupt_handler(int irq, void *data)
430{
431 struct hpet_dev *dev = (struct hpet_dev *)data;
432 struct clock_event_device *hevt = &dev->evt;
433
434 if (!hevt->event_handler) {
435 printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n",
436 dev->num);
437 return IRQ_HANDLED;
438 }
439
440 hevt->event_handler(hevt);
441 return IRQ_HANDLED;
442}
443
444static int hpet_setup_irq(struct hpet_dev *dev)
445{
446
447 if (request_irq(dev->irq, hpet_interrupt_handler,
448 IRQF_SHARED|IRQF_NOBALANCING, dev->name, dev))
449 return -1;
450
451 disable_irq(dev->irq);
452 irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu));
453 enable_irq(dev->irq);
454
455 printk(KERN_DEBUG "hpet: %s irq %d for MSI\n",
456 dev->name, dev->irq);
457
458 return 0;
459}
460
461/* This should be called in specific @cpu */
462static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
463{
464 struct clock_event_device *evt = &hdev->evt;
465 uint64_t hpet_freq;
466
467 WARN_ON(cpu != smp_processor_id());
468 if (!(hdev->flags & HPET_DEV_VALID))
469 return;
280 470
281 return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; 471 if (hpet_setup_msi_irq(hdev->irq))
472 return;
473
474 hdev->cpu = cpu;
475 per_cpu(cpu_hpet_dev, cpu) = hdev;
476 evt->name = hdev->name;
477 hpet_setup_irq(hdev);
478 evt->irq = hdev->irq;
479
480 evt->rating = 110;
481 evt->features = CLOCK_EVT_FEAT_ONESHOT;
482 if (hdev->flags & HPET_DEV_PERI_CAP)
483 evt->features |= CLOCK_EVT_FEAT_PERIODIC;
484
485 evt->set_mode = hpet_msi_set_mode;
486 evt->set_next_event = hpet_msi_next_event;
487 evt->shift = 32;
488
489 /*
490 * The period is a femto seconds value. We need to calculate the
491 * scaled math multiplication factor for nanosecond to hpet tick
492 * conversion.
493 */
494 hpet_freq = 1000000000000000ULL;
495 do_div(hpet_freq, hpet_period);
496 evt->mult = div_sc((unsigned long) hpet_freq,
497 NSEC_PER_SEC, evt->shift);
498 /* Calculate the max delta */
499 evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
500 /* 5 usec minimum reprogramming delta. */
501 evt->min_delta_ns = 5000;
502
503 evt->cpumask = cpumask_of_cpu(hdev->cpu);
504 clockevents_register_device(evt);
505}
506
507#ifdef CONFIG_HPET
508/* Reserve at least one timer for userspace (/dev/hpet) */
509#define RESERVE_TIMERS 1
510#else
511#define RESERVE_TIMERS 0
512#endif
513
514static void hpet_msi_capability_lookup(unsigned int start_timer)
515{
516 unsigned int id;
517 unsigned int num_timers;
518 unsigned int num_timers_used = 0;
519 int i;
520
521 id = hpet_readl(HPET_ID);
522
523 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
524 num_timers++; /* Value read out starts from 0 */
525
526 hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL);
527 if (!hpet_devs)
528 return;
529
530 hpet_num_timers = num_timers;
531
532 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
533 struct hpet_dev *hdev = &hpet_devs[num_timers_used];
534 unsigned long cfg = hpet_readl(HPET_Tn_CFG(i));
535
536 /* Only consider HPET timer with MSI support */
537 if (!(cfg & HPET_TN_FSB_CAP))
538 continue;
539
540 hdev->flags = 0;
541 if (cfg & HPET_TN_PERIODIC_CAP)
542 hdev->flags |= HPET_DEV_PERI_CAP;
543 hdev->num = i;
544
545 sprintf(hdev->name, "hpet%d", i);
546 if (hpet_assign_irq(hdev))
547 continue;
548
549 hdev->flags |= HPET_DEV_FSB_CAP;
550 hdev->flags |= HPET_DEV_VALID;
551 num_timers_used++;
552 if (num_timers_used == num_possible_cpus())
553 break;
554 }
555
556 printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n",
557 num_timers, num_timers_used);
558}
559
560#ifdef CONFIG_HPET
561static void hpet_reserve_msi_timers(struct hpet_data *hd)
562{
563 int i;
564
565 if (!hpet_devs)
566 return;
567
568 for (i = 0; i < hpet_num_timers; i++) {
569 struct hpet_dev *hdev = &hpet_devs[i];
570
571 if (!(hdev->flags & HPET_DEV_VALID))
572 continue;
573
574 hd->hd_irq[hdev->num] = hdev->irq;
575 hpet_reserve_timer(hd, hdev->num);
576 }
577}
578#endif
579
580static struct hpet_dev *hpet_get_unused_timer(void)
581{
582 int i;
583
584 if (!hpet_devs)
585 return NULL;
586
587 for (i = 0; i < hpet_num_timers; i++) {
588 struct hpet_dev *hdev = &hpet_devs[i];
589
590 if (!(hdev->flags & HPET_DEV_VALID))
591 continue;
592 if (test_and_set_bit(HPET_DEV_USED_BIT,
593 (unsigned long *)&hdev->flags))
594 continue;
595 return hdev;
596 }
597 return NULL;
598}
599
600struct hpet_work_struct {
601 struct delayed_work work;
602 struct completion complete;
603};
604
605static void hpet_work(struct work_struct *w)
606{
607 struct hpet_dev *hdev;
608 int cpu = smp_processor_id();
609 struct hpet_work_struct *hpet_work;
610
611 hpet_work = container_of(w, struct hpet_work_struct, work.work);
612
613 hdev = hpet_get_unused_timer();
614 if (hdev)
615 init_one_hpet_msi_clockevent(hdev, cpu);
616
617 complete(&hpet_work->complete);
618}
619
620static int hpet_cpuhp_notify(struct notifier_block *n,
621 unsigned long action, void *hcpu)
622{
623 unsigned long cpu = (unsigned long)hcpu;
624 struct hpet_work_struct work;
625 struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu);
626
627 switch (action & 0xf) {
628 case CPU_ONLINE:
629 INIT_DELAYED_WORK(&work.work, hpet_work);
630 init_completion(&work.complete);
631 /* FIXME: add schedule_work_on() */
632 schedule_delayed_work_on(cpu, &work.work, 0);
633 wait_for_completion(&work.complete);
634 break;
635 case CPU_DEAD:
636 if (hdev) {
637 free_irq(hdev->irq, hdev);
638 hdev->flags &= ~HPET_DEV_USED;
639 per_cpu(cpu_hpet_dev, cpu) = NULL;
640 }
641 break;
642 }
643 return NOTIFY_OK;
644}
645#else
646
647static int hpet_setup_msi_irq(unsigned int irq)
648{
649 return 0;
650}
651static void hpet_msi_capability_lookup(unsigned int start_timer)
652{
653 return;
654}
655
656#ifdef CONFIG_HPET
657static void hpet_reserve_msi_timers(struct hpet_data *hd)
658{
659 return;
282} 660}
661#endif
662
663static int hpet_cpuhp_notify(struct notifier_block *n,
664 unsigned long action, void *hcpu)
665{
666 return NOTIFY_OK;
667}
668
669#endif
283 670
284/* 671/*
285 * Clock source related code 672 * Clock source related code
@@ -359,6 +746,7 @@ static int hpet_clocksource_register(void)
359int __init hpet_enable(void) 746int __init hpet_enable(void)
360{ 747{
361 unsigned long id; 748 unsigned long id;
749 int i;
362 750
363 if (!is_hpet_capable()) 751 if (!is_hpet_capable())
364 return 0; 752 return 0;
@@ -369,6 +757,29 @@ int __init hpet_enable(void)
369 * Read the period and check for a sane value: 757 * Read the period and check for a sane value:
370 */ 758 */
371 hpet_period = hpet_readl(HPET_PERIOD); 759 hpet_period = hpet_readl(HPET_PERIOD);
760
761 /*
762 * AMD SB700 based systems with spread spectrum enabled use a
763 * SMM based HPET emulation to provide proper frequency
764 * setting. The SMM code is initialized with the first HPET
765 * register access and takes some time to complete. During
766 * this time the config register reads 0xffffffff. We check
767 * for max. 1000 loops whether the config register reads a non
768 * 0xffffffff value to make sure that HPET is up and running
769 * before we go further. A counting loop is safe, as the HPET
770 * access takes thousands of CPU cycles. On non SB700 based
771 * machines this check is only done once and has no side
772 * effects.
773 */
774 for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) {
775 if (i == 1000) {
776 printk(KERN_WARNING
777 "HPET config register value = 0xFFFFFFFF. "
778 "Disabling HPET\n");
779 goto out_nohpet;
780 }
781 }
782
372 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) 783 if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD)
373 goto out_nohpet; 784 goto out_nohpet;
374 785
@@ -392,8 +803,10 @@ int __init hpet_enable(void)
392 803
393 if (id & HPET_ID_LEGSUP) { 804 if (id & HPET_ID_LEGSUP) {
394 hpet_legacy_clockevent_register(); 805 hpet_legacy_clockevent_register();
806 hpet_msi_capability_lookup(2);
395 return 1; 807 return 1;
396 } 808 }
809 hpet_msi_capability_lookup(0);
397 return 0; 810 return 0;
398 811
399out_nohpet: 812out_nohpet:
@@ -410,6 +823,8 @@ out_nohpet:
410 */ 823 */
411static __init int hpet_late_init(void) 824static __init int hpet_late_init(void)
412{ 825{
826 int cpu;
827
413 if (boot_hpet_disable) 828 if (boot_hpet_disable)
414 return -ENODEV; 829 return -ENODEV;
415 830
@@ -425,6 +840,13 @@ static __init int hpet_late_init(void)
425 840
426 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 841 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
427 842
843 for_each_online_cpu(cpu) {
844 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
845 }
846
847 /* This notifier should be called after workqueue is ready */
848 hotcpu_notifier(hpet_cpuhp_notify, -20);
849
428 return 0; 850 return 0;
429} 851}
430fs_initcall(hpet_late_init); 852fs_initcall(hpet_late_init);
@@ -468,7 +890,7 @@ void hpet_disable(void)
468#define RTC_NUM_INTS 1 890#define RTC_NUM_INTS 1
469 891
470static unsigned long hpet_rtc_flags; 892static unsigned long hpet_rtc_flags;
471static unsigned long hpet_prev_update_sec; 893static int hpet_prev_update_sec;
472static struct rtc_time hpet_alarm_time; 894static struct rtc_time hpet_alarm_time;
473static unsigned long hpet_pie_count; 895static unsigned long hpet_pie_count;
474static unsigned long hpet_t1_cmp; 896static unsigned long hpet_t1_cmp;
@@ -575,6 +997,9 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask)
575 997
576 hpet_rtc_flags |= bit_mask; 998 hpet_rtc_flags |= bit_mask;
577 999
1000 if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE))
1001 hpet_prev_update_sec = -1;
1002
578 if (!oldbits) 1003 if (!oldbits)
579 hpet_rtc_timer_init(); 1004 hpet_rtc_timer_init();
580 1005
@@ -652,7 +1077,7 @@ static void hpet_rtc_timer_reinit(void)
652 if (hpet_rtc_flags & RTC_PIE) 1077 if (hpet_rtc_flags & RTC_PIE)
653 hpet_pie_count += lost_ints; 1078 hpet_pie_count += lost_ints;
654 if (printk_ratelimit()) 1079 if (printk_ratelimit())
655 printk(KERN_WARNING "rtc: lost %d interrupts\n", 1080 printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n",
656 lost_ints); 1081 lost_ints);
657 } 1082 }
658} 1083}
@@ -670,7 +1095,8 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id)
670 1095
671 if (hpet_rtc_flags & RTC_UIE && 1096 if (hpet_rtc_flags & RTC_UIE &&
672 curr_time.tm_sec != hpet_prev_update_sec) { 1097 curr_time.tm_sec != hpet_prev_update_sec) {
673 rtc_int_flag = RTC_UF; 1098 if (hpet_prev_update_sec >= 0)
1099 rtc_int_flag = RTC_UF;
674 hpet_prev_update_sec = curr_time.tm_sec; 1100 hpet_prev_update_sec = curr_time.tm_sec;
675 } 1101 }
676 1102
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index eb9ddd8efb82..1f20608d4ca8 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -21,9 +21,12 @@
21# include <asm/sigcontext32.h> 21# include <asm/sigcontext32.h>
22# include <asm/user32.h> 22# include <asm/user32.h>
23#else 23#else
24# define save_i387_ia32 save_i387 24# define save_i387_xstate_ia32 save_i387_xstate
25# define restore_i387_ia32 restore_i387 25# define restore_i387_xstate_ia32 restore_i387_xstate
26# define _fpstate_ia32 _fpstate 26# define _fpstate_ia32 _fpstate
27# define _xstate_ia32 _xstate
28# define sig_xstate_ia32_size sig_xstate_size
29# define fx_sw_reserved_ia32 fx_sw_reserved
27# define user_i387_ia32_struct user_i387_struct 30# define user_i387_ia32_struct user_i387_struct
28# define user32_fxsr_struct user_fxsr_struct 31# define user32_fxsr_struct user_fxsr_struct
29#endif 32#endif
@@ -36,6 +39,7 @@
36 39
37static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; 40static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;
38unsigned int xstate_size; 41unsigned int xstate_size;
42unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);
39static struct i387_fxsave_struct fx_scratch __cpuinitdata; 43static struct i387_fxsave_struct fx_scratch __cpuinitdata;
40 44
41void __cpuinit mxcsr_feature_mask_init(void) 45void __cpuinit mxcsr_feature_mask_init(void)
@@ -61,6 +65,11 @@ void __init init_thread_xstate(void)
61 return; 65 return;
62 } 66 }
63 67
68 if (cpu_has_xsave) {
69 xsave_cntxt_init();
70 return;
71 }
72
64 if (cpu_has_fxsr) 73 if (cpu_has_fxsr)
65 xstate_size = sizeof(struct i387_fxsave_struct); 74 xstate_size = sizeof(struct i387_fxsave_struct);
66#ifdef CONFIG_X86_32 75#ifdef CONFIG_X86_32
@@ -83,9 +92,19 @@ void __cpuinit fpu_init(void)
83 92
84 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ 93 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
85 94
95 /*
96 * Boot processor to setup the FP and extended state context info.
97 */
98 if (!smp_processor_id())
99 init_thread_xstate();
100 xsave_init();
101
86 mxcsr_feature_mask_init(); 102 mxcsr_feature_mask_init();
87 /* clean state in init */ 103 /* clean state in init */
88 current_thread_info()->status = 0; 104 if (cpu_has_xsave)
105 current_thread_info()->status = TS_XSAVE;
106 else
107 current_thread_info()->status = 0;
89 clear_used_math(); 108 clear_used_math();
90} 109}
91#endif /* CONFIG_X86_64 */ 110#endif /* CONFIG_X86_64 */
@@ -195,6 +214,13 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
195 */ 214 */
196 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 215 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
197 216
217 /*
218 * update the header bits in the xsave header, indicating the
219 * presence of FP and SSE state.
220 */
221 if (cpu_has_xsave)
222 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
223
198 return ret; 224 return ret;
199} 225}
200 226
@@ -395,6 +421,12 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
395 if (!ret) 421 if (!ret)
396 convert_to_fxsr(target, &env); 422 convert_to_fxsr(target, &env);
397 423
424 /*
425 * update the header bit in the xsave header, indicating the
426 * presence of FP.
427 */
428 if (cpu_has_xsave)
429 target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
398 return ret; 430 return ret;
399} 431}
400 432
@@ -407,7 +439,6 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
407 struct task_struct *tsk = current; 439 struct task_struct *tsk = current;
408 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; 440 struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
409 441
410 unlazy_fpu(tsk);
411 fp->status = fp->swd; 442 fp->status = fp->swd;
412 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) 443 if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
413 return -1; 444 return -1;
@@ -421,8 +452,6 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
421 struct user_i387_ia32_struct env; 452 struct user_i387_ia32_struct env;
422 int err = 0; 453 int err = 0;
423 454
424 unlazy_fpu(tsk);
425
426 convert_from_fxsr(&env, tsk); 455 convert_from_fxsr(&env, tsk);
427 if (__copy_to_user(buf, &env, sizeof(env))) 456 if (__copy_to_user(buf, &env, sizeof(env)))
428 return -1; 457 return -1;
@@ -432,16 +461,54 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
432 if (err) 461 if (err)
433 return -1; 462 return -1;
434 463
435 if (__copy_to_user(&buf->_fxsr_env[0], fx, 464 if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size))
436 sizeof(struct i387_fxsave_struct))) 465 return -1;
466 return 1;
467}
468
469static int save_i387_xsave(void __user *buf)
470{
471 struct task_struct *tsk = current;
472 struct _fpstate_ia32 __user *fx = buf;
473 int err = 0;
474
475 /*
476 * For legacy compatible, we always set FP/SSE bits in the bit
477 * vector while saving the state to the user context.
478 * This will enable us capturing any changes(during sigreturn) to
479 * the FP/SSE bits by the legacy applications which don't touch
480 * xstate_bv in the xsave header.
481 *
482 * xsave aware applications can change the xstate_bv in the xsave
483 * header as well as change any contents in the memory layout.
484 * xrestore as part of sigreturn will capture all the changes.
485 */
486 tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
487
488 if (save_i387_fxsave(fx) < 0)
489 return -1;
490
491 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32,
492 sizeof(struct _fpx_sw_bytes));
493 err |= __put_user(FP_XSTATE_MAGIC2,
494 (__u32 __user *) (buf + sig_xstate_ia32_size
495 - FP_XSTATE_MAGIC2_SIZE));
496 if (err)
437 return -1; 497 return -1;
498
438 return 1; 499 return 1;
439} 500}
440 501
441int save_i387_ia32(struct _fpstate_ia32 __user *buf) 502int save_i387_xstate_ia32(void __user *buf)
442{ 503{
504 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
505 struct task_struct *tsk = current;
506
443 if (!used_math()) 507 if (!used_math())
444 return 0; 508 return 0;
509
510 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size))
511 return -EACCES;
445 /* 512 /*
446 * This will cause a "finit" to be triggered by the next 513 * This will cause a "finit" to be triggered by the next
447 * attempted FPU operation by the 'current' process. 514 * attempted FPU operation by the 'current' process.
@@ -451,13 +518,17 @@ int save_i387_ia32(struct _fpstate_ia32 __user *buf)
451 if (!HAVE_HWFP) { 518 if (!HAVE_HWFP) {
452 return fpregs_soft_get(current, NULL, 519 return fpregs_soft_get(current, NULL,
453 0, sizeof(struct user_i387_ia32_struct), 520 0, sizeof(struct user_i387_ia32_struct),
454 NULL, buf) ? -1 : 1; 521 NULL, fp) ? -1 : 1;
455 } 522 }
456 523
524 unlazy_fpu(tsk);
525
526 if (cpu_has_xsave)
527 return save_i387_xsave(fp);
457 if (cpu_has_fxsr) 528 if (cpu_has_fxsr)
458 return save_i387_fxsave(buf); 529 return save_i387_fxsave(fp);
459 else 530 else
460 return save_i387_fsave(buf); 531 return save_i387_fsave(fp);
461} 532}
462 533
463static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) 534static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
@@ -468,14 +539,15 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
468 sizeof(struct i387_fsave_struct)); 539 sizeof(struct i387_fsave_struct));
469} 540}
470 541
471static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) 542static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
543 unsigned int size)
472{ 544{
473 struct task_struct *tsk = current; 545 struct task_struct *tsk = current;
474 struct user_i387_ia32_struct env; 546 struct user_i387_ia32_struct env;
475 int err; 547 int err;
476 548
477 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], 549 err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
478 sizeof(struct i387_fxsave_struct)); 550 size);
479 /* mxcsr reserved bits must be masked to zero for security reasons */ 551 /* mxcsr reserved bits must be masked to zero for security reasons */
480 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; 552 tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
481 if (err || __copy_from_user(&env, buf, sizeof(env))) 553 if (err || __copy_from_user(&env, buf, sizeof(env)))
@@ -485,14 +557,69 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf)
485 return 0; 557 return 0;
486} 558}
487 559
488int restore_i387_ia32(struct _fpstate_ia32 __user *buf) 560static int restore_i387_xsave(void __user *buf)
561{
562 struct _fpx_sw_bytes fx_sw_user;
563 struct _fpstate_ia32 __user *fx_user =
564 ((struct _fpstate_ia32 __user *) buf);
565 struct i387_fxsave_struct __user *fx =
566 (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
567 struct xsave_hdr_struct *xsave_hdr =
568 &current->thread.xstate->xsave.xsave_hdr;
569 u64 mask;
570 int err;
571
572 if (check_for_xstate(fx, buf, &fx_sw_user))
573 goto fx_only;
574
575 mask = fx_sw_user.xstate_bv;
576
577 err = restore_i387_fxsave(buf, fx_sw_user.xstate_size);
578
579 xsave_hdr->xstate_bv &= pcntxt_mask;
580 /*
581 * These bits must be zero.
582 */
583 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
584
585 /*
586 * Init the state that is not present in the memory layout
587 * and enabled by the OS.
588 */
589 mask = ~(pcntxt_mask & ~mask);
590 xsave_hdr->xstate_bv &= mask;
591
592 return err;
593fx_only:
594 /*
595 * Couldn't find the extended state information in the memory
596 * layout. Restore the FP/SSE and init the other extended state
597 * enabled by the OS.
598 */
599 xsave_hdr->xstate_bv = XSTATE_FPSSE;
600 return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct));
601}
602
603int restore_i387_xstate_ia32(void __user *buf)
489{ 604{
490 int err; 605 int err;
491 struct task_struct *tsk = current; 606 struct task_struct *tsk = current;
607 struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf;
492 608
493 if (HAVE_HWFP) 609 if (HAVE_HWFP)
494 clear_fpu(tsk); 610 clear_fpu(tsk);
495 611
612 if (!buf) {
613 if (used_math()) {
614 clear_fpu(tsk);
615 clear_used_math();
616 }
617
618 return 0;
619 } else
620 if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size))
621 return -EACCES;
622
496 if (!used_math()) { 623 if (!used_math()) {
497 err = init_fpu(tsk); 624 err = init_fpu(tsk);
498 if (err) 625 if (err)
@@ -500,14 +627,17 @@ int restore_i387_ia32(struct _fpstate_ia32 __user *buf)
500 } 627 }
501 628
502 if (HAVE_HWFP) { 629 if (HAVE_HWFP) {
503 if (cpu_has_fxsr) 630 if (cpu_has_xsave)
504 err = restore_i387_fxsave(buf); 631 err = restore_i387_xsave(buf);
632 else if (cpu_has_fxsr)
633 err = restore_i387_fxsave(fp, sizeof(struct
634 i387_fxsave_struct));
505 else 635 else
506 err = restore_i387_fsave(buf); 636 err = restore_i387_fsave(fp);
507 } else { 637 } else {
508 err = fpregs_soft_set(current, NULL, 638 err = fpregs_soft_set(current, NULL,
509 0, sizeof(struct user_i387_ia32_struct), 639 0, sizeof(struct user_i387_ia32_struct),
510 NULL, buf) != 0; 640 NULL, fp) != 0;
511 } 641 }
512 set_used_math(); 642 set_used_math();
513 643
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index dc92b49d9204..4b8a53d841f7 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -282,6 +282,30 @@ static int __init i8259A_init_sysfs(void)
282 282
283device_initcall(i8259A_init_sysfs); 283device_initcall(i8259A_init_sysfs);
284 284
285void mask_8259A(void)
286{
287 unsigned long flags;
288
289 spin_lock_irqsave(&i8259A_lock, flags);
290
291 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
292 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
293
294 spin_unlock_irqrestore(&i8259A_lock, flags);
295}
296
297void unmask_8259A(void)
298{
299 unsigned long flags;
300
301 spin_lock_irqsave(&i8259A_lock, flags);
302
303 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
304 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
305
306 spin_unlock_irqrestore(&i8259A_lock, flags);
307}
308
285void init_8259A(int auto_eoi) 309void init_8259A(int auto_eoi)
286{ 310{
287 unsigned long flags; 311 unsigned long flags;
diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic.c
index 6510cde36b35..b764d7429c61 100644
--- a/arch/x86/kernel/io_apic_64.c
+++ b/arch/x86/kernel/io_apic.c
@@ -27,16 +27,21 @@
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/pci.h> 28#include <linux/pci.h>
29#include <linux/mc146818rtc.h> 29#include <linux/mc146818rtc.h>
30#include <linux/compiler.h>
30#include <linux/acpi.h> 31#include <linux/acpi.h>
32#include <linux/module.h>
31#include <linux/sysdev.h> 33#include <linux/sysdev.h>
32#include <linux/msi.h> 34#include <linux/msi.h>
33#include <linux/htirq.h> 35#include <linux/htirq.h>
34#include <linux/dmar.h> 36#include <linux/freezer.h>
35#include <linux/jiffies.h> 37#include <linux/kthread.h>
38#include <linux/jiffies.h> /* time_after() */
36#ifdef CONFIG_ACPI 39#ifdef CONFIG_ACPI
37#include <acpi/acpi_bus.h> 40#include <acpi/acpi_bus.h>
38#endif 41#endif
39#include <linux/bootmem.h> 42#include <linux/bootmem.h>
43#include <linux/dmar.h>
44#include <linux/hpet.h>
40 45
41#include <asm/idle.h> 46#include <asm/idle.h>
42#include <asm/io.h> 47#include <asm/io.h>
@@ -45,62 +50,31 @@
45#include <asm/proto.h> 50#include <asm/proto.h>
46#include <asm/acpi.h> 51#include <asm/acpi.h>
47#include <asm/dma.h> 52#include <asm/dma.h>
53#include <asm/timer.h>
54#include <asm/i8259.h>
48#include <asm/nmi.h> 55#include <asm/nmi.h>
49#include <asm/msidef.h> 56#include <asm/msidef.h>
50#include <asm/hypertransport.h> 57#include <asm/hypertransport.h>
58#include <asm/setup.h>
59#include <asm/irq_remapping.h>
60#include <asm/hpet.h>
61#include <asm/uv/uv_hub.h>
62#include <asm/uv/uv_irq.h>
51 63
52#include <mach_ipi.h> 64#include <mach_ipi.h>
53#include <mach_apic.h> 65#include <mach_apic.h>
66#include <mach_apicdef.h>
54 67
55struct irq_cfg { 68#define __apicdebuginit(type) static type __init
56 cpumask_t domain;
57 cpumask_t old_domain;
58 unsigned move_cleanup_count;
59 u8 vector;
60 u8 move_in_progress : 1;
61};
62
63/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
64static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = {
65 [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
66 [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
67 [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
68 [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
69 [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
70 [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
71 [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
72 [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
73 [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
74 [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
75 [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
76 [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
77 [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
78 [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
79 [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
80 [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
81};
82
83static int assign_irq_vector(int irq, cpumask_t mask);
84 69
85int first_system_vector = 0xfe; 70/*
86 71 * Is the SiS APIC rmw bug present ?
87char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; 72 * -1 = don't know, 0 = no, 1 = yes
88 73 */
89#define __apicdebuginit __init 74int sis_apic_bug = -1;
90
91int sis_apic_bug; /* not actually supported, dummy for compile */
92
93static int no_timer_check;
94
95static int disable_timer_pin_1 __initdata;
96
97int timer_through_8259 __initdata;
98
99/* Where if anywhere is the i8259 connect in external int mode */
100static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
101 75
102static DEFINE_SPINLOCK(ioapic_lock); 76static DEFINE_SPINLOCK(ioapic_lock);
103DEFINE_SPINLOCK(vector_lock); 77static DEFINE_SPINLOCK(vector_lock);
104 78
105/* 79/*
106 * # of IRQ routing registers 80 * # of IRQ routing registers
@@ -117,11 +91,69 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
117/* # of MP IRQ source entries */ 91/* # of MP IRQ source entries */
118int mp_irq_entries; 92int mp_irq_entries;
119 93
94#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
95int mp_bus_id_to_type[MAX_MP_BUSSES];
96#endif
97
120DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); 98DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
121 99
100int skip_ioapic_setup;
101
102static int __init parse_noapic(char *str)
103{
104 /* disable IO-APIC */
105 disable_ioapic_setup();
106 return 0;
107}
108early_param("noapic", parse_noapic);
109
110struct irq_pin_list;
111struct irq_cfg {
112 unsigned int irq;
113 struct irq_pin_list *irq_2_pin;
114 cpumask_t domain;
115 cpumask_t old_domain;
116 unsigned move_cleanup_count;
117 u8 vector;
118 u8 move_in_progress : 1;
119};
120
121/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
122static struct irq_cfg irq_cfgx[NR_IRQS] = {
123 [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, },
124 [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, },
125 [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, },
126 [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, },
127 [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, },
128 [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, },
129 [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, },
130 [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, },
131 [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, },
132 [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, },
133 [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, },
134 [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, },
135 [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, },
136 [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, },
137 [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, },
138 [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, },
139};
140
141#define for_each_irq_cfg(irq, cfg) \
142 for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++)
143
144static struct irq_cfg *irq_cfg(unsigned int irq)
145{
146 return irq < nr_irqs ? irq_cfgx + irq : NULL;
147}
148
149static struct irq_cfg *irq_cfg_alloc(unsigned int irq)
150{
151 return irq_cfg(irq);
152}
153
122/* 154/*
123 * Rough estimation of how many shared IRQs there are, can 155 * Rough estimation of how many shared IRQs there are, can be changed
124 * be changed anytime. 156 * anytime.
125 */ 157 */
126#define MAX_PLUS_SHARED_IRQS NR_IRQS 158#define MAX_PLUS_SHARED_IRQS NR_IRQS
127#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) 159#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
@@ -133,9 +165,36 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
133 * between pins and IRQs. 165 * between pins and IRQs.
134 */ 166 */
135 167
136static struct irq_pin_list { 168struct irq_pin_list {
137 short apic, pin, next; 169 int apic, pin;
138} irq_2_pin[PIN_MAP_SIZE]; 170 struct irq_pin_list *next;
171};
172
173static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE];
174static struct irq_pin_list *irq_2_pin_ptr;
175
176static void __init irq_2_pin_init(void)
177{
178 struct irq_pin_list *pin = irq_2_pin_head;
179 int i;
180
181 for (i = 1; i < PIN_MAP_SIZE; i++)
182 pin[i-1].next = &pin[i];
183
184 irq_2_pin_ptr = &pin[0];
185}
186
187static struct irq_pin_list *get_one_free_irq_2_pin(void)
188{
189 struct irq_pin_list *pin = irq_2_pin_ptr;
190
191 if (!pin)
192 panic("can not get more irq_2_pin\n");
193
194 irq_2_pin_ptr = pin->next;
195 pin->next = NULL;
196 return pin;
197}
139 198
140struct io_apic { 199struct io_apic {
141 unsigned int index; 200 unsigned int index;
@@ -166,10 +225,15 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i
166/* 225/*
167 * Re-write a value: to be used for read-modify-write 226 * Re-write a value: to be used for read-modify-write
168 * cycles where the read already set up the index register. 227 * cycles where the read already set up the index register.
228 *
229 * Older SiS APIC requires we rewrite the index register
169 */ 230 */
170static inline void io_apic_modify(unsigned int apic, unsigned int value) 231static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
171{ 232{
172 struct io_apic __iomem *io_apic = io_apic_base(apic); 233 struct io_apic __iomem *io_apic = io_apic_base(apic);
234
235 if (sis_apic_bug)
236 writel(reg, &io_apic->index);
173 writel(value, &io_apic->data); 237 writel(value, &io_apic->data);
174} 238}
175 239
@@ -177,16 +241,17 @@ static bool io_apic_level_ack_pending(unsigned int irq)
177{ 241{
178 struct irq_pin_list *entry; 242 struct irq_pin_list *entry;
179 unsigned long flags; 243 unsigned long flags;
244 struct irq_cfg *cfg = irq_cfg(irq);
180 245
181 spin_lock_irqsave(&ioapic_lock, flags); 246 spin_lock_irqsave(&ioapic_lock, flags);
182 entry = irq_2_pin + irq; 247 entry = cfg->irq_2_pin;
183 for (;;) { 248 for (;;) {
184 unsigned int reg; 249 unsigned int reg;
185 int pin; 250 int pin;
186 251
187 pin = entry->pin; 252 if (!entry)
188 if (pin == -1)
189 break; 253 break;
254 pin = entry->pin;
190 reg = io_apic_read(entry->apic, 0x10 + pin*2); 255 reg = io_apic_read(entry->apic, 0x10 + pin*2);
191 /* Is the remote IRR bit set? */ 256 /* Is the remote IRR bit set? */
192 if (reg & IO_APIC_REDIR_REMOTE_IRR) { 257 if (reg & IO_APIC_REDIR_REMOTE_IRR) {
@@ -195,45 +260,13 @@ static bool io_apic_level_ack_pending(unsigned int irq)
195 } 260 }
196 if (!entry->next) 261 if (!entry->next)
197 break; 262 break;
198 entry = irq_2_pin + entry->next; 263 entry = entry->next;
199 } 264 }
200 spin_unlock_irqrestore(&ioapic_lock, flags); 265 spin_unlock_irqrestore(&ioapic_lock, flags);
201 266
202 return false; 267 return false;
203} 268}
204 269
205/*
206 * Synchronize the IO-APIC and the CPU by doing
207 * a dummy read from the IO-APIC
208 */
209static inline void io_apic_sync(unsigned int apic)
210{
211 struct io_apic __iomem *io_apic = io_apic_base(apic);
212 readl(&io_apic->data);
213}
214
215#define __DO_ACTION(R, ACTION, FINAL) \
216 \
217{ \
218 int pin; \
219 struct irq_pin_list *entry = irq_2_pin + irq; \
220 \
221 BUG_ON(irq >= NR_IRQS); \
222 for (;;) { \
223 unsigned int reg; \
224 pin = entry->pin; \
225 if (pin == -1) \
226 break; \
227 reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \
228 reg ACTION; \
229 io_apic_modify(entry->apic, reg); \
230 FINAL; \
231 if (!entry->next) \
232 break; \
233 entry = irq_2_pin + entry->next; \
234 } \
235}
236
237union entry_union { 270union entry_union {
238 struct { u32 w1, w2; }; 271 struct { u32 w1, w2; };
239 struct IO_APIC_route_entry entry; 272 struct IO_APIC_route_entry entry;
@@ -293,54 +326,71 @@ static void ioapic_mask_entry(int apic, int pin)
293static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) 326static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector)
294{ 327{
295 int apic, pin; 328 int apic, pin;
296 struct irq_pin_list *entry = irq_2_pin + irq; 329 struct irq_cfg *cfg;
330 struct irq_pin_list *entry;
297 331
298 BUG_ON(irq >= NR_IRQS); 332 cfg = irq_cfg(irq);
333 entry = cfg->irq_2_pin;
299 for (;;) { 334 for (;;) {
300 unsigned int reg; 335 unsigned int reg;
336
337 if (!entry)
338 break;
339
301 apic = entry->apic; 340 apic = entry->apic;
302 pin = entry->pin; 341 pin = entry->pin;
303 if (pin == -1) 342#ifdef CONFIG_INTR_REMAP
304 break; 343 /*
344 * With interrupt-remapping, destination information comes
345 * from interrupt-remapping table entry.
346 */
347 if (!irq_remapped(irq))
348 io_apic_write(apic, 0x11 + pin*2, dest);
349#else
305 io_apic_write(apic, 0x11 + pin*2, dest); 350 io_apic_write(apic, 0x11 + pin*2, dest);
351#endif
306 reg = io_apic_read(apic, 0x10 + pin*2); 352 reg = io_apic_read(apic, 0x10 + pin*2);
307 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 353 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
308 reg |= vector; 354 reg |= vector;
309 io_apic_modify(apic, reg); 355 io_apic_modify(apic, 0x10 + pin*2, reg);
310 if (!entry->next) 356 if (!entry->next)
311 break; 357 break;
312 entry = irq_2_pin + entry->next; 358 entry = entry->next;
313 } 359 }
314} 360}
315 361
362static int assign_irq_vector(int irq, cpumask_t mask);
363
316static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) 364static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
317{ 365{
318 struct irq_cfg *cfg = irq_cfg + irq; 366 struct irq_cfg *cfg;
319 unsigned long flags; 367 unsigned long flags;
320 unsigned int dest; 368 unsigned int dest;
321 cpumask_t tmp; 369 cpumask_t tmp;
370 struct irq_desc *desc;
322 371
323 cpus_and(tmp, mask, cpu_online_map); 372 cpus_and(tmp, mask, cpu_online_map);
324 if (cpus_empty(tmp)) 373 if (cpus_empty(tmp))
325 return; 374 return;
326 375
376 cfg = irq_cfg(irq);
327 if (assign_irq_vector(irq, mask)) 377 if (assign_irq_vector(irq, mask))
328 return; 378 return;
329 379
330 cpus_and(tmp, cfg->domain, mask); 380 cpus_and(tmp, cfg->domain, mask);
331 dest = cpu_mask_to_apicid(tmp); 381 dest = cpu_mask_to_apicid(tmp);
332
333 /* 382 /*
334 * Only the high 8 bits are valid. 383 * Only the high 8 bits are valid.
335 */ 384 */
336 dest = SET_APIC_LOGICAL_ID(dest); 385 dest = SET_APIC_LOGICAL_ID(dest);
337 386
387 desc = irq_to_desc(irq);
338 spin_lock_irqsave(&ioapic_lock, flags); 388 spin_lock_irqsave(&ioapic_lock, flags);
339 __target_IO_APIC_irq(irq, dest, cfg->vector); 389 __target_IO_APIC_irq(irq, dest, cfg->vector);
340 irq_desc[irq].affinity = mask; 390 desc->affinity = mask;
341 spin_unlock_irqrestore(&ioapic_lock, flags); 391 spin_unlock_irqrestore(&ioapic_lock, flags);
342} 392}
343#endif 393#endif /* CONFIG_SMP */
344 394
345/* 395/*
346 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are 396 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
@@ -349,19 +399,30 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
349 */ 399 */
350static void add_pin_to_irq(unsigned int irq, int apic, int pin) 400static void add_pin_to_irq(unsigned int irq, int apic, int pin)
351{ 401{
352 static int first_free_entry = NR_IRQS; 402 struct irq_cfg *cfg;
353 struct irq_pin_list *entry = irq_2_pin + irq; 403 struct irq_pin_list *entry;
404
405 /* first time to refer irq_cfg, so with new */
406 cfg = irq_cfg_alloc(irq);
407 entry = cfg->irq_2_pin;
408 if (!entry) {
409 entry = get_one_free_irq_2_pin();
410 cfg->irq_2_pin = entry;
411 entry->apic = apic;
412 entry->pin = pin;
413 return;
414 }
354 415
355 BUG_ON(irq >= NR_IRQS); 416 while (entry->next) {
356 while (entry->next) 417 /* not again, please */
357 entry = irq_2_pin + entry->next; 418 if (entry->apic == apic && entry->pin == pin)
419 return;
358 420
359 if (entry->pin != -1) { 421 entry = entry->next;
360 entry->next = first_free_entry;
361 entry = irq_2_pin + entry->next;
362 if (++first_free_entry >= PIN_MAP_SIZE)
363 panic("io_apic.c: ran out of irq_2_pin entries!");
364 } 422 }
423
424 entry->next = get_one_free_irq_2_pin();
425 entry = entry->next;
365 entry->apic = apic; 426 entry->apic = apic;
366 entry->pin = pin; 427 entry->pin = pin;
367} 428}
@@ -373,30 +434,86 @@ static void __init replace_pin_at_irq(unsigned int irq,
373 int oldapic, int oldpin, 434 int oldapic, int oldpin,
374 int newapic, int newpin) 435 int newapic, int newpin)
375{ 436{
376 struct irq_pin_list *entry = irq_2_pin + irq; 437 struct irq_cfg *cfg = irq_cfg(irq);
438 struct irq_pin_list *entry = cfg->irq_2_pin;
439 int replaced = 0;
377 440
378 while (1) { 441 while (entry) {
379 if (entry->apic == oldapic && entry->pin == oldpin) { 442 if (entry->apic == oldapic && entry->pin == oldpin) {
380 entry->apic = newapic; 443 entry->apic = newapic;
381 entry->pin = newpin; 444 entry->pin = newpin;
382 } 445 replaced = 1;
383 if (!entry->next) 446 /* every one is different, right? */
384 break; 447 break;
385 entry = irq_2_pin + entry->next; 448 }
449 entry = entry->next;
386 } 450 }
451
452 /* why? call replace before add? */
453 if (!replaced)
454 add_pin_to_irq(irq, newapic, newpin);
387} 455}
388 456
457static inline void io_apic_modify_irq(unsigned int irq,
458 int mask_and, int mask_or,
459 void (*final)(struct irq_pin_list *entry))
460{
461 int pin;
462 struct irq_cfg *cfg;
463 struct irq_pin_list *entry;
389 464
390#define DO_ACTION(name,R,ACTION, FINAL) \ 465 cfg = irq_cfg(irq);
391 \ 466 for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) {
392 static void name##_IO_APIC_irq (unsigned int irq) \ 467 unsigned int reg;
393 __DO_ACTION(R, ACTION, FINAL) 468 pin = entry->pin;
469 reg = io_apic_read(entry->apic, 0x10 + pin * 2);
470 reg &= mask_and;
471 reg |= mask_or;
472 io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
473 if (final)
474 final(entry);
475 }
476}
394 477
395/* mask = 1 */ 478static void __unmask_IO_APIC_irq(unsigned int irq)
396DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) 479{
480 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL);
481}
397 482
398/* mask = 0 */ 483#ifdef CONFIG_X86_64
399DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) 484void io_apic_sync(struct irq_pin_list *entry)
485{
486 /*
487 * Synchronize the IO-APIC and the CPU by doing
488 * a dummy read from the IO-APIC
489 */
490 struct io_apic __iomem *io_apic;
491 io_apic = io_apic_base(entry->apic);
492 readl(&io_apic->data);
493}
494
495static void __mask_IO_APIC_irq(unsigned int irq)
496{
497 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
498}
499#else /* CONFIG_X86_32 */
500static void __mask_IO_APIC_irq(unsigned int irq)
501{
502 io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL);
503}
504
505static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
506{
507 io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER,
508 IO_APIC_REDIR_MASKED, NULL);
509}
510
511static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
512{
513 io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED,
514 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
515}
516#endif /* CONFIG_X86_32 */
400 517
401static void mask_IO_APIC_irq (unsigned int irq) 518static void mask_IO_APIC_irq (unsigned int irq)
402{ 519{
@@ -439,24 +556,145 @@ static void clear_IO_APIC (void)
439 clear_IO_APIC_pin(apic, pin); 556 clear_IO_APIC_pin(apic, pin);
440} 557}
441 558
442int skip_ioapic_setup; 559#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32)
443int ioapic_force; 560void send_IPI_self(int vector)
561{
562 unsigned int cfg;
444 563
445static int __init parse_noapic(char *str) 564 /*
565 * Wait for idle.
566 */
567 apic_wait_icr_idle();
568 cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
569 /*
570 * Send the IPI. The write to APIC_ICR fires this off.
571 */
572 apic_write(APIC_ICR, cfg);
573}
574#endif /* !CONFIG_SMP && CONFIG_X86_32*/
575
576#ifdef CONFIG_X86_32
577/*
578 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
579 * specific CPU-side IRQs.
580 */
581
582#define MAX_PIRQS 8
583static int pirq_entries [MAX_PIRQS];
584static int pirqs_enabled;
585
586static int __init ioapic_pirq_setup(char *str)
446{ 587{
447 disable_ioapic_setup(); 588 int i, max;
589 int ints[MAX_PIRQS+1];
590
591 get_options(str, ARRAY_SIZE(ints), ints);
592
593 for (i = 0; i < MAX_PIRQS; i++)
594 pirq_entries[i] = -1;
595
596 pirqs_enabled = 1;
597 apic_printk(APIC_VERBOSE, KERN_INFO
598 "PIRQ redirection, working around broken MP-BIOS.\n");
599 max = MAX_PIRQS;
600 if (ints[0] < MAX_PIRQS)
601 max = ints[0];
602
603 for (i = 0; i < max; i++) {
604 apic_printk(APIC_VERBOSE, KERN_DEBUG
605 "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
606 /*
607 * PIRQs are mapped upside down, usually.
608 */
609 pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
610 }
611 return 1;
612}
613
614__setup("pirq=", ioapic_pirq_setup);
615#endif /* CONFIG_X86_32 */
616
617#ifdef CONFIG_INTR_REMAP
618/* I/O APIC RTE contents at the OS boot up */
619static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS];
620
621/*
622 * Saves and masks all the unmasked IO-APIC RTE's
623 */
624int save_mask_IO_APIC_setup(void)
625{
626 union IO_APIC_reg_01 reg_01;
627 unsigned long flags;
628 int apic, pin;
629
630 /*
631 * The number of IO-APIC IRQ registers (== #pins):
632 */
633 for (apic = 0; apic < nr_ioapics; apic++) {
634 spin_lock_irqsave(&ioapic_lock, flags);
635 reg_01.raw = io_apic_read(apic, 1);
636 spin_unlock_irqrestore(&ioapic_lock, flags);
637 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
638 }
639
640 for (apic = 0; apic < nr_ioapics; apic++) {
641 early_ioapic_entries[apic] =
642 kzalloc(sizeof(struct IO_APIC_route_entry) *
643 nr_ioapic_registers[apic], GFP_KERNEL);
644 if (!early_ioapic_entries[apic])
645 goto nomem;
646 }
647
648 for (apic = 0; apic < nr_ioapics; apic++)
649 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
650 struct IO_APIC_route_entry entry;
651
652 entry = early_ioapic_entries[apic][pin] =
653 ioapic_read_entry(apic, pin);
654 if (!entry.mask) {
655 entry.mask = 1;
656 ioapic_write_entry(apic, pin, entry);
657 }
658 }
659
448 return 0; 660 return 0;
661
662nomem:
663 while (apic >= 0)
664 kfree(early_ioapic_entries[apic--]);
665 memset(early_ioapic_entries, 0,
666 ARRAY_SIZE(early_ioapic_entries));
667
668 return -ENOMEM;
449} 669}
450early_param("noapic", parse_noapic);
451 670
452/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ 671void restore_IO_APIC_setup(void)
453static int __init disable_timer_pin_setup(char *arg)
454{ 672{
455 disable_timer_pin_1 = 1; 673 int apic, pin;
456 return 1; 674
675 for (apic = 0; apic < nr_ioapics; apic++) {
676 if (!early_ioapic_entries[apic])
677 break;
678 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
679 ioapic_write_entry(apic, pin,
680 early_ioapic_entries[apic][pin]);
681 kfree(early_ioapic_entries[apic]);
682 early_ioapic_entries[apic] = NULL;
683 }
457} 684}
458__setup("disable_timer_pin_1", disable_timer_pin_setup);
459 685
686void reinit_intr_remapped_IO_APIC(int intr_remapping)
687{
688 /*
689 * for now plain restore of previous settings.
690 * TBD: In the case of OS enabling interrupt-remapping,
691 * IO-APIC RTE's need to be setup to point to interrupt-remapping
692 * table entries. for now, do a plain restore, and wait for
693 * the setup_IO_APIC_irqs() to do proper initialization.
694 */
695 restore_IO_APIC_setup();
696}
697#endif
460 698
461/* 699/*
462 * Find the IRQ entry number of a certain pin. 700 * Find the IRQ entry number of a certain pin.
@@ -560,22 +798,54 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
560 best_guess = irq; 798 best_guess = irq;
561 } 799 }
562 } 800 }
563 BUG_ON(best_guess >= NR_IRQS);
564 return best_guess; 801 return best_guess;
565} 802}
566 803
804EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
805
806#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
807/*
808 * EISA Edge/Level control register, ELCR
809 */
810static int EISA_ELCR(unsigned int irq)
811{
812 if (irq < 16) {
813 unsigned int port = 0x4d0 + (irq >> 3);
814 return (inb(port) >> (irq & 7)) & 1;
815 }
816 apic_printk(APIC_VERBOSE, KERN_INFO
817 "Broken MPtable reports ISA irq %d\n", irq);
818 return 0;
819}
820
821#endif
822
567/* ISA interrupts are always polarity zero edge triggered, 823/* ISA interrupts are always polarity zero edge triggered,
568 * when listed as conforming in the MP table. */ 824 * when listed as conforming in the MP table. */
569 825
570#define default_ISA_trigger(idx) (0) 826#define default_ISA_trigger(idx) (0)
571#define default_ISA_polarity(idx) (0) 827#define default_ISA_polarity(idx) (0)
572 828
829/* EISA interrupts are always polarity zero and can be edge or level
830 * trigger depending on the ELCR value. If an interrupt is listed as
831 * EISA conforming in the MP table, that means its trigger type must
832 * be read in from the ELCR */
833
834#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
835#define default_EISA_polarity(idx) default_ISA_polarity(idx)
836
573/* PCI interrupts are always polarity one level triggered, 837/* PCI interrupts are always polarity one level triggered,
574 * when listed as conforming in the MP table. */ 838 * when listed as conforming in the MP table. */
575 839
576#define default_PCI_trigger(idx) (1) 840#define default_PCI_trigger(idx) (1)
577#define default_PCI_polarity(idx) (1) 841#define default_PCI_polarity(idx) (1)
578 842
843/* MCA interrupts are always polarity zero level triggered,
844 * when listed as conforming in the MP table. */
845
846#define default_MCA_trigger(idx) (1)
847#define default_MCA_polarity(idx) default_ISA_polarity(idx)
848
579static int MPBIOS_polarity(int idx) 849static int MPBIOS_polarity(int idx)
580{ 850{
581 int bus = mp_irqs[idx].mp_srcbus; 851 int bus = mp_irqs[idx].mp_srcbus;
@@ -633,6 +903,36 @@ static int MPBIOS_trigger(int idx)
633 trigger = default_ISA_trigger(idx); 903 trigger = default_ISA_trigger(idx);
634 else 904 else
635 trigger = default_PCI_trigger(idx); 905 trigger = default_PCI_trigger(idx);
906#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
907 switch (mp_bus_id_to_type[bus]) {
908 case MP_BUS_ISA: /* ISA pin */
909 {
910 /* set before the switch */
911 break;
912 }
913 case MP_BUS_EISA: /* EISA pin */
914 {
915 trigger = default_EISA_trigger(idx);
916 break;
917 }
918 case MP_BUS_PCI: /* PCI pin */
919 {
920 /* set before the switch */
921 break;
922 }
923 case MP_BUS_MCA: /* MCA pin */
924 {
925 trigger = default_MCA_trigger(idx);
926 break;
927 }
928 default:
929 {
930 printk(KERN_WARNING "broken BIOS!!\n");
931 trigger = 1;
932 break;
933 }
934 }
935#endif
636 break; 936 break;
637 case 1: /* edge */ 937 case 1: /* edge */
638 { 938 {
@@ -670,6 +970,7 @@ static inline int irq_trigger(int idx)
670 return MPBIOS_trigger(idx); 970 return MPBIOS_trigger(idx);
671} 971}
672 972
973int (*ioapic_renumber_irq)(int ioapic, int irq);
673static int pin_2_irq(int idx, int apic, int pin) 974static int pin_2_irq(int idx, int apic, int pin)
674{ 975{
675 int irq, i; 976 int irq, i;
@@ -691,11 +992,48 @@ static int pin_2_irq(int idx, int apic, int pin)
691 while (i < apic) 992 while (i < apic)
692 irq += nr_ioapic_registers[i++]; 993 irq += nr_ioapic_registers[i++];
693 irq += pin; 994 irq += pin;
995 /*
996 * For MPS mode, so far only needed by ES7000 platform
997 */
998 if (ioapic_renumber_irq)
999 irq = ioapic_renumber_irq(apic, irq);
694 } 1000 }
695 BUG_ON(irq >= NR_IRQS); 1001
1002#ifdef CONFIG_X86_32
1003 /*
1004 * PCI IRQ command line redirection. Yes, limits are hardcoded.
1005 */
1006 if ((pin >= 16) && (pin <= 23)) {
1007 if (pirq_entries[pin-16] != -1) {
1008 if (!pirq_entries[pin-16]) {
1009 apic_printk(APIC_VERBOSE, KERN_DEBUG
1010 "disabling PIRQ%d\n", pin-16);
1011 } else {
1012 irq = pirq_entries[pin-16];
1013 apic_printk(APIC_VERBOSE, KERN_DEBUG
1014 "using PIRQ%d -> IRQ %d\n",
1015 pin-16, irq);
1016 }
1017 }
1018 }
1019#endif
1020
696 return irq; 1021 return irq;
697} 1022}
698 1023
1024void lock_vector_lock(void)
1025{
1026 /* Used to the online set of cpus does not change
1027 * during assign_irq_vector.
1028 */
1029 spin_lock(&vector_lock);
1030}
1031
1032void unlock_vector_lock(void)
1033{
1034 spin_unlock(&vector_lock);
1035}
1036
699static int __assign_irq_vector(int irq, cpumask_t mask) 1037static int __assign_irq_vector(int irq, cpumask_t mask)
700{ 1038{
701 /* 1039 /*
@@ -714,8 +1052,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
714 int cpu; 1052 int cpu;
715 struct irq_cfg *cfg; 1053 struct irq_cfg *cfg;
716 1054
717 BUG_ON((unsigned)irq >= NR_IRQS); 1055 cfg = irq_cfg(irq);
718 cfg = &irq_cfg[irq];
719 1056
720 /* Only try and allocate irqs on cpus that are present */ 1057 /* Only try and allocate irqs on cpus that are present */
721 cpus_and(mask, mask, cpu_online_map); 1058 cpus_and(mask, mask, cpu_online_map);
@@ -731,7 +1068,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask)
731 return 0; 1068 return 0;
732 } 1069 }
733 1070
734 for_each_cpu_mask(cpu, mask) { 1071 for_each_cpu_mask_nr(cpu, mask) {
735 cpumask_t domain, new_mask; 1072 cpumask_t domain, new_mask;
736 int new_cpu; 1073 int new_cpu;
737 int vector, offset; 1074 int vector, offset;
@@ -750,9 +1087,14 @@ next:
750 } 1087 }
751 if (unlikely(current_vector == vector)) 1088 if (unlikely(current_vector == vector))
752 continue; 1089 continue;
1090#ifdef CONFIG_X86_64
753 if (vector == IA32_SYSCALL_VECTOR) 1091 if (vector == IA32_SYSCALL_VECTOR)
754 goto next; 1092 goto next;
755 for_each_cpu_mask(new_cpu, new_mask) 1093#else
1094 if (vector == SYSCALL_VECTOR)
1095 goto next;
1096#endif
1097 for_each_cpu_mask_nr(new_cpu, new_mask)
756 if (per_cpu(vector_irq, new_cpu)[vector] != -1) 1098 if (per_cpu(vector_irq, new_cpu)[vector] != -1)
757 goto next; 1099 goto next;
758 /* Found one! */ 1100 /* Found one! */
@@ -762,7 +1104,7 @@ next:
762 cfg->move_in_progress = 1; 1104 cfg->move_in_progress = 1;
763 cfg->old_domain = cfg->domain; 1105 cfg->old_domain = cfg->domain;
764 } 1106 }
765 for_each_cpu_mask(new_cpu, new_mask) 1107 for_each_cpu_mask_nr(new_cpu, new_mask)
766 per_cpu(vector_irq, new_cpu)[vector] = irq; 1108 per_cpu(vector_irq, new_cpu)[vector] = irq;
767 cfg->vector = vector; 1109 cfg->vector = vector;
768 cfg->domain = domain; 1110 cfg->domain = domain;
@@ -788,30 +1130,30 @@ static void __clear_irq_vector(int irq)
788 cpumask_t mask; 1130 cpumask_t mask;
789 int cpu, vector; 1131 int cpu, vector;
790 1132
791 BUG_ON((unsigned)irq >= NR_IRQS); 1133 cfg = irq_cfg(irq);
792 cfg = &irq_cfg[irq];
793 BUG_ON(!cfg->vector); 1134 BUG_ON(!cfg->vector);
794 1135
795 vector = cfg->vector; 1136 vector = cfg->vector;
796 cpus_and(mask, cfg->domain, cpu_online_map); 1137 cpus_and(mask, cfg->domain, cpu_online_map);
797 for_each_cpu_mask(cpu, mask) 1138 for_each_cpu_mask_nr(cpu, mask)
798 per_cpu(vector_irq, cpu)[vector] = -1; 1139 per_cpu(vector_irq, cpu)[vector] = -1;
799 1140
800 cfg->vector = 0; 1141 cfg->vector = 0;
801 cpus_clear(cfg->domain); 1142 cpus_clear(cfg->domain);
802} 1143}
803 1144
804static void __setup_vector_irq(int cpu) 1145void __setup_vector_irq(int cpu)
805{ 1146{
806 /* Initialize vector_irq on a new cpu */ 1147 /* Initialize vector_irq on a new cpu */
807 /* This function must be called with vector_lock held */ 1148 /* This function must be called with vector_lock held */
808 int irq, vector; 1149 int irq, vector;
1150 struct irq_cfg *cfg;
809 1151
810 /* Mark the inuse vectors */ 1152 /* Mark the inuse vectors */
811 for (irq = 0; irq < NR_IRQS; ++irq) { 1153 for_each_irq_cfg(irq, cfg) {
812 if (!cpu_isset(cpu, irq_cfg[irq].domain)) 1154 if (!cpu_isset(cpu, cfg->domain))
813 continue; 1155 continue;
814 vector = irq_cfg[irq].vector; 1156 vector = cfg->vector;
815 per_cpu(vector_irq, cpu)[vector] = irq; 1157 per_cpu(vector_irq, cpu)[vector] = irq;
816 } 1158 }
817 /* Mark the free vectors */ 1159 /* Mark the free vectors */
@@ -819,44 +1161,154 @@ static void __setup_vector_irq(int cpu)
819 irq = per_cpu(vector_irq, cpu)[vector]; 1161 irq = per_cpu(vector_irq, cpu)[vector];
820 if (irq < 0) 1162 if (irq < 0)
821 continue; 1163 continue;
822 if (!cpu_isset(cpu, irq_cfg[irq].domain)) 1164
1165 cfg = irq_cfg(irq);
1166 if (!cpu_isset(cpu, cfg->domain))
823 per_cpu(vector_irq, cpu)[vector] = -1; 1167 per_cpu(vector_irq, cpu)[vector] = -1;
824 } 1168 }
825} 1169}
826 1170
827void setup_vector_irq(int cpu) 1171static struct irq_chip ioapic_chip;
828{ 1172#ifdef CONFIG_INTR_REMAP
829 spin_lock(&vector_lock); 1173static struct irq_chip ir_ioapic_chip;
830 __setup_vector_irq(smp_processor_id()); 1174#endif
831 spin_unlock(&vector_lock);
832}
833 1175
1176#define IOAPIC_AUTO -1
1177#define IOAPIC_EDGE 0
1178#define IOAPIC_LEVEL 1
834 1179
835static struct irq_chip ioapic_chip; 1180#ifdef CONFIG_X86_32
1181static inline int IO_APIC_irq_trigger(int irq)
1182{
1183 int apic, idx, pin;
1184
1185 for (apic = 0; apic < nr_ioapics; apic++) {
1186 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1187 idx = find_irq_entry(apic, pin, mp_INT);
1188 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
1189 return irq_trigger(idx);
1190 }
1191 }
1192 /*
1193 * nonexistent IRQs are edge default
1194 */
1195 return 0;
1196}
1197#else
1198static inline int IO_APIC_irq_trigger(int irq)
1199{
1200 return 1;
1201}
1202#endif
836 1203
837static void ioapic_register_intr(int irq, unsigned long trigger) 1204static void ioapic_register_intr(int irq, unsigned long trigger)
838{ 1205{
839 if (trigger) { 1206 struct irq_desc *desc;
840 irq_desc[irq].status |= IRQ_LEVEL; 1207
1208 desc = irq_to_desc(irq);
1209
1210 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1211 trigger == IOAPIC_LEVEL)
1212 desc->status |= IRQ_LEVEL;
1213 else
1214 desc->status &= ~IRQ_LEVEL;
1215
1216#ifdef CONFIG_INTR_REMAP
1217 if (irq_remapped(irq)) {
1218 desc->status |= IRQ_MOVE_PCNTXT;
1219 if (trigger)
1220 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
1221 handle_fasteoi_irq,
1222 "fasteoi");
1223 else
1224 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
1225 handle_edge_irq, "edge");
1226 return;
1227 }
1228#endif
1229 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1230 trigger == IOAPIC_LEVEL)
841 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1231 set_irq_chip_and_handler_name(irq, &ioapic_chip,
842 handle_fasteoi_irq, "fasteoi"); 1232 handle_fasteoi_irq,
843 } else { 1233 "fasteoi");
844 irq_desc[irq].status &= ~IRQ_LEVEL; 1234 else
845 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1235 set_irq_chip_and_handler_name(irq, &ioapic_chip,
846 handle_edge_irq, "edge"); 1236 handle_edge_irq, "edge");
1237}
1238
1239static int setup_ioapic_entry(int apic, int irq,
1240 struct IO_APIC_route_entry *entry,
1241 unsigned int destination, int trigger,
1242 int polarity, int vector)
1243{
1244 /*
1245 * add it to the IO-APIC irq-routing table:
1246 */
1247 memset(entry,0,sizeof(*entry));
1248
1249#ifdef CONFIG_INTR_REMAP
1250 if (intr_remapping_enabled) {
1251 struct intel_iommu *iommu = map_ioapic_to_ir(apic);
1252 struct irte irte;
1253 struct IR_IO_APIC_route_entry *ir_entry =
1254 (struct IR_IO_APIC_route_entry *) entry;
1255 int index;
1256
1257 if (!iommu)
1258 panic("No mapping iommu for ioapic %d\n", apic);
1259
1260 index = alloc_irte(iommu, irq, 1);
1261 if (index < 0)
1262 panic("Failed to allocate IRTE for ioapic %d\n", apic);
1263
1264 memset(&irte, 0, sizeof(irte));
1265
1266 irte.present = 1;
1267 irte.dst_mode = INT_DEST_MODE;
1268 irte.trigger_mode = trigger;
1269 irte.dlvry_mode = INT_DELIVERY_MODE;
1270 irte.vector = vector;
1271 irte.dest_id = IRTE_DEST(destination);
1272
1273 modify_irte(irq, &irte);
1274
1275 ir_entry->index2 = (index >> 15) & 0x1;
1276 ir_entry->zero = 0;
1277 ir_entry->format = 1;
1278 ir_entry->index = (index & 0x7fff);
1279 } else
1280#endif
1281 {
1282 entry->delivery_mode = INT_DELIVERY_MODE;
1283 entry->dest_mode = INT_DEST_MODE;
1284 entry->dest = destination;
847 } 1285 }
1286
1287 entry->mask = 0; /* enable IRQ */
1288 entry->trigger = trigger;
1289 entry->polarity = polarity;
1290 entry->vector = vector;
1291
1292 /* Mask level triggered irqs.
1293 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
1294 */
1295 if (trigger)
1296 entry->mask = 1;
1297 return 0;
848} 1298}
849 1299
850static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, 1300static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
851 int trigger, int polarity) 1301 int trigger, int polarity)
852{ 1302{
853 struct irq_cfg *cfg = irq_cfg + irq; 1303 struct irq_cfg *cfg;
854 struct IO_APIC_route_entry entry; 1304 struct IO_APIC_route_entry entry;
855 cpumask_t mask; 1305 cpumask_t mask;
856 1306
857 if (!IO_APIC_IRQ(irq)) 1307 if (!IO_APIC_IRQ(irq))
858 return; 1308 return;
859 1309
1310 cfg = irq_cfg(irq);
1311
860 mask = TARGET_CPUS; 1312 mask = TARGET_CPUS;
861 if (assign_irq_vector(irq, mask)) 1313 if (assign_irq_vector(irq, mask))
862 return; 1314 return;
@@ -869,24 +1321,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
869 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, 1321 apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector,
870 irq, trigger, polarity); 1322 irq, trigger, polarity);
871 1323
872 /*
873 * add it to the IO-APIC irq-routing table:
874 */
875 memset(&entry,0,sizeof(entry));
876 1324
877 entry.delivery_mode = INT_DELIVERY_MODE; 1325 if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry,
878 entry.dest_mode = INT_DEST_MODE; 1326 cpu_mask_to_apicid(mask), trigger, polarity,
879 entry.dest = cpu_mask_to_apicid(mask); 1327 cfg->vector)) {
880 entry.mask = 0; /* enable IRQ */ 1328 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
881 entry.trigger = trigger; 1329 mp_ioapics[apic].mp_apicid, pin);
882 entry.polarity = polarity; 1330 __clear_irq_vector(irq);
883 entry.vector = cfg->vector; 1331 return;
884 1332 }
885 /* Mask level triggered irqs.
886 * Use IRQ_DELAYED_DISABLE for edge triggered irqs.
887 */
888 if (trigger)
889 entry.mask = 1;
890 1333
891 ioapic_register_intr(irq, trigger); 1334 ioapic_register_intr(irq, trigger);
892 if (irq < 16) 1335 if (irq < 16)
@@ -897,37 +1340,49 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq,
897 1340
898static void __init setup_IO_APIC_irqs(void) 1341static void __init setup_IO_APIC_irqs(void)
899{ 1342{
900 int apic, pin, idx, irq, first_notcon = 1; 1343 int apic, pin, idx, irq;
1344 int notcon = 0;
901 1345
902 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1346 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
903 1347
904 for (apic = 0; apic < nr_ioapics; apic++) { 1348 for (apic = 0; apic < nr_ioapics; apic++) {
905 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1349 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
906 1350
907 idx = find_irq_entry(apic,pin,mp_INT); 1351 idx = find_irq_entry(apic, pin, mp_INT);
908 if (idx == -1) { 1352 if (idx == -1) {
909 if (first_notcon) { 1353 if (!notcon) {
910 apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); 1354 notcon = 1;
911 first_notcon = 0; 1355 apic_printk(APIC_VERBOSE,
912 } else 1356 KERN_DEBUG " %d-%d",
913 apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); 1357 mp_ioapics[apic].mp_apicid,
914 continue; 1358 pin);
915 } 1359 } else
916 if (!first_notcon) { 1360 apic_printk(APIC_VERBOSE, " %d-%d",
917 apic_printk(APIC_VERBOSE, " not connected.\n"); 1361 mp_ioapics[apic].mp_apicid,
918 first_notcon = 1; 1362 pin);
919 } 1363 continue;
1364 }
1365 if (notcon) {
1366 apic_printk(APIC_VERBOSE,
1367 " (apicid-pin) not connected\n");
1368 notcon = 0;
1369 }
920 1370
921 irq = pin_2_irq(idx, apic, pin); 1371 irq = pin_2_irq(idx, apic, pin);
922 add_pin_to_irq(irq, apic, pin); 1372#ifdef CONFIG_X86_32
1373 if (multi_timer_check(apic, irq))
1374 continue;
1375#endif
1376 add_pin_to_irq(irq, apic, pin);
923 1377
924 setup_IO_APIC_irq(apic, pin, irq, 1378 setup_IO_APIC_irq(apic, pin, irq,
925 irq_trigger(idx), irq_polarity(idx)); 1379 irq_trigger(idx), irq_polarity(idx));
926 } 1380 }
927 } 1381 }
928 1382
929 if (!first_notcon) 1383 if (notcon)
930 apic_printk(APIC_VERBOSE, " not connected.\n"); 1384 apic_printk(APIC_VERBOSE,
1385 " (apicid-pin) not connected\n");
931} 1386}
932 1387
933/* 1388/*
@@ -938,6 +1393,11 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
938{ 1393{
939 struct IO_APIC_route_entry entry; 1394 struct IO_APIC_route_entry entry;
940 1395
1396#ifdef CONFIG_INTR_REMAP
1397 if (intr_remapping_enabled)
1398 return;
1399#endif
1400
941 memset(&entry, 0, sizeof(entry)); 1401 memset(&entry, 0, sizeof(entry));
942 1402
943 /* 1403 /*
@@ -964,13 +1424,17 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
964 ioapic_write_entry(apic, pin, entry); 1424 ioapic_write_entry(apic, pin, entry);
965} 1425}
966 1426
967void __apicdebuginit print_IO_APIC(void) 1427
1428__apicdebuginit(void) print_IO_APIC(void)
968{ 1429{
969 int apic, i; 1430 int apic, i;
970 union IO_APIC_reg_00 reg_00; 1431 union IO_APIC_reg_00 reg_00;
971 union IO_APIC_reg_01 reg_01; 1432 union IO_APIC_reg_01 reg_01;
972 union IO_APIC_reg_02 reg_02; 1433 union IO_APIC_reg_02 reg_02;
1434 union IO_APIC_reg_03 reg_03;
973 unsigned long flags; 1435 unsigned long flags;
1436 struct irq_cfg *cfg;
1437 unsigned int irq;
974 1438
975 if (apic_verbosity == APIC_QUIET) 1439 if (apic_verbosity == APIC_QUIET)
976 return; 1440 return;
@@ -993,12 +1457,16 @@ void __apicdebuginit print_IO_APIC(void)
993 reg_01.raw = io_apic_read(apic, 1); 1457 reg_01.raw = io_apic_read(apic, 1);
994 if (reg_01.bits.version >= 0x10) 1458 if (reg_01.bits.version >= 0x10)
995 reg_02.raw = io_apic_read(apic, 2); 1459 reg_02.raw = io_apic_read(apic, 2);
1460 if (reg_01.bits.version >= 0x20)
1461 reg_03.raw = io_apic_read(apic, 3);
996 spin_unlock_irqrestore(&ioapic_lock, flags); 1462 spin_unlock_irqrestore(&ioapic_lock, flags);
997 1463
998 printk("\n"); 1464 printk("\n");
999 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); 1465 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
1000 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1466 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1001 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1467 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1468 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1469 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1002 1470
1003 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01); 1471 printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)&reg_01);
1004 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); 1472 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
@@ -1006,11 +1474,27 @@ void __apicdebuginit print_IO_APIC(void)
1006 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); 1474 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1007 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); 1475 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1008 1476
1009 if (reg_01.bits.version >= 0x10) { 1477 /*
1478 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
1479 * but the value of reg_02 is read as the previous read register
1480 * value, so ignore it if reg_02 == reg_01.
1481 */
1482 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1010 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); 1483 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1011 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); 1484 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1012 } 1485 }
1013 1486
1487 /*
1488 * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
1489 * or reg_03, but the value of reg_0[23] is read as the previous read
1490 * register value, so ignore it if reg_03 == reg_0[12].
1491 */
1492 if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
1493 reg_03.raw != reg_01.raw) {
1494 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1495 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1496 }
1497
1014 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1498 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1015 1499
1016 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" 1500 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
@@ -1039,16 +1523,16 @@ void __apicdebuginit print_IO_APIC(void)
1039 } 1523 }
1040 } 1524 }
1041 printk(KERN_DEBUG "IRQ to pin mappings:\n"); 1525 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1042 for (i = 0; i < NR_IRQS; i++) { 1526 for_each_irq_cfg(irq, cfg) {
1043 struct irq_pin_list *entry = irq_2_pin + i; 1527 struct irq_pin_list *entry = cfg->irq_2_pin;
1044 if (entry->pin < 0) 1528 if (!entry)
1045 continue; 1529 continue;
1046 printk(KERN_DEBUG "IRQ%d ", i); 1530 printk(KERN_DEBUG "IRQ%d ", irq);
1047 for (;;) { 1531 for (;;) {
1048 printk("-> %d:%d", entry->apic, entry->pin); 1532 printk("-> %d:%d", entry->apic, entry->pin);
1049 if (!entry->next) 1533 if (!entry->next)
1050 break; 1534 break;
1051 entry = irq_2_pin + entry->next; 1535 entry = entry->next;
1052 } 1536 }
1053 printk("\n"); 1537 printk("\n");
1054 } 1538 }
@@ -1058,9 +1542,7 @@ void __apicdebuginit print_IO_APIC(void)
1058 return; 1542 return;
1059} 1543}
1060 1544
1061#if 0 1545__apicdebuginit(void) print_APIC_bitfield(int base)
1062
1063static __apicdebuginit void print_APIC_bitfield (int base)
1064{ 1546{
1065 unsigned int v; 1547 unsigned int v;
1066 int i, j; 1548 int i, j;
@@ -1081,9 +1563,10 @@ static __apicdebuginit void print_APIC_bitfield (int base)
1081 } 1563 }
1082} 1564}
1083 1565
1084void __apicdebuginit print_local_APIC(void * dummy) 1566__apicdebuginit(void) print_local_APIC(void *dummy)
1085{ 1567{
1086 unsigned int v, ver, maxlvt; 1568 unsigned int v, ver, maxlvt;
1569 u64 icr;
1087 1570
1088 if (apic_verbosity == APIC_QUIET) 1571 if (apic_verbosity == APIC_QUIET)
1089 return; 1572 return;
@@ -1091,7 +1574,7 @@ void __apicdebuginit print_local_APIC(void * dummy)
1091 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1574 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1092 smp_processor_id(), hard_smp_processor_id()); 1575 smp_processor_id(), hard_smp_processor_id());
1093 v = apic_read(APIC_ID); 1576 v = apic_read(APIC_ID);
1094 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); 1577 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id());
1095 v = apic_read(APIC_LVR); 1578 v = apic_read(APIC_LVR);
1096 printk(KERN_INFO "... APIC VERSION: %08x\n", v); 1579 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1097 ver = GET_APIC_VERSION(v); 1580 ver = GET_APIC_VERSION(v);
@@ -1100,20 +1583,31 @@ void __apicdebuginit print_local_APIC(void * dummy)
1100 v = apic_read(APIC_TASKPRI); 1583 v = apic_read(APIC_TASKPRI);
1101 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); 1584 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
1102 1585
1103 v = apic_read(APIC_ARBPRI); 1586 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1104 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, 1587 if (!APIC_XAPIC(ver)) {
1105 v & APIC_ARBPRI_MASK); 1588 v = apic_read(APIC_ARBPRI);
1106 v = apic_read(APIC_PROCPRI); 1589 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
1107 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); 1590 v & APIC_ARBPRI_MASK);
1591 }
1592 v = apic_read(APIC_PROCPRI);
1593 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
1594 }
1595
1596 /*
1597 * Remote read supported only in the 82489DX and local APIC for
1598 * Pentium processors.
1599 */
1600 if (!APIC_INTEGRATED(ver) || maxlvt == 3) {
1601 v = apic_read(APIC_RRR);
1602 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1603 }
1108 1604
1109 v = apic_read(APIC_EOI);
1110 printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
1111 v = apic_read(APIC_RRR);
1112 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1113 v = apic_read(APIC_LDR); 1605 v = apic_read(APIC_LDR);
1114 printk(KERN_DEBUG "... APIC LDR: %08x\n", v); 1606 printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
1115 v = apic_read(APIC_DFR); 1607 if (!x2apic_enabled()) {
1116 printk(KERN_DEBUG "... APIC DFR: %08x\n", v); 1608 v = apic_read(APIC_DFR);
1609 printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
1610 }
1117 v = apic_read(APIC_SPIV); 1611 v = apic_read(APIC_SPIV);
1118 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); 1612 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1119 1613
@@ -1124,13 +1618,17 @@ void __apicdebuginit print_local_APIC(void * dummy)
1124 printk(KERN_DEBUG "... APIC IRR field:\n"); 1618 printk(KERN_DEBUG "... APIC IRR field:\n");
1125 print_APIC_bitfield(APIC_IRR); 1619 print_APIC_bitfield(APIC_IRR);
1126 1620
1127 v = apic_read(APIC_ESR); 1621 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1128 printk(KERN_DEBUG "... APIC ESR: %08x\n", v); 1622 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1623 apic_write(APIC_ESR, 0);
1129 1624
1130 v = apic_read(APIC_ICR); 1625 v = apic_read(APIC_ESR);
1131 printk(KERN_DEBUG "... APIC ICR: %08x\n", v); 1626 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1132 v = apic_read(APIC_ICR2); 1627 }
1133 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); 1628
1629 icr = apic_icr_read();
1630 printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr);
1631 printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32));
1134 1632
1135 v = apic_read(APIC_LVTT); 1633 v = apic_read(APIC_LVTT);
1136 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); 1634 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
@@ -1158,12 +1656,17 @@ void __apicdebuginit print_local_APIC(void * dummy)
1158 printk("\n"); 1656 printk("\n");
1159} 1657}
1160 1658
1161void print_all_local_APICs (void) 1659__apicdebuginit(void) print_all_local_APICs(void)
1162{ 1660{
1163 on_each_cpu(print_local_APIC, NULL, 1); 1661 int cpu;
1662
1663 preempt_disable();
1664 for_each_online_cpu(cpu)
1665 smp_call_function_single(cpu, print_local_APIC, NULL, 1);
1666 preempt_enable();
1164} 1667}
1165 1668
1166void __apicdebuginit print_PIC(void) 1669__apicdebuginit(void) print_PIC(void)
1167{ 1670{
1168 unsigned int v; 1671 unsigned int v;
1169 unsigned long flags; 1672 unsigned long flags;
@@ -1195,19 +1698,34 @@ void __apicdebuginit print_PIC(void)
1195 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1698 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1196} 1699}
1197 1700
1198#endif /* 0 */ 1701__apicdebuginit(int) print_all_ICs(void)
1702{
1703 print_PIC();
1704 print_all_local_APICs();
1705 print_IO_APIC();
1706
1707 return 0;
1708}
1709
1710fs_initcall(print_all_ICs);
1711
1712
1713/* Where if anywhere is the i8259 connect in external int mode */
1714static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
1199 1715
1200void __init enable_IO_APIC(void) 1716void __init enable_IO_APIC(void)
1201{ 1717{
1202 union IO_APIC_reg_01 reg_01; 1718 union IO_APIC_reg_01 reg_01;
1203 int i8259_apic, i8259_pin; 1719 int i8259_apic, i8259_pin;
1204 int i, apic; 1720 int apic;
1205 unsigned long flags; 1721 unsigned long flags;
1206 1722
1207 for (i = 0; i < PIN_MAP_SIZE; i++) { 1723#ifdef CONFIG_X86_32
1208 irq_2_pin[i].pin = -1; 1724 int i;
1209 irq_2_pin[i].next = 0; 1725 if (!pirqs_enabled)
1210 } 1726 for (i = 0; i < MAX_PIRQS; i++)
1727 pirq_entries[i] = -1;
1728#endif
1211 1729
1212 /* 1730 /*
1213 * The number of IO-APIC IRQ registers (== #pins): 1731 * The number of IO-APIC IRQ registers (== #pins):
@@ -1237,6 +1755,10 @@ void __init enable_IO_APIC(void)
1237 } 1755 }
1238 found_i8259: 1756 found_i8259:
1239 /* Look to see what if the MP table has reported the ExtINT */ 1757 /* Look to see what if the MP table has reported the ExtINT */
1758 /* If we could not find the appropriate pin by looking at the ioapic
1759 * the i8259 probably is not connected the ioapic but give the
1760 * mptable a chance anyway.
1761 */
1240 i8259_pin = find_isa_irq_pin(0, mp_ExtINT); 1762 i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
1241 i8259_apic = find_isa_irq_apic(0, mp_ExtINT); 1763 i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
1242 /* Trust the MP table if nothing is setup in the hardware */ 1764 /* Trust the MP table if nothing is setup in the hardware */
@@ -1285,7 +1807,7 @@ void disable_IO_APIC(void)
1285 entry.dest_mode = 0; /* Physical */ 1807 entry.dest_mode = 0; /* Physical */
1286 entry.delivery_mode = dest_ExtINT; /* ExtInt */ 1808 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1287 entry.vector = 0; 1809 entry.vector = 0;
1288 entry.dest = GET_APIC_ID(read_apic_id()); 1810 entry.dest = read_apic_id();
1289 1811
1290 /* 1812 /*
1291 * Add it to the IO-APIC irq-routing table: 1813 * Add it to the IO-APIC irq-routing table:
@@ -1296,6 +1818,133 @@ void disable_IO_APIC(void)
1296 disconnect_bsp_APIC(ioapic_i8259.pin != -1); 1818 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1297} 1819}
1298 1820
1821#ifdef CONFIG_X86_32
1822/*
1823 * function to set the IO-APIC physical IDs based on the
1824 * values stored in the MPC table.
1825 *
1826 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1827 */
1828
1829static void __init setup_ioapic_ids_from_mpc(void)
1830{
1831 union IO_APIC_reg_00 reg_00;
1832 physid_mask_t phys_id_present_map;
1833 int apic;
1834 int i;
1835 unsigned char old_id;
1836 unsigned long flags;
1837
1838 if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids())
1839 return;
1840
1841 /*
1842 * Don't check I/O APIC IDs for xAPIC systems. They have
1843 * no meaning without the serial APIC bus.
1844 */
1845 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1846 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
1847 return;
1848 /*
1849 * This is broken; anything with a real cpu count has to
1850 * circumvent this idiocy regardless.
1851 */
1852 phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
1853
1854 /*
1855 * Set the IOAPIC ID to the value stored in the MPC table.
1856 */
1857 for (apic = 0; apic < nr_ioapics; apic++) {
1858
1859 /* Read the register 0 value */
1860 spin_lock_irqsave(&ioapic_lock, flags);
1861 reg_00.raw = io_apic_read(apic, 0);
1862 spin_unlock_irqrestore(&ioapic_lock, flags);
1863
1864 old_id = mp_ioapics[apic].mp_apicid;
1865
1866 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
1867 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1868 apic, mp_ioapics[apic].mp_apicid);
1869 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1870 reg_00.bits.ID);
1871 mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
1872 }
1873
1874 /*
1875 * Sanity check, is the ID really free? Every APIC in a
1876 * system must have a unique ID or we get lots of nice
1877 * 'stuck on smp_invalidate_needed IPI wait' messages.
1878 */
1879 if (check_apicid_used(phys_id_present_map,
1880 mp_ioapics[apic].mp_apicid)) {
1881 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1882 apic, mp_ioapics[apic].mp_apicid);
1883 for (i = 0; i < get_physical_broadcast(); i++)
1884 if (!physid_isset(i, phys_id_present_map))
1885 break;
1886 if (i >= get_physical_broadcast())
1887 panic("Max APIC ID exceeded!\n");
1888 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1889 i);
1890 physid_set(i, phys_id_present_map);
1891 mp_ioapics[apic].mp_apicid = i;
1892 } else {
1893 physid_mask_t tmp;
1894 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
1895 apic_printk(APIC_VERBOSE, "Setting %d in the "
1896 "phys_id_present_map\n",
1897 mp_ioapics[apic].mp_apicid);
1898 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1899 }
1900
1901
1902 /*
1903 * We need to adjust the IRQ routing table
1904 * if the ID changed.
1905 */
1906 if (old_id != mp_ioapics[apic].mp_apicid)
1907 for (i = 0; i < mp_irq_entries; i++)
1908 if (mp_irqs[i].mp_dstapic == old_id)
1909 mp_irqs[i].mp_dstapic
1910 = mp_ioapics[apic].mp_apicid;
1911
1912 /*
1913 * Read the right value from the MPC table and
1914 * write it into the ID register.
1915 */
1916 apic_printk(APIC_VERBOSE, KERN_INFO
1917 "...changing IO-APIC physical APIC ID to %d ...",
1918 mp_ioapics[apic].mp_apicid);
1919
1920 reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
1921 spin_lock_irqsave(&ioapic_lock, flags);
1922 io_apic_write(apic, 0, reg_00.raw);
1923 spin_unlock_irqrestore(&ioapic_lock, flags);
1924
1925 /*
1926 * Sanity check
1927 */
1928 spin_lock_irqsave(&ioapic_lock, flags);
1929 reg_00.raw = io_apic_read(apic, 0);
1930 spin_unlock_irqrestore(&ioapic_lock, flags);
1931 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
1932 printk("could not set ID!\n");
1933 else
1934 apic_printk(APIC_VERBOSE, " ok.\n");
1935 }
1936}
1937#endif
1938
1939int no_timer_check __initdata;
1940
1941static int __init notimercheck(char *s)
1942{
1943 no_timer_check = 1;
1944 return 1;
1945}
1946__setup("no_timer_check", notimercheck);
1947
1299/* 1948/*
1300 * There is a nasty bug in some older SMP boards, their mptable lies 1949 * There is a nasty bug in some older SMP boards, their mptable lies
1301 * about the timer IRQ. We do the following to work around the situation: 1950 * about the timer IRQ. We do the following to work around the situation:
@@ -1309,6 +1958,9 @@ static int __init timer_irq_works(void)
1309 unsigned long t1 = jiffies; 1958 unsigned long t1 = jiffies;
1310 unsigned long flags; 1959 unsigned long flags;
1311 1960
1961 if (no_timer_check)
1962 return 1;
1963
1312 local_save_flags(flags); 1964 local_save_flags(flags);
1313 local_irq_enable(); 1965 local_irq_enable();
1314 /* Let ten ticks pass... */ 1966 /* Let ten ticks pass... */
@@ -1369,19 +2021,27 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
1369 return was_pending; 2021 return was_pending;
1370} 2022}
1371 2023
2024#ifdef CONFIG_X86_64
1372static int ioapic_retrigger_irq(unsigned int irq) 2025static int ioapic_retrigger_irq(unsigned int irq)
1373{ 2026{
1374 struct irq_cfg *cfg = &irq_cfg[irq]; 2027
1375 cpumask_t mask; 2028 struct irq_cfg *cfg = irq_cfg(irq);
1376 unsigned long flags; 2029 unsigned long flags;
1377 2030
1378 spin_lock_irqsave(&vector_lock, flags); 2031 spin_lock_irqsave(&vector_lock, flags);
1379 mask = cpumask_of_cpu(first_cpu(cfg->domain)); 2032 send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector);
1380 send_IPI_mask(mask, cfg->vector);
1381 spin_unlock_irqrestore(&vector_lock, flags); 2033 spin_unlock_irqrestore(&vector_lock, flags);
1382 2034
1383 return 1; 2035 return 1;
1384} 2036}
2037#else
2038static int ioapic_retrigger_irq(unsigned int irq)
2039{
2040 send_IPI_self(irq_cfg(irq)->vector);
2041
2042 return 1;
2043}
2044#endif
1385 2045
1386/* 2046/*
1387 * Level and edge triggered IO-APIC interrupts need different handling, 2047 * Level and edge triggered IO-APIC interrupts need different handling,
@@ -1393,11 +2053,159 @@ static int ioapic_retrigger_irq(unsigned int irq)
1393 */ 2053 */
1394 2054
1395#ifdef CONFIG_SMP 2055#ifdef CONFIG_SMP
2056
2057#ifdef CONFIG_INTR_REMAP
2058static void ir_irq_migration(struct work_struct *work);
2059
2060static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration);
2061
2062/*
2063 * Migrate the IO-APIC irq in the presence of intr-remapping.
2064 *
2065 * For edge triggered, irq migration is a simple atomic update(of vector
2066 * and cpu destination) of IRTE and flush the hardware cache.
2067 *
2068 * For level triggered, we need to modify the io-apic RTE aswell with the update
2069 * vector information, along with modifying IRTE with vector and destination.
2070 * So irq migration for level triggered is little bit more complex compared to
2071 * edge triggered migration. But the good news is, we use the same algorithm
2072 * for level triggered migration as we have today, only difference being,
2073 * we now initiate the irq migration from process context instead of the
2074 * interrupt context.
2075 *
2076 * In future, when we do a directed EOI (combined with cpu EOI broadcast
2077 * suppression) to the IO-APIC, level triggered irq migration will also be
2078 * as simple as edge triggered migration and we can do the irq migration
2079 * with a simple atomic update to IO-APIC RTE.
2080 */
2081static void migrate_ioapic_irq(int irq, cpumask_t mask)
2082{
2083 struct irq_cfg *cfg;
2084 struct irq_desc *desc;
2085 cpumask_t tmp, cleanup_mask;
2086 struct irte irte;
2087 int modify_ioapic_rte;
2088 unsigned int dest;
2089 unsigned long flags;
2090
2091 cpus_and(tmp, mask, cpu_online_map);
2092 if (cpus_empty(tmp))
2093 return;
2094
2095 if (get_irte(irq, &irte))
2096 return;
2097
2098 if (assign_irq_vector(irq, mask))
2099 return;
2100
2101 cfg = irq_cfg(irq);
2102 cpus_and(tmp, cfg->domain, mask);
2103 dest = cpu_mask_to_apicid(tmp);
2104
2105 desc = irq_to_desc(irq);
2106 modify_ioapic_rte = desc->status & IRQ_LEVEL;
2107 if (modify_ioapic_rte) {
2108 spin_lock_irqsave(&ioapic_lock, flags);
2109 __target_IO_APIC_irq(irq, dest, cfg->vector);
2110 spin_unlock_irqrestore(&ioapic_lock, flags);
2111 }
2112
2113 irte.vector = cfg->vector;
2114 irte.dest_id = IRTE_DEST(dest);
2115
2116 /*
2117 * Modified the IRTE and flushes the Interrupt entry cache.
2118 */
2119 modify_irte(irq, &irte);
2120
2121 if (cfg->move_in_progress) {
2122 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
2123 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
2124 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2125 cfg->move_in_progress = 0;
2126 }
2127
2128 desc->affinity = mask;
2129}
2130
2131static int migrate_irq_remapped_level(int irq)
2132{
2133 int ret = -1;
2134 struct irq_desc *desc = irq_to_desc(irq);
2135
2136 mask_IO_APIC_irq(irq);
2137
2138 if (io_apic_level_ack_pending(irq)) {
2139 /*
2140 * Interrupt in progress. Migrating irq now will change the
2141 * vector information in the IO-APIC RTE and that will confuse
2142 * the EOI broadcast performed by cpu.
2143 * So, delay the irq migration to the next instance.
2144 */
2145 schedule_delayed_work(&ir_migration_work, 1);
2146 goto unmask;
2147 }
2148
2149 /* everthing is clear. we have right of way */
2150 migrate_ioapic_irq(irq, desc->pending_mask);
2151
2152 ret = 0;
2153 desc->status &= ~IRQ_MOVE_PENDING;
2154 cpus_clear(desc->pending_mask);
2155
2156unmask:
2157 unmask_IO_APIC_irq(irq);
2158 return ret;
2159}
2160
2161static void ir_irq_migration(struct work_struct *work)
2162{
2163 unsigned int irq;
2164 struct irq_desc *desc;
2165
2166 for_each_irq_desc(irq, desc) {
2167 if (desc->status & IRQ_MOVE_PENDING) {
2168 unsigned long flags;
2169
2170 spin_lock_irqsave(&desc->lock, flags);
2171 if (!desc->chip->set_affinity ||
2172 !(desc->status & IRQ_MOVE_PENDING)) {
2173 desc->status &= ~IRQ_MOVE_PENDING;
2174 spin_unlock_irqrestore(&desc->lock, flags);
2175 continue;
2176 }
2177
2178 desc->chip->set_affinity(irq, desc->pending_mask);
2179 spin_unlock_irqrestore(&desc->lock, flags);
2180 }
2181 }
2182}
2183
2184/*
2185 * Migrates the IRQ destination in the process context.
2186 */
2187static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask)
2188{
2189 struct irq_desc *desc = irq_to_desc(irq);
2190
2191 if (desc->status & IRQ_LEVEL) {
2192 desc->status |= IRQ_MOVE_PENDING;
2193 desc->pending_mask = mask;
2194 migrate_irq_remapped_level(irq);
2195 return;
2196 }
2197
2198 migrate_ioapic_irq(irq, mask);
2199}
2200#endif
2201
1396asmlinkage void smp_irq_move_cleanup_interrupt(void) 2202asmlinkage void smp_irq_move_cleanup_interrupt(void)
1397{ 2203{
1398 unsigned vector, me; 2204 unsigned vector, me;
1399 ack_APIC_irq(); 2205 ack_APIC_irq();
2206#ifdef CONFIG_X86_64
1400 exit_idle(); 2207 exit_idle();
2208#endif
1401 irq_enter(); 2209 irq_enter();
1402 2210
1403 me = smp_processor_id(); 2211 me = smp_processor_id();
@@ -1406,11 +2214,12 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
1406 struct irq_desc *desc; 2214 struct irq_desc *desc;
1407 struct irq_cfg *cfg; 2215 struct irq_cfg *cfg;
1408 irq = __get_cpu_var(vector_irq)[vector]; 2216 irq = __get_cpu_var(vector_irq)[vector];
1409 if (irq >= NR_IRQS) 2217
2218 desc = irq_to_desc(irq);
2219 if (!desc)
1410 continue; 2220 continue;
1411 2221
1412 desc = irq_desc + irq; 2222 cfg = irq_cfg(irq);
1413 cfg = irq_cfg + irq;
1414 spin_lock(&desc->lock); 2223 spin_lock(&desc->lock);
1415 if (!cfg->move_cleanup_count) 2224 if (!cfg->move_cleanup_count)
1416 goto unlock; 2225 goto unlock;
@@ -1429,7 +2238,7 @@ unlock:
1429 2238
1430static void irq_complete_move(unsigned int irq) 2239static void irq_complete_move(unsigned int irq)
1431{ 2240{
1432 struct irq_cfg *cfg = irq_cfg + irq; 2241 struct irq_cfg *cfg = irq_cfg(irq);
1433 unsigned vector, me; 2242 unsigned vector, me;
1434 2243
1435 if (likely(!cfg->move_in_progress)) 2244 if (likely(!cfg->move_in_progress))
@@ -1449,6 +2258,17 @@ static void irq_complete_move(unsigned int irq)
1449#else 2258#else
1450static inline void irq_complete_move(unsigned int irq) {} 2259static inline void irq_complete_move(unsigned int irq) {}
1451#endif 2260#endif
2261#ifdef CONFIG_INTR_REMAP
2262static void ack_x2apic_level(unsigned int irq)
2263{
2264 ack_x2APIC_irq();
2265}
2266
2267static void ack_x2apic_edge(unsigned int irq)
2268{
2269 ack_x2APIC_irq();
2270}
2271#endif
1452 2272
1453static void ack_apic_edge(unsigned int irq) 2273static void ack_apic_edge(unsigned int irq)
1454{ 2274{
@@ -1457,19 +2277,50 @@ static void ack_apic_edge(unsigned int irq)
1457 ack_APIC_irq(); 2277 ack_APIC_irq();
1458} 2278}
1459 2279
2280atomic_t irq_mis_count;
2281
1460static void ack_apic_level(unsigned int irq) 2282static void ack_apic_level(unsigned int irq)
1461{ 2283{
2284#ifdef CONFIG_X86_32
2285 unsigned long v;
2286 int i;
2287#endif
1462 int do_unmask_irq = 0; 2288 int do_unmask_irq = 0;
1463 2289
1464 irq_complete_move(irq); 2290 irq_complete_move(irq);
1465#ifdef CONFIG_GENERIC_PENDING_IRQ 2291#ifdef CONFIG_GENERIC_PENDING_IRQ
1466 /* If we are moving the irq we need to mask it */ 2292 /* If we are moving the irq we need to mask it */
1467 if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { 2293 if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) {
1468 do_unmask_irq = 1; 2294 do_unmask_irq = 1;
1469 mask_IO_APIC_irq(irq); 2295 mask_IO_APIC_irq(irq);
1470 } 2296 }
1471#endif 2297#endif
1472 2298
2299#ifdef CONFIG_X86_32
2300 /*
2301 * It appears there is an erratum which affects at least version 0x11
2302 * of I/O APIC (that's the 82093AA and cores integrated into various
2303 * chipsets). Under certain conditions a level-triggered interrupt is
2304 * erroneously delivered as edge-triggered one but the respective IRR
2305 * bit gets set nevertheless. As a result the I/O unit expects an EOI
2306 * message but it will never arrive and further interrupts are blocked
2307 * from the source. The exact reason is so far unknown, but the
2308 * phenomenon was observed when two consecutive interrupt requests
2309 * from a given source get delivered to the same CPU and the source is
2310 * temporarily disabled in between.
2311 *
2312 * A workaround is to simulate an EOI message manually. We achieve it
2313 * by setting the trigger mode to edge and then to level when the edge
2314 * trigger mode gets detected in the TMR of a local APIC for a
2315 * level-triggered interrupt. We mask the source for the time of the
2316 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2317 * The idea is from Manfred Spraul. --macro
2318 */
2319 i = irq_cfg(irq)->vector;
2320
2321 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2322#endif
2323
1473 /* 2324 /*
1474 * We must acknowledge the irq before we move it or the acknowledge will 2325 * We must acknowledge the irq before we move it or the acknowledge will
1475 * not propagate properly. 2326 * not propagate properly.
@@ -1508,24 +2359,51 @@ static void ack_apic_level(unsigned int irq)
1508 move_masked_irq(irq); 2359 move_masked_irq(irq);
1509 unmask_IO_APIC_irq(irq); 2360 unmask_IO_APIC_irq(irq);
1510 } 2361 }
2362
2363#ifdef CONFIG_X86_32
2364 if (!(v & (1 << (i & 0x1f)))) {
2365 atomic_inc(&irq_mis_count);
2366 spin_lock(&ioapic_lock);
2367 __mask_and_edge_IO_APIC_irq(irq);
2368 __unmask_and_level_IO_APIC_irq(irq);
2369 spin_unlock(&ioapic_lock);
2370 }
2371#endif
1511} 2372}
1512 2373
1513static struct irq_chip ioapic_chip __read_mostly = { 2374static struct irq_chip ioapic_chip __read_mostly = {
1514 .name = "IO-APIC", 2375 .name = "IO-APIC",
1515 .startup = startup_ioapic_irq, 2376 .startup = startup_ioapic_irq,
1516 .mask = mask_IO_APIC_irq, 2377 .mask = mask_IO_APIC_irq,
1517 .unmask = unmask_IO_APIC_irq, 2378 .unmask = unmask_IO_APIC_irq,
1518 .ack = ack_apic_edge, 2379 .ack = ack_apic_edge,
1519 .eoi = ack_apic_level, 2380 .eoi = ack_apic_level,
1520#ifdef CONFIG_SMP 2381#ifdef CONFIG_SMP
1521 .set_affinity = set_ioapic_affinity_irq, 2382 .set_affinity = set_ioapic_affinity_irq,
1522#endif 2383#endif
1523 .retrigger = ioapic_retrigger_irq, 2384 .retrigger = ioapic_retrigger_irq,
1524}; 2385};
1525 2386
2387#ifdef CONFIG_INTR_REMAP
2388static struct irq_chip ir_ioapic_chip __read_mostly = {
2389 .name = "IR-IO-APIC",
2390 .startup = startup_ioapic_irq,
2391 .mask = mask_IO_APIC_irq,
2392 .unmask = unmask_IO_APIC_irq,
2393 .ack = ack_x2apic_edge,
2394 .eoi = ack_x2apic_level,
2395#ifdef CONFIG_SMP
2396 .set_affinity = set_ir_ioapic_affinity_irq,
2397#endif
2398 .retrigger = ioapic_retrigger_irq,
2399};
2400#endif
2401
1526static inline void init_IO_APIC_traps(void) 2402static inline void init_IO_APIC_traps(void)
1527{ 2403{
1528 int irq; 2404 int irq;
2405 struct irq_desc *desc;
2406 struct irq_cfg *cfg;
1529 2407
1530 /* 2408 /*
1531 * NOTE! The local APIC isn't very good at handling 2409 * NOTE! The local APIC isn't very good at handling
@@ -1538,8 +2416,8 @@ static inline void init_IO_APIC_traps(void)
1538 * Also, we've got to be careful not to trash gate 2416 * Also, we've got to be careful not to trash gate
1539 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2417 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1540 */ 2418 */
1541 for (irq = 0; irq < NR_IRQS ; irq++) { 2419 for_each_irq_cfg(irq, cfg) {
1542 if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) { 2420 if (IO_APIC_IRQ(irq) && !cfg->vector) {
1543 /* 2421 /*
1544 * Hmm.. We don't have an entry for this, 2422 * Hmm.. We don't have an entry for this,
1545 * so default to an old-fashioned 8259 2423 * so default to an old-fashioned 8259
@@ -1547,27 +2425,33 @@ static inline void init_IO_APIC_traps(void)
1547 */ 2425 */
1548 if (irq < 16) 2426 if (irq < 16)
1549 make_8259A_irq(irq); 2427 make_8259A_irq(irq);
1550 else 2428 else {
2429 desc = irq_to_desc(irq);
1551 /* Strange. Oh, well.. */ 2430 /* Strange. Oh, well.. */
1552 irq_desc[irq].chip = &no_irq_chip; 2431 desc->chip = &no_irq_chip;
2432 }
1553 } 2433 }
1554 } 2434 }
1555} 2435}
1556 2436
1557static void unmask_lapic_irq(unsigned int irq) 2437/*
2438 * The local APIC irq-chip implementation:
2439 */
2440
2441static void mask_lapic_irq(unsigned int irq)
1558{ 2442{
1559 unsigned long v; 2443 unsigned long v;
1560 2444
1561 v = apic_read(APIC_LVT0); 2445 v = apic_read(APIC_LVT0);
1562 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); 2446 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
1563} 2447}
1564 2448
1565static void mask_lapic_irq(unsigned int irq) 2449static void unmask_lapic_irq(unsigned int irq)
1566{ 2450{
1567 unsigned long v; 2451 unsigned long v;
1568 2452
1569 v = apic_read(APIC_LVT0); 2453 v = apic_read(APIC_LVT0);
1570 apic_write(APIC_LVT0, v | APIC_LVT_MASKED); 2454 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
1571} 2455}
1572 2456
1573static void ack_lapic_irq (unsigned int irq) 2457static void ack_lapic_irq (unsigned int irq)
@@ -1584,7 +2468,10 @@ static struct irq_chip lapic_chip __read_mostly = {
1584 2468
1585static void lapic_register_intr(int irq) 2469static void lapic_register_intr(int irq)
1586{ 2470{
1587 irq_desc[irq].status &= ~IRQ_LEVEL; 2471 struct irq_desc *desc;
2472
2473 desc = irq_to_desc(irq);
2474 desc->status &= ~IRQ_LEVEL;
1588 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2475 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
1589 "edge"); 2476 "edge");
1590} 2477}
@@ -1592,19 +2479,19 @@ static void lapic_register_intr(int irq)
1592static void __init setup_nmi(void) 2479static void __init setup_nmi(void)
1593{ 2480{
1594 /* 2481 /*
1595 * Dirty trick to enable the NMI watchdog ... 2482 * Dirty trick to enable the NMI watchdog ...
1596 * We put the 8259A master into AEOI mode and 2483 * We put the 8259A master into AEOI mode and
1597 * unmask on all local APICs LVT0 as NMI. 2484 * unmask on all local APICs LVT0 as NMI.
1598 * 2485 *
1599 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') 2486 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
1600 * is from Maciej W. Rozycki - so we do not have to EOI from 2487 * is from Maciej W. Rozycki - so we do not have to EOI from
1601 * the NMI handler or the timer interrupt. 2488 * the NMI handler or the timer interrupt.
1602 */ 2489 */
1603 printk(KERN_INFO "activating NMI Watchdog ..."); 2490 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
1604 2491
1605 enable_NMI_through_LVT0(); 2492 enable_NMI_through_LVT0();
1606 2493
1607 printk(" done.\n"); 2494 apic_printk(APIC_VERBOSE, " done.\n");
1608} 2495}
1609 2496
1610/* 2497/*
@@ -1621,12 +2508,17 @@ static inline void __init unlock_ExtINT_logic(void)
1621 unsigned char save_control, save_freq_select; 2508 unsigned char save_control, save_freq_select;
1622 2509
1623 pin = find_isa_irq_pin(8, mp_INT); 2510 pin = find_isa_irq_pin(8, mp_INT);
2511 if (pin == -1) {
2512 WARN_ON_ONCE(1);
2513 return;
2514 }
1624 apic = find_isa_irq_apic(8, mp_INT); 2515 apic = find_isa_irq_apic(8, mp_INT);
1625 if (pin == -1) 2516 if (apic == -1) {
2517 WARN_ON_ONCE(1);
1626 return; 2518 return;
2519 }
1627 2520
1628 entry0 = ioapic_read_entry(apic, pin); 2521 entry0 = ioapic_read_entry(apic, pin);
1629
1630 clear_IO_APIC_pin(apic, pin); 2522 clear_IO_APIC_pin(apic, pin);
1631 2523
1632 memset(&entry1, 0, sizeof(entry1)); 2524 memset(&entry1, 0, sizeof(entry1));
@@ -1661,23 +2553,38 @@ static inline void __init unlock_ExtINT_logic(void)
1661 ioapic_write_entry(apic, pin, entry0); 2553 ioapic_write_entry(apic, pin, entry0);
1662} 2554}
1663 2555
2556static int disable_timer_pin_1 __initdata;
2557/* Actually the next is obsolete, but keep it for paranoid reasons -AK */
2558static int __init disable_timer_pin_setup(char *arg)
2559{
2560 disable_timer_pin_1 = 1;
2561 return 0;
2562}
2563early_param("disable_timer_pin_1", disable_timer_pin_setup);
2564
2565int timer_through_8259 __initdata;
2566
1664/* 2567/*
1665 * This code may look a bit paranoid, but it's supposed to cooperate with 2568 * This code may look a bit paranoid, but it's supposed to cooperate with
1666 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ 2569 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
1667 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast 2570 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
1668 * fanatically on his truly buggy board. 2571 * fanatically on his truly buggy board.
1669 * 2572 *
1670 * FIXME: really need to revamp this for modern platforms only. 2573 * FIXME: really need to revamp this for all platforms.
1671 */ 2574 */
1672static inline void __init check_timer(void) 2575static inline void __init check_timer(void)
1673{ 2576{
1674 struct irq_cfg *cfg = irq_cfg + 0; 2577 struct irq_cfg *cfg = irq_cfg(0);
1675 int apic1, pin1, apic2, pin2; 2578 int apic1, pin1, apic2, pin2;
1676 unsigned long flags; 2579 unsigned long flags;
2580 unsigned int ver;
1677 int no_pin1 = 0; 2581 int no_pin1 = 0;
1678 2582
1679 local_irq_save(flags); 2583 local_irq_save(flags);
1680 2584
2585 ver = apic_read(APIC_LVR);
2586 ver = GET_APIC_VERSION(ver);
2587
1681 /* 2588 /*
1682 * get/set the timer IRQ vector: 2589 * get/set the timer IRQ vector:
1683 */ 2590 */
@@ -1686,18 +2593,27 @@ static inline void __init check_timer(void)
1686 2593
1687 /* 2594 /*
1688 * As IRQ0 is to be enabled in the 8259A, the virtual 2595 * As IRQ0 is to be enabled in the 8259A, the virtual
1689 * wire has to be disabled in the local APIC. 2596 * wire has to be disabled in the local APIC. Also
2597 * timer interrupts need to be acknowledged manually in
2598 * the 8259A for the i82489DX when using the NMI
2599 * watchdog as that APIC treats NMIs as level-triggered.
2600 * The AEOI mode will finish them in the 8259A
2601 * automatically.
1690 */ 2602 */
1691 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2603 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
1692 init_8259A(1); 2604 init_8259A(1);
2605#ifdef CONFIG_X86_32
2606 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2607#endif
1693 2608
1694 pin1 = find_isa_irq_pin(0, mp_INT); 2609 pin1 = find_isa_irq_pin(0, mp_INT);
1695 apic1 = find_isa_irq_apic(0, mp_INT); 2610 apic1 = find_isa_irq_apic(0, mp_INT);
1696 pin2 = ioapic_i8259.pin; 2611 pin2 = ioapic_i8259.pin;
1697 apic2 = ioapic_i8259.apic; 2612 apic2 = ioapic_i8259.apic;
1698 2613
1699 apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", 2614 apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X "
1700 cfg->vector, apic1, pin1, apic2, pin2); 2615 "apic1=%d pin1=%d apic2=%d pin2=%d\n",
2616 cfg->vector, apic1, pin1, apic2, pin2);
1701 2617
1702 /* 2618 /*
1703 * Some BIOS writers are clueless and report the ExtINTA 2619 * Some BIOS writers are clueless and report the ExtINTA
@@ -1707,6 +2623,10 @@ static inline void __init check_timer(void)
1707 * 8259A. 2623 * 8259A.
1708 */ 2624 */
1709 if (pin1 == -1) { 2625 if (pin1 == -1) {
2626#ifdef CONFIG_INTR_REMAP
2627 if (intr_remapping_enabled)
2628 panic("BIOS bug: timer not connected to IO-APIC");
2629#endif
1710 pin1 = pin2; 2630 pin1 = pin2;
1711 apic1 = apic2; 2631 apic1 = apic2;
1712 no_pin1 = 1; 2632 no_pin1 = 1;
@@ -1724,7 +2644,7 @@ static inline void __init check_timer(void)
1724 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2644 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
1725 } 2645 }
1726 unmask_IO_APIC_irq(0); 2646 unmask_IO_APIC_irq(0);
1727 if (!no_timer_check && timer_irq_works()) { 2647 if (timer_irq_works()) {
1728 if (nmi_watchdog == NMI_IO_APIC) { 2648 if (nmi_watchdog == NMI_IO_APIC) {
1729 setup_nmi(); 2649 setup_nmi();
1730 enable_8259A_irq(0); 2650 enable_8259A_irq(0);
@@ -1733,16 +2653,19 @@ static inline void __init check_timer(void)
1733 clear_IO_APIC_pin(0, pin1); 2653 clear_IO_APIC_pin(0, pin1);
1734 goto out; 2654 goto out;
1735 } 2655 }
2656#ifdef CONFIG_INTR_REMAP
2657 if (intr_remapping_enabled)
2658 panic("timer doesn't work through Interrupt-remapped IO-APIC");
2659#endif
1736 clear_IO_APIC_pin(apic1, pin1); 2660 clear_IO_APIC_pin(apic1, pin1);
1737 if (!no_pin1) 2661 if (!no_pin1)
1738 apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: " 2662 apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: "
1739 "8254 timer not connected to IO-APIC\n"); 2663 "8254 timer not connected to IO-APIC\n");
1740 2664
1741 apic_printk(APIC_VERBOSE,KERN_INFO 2665 apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer "
1742 "...trying to set up timer (IRQ0) " 2666 "(IRQ0) through the 8259A ...\n");
1743 "through the 8259A ... "); 2667 apic_printk(APIC_QUIET, KERN_INFO
1744 apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", 2668 "..... (found apic %d pin %d) ...\n", apic2, pin2);
1745 apic2, pin2);
1746 /* 2669 /*
1747 * legacy devices should be connected to IO APIC #0 2670 * legacy devices should be connected to IO APIC #0
1748 */ 2671 */
@@ -1751,7 +2674,7 @@ static inline void __init check_timer(void)
1751 unmask_IO_APIC_irq(0); 2674 unmask_IO_APIC_irq(0);
1752 enable_8259A_irq(0); 2675 enable_8259A_irq(0);
1753 if (timer_irq_works()) { 2676 if (timer_irq_works()) {
1754 apic_printk(APIC_VERBOSE," works.\n"); 2677 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
1755 timer_through_8259 = 1; 2678 timer_through_8259 = 1;
1756 if (nmi_watchdog == NMI_IO_APIC) { 2679 if (nmi_watchdog == NMI_IO_APIC) {
1757 disable_8259A_irq(0); 2680 disable_8259A_irq(0);
@@ -1765,29 +2688,35 @@ static inline void __init check_timer(void)
1765 */ 2688 */
1766 disable_8259A_irq(0); 2689 disable_8259A_irq(0);
1767 clear_IO_APIC_pin(apic2, pin2); 2690 clear_IO_APIC_pin(apic2, pin2);
1768 apic_printk(APIC_VERBOSE," failed.\n"); 2691 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
1769 } 2692 }
1770 2693
1771 if (nmi_watchdog == NMI_IO_APIC) { 2694 if (nmi_watchdog == NMI_IO_APIC) {
1772 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); 2695 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
2696 "through the IO-APIC - disabling NMI Watchdog!\n");
1773 nmi_watchdog = NMI_NONE; 2697 nmi_watchdog = NMI_NONE;
1774 } 2698 }
2699#ifdef CONFIG_X86_32
2700 timer_ack = 0;
2701#endif
1775 2702
1776 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); 2703 apic_printk(APIC_QUIET, KERN_INFO
2704 "...trying to set up timer as Virtual Wire IRQ...\n");
1777 2705
1778 lapic_register_intr(0); 2706 lapic_register_intr(0);
1779 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 2707 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
1780 enable_8259A_irq(0); 2708 enable_8259A_irq(0);
1781 2709
1782 if (timer_irq_works()) { 2710 if (timer_irq_works()) {
1783 apic_printk(APIC_VERBOSE," works.\n"); 2711 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1784 goto out; 2712 goto out;
1785 } 2713 }
1786 disable_8259A_irq(0); 2714 disable_8259A_irq(0);
1787 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 2715 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
1788 apic_printk(APIC_VERBOSE," failed.\n"); 2716 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
1789 2717
1790 apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); 2718 apic_printk(APIC_QUIET, KERN_INFO
2719 "...trying to set up timer as ExtINT IRQ...\n");
1791 2720
1792 init_8259A(0); 2721 init_8259A(0);
1793 make_8259A_irq(0); 2722 make_8259A_irq(0);
@@ -1796,22 +2725,16 @@ static inline void __init check_timer(void)
1796 unlock_ExtINT_logic(); 2725 unlock_ExtINT_logic();
1797 2726
1798 if (timer_irq_works()) { 2727 if (timer_irq_works()) {
1799 apic_printk(APIC_VERBOSE," works.\n"); 2728 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
1800 goto out; 2729 goto out;
1801 } 2730 }
1802 apic_printk(APIC_VERBOSE," failed :(.\n"); 2731 apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n");
1803 panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); 2732 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2733 "report. Then try booting with the 'noapic' option.\n");
1804out: 2734out:
1805 local_irq_restore(flags); 2735 local_irq_restore(flags);
1806} 2736}
1807 2737
1808static int __init notimercheck(char *s)
1809{
1810 no_timer_check = 1;
1811 return 1;
1812}
1813__setup("no_timer_check", notimercheck);
1814
1815/* 2738/*
1816 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available 2739 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
1817 * to devices. However there may be an I/O APIC pin available for 2740 * to devices. However there may be an I/O APIC pin available for
@@ -1829,27 +2752,49 @@ __setup("no_timer_check", notimercheck);
1829 * the I/O APIC in all cases now. No actual device should request 2752 * the I/O APIC in all cases now. No actual device should request
1830 * it anyway. --macro 2753 * it anyway. --macro
1831 */ 2754 */
1832#define PIC_IRQS (1<<2) 2755#define PIC_IRQS (1 << PIC_CASCADE_IR)
1833 2756
1834void __init setup_IO_APIC(void) 2757void __init setup_IO_APIC(void)
1835{ 2758{
1836 2759
2760#ifdef CONFIG_X86_32
2761 enable_IO_APIC();
2762#else
1837 /* 2763 /*
1838 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 2764 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
1839 */ 2765 */
2766#endif
1840 2767
1841 io_apic_irqs = ~PIC_IRQS; 2768 io_apic_irqs = ~PIC_IRQS;
1842 2769
1843 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 2770 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
1844 2771 /*
2772 * Set up IO-APIC IRQ routing.
2773 */
2774#ifdef CONFIG_X86_32
2775 if (!acpi_ioapic)
2776 setup_ioapic_ids_from_mpc();
2777#endif
1845 sync_Arb_IDs(); 2778 sync_Arb_IDs();
1846 setup_IO_APIC_irqs(); 2779 setup_IO_APIC_irqs();
1847 init_IO_APIC_traps(); 2780 init_IO_APIC_traps();
1848 check_timer(); 2781 check_timer();
1849 if (!acpi_ioapic)
1850 print_IO_APIC();
1851} 2782}
1852 2783
2784/*
2785 * Called after all the initialization is done. If we didnt find any
2786 * APIC bugs then we can allow the modify fast path
2787 */
2788
2789static int __init io_apic_bug_finalize(void)
2790{
2791 if (sis_apic_bug == -1)
2792 sis_apic_bug = 0;
2793 return 0;
2794}
2795
2796late_initcall(io_apic_bug_finalize);
2797
1853struct sysfs_ioapic_data { 2798struct sysfs_ioapic_data {
1854 struct sys_device dev; 2799 struct sys_device dev;
1855 struct IO_APIC_route_entry entry[0]; 2800 struct IO_APIC_route_entry entry[0];
@@ -1937,38 +2882,60 @@ device_initcall(ioapic_init_sysfs);
1937/* 2882/*
1938 * Dynamic irq allocate and deallocation 2883 * Dynamic irq allocate and deallocation
1939 */ 2884 */
1940int create_irq(void) 2885unsigned int create_irq_nr(unsigned int irq_want)
1941{ 2886{
1942 /* Allocate an unused irq */ 2887 /* Allocate an unused irq */
1943 int irq; 2888 unsigned int irq;
1944 int new; 2889 unsigned int new;
1945 unsigned long flags; 2890 unsigned long flags;
2891 struct irq_cfg *cfg_new;
2892
2893 irq_want = nr_irqs - 1;
1946 2894
1947 irq = -ENOSPC; 2895 irq = 0;
1948 spin_lock_irqsave(&vector_lock, flags); 2896 spin_lock_irqsave(&vector_lock, flags);
1949 for (new = (NR_IRQS - 1); new >= 0; new--) { 2897 for (new = irq_want; new > 0; new--) {
1950 if (platform_legacy_irq(new)) 2898 if (platform_legacy_irq(new))
1951 continue; 2899 continue;
1952 if (irq_cfg[new].vector != 0) 2900 cfg_new = irq_cfg(new);
2901 if (cfg_new && cfg_new->vector != 0)
1953 continue; 2902 continue;
2903 /* check if need to create one */
2904 if (!cfg_new)
2905 cfg_new = irq_cfg_alloc(new);
1954 if (__assign_irq_vector(new, TARGET_CPUS) == 0) 2906 if (__assign_irq_vector(new, TARGET_CPUS) == 0)
1955 irq = new; 2907 irq = new;
1956 break; 2908 break;
1957 } 2909 }
1958 spin_unlock_irqrestore(&vector_lock, flags); 2910 spin_unlock_irqrestore(&vector_lock, flags);
1959 2911
1960 if (irq >= 0) { 2912 if (irq > 0) {
1961 dynamic_irq_init(irq); 2913 dynamic_irq_init(irq);
1962 } 2914 }
1963 return irq; 2915 return irq;
1964} 2916}
1965 2917
2918int create_irq(void)
2919{
2920 int irq;
2921
2922 irq = create_irq_nr(nr_irqs - 1);
2923
2924 if (irq == 0)
2925 irq = -1;
2926
2927 return irq;
2928}
2929
1966void destroy_irq(unsigned int irq) 2930void destroy_irq(unsigned int irq)
1967{ 2931{
1968 unsigned long flags; 2932 unsigned long flags;
1969 2933
1970 dynamic_irq_cleanup(irq); 2934 dynamic_irq_cleanup(irq);
1971 2935
2936#ifdef CONFIG_INTR_REMAP
2937 free_irte(irq);
2938#endif
1972 spin_lock_irqsave(&vector_lock, flags); 2939 spin_lock_irqsave(&vector_lock, flags);
1973 __clear_irq_vector(irq); 2940 __clear_irq_vector(irq);
1974 spin_unlock_irqrestore(&vector_lock, flags); 2941 spin_unlock_irqrestore(&vector_lock, flags);
@@ -1980,18 +2947,50 @@ void destroy_irq(unsigned int irq)
1980#ifdef CONFIG_PCI_MSI 2947#ifdef CONFIG_PCI_MSI
1981static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) 2948static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
1982{ 2949{
1983 struct irq_cfg *cfg = irq_cfg + irq; 2950 struct irq_cfg *cfg;
1984 int err; 2951 int err;
1985 unsigned dest; 2952 unsigned dest;
1986 cpumask_t tmp; 2953 cpumask_t tmp;
1987 2954
1988 tmp = TARGET_CPUS; 2955 tmp = TARGET_CPUS;
1989 err = assign_irq_vector(irq, tmp); 2956 err = assign_irq_vector(irq, tmp);
1990 if (!err) { 2957 if (err)
1991 cpus_and(tmp, cfg->domain, tmp); 2958 return err;
1992 dest = cpu_mask_to_apicid(tmp); 2959
2960 cfg = irq_cfg(irq);
2961 cpus_and(tmp, cfg->domain, tmp);
2962 dest = cpu_mask_to_apicid(tmp);
2963
2964#ifdef CONFIG_INTR_REMAP
2965 if (irq_remapped(irq)) {
2966 struct irte irte;
2967 int ir_index;
2968 u16 sub_handle;
2969
2970 ir_index = map_irq_to_irte_handle(irq, &sub_handle);
2971 BUG_ON(ir_index == -1);
2972
2973 memset (&irte, 0, sizeof(irte));
2974
2975 irte.present = 1;
2976 irte.dst_mode = INT_DEST_MODE;
2977 irte.trigger_mode = 0; /* edge */
2978 irte.dlvry_mode = INT_DELIVERY_MODE;
2979 irte.vector = cfg->vector;
2980 irte.dest_id = IRTE_DEST(dest);
2981
2982 modify_irte(irq, &irte);
1993 2983
1994 msg->address_hi = MSI_ADDR_BASE_HI; 2984 msg->address_hi = MSI_ADDR_BASE_HI;
2985 msg->data = sub_handle;
2986 msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT |
2987 MSI_ADDR_IR_SHV |
2988 MSI_ADDR_IR_INDEX1(ir_index) |
2989 MSI_ADDR_IR_INDEX2(ir_index);
2990 } else
2991#endif
2992 {
2993 msg->address_hi = MSI_ADDR_BASE_HI;
1995 msg->address_lo = 2994 msg->address_lo =
1996 MSI_ADDR_BASE_LO | 2995 MSI_ADDR_BASE_LO |
1997 ((INT_DEST_MODE == 0) ? 2996 ((INT_DEST_MODE == 0) ?
@@ -2016,10 +3015,11 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
2016#ifdef CONFIG_SMP 3015#ifdef CONFIG_SMP
2017static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) 3016static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2018{ 3017{
2019 struct irq_cfg *cfg = irq_cfg + irq; 3018 struct irq_cfg *cfg;
2020 struct msi_msg msg; 3019 struct msi_msg msg;
2021 unsigned int dest; 3020 unsigned int dest;
2022 cpumask_t tmp; 3021 cpumask_t tmp;
3022 struct irq_desc *desc;
2023 3023
2024 cpus_and(tmp, mask, cpu_online_map); 3024 cpus_and(tmp, mask, cpu_online_map);
2025 if (cpus_empty(tmp)) 3025 if (cpus_empty(tmp))
@@ -2028,6 +3028,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2028 if (assign_irq_vector(irq, mask)) 3028 if (assign_irq_vector(irq, mask))
2029 return; 3029 return;
2030 3030
3031 cfg = irq_cfg(irq);
2031 cpus_and(tmp, cfg->domain, mask); 3032 cpus_and(tmp, cfg->domain, mask);
2032 dest = cpu_mask_to_apicid(tmp); 3033 dest = cpu_mask_to_apicid(tmp);
2033 3034
@@ -2039,8 +3040,61 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2039 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3040 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2040 3041
2041 write_msi_msg(irq, &msg); 3042 write_msi_msg(irq, &msg);
2042 irq_desc[irq].affinity = mask; 3043 desc = irq_to_desc(irq);
3044 desc->affinity = mask;
2043} 3045}
3046
3047#ifdef CONFIG_INTR_REMAP
3048/*
3049 * Migrate the MSI irq to another cpumask. This migration is
3050 * done in the process context using interrupt-remapping hardware.
3051 */
3052static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
3053{
3054 struct irq_cfg *cfg;
3055 unsigned int dest;
3056 cpumask_t tmp, cleanup_mask;
3057 struct irte irte;
3058 struct irq_desc *desc;
3059
3060 cpus_and(tmp, mask, cpu_online_map);
3061 if (cpus_empty(tmp))
3062 return;
3063
3064 if (get_irte(irq, &irte))
3065 return;
3066
3067 if (assign_irq_vector(irq, mask))
3068 return;
3069
3070 cfg = irq_cfg(irq);
3071 cpus_and(tmp, cfg->domain, mask);
3072 dest = cpu_mask_to_apicid(tmp);
3073
3074 irte.vector = cfg->vector;
3075 irte.dest_id = IRTE_DEST(dest);
3076
3077 /*
3078 * atomically update the IRTE with the new destination and vector.
3079 */
3080 modify_irte(irq, &irte);
3081
3082 /*
3083 * After this point, all the interrupts will start arriving
3084 * at the new destination. So, time to cleanup the previous
3085 * vector allocation.
3086 */
3087 if (cfg->move_in_progress) {
3088 cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map);
3089 cfg->move_cleanup_count = cpus_weight(cleanup_mask);
3090 send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
3091 cfg->move_in_progress = 0;
3092 }
3093
3094 desc = irq_to_desc(irq);
3095 desc->affinity = mask;
3096}
3097#endif
2044#endif /* CONFIG_SMP */ 3098#endif /* CONFIG_SMP */
2045 3099
2046/* 3100/*
@@ -2058,26 +3112,179 @@ static struct irq_chip msi_chip = {
2058 .retrigger = ioapic_retrigger_irq, 3112 .retrigger = ioapic_retrigger_irq,
2059}; 3113};
2060 3114
2061int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) 3115#ifdef CONFIG_INTR_REMAP
3116static struct irq_chip msi_ir_chip = {
3117 .name = "IR-PCI-MSI",
3118 .unmask = unmask_msi_irq,
3119 .mask = mask_msi_irq,
3120 .ack = ack_x2apic_edge,
3121#ifdef CONFIG_SMP
3122 .set_affinity = ir_set_msi_irq_affinity,
3123#endif
3124 .retrigger = ioapic_retrigger_irq,
3125};
3126
3127/*
3128 * Map the PCI dev to the corresponding remapping hardware unit
3129 * and allocate 'nvec' consecutive interrupt-remapping table entries
3130 * in it.
3131 */
3132static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
2062{ 3133{
3134 struct intel_iommu *iommu;
3135 int index;
3136
3137 iommu = map_dev_to_ir(dev);
3138 if (!iommu) {
3139 printk(KERN_ERR
3140 "Unable to map PCI %s to iommu\n", pci_name(dev));
3141 return -ENOENT;
3142 }
3143
3144 index = alloc_irte(iommu, irq, nvec);
3145 if (index < 0) {
3146 printk(KERN_ERR
3147 "Unable to allocate %d IRTE for PCI %s\n", nvec,
3148 pci_name(dev));
3149 return -ENOSPC;
3150 }
3151 return index;
3152}
3153#endif
3154
3155static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq)
3156{
3157 int ret;
2063 struct msi_msg msg; 3158 struct msi_msg msg;
2064 int irq, ret;
2065 irq = create_irq();
2066 if (irq < 0)
2067 return irq;
2068 3159
2069 ret = msi_compose_msg(dev, irq, &msg); 3160 ret = msi_compose_msg(dev, irq, &msg);
3161 if (ret < 0)
3162 return ret;
3163
3164 set_irq_msi(irq, desc);
3165 write_msi_msg(irq, &msg);
3166
3167#ifdef CONFIG_INTR_REMAP
3168 if (irq_remapped(irq)) {
3169 struct irq_desc *desc = irq_to_desc(irq);
3170 /*
3171 * irq migration in process context
3172 */
3173 desc->status |= IRQ_MOVE_PCNTXT;
3174 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
3175 } else
3176#endif
3177 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
3178
3179 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
3180
3181 return 0;
3182}
3183
3184static unsigned int build_irq_for_pci_dev(struct pci_dev *dev)
3185{
3186 unsigned int irq;
3187
3188 irq = dev->bus->number;
3189 irq <<= 8;
3190 irq |= dev->devfn;
3191 irq <<= 12;
3192
3193 return irq;
3194}
3195
3196int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
3197{
3198 unsigned int irq;
3199 int ret;
3200 unsigned int irq_want;
3201
3202 irq_want = build_irq_for_pci_dev(dev) + 0x100;
3203
3204 irq = create_irq_nr(irq_want);
3205 if (irq == 0)
3206 return -1;
3207
3208#ifdef CONFIG_INTR_REMAP
3209 if (!intr_remapping_enabled)
3210 goto no_ir;
3211
3212 ret = msi_alloc_irte(dev, irq, 1);
3213 if (ret < 0)
3214 goto error;
3215no_ir:
3216#endif
3217 ret = setup_msi_irq(dev, desc, irq);
2070 if (ret < 0) { 3218 if (ret < 0) {
2071 destroy_irq(irq); 3219 destroy_irq(irq);
2072 return ret; 3220 return ret;
2073 } 3221 }
3222 return 0;
2074 3223
2075 set_irq_msi(irq, desc); 3224#ifdef CONFIG_INTR_REMAP
2076 write_msi_msg(irq, &msg); 3225error:
3226 destroy_irq(irq);
3227 return ret;
3228#endif
3229}
2077 3230
2078 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); 3231int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3232{
3233 unsigned int irq;
3234 int ret, sub_handle;
3235 struct msi_desc *desc;
3236 unsigned int irq_want;
2079 3237
3238#ifdef CONFIG_INTR_REMAP
3239 struct intel_iommu *iommu = 0;
3240 int index = 0;
3241#endif
3242
3243 irq_want = build_irq_for_pci_dev(dev) + 0x100;
3244 sub_handle = 0;
3245 list_for_each_entry(desc, &dev->msi_list, list) {
3246 irq = create_irq_nr(irq_want--);
3247 if (irq == 0)
3248 return -1;
3249#ifdef CONFIG_INTR_REMAP
3250 if (!intr_remapping_enabled)
3251 goto no_ir;
3252
3253 if (!sub_handle) {
3254 /*
3255 * allocate the consecutive block of IRTE's
3256 * for 'nvec'
3257 */
3258 index = msi_alloc_irte(dev, irq, nvec);
3259 if (index < 0) {
3260 ret = index;
3261 goto error;
3262 }
3263 } else {
3264 iommu = map_dev_to_ir(dev);
3265 if (!iommu) {
3266 ret = -ENOENT;
3267 goto error;
3268 }
3269 /*
3270 * setup the mapping between the irq and the IRTE
3271 * base index, the sub_handle pointing to the
3272 * appropriate interrupt remap table entry.
3273 */
3274 set_irte_irq(irq, iommu, index, sub_handle);
3275 }
3276no_ir:
3277#endif
3278 ret = setup_msi_irq(dev, desc, irq);
3279 if (ret < 0)
3280 goto error;
3281 sub_handle++;
3282 }
2080 return 0; 3283 return 0;
3284
3285error:
3286 destroy_irq(irq);
3287 return ret;
2081} 3288}
2082 3289
2083void arch_teardown_msi_irq(unsigned int irq) 3290void arch_teardown_msi_irq(unsigned int irq)
@@ -2089,10 +3296,11 @@ void arch_teardown_msi_irq(unsigned int irq)
2089#ifdef CONFIG_SMP 3296#ifdef CONFIG_SMP
2090static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) 3297static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
2091{ 3298{
2092 struct irq_cfg *cfg = irq_cfg + irq; 3299 struct irq_cfg *cfg;
2093 struct msi_msg msg; 3300 struct msi_msg msg;
2094 unsigned int dest; 3301 unsigned int dest;
2095 cpumask_t tmp; 3302 cpumask_t tmp;
3303 struct irq_desc *desc;
2096 3304
2097 cpus_and(tmp, mask, cpu_online_map); 3305 cpus_and(tmp, mask, cpu_online_map);
2098 if (cpus_empty(tmp)) 3306 if (cpus_empty(tmp))
@@ -2101,6 +3309,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
2101 if (assign_irq_vector(irq, mask)) 3309 if (assign_irq_vector(irq, mask))
2102 return; 3310 return;
2103 3311
3312 cfg = irq_cfg(irq);
2104 cpus_and(tmp, cfg->domain, mask); 3313 cpus_and(tmp, cfg->domain, mask);
2105 dest = cpu_mask_to_apicid(tmp); 3314 dest = cpu_mask_to_apicid(tmp);
2106 3315
@@ -2112,7 +3321,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask)
2112 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3321 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2113 3322
2114 dmar_msi_write(irq, &msg); 3323 dmar_msi_write(irq, &msg);
2115 irq_desc[irq].affinity = mask; 3324 desc = irq_to_desc(irq);
3325 desc->affinity = mask;
2116} 3326}
2117#endif /* CONFIG_SMP */ 3327#endif /* CONFIG_SMP */
2118 3328
@@ -2142,6 +3352,69 @@ int arch_setup_dmar_msi(unsigned int irq)
2142} 3352}
2143#endif 3353#endif
2144 3354
3355#ifdef CONFIG_HPET_TIMER
3356
3357#ifdef CONFIG_SMP
3358static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask)
3359{
3360 struct irq_cfg *cfg;
3361 struct irq_desc *desc;
3362 struct msi_msg msg;
3363 unsigned int dest;
3364 cpumask_t tmp;
3365
3366 cpus_and(tmp, mask, cpu_online_map);
3367 if (cpus_empty(tmp))
3368 return;
3369
3370 if (assign_irq_vector(irq, mask))
3371 return;
3372
3373 cfg = irq_cfg(irq);
3374 cpus_and(tmp, cfg->domain, mask);
3375 dest = cpu_mask_to_apicid(tmp);
3376
3377 hpet_msi_read(irq, &msg);
3378
3379 msg.data &= ~MSI_DATA_VECTOR_MASK;
3380 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3381 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3382 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3383
3384 hpet_msi_write(irq, &msg);
3385 desc = irq_to_desc(irq);
3386 desc->affinity = mask;
3387}
3388#endif /* CONFIG_SMP */
3389
3390struct irq_chip hpet_msi_type = {
3391 .name = "HPET_MSI",
3392 .unmask = hpet_msi_unmask,
3393 .mask = hpet_msi_mask,
3394 .ack = ack_apic_edge,
3395#ifdef CONFIG_SMP
3396 .set_affinity = hpet_msi_set_affinity,
3397#endif
3398 .retrigger = ioapic_retrigger_irq,
3399};
3400
3401int arch_setup_hpet_msi(unsigned int irq)
3402{
3403 int ret;
3404 struct msi_msg msg;
3405
3406 ret = msi_compose_msg(NULL, irq, &msg);
3407 if (ret < 0)
3408 return ret;
3409
3410 hpet_msi_write(irq, &msg);
3411 set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq,
3412 "edge");
3413
3414 return 0;
3415}
3416#endif
3417
2145#endif /* CONFIG_PCI_MSI */ 3418#endif /* CONFIG_PCI_MSI */
2146/* 3419/*
2147 * Hypertransport interrupt support 3420 * Hypertransport interrupt support
@@ -2166,9 +3439,10 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
2166 3439
2167static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) 3440static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2168{ 3441{
2169 struct irq_cfg *cfg = irq_cfg + irq; 3442 struct irq_cfg *cfg;
2170 unsigned int dest; 3443 unsigned int dest;
2171 cpumask_t tmp; 3444 cpumask_t tmp;
3445 struct irq_desc *desc;
2172 3446
2173 cpus_and(tmp, mask, cpu_online_map); 3447 cpus_and(tmp, mask, cpu_online_map);
2174 if (cpus_empty(tmp)) 3448 if (cpus_empty(tmp))
@@ -2177,11 +3451,13 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2177 if (assign_irq_vector(irq, mask)) 3451 if (assign_irq_vector(irq, mask))
2178 return; 3452 return;
2179 3453
3454 cfg = irq_cfg(irq);
2180 cpus_and(tmp, cfg->domain, mask); 3455 cpus_and(tmp, cfg->domain, mask);
2181 dest = cpu_mask_to_apicid(tmp); 3456 dest = cpu_mask_to_apicid(tmp);
2182 3457
2183 target_ht_irq(irq, dest, cfg->vector); 3458 target_ht_irq(irq, dest, cfg->vector);
2184 irq_desc[irq].affinity = mask; 3459 desc = irq_to_desc(irq);
3460 desc->affinity = mask;
2185} 3461}
2186#endif 3462#endif
2187 3463
@@ -2198,7 +3474,7 @@ static struct irq_chip ht_irq_chip = {
2198 3474
2199int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) 3475int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2200{ 3476{
2201 struct irq_cfg *cfg = irq_cfg + irq; 3477 struct irq_cfg *cfg;
2202 int err; 3478 int err;
2203 cpumask_t tmp; 3479 cpumask_t tmp;
2204 3480
@@ -2208,6 +3484,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2208 struct ht_irq_msg msg; 3484 struct ht_irq_msg msg;
2209 unsigned dest; 3485 unsigned dest;
2210 3486
3487 cfg = irq_cfg(irq);
2211 cpus_and(tmp, cfg->domain, tmp); 3488 cpus_and(tmp, cfg->domain, tmp);
2212 dest = cpu_mask_to_apicid(tmp); 3489 dest = cpu_mask_to_apicid(tmp);
2213 3490
@@ -2230,20 +3507,196 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2230 3507
2231 set_irq_chip_and_handler_name(irq, &ht_irq_chip, 3508 set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2232 handle_edge_irq, "edge"); 3509 handle_edge_irq, "edge");
3510
3511 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
2233 } 3512 }
2234 return err; 3513 return err;
2235} 3514}
2236#endif /* CONFIG_HT_IRQ */ 3515#endif /* CONFIG_HT_IRQ */
2237 3516
3517#ifdef CONFIG_X86_64
3518/*
3519 * Re-target the irq to the specified CPU and enable the specified MMR located
3520 * on the specified blade to allow the sending of MSIs to the specified CPU.
3521 */
3522int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3523 unsigned long mmr_offset)
3524{
3525 const cpumask_t *eligible_cpu = get_cpu_mask(cpu);
3526 struct irq_cfg *cfg;
3527 int mmr_pnode;
3528 unsigned long mmr_value;
3529 struct uv_IO_APIC_route_entry *entry;
3530 unsigned long flags;
3531 int err;
3532
3533 err = assign_irq_vector(irq, *eligible_cpu);
3534 if (err != 0)
3535 return err;
3536
3537 spin_lock_irqsave(&vector_lock, flags);
3538 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
3539 irq_name);
3540 spin_unlock_irqrestore(&vector_lock, flags);
3541
3542 cfg = irq_cfg(irq);
3543
3544 mmr_value = 0;
3545 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3546 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3547
3548 entry->vector = cfg->vector;
3549 entry->delivery_mode = INT_DELIVERY_MODE;
3550 entry->dest_mode = INT_DEST_MODE;
3551 entry->polarity = 0;
3552 entry->trigger = 0;
3553 entry->mask = 0;
3554 entry->dest = cpu_mask_to_apicid(*eligible_cpu);
3555
3556 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3557 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3558
3559 return irq;
3560}
3561
3562/*
3563 * Disable the specified MMR located on the specified blade so that MSIs are
3564 * longer allowed to be sent.
3565 */
3566void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3567{
3568 unsigned long mmr_value;
3569 struct uv_IO_APIC_route_entry *entry;
3570 int mmr_pnode;
3571
3572 mmr_value = 0;
3573 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3574 BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3575
3576 entry->mask = 1;
3577
3578 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3579 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3580}
3581#endif /* CONFIG_X86_64 */
3582
3583int __init io_apic_get_redir_entries (int ioapic)
3584{
3585 union IO_APIC_reg_01 reg_01;
3586 unsigned long flags;
3587
3588 spin_lock_irqsave(&ioapic_lock, flags);
3589 reg_01.raw = io_apic_read(ioapic, 1);
3590 spin_unlock_irqrestore(&ioapic_lock, flags);
3591
3592 return reg_01.bits.entries;
3593}
3594
3595int __init probe_nr_irqs(void)
3596{
3597 int idx;
3598 int nr = 0;
3599#ifndef CONFIG_XEN
3600 int nr_min = 32;
3601#else
3602 int nr_min = NR_IRQS;
3603#endif
3604
3605 for (idx = 0; idx < nr_ioapics; idx++)
3606 nr += io_apic_get_redir_entries(idx) + 1;
3607
3608 /* double it for hotplug and msi and nmi */
3609 nr <<= 1;
3610
3611 /* something wrong ? */
3612 if (nr < nr_min)
3613 nr = nr_min;
3614
3615 return nr;
3616}
3617
2238/* -------------------------------------------------------------------------- 3618/* --------------------------------------------------------------------------
2239 ACPI-based IOAPIC Configuration 3619 ACPI-based IOAPIC Configuration
2240 -------------------------------------------------------------------------- */ 3620 -------------------------------------------------------------------------- */
2241 3621
2242#ifdef CONFIG_ACPI 3622#ifdef CONFIG_ACPI
2243 3623
2244#define IO_APIC_MAX_ID 0xFE 3624#ifdef CONFIG_X86_32
3625int __init io_apic_get_unique_id(int ioapic, int apic_id)
3626{
3627 union IO_APIC_reg_00 reg_00;
3628 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
3629 physid_mask_t tmp;
3630 unsigned long flags;
3631 int i = 0;
2245 3632
2246int __init io_apic_get_redir_entries (int ioapic) 3633 /*
3634 * The P4 platform supports up to 256 APIC IDs on two separate APIC
3635 * buses (one for LAPICs, one for IOAPICs), where predecessors only
3636 * supports up to 16 on one shared APIC bus.
3637 *
3638 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
3639 * advantage of new APIC bus architecture.
3640 */
3641
3642 if (physids_empty(apic_id_map))
3643 apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
3644
3645 spin_lock_irqsave(&ioapic_lock, flags);
3646 reg_00.raw = io_apic_read(ioapic, 0);
3647 spin_unlock_irqrestore(&ioapic_lock, flags);
3648
3649 if (apic_id >= get_physical_broadcast()) {
3650 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
3651 "%d\n", ioapic, apic_id, reg_00.bits.ID);
3652 apic_id = reg_00.bits.ID;
3653 }
3654
3655 /*
3656 * Every APIC in a system must have a unique ID or we get lots of nice
3657 * 'stuck on smp_invalidate_needed IPI wait' messages.
3658 */
3659 if (check_apicid_used(apic_id_map, apic_id)) {
3660
3661 for (i = 0; i < get_physical_broadcast(); i++) {
3662 if (!check_apicid_used(apic_id_map, i))
3663 break;
3664 }
3665
3666 if (i == get_physical_broadcast())
3667 panic("Max apic_id exceeded!\n");
3668
3669 printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
3670 "trying %d\n", ioapic, apic_id, i);
3671
3672 apic_id = i;
3673 }
3674
3675 tmp = apicid_to_cpu_present(apic_id);
3676 physids_or(apic_id_map, apic_id_map, tmp);
3677
3678 if (reg_00.bits.ID != apic_id) {
3679 reg_00.bits.ID = apic_id;
3680
3681 spin_lock_irqsave(&ioapic_lock, flags);
3682 io_apic_write(ioapic, 0, reg_00.raw);
3683 reg_00.raw = io_apic_read(ioapic, 0);
3684 spin_unlock_irqrestore(&ioapic_lock, flags);
3685
3686 /* Sanity check */
3687 if (reg_00.bits.ID != apic_id) {
3688 printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
3689 return -1;
3690 }
3691 }
3692
3693 apic_printk(APIC_VERBOSE, KERN_INFO
3694 "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
3695
3696 return apic_id;
3697}
3698
3699int __init io_apic_get_version(int ioapic)
2247{ 3700{
2248 union IO_APIC_reg_01 reg_01; 3701 union IO_APIC_reg_01 reg_01;
2249 unsigned long flags; 3702 unsigned long flags;
@@ -2252,9 +3705,9 @@ int __init io_apic_get_redir_entries (int ioapic)
2252 reg_01.raw = io_apic_read(ioapic, 1); 3705 reg_01.raw = io_apic_read(ioapic, 1);
2253 spin_unlock_irqrestore(&ioapic_lock, flags); 3706 spin_unlock_irqrestore(&ioapic_lock, flags);
2254 3707
2255 return reg_01.bits.entries; 3708 return reg_01.bits.version;
2256} 3709}
2257 3710#endif
2258 3711
2259int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) 3712int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
2260{ 3713{
@@ -2306,6 +3759,7 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
2306void __init setup_ioapic_dest(void) 3759void __init setup_ioapic_dest(void)
2307{ 3760{
2308 int pin, ioapic, irq, irq_entry; 3761 int pin, ioapic, irq, irq_entry;
3762 struct irq_cfg *cfg;
2309 3763
2310 if (skip_ioapic_setup == 1) 3764 if (skip_ioapic_setup == 1)
2311 return; 3765 return;
@@ -2321,10 +3775,15 @@ void __init setup_ioapic_dest(void)
2321 * when you have too many devices, because at that time only boot 3775 * when you have too many devices, because at that time only boot
2322 * cpu is online. 3776 * cpu is online.
2323 */ 3777 */
2324 if (!irq_cfg[irq].vector) 3778 cfg = irq_cfg(irq);
3779 if (!cfg->vector)
2325 setup_IO_APIC_irq(ioapic, pin, irq, 3780 setup_IO_APIC_irq(ioapic, pin, irq,
2326 irq_trigger(irq_entry), 3781 irq_trigger(irq_entry),
2327 irq_polarity(irq_entry)); 3782 irq_polarity(irq_entry));
3783#ifdef CONFIG_INTR_REMAP
3784 else if (intr_remapping_enabled)
3785 set_ir_ioapic_affinity_irq(irq, TARGET_CPUS);
3786#endif
2328 else 3787 else
2329 set_ioapic_affinity_irq(irq, TARGET_CPUS); 3788 set_ioapic_affinity_irq(irq, TARGET_CPUS);
2330 } 3789 }
@@ -2375,18 +3834,33 @@ void __init ioapic_init_mappings(void)
2375 struct resource *ioapic_res; 3834 struct resource *ioapic_res;
2376 int i; 3835 int i;
2377 3836
3837 irq_2_pin_init();
2378 ioapic_res = ioapic_setup_resources(); 3838 ioapic_res = ioapic_setup_resources();
2379 for (i = 0; i < nr_ioapics; i++) { 3839 for (i = 0; i < nr_ioapics; i++) {
2380 if (smp_found_config) { 3840 if (smp_found_config) {
2381 ioapic_phys = mp_ioapics[i].mp_apicaddr; 3841 ioapic_phys = mp_ioapics[i].mp_apicaddr;
3842#ifdef CONFIG_X86_32
3843 if (!ioapic_phys) {
3844 printk(KERN_ERR
3845 "WARNING: bogus zero IO-APIC "
3846 "address found in MPTABLE, "
3847 "disabling IO/APIC support!\n");
3848 smp_found_config = 0;
3849 skip_ioapic_setup = 1;
3850 goto fake_ioapic_page;
3851 }
3852#endif
2382 } else { 3853 } else {
3854#ifdef CONFIG_X86_32
3855fake_ioapic_page:
3856#endif
2383 ioapic_phys = (unsigned long) 3857 ioapic_phys = (unsigned long)
2384 alloc_bootmem_pages(PAGE_SIZE); 3858 alloc_bootmem_pages(PAGE_SIZE);
2385 ioapic_phys = __pa(ioapic_phys); 3859 ioapic_phys = __pa(ioapic_phys);
2386 } 3860 }
2387 set_fixmap_nocache(idx, ioapic_phys); 3861 set_fixmap_nocache(idx, ioapic_phys);
2388 apic_printk(APIC_VERBOSE, 3862 apic_printk(APIC_VERBOSE,
2389 "mapped IOAPIC to %016lx (%016lx)\n", 3863 "mapped IOAPIC to %08lx (%08lx)\n",
2390 __fix_to_virt(idx), ioapic_phys); 3864 __fix_to_virt(idx), ioapic_phys);
2391 idx++; 3865 idx++;
2392 3866
@@ -2420,4 +3894,3 @@ static int __init ioapic_insert_resources(void)
2420/* Insert the IO APIC resources after PCI initialization has occured to handle 3894/* Insert the IO APIC resources after PCI initialization has occured to handle
2421 * IO APICS that are mapped in on a BAR in PCI space. */ 3895 * IO APICS that are mapped in on a BAR in PCI space. */
2422late_initcall(ioapic_insert_resources); 3896late_initcall(ioapic_insert_resources);
2423
diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c
deleted file mode 100644
index 558abf4c796a..000000000000
--- a/arch/x86/kernel/io_apic_32.c
+++ /dev/null
@@ -1,2900 +0,0 @@
1/*
2 * Intel IO-APIC support for multi-Pentium hosts.
3 *
4 * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo
5 *
6 * Many thanks to Stig Venaas for trying out countless experimental
7 * patches and reporting/debugging problems patiently!
8 *
9 * (c) 1999, Multiple IO-APIC support, developed by
10 * Ken-ichi Yaku <yaku@css1.kbnes.nec.co.jp> and
11 * Hidemi Kishimoto <kisimoto@css1.kbnes.nec.co.jp>,
12 * further tested and cleaned up by Zach Brown <zab@redhat.com>
13 * and Ingo Molnar <mingo@redhat.com>
14 *
15 * Fixes
16 * Maciej W. Rozycki : Bits for genuine 82489DX APICs;
17 * thanks to Eric Gilmore
18 * and Rolf G. Tews
19 * for testing these extensively
20 * Paul Diefenbaugh : Added full ACPI support
21 */
22
23#include <linux/mm.h>
24#include <linux/interrupt.h>
25#include <linux/init.h>
26#include <linux/delay.h>
27#include <linux/sched.h>
28#include <linux/bootmem.h>
29#include <linux/mc146818rtc.h>
30#include <linux/compiler.h>
31#include <linux/acpi.h>
32#include <linux/module.h>
33#include <linux/sysdev.h>
34#include <linux/pci.h>
35#include <linux/msi.h>
36#include <linux/htirq.h>
37#include <linux/freezer.h>
38#include <linux/kthread.h>
39#include <linux/jiffies.h> /* time_after() */
40
41#include <asm/io.h>
42#include <asm/smp.h>
43#include <asm/desc.h>
44#include <asm/timer.h>
45#include <asm/i8259.h>
46#include <asm/nmi.h>
47#include <asm/msidef.h>
48#include <asm/hypertransport.h>
49
50#include <mach_apic.h>
51#include <mach_apicdef.h>
52
53int (*ioapic_renumber_irq)(int ioapic, int irq);
54atomic_t irq_mis_count;
55
56/* Where if anywhere is the i8259 connect in external int mode */
57static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
58
59static DEFINE_SPINLOCK(ioapic_lock);
60static DEFINE_SPINLOCK(vector_lock);
61
62int timer_through_8259 __initdata;
63
64/*
65 * Is the SiS APIC rmw bug present ?
66 * -1 = don't know, 0 = no, 1 = yes
67 */
68int sis_apic_bug = -1;
69
70/*
71 * # of IRQ routing registers
72 */
73int nr_ioapic_registers[MAX_IO_APICS];
74
75/* I/O APIC entries */
76struct mp_config_ioapic mp_ioapics[MAX_IO_APICS];
77int nr_ioapics;
78
79/* MP IRQ source entries */
80struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES];
81
82/* # of MP IRQ source entries */
83int mp_irq_entries;
84
85#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
86int mp_bus_id_to_type[MAX_MP_BUSSES];
87#endif
88
89DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
90
91static int disable_timer_pin_1 __initdata;
92
93/*
94 * Rough estimation of how many shared IRQs there are, can
95 * be changed anytime.
96 */
97#define MAX_PLUS_SHARED_IRQS NR_IRQS
98#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS)
99
100/*
101 * This is performance-critical, we want to do it O(1)
102 *
103 * the indexing order of this array favors 1:1 mappings
104 * between pins and IRQs.
105 */
106
107static struct irq_pin_list {
108 int apic, pin, next;
109} irq_2_pin[PIN_MAP_SIZE];
110
111struct io_apic {
112 unsigned int index;
113 unsigned int unused[3];
114 unsigned int data;
115};
116
117static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
118{
119 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
120 + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK);
121}
122
123static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
124{
125 struct io_apic __iomem *io_apic = io_apic_base(apic);
126 writel(reg, &io_apic->index);
127 return readl(&io_apic->data);
128}
129
130static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
131{
132 struct io_apic __iomem *io_apic = io_apic_base(apic);
133 writel(reg, &io_apic->index);
134 writel(value, &io_apic->data);
135}
136
137/*
138 * Re-write a value: to be used for read-modify-write
139 * cycles where the read already set up the index register.
140 *
141 * Older SiS APIC requires we rewrite the index register
142 */
143static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
144{
145 volatile struct io_apic __iomem *io_apic = io_apic_base(apic);
146 if (sis_apic_bug)
147 writel(reg, &io_apic->index);
148 writel(value, &io_apic->data);
149}
150
151union entry_union {
152 struct { u32 w1, w2; };
153 struct IO_APIC_route_entry entry;
154};
155
156static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
157{
158 union entry_union eu;
159 unsigned long flags;
160 spin_lock_irqsave(&ioapic_lock, flags);
161 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
162 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
163 spin_unlock_irqrestore(&ioapic_lock, flags);
164 return eu.entry;
165}
166
167/*
168 * When we write a new IO APIC routing entry, we need to write the high
169 * word first! If the mask bit in the low word is clear, we will enable
170 * the interrupt, and we need to make sure the entry is fully populated
171 * before that happens.
172 */
173static void
174__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
175{
176 union entry_union eu;
177 eu.entry = e;
178 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
179 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
180}
181
182static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
183{
184 unsigned long flags;
185 spin_lock_irqsave(&ioapic_lock, flags);
186 __ioapic_write_entry(apic, pin, e);
187 spin_unlock_irqrestore(&ioapic_lock, flags);
188}
189
190/*
191 * When we mask an IO APIC routing entry, we need to write the low
192 * word first, in order to set the mask bit before we change the
193 * high bits!
194 */
195static void ioapic_mask_entry(int apic, int pin)
196{
197 unsigned long flags;
198 union entry_union eu = { .entry.mask = 1 };
199
200 spin_lock_irqsave(&ioapic_lock, flags);
201 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
202 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
203 spin_unlock_irqrestore(&ioapic_lock, flags);
204}
205
206/*
207 * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
208 * shared ISA-space IRQs, so we have to support them. We are super
209 * fast in the common case, and fast for shared ISA-space IRQs.
210 */
211static void add_pin_to_irq(unsigned int irq, int apic, int pin)
212{
213 static int first_free_entry = NR_IRQS;
214 struct irq_pin_list *entry = irq_2_pin + irq;
215
216 while (entry->next)
217 entry = irq_2_pin + entry->next;
218
219 if (entry->pin != -1) {
220 entry->next = first_free_entry;
221 entry = irq_2_pin + entry->next;
222 if (++first_free_entry >= PIN_MAP_SIZE)
223 panic("io_apic.c: whoops");
224 }
225 entry->apic = apic;
226 entry->pin = pin;
227}
228
229/*
230 * Reroute an IRQ to a different pin.
231 */
232static void __init replace_pin_at_irq(unsigned int irq,
233 int oldapic, int oldpin,
234 int newapic, int newpin)
235{
236 struct irq_pin_list *entry = irq_2_pin + irq;
237
238 while (1) {
239 if (entry->apic == oldapic && entry->pin == oldpin) {
240 entry->apic = newapic;
241 entry->pin = newpin;
242 }
243 if (!entry->next)
244 break;
245 entry = irq_2_pin + entry->next;
246 }
247}
248
249static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable)
250{
251 struct irq_pin_list *entry = irq_2_pin + irq;
252 unsigned int pin, reg;
253
254 for (;;) {
255 pin = entry->pin;
256 if (pin == -1)
257 break;
258 reg = io_apic_read(entry->apic, 0x10 + pin*2);
259 reg &= ~disable;
260 reg |= enable;
261 io_apic_modify(entry->apic, 0x10 + pin*2, reg);
262 if (!entry->next)
263 break;
264 entry = irq_2_pin + entry->next;
265 }
266}
267
268/* mask = 1 */
269static void __mask_IO_APIC_irq(unsigned int irq)
270{
271 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0);
272}
273
274/* mask = 0 */
275static void __unmask_IO_APIC_irq(unsigned int irq)
276{
277 __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED);
278}
279
280/* mask = 1, trigger = 0 */
281static void __mask_and_edge_IO_APIC_irq(unsigned int irq)
282{
283 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED,
284 IO_APIC_REDIR_LEVEL_TRIGGER);
285}
286
287/* mask = 0, trigger = 1 */
288static void __unmask_and_level_IO_APIC_irq(unsigned int irq)
289{
290 __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER,
291 IO_APIC_REDIR_MASKED);
292}
293
294static void mask_IO_APIC_irq(unsigned int irq)
295{
296 unsigned long flags;
297
298 spin_lock_irqsave(&ioapic_lock, flags);
299 __mask_IO_APIC_irq(irq);
300 spin_unlock_irqrestore(&ioapic_lock, flags);
301}
302
303static void unmask_IO_APIC_irq(unsigned int irq)
304{
305 unsigned long flags;
306
307 spin_lock_irqsave(&ioapic_lock, flags);
308 __unmask_IO_APIC_irq(irq);
309 spin_unlock_irqrestore(&ioapic_lock, flags);
310}
311
312static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
313{
314 struct IO_APIC_route_entry entry;
315
316 /* Check delivery_mode to be sure we're not clearing an SMI pin */
317 entry = ioapic_read_entry(apic, pin);
318 if (entry.delivery_mode == dest_SMI)
319 return;
320
321 /*
322 * Disable it in the IO-APIC irq-routing table:
323 */
324 ioapic_mask_entry(apic, pin);
325}
326
327static void clear_IO_APIC(void)
328{
329 int apic, pin;
330
331 for (apic = 0; apic < nr_ioapics; apic++)
332 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
333 clear_IO_APIC_pin(apic, pin);
334}
335
336#ifdef CONFIG_SMP
337static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask)
338{
339 unsigned long flags;
340 int pin;
341 struct irq_pin_list *entry = irq_2_pin + irq;
342 unsigned int apicid_value;
343 cpumask_t tmp;
344
345 cpus_and(tmp, cpumask, cpu_online_map);
346 if (cpus_empty(tmp))
347 tmp = TARGET_CPUS;
348
349 cpus_and(cpumask, tmp, CPU_MASK_ALL);
350
351 apicid_value = cpu_mask_to_apicid(cpumask);
352 /* Prepare to do the io_apic_write */
353 apicid_value = apicid_value << 24;
354 spin_lock_irqsave(&ioapic_lock, flags);
355 for (;;) {
356 pin = entry->pin;
357 if (pin == -1)
358 break;
359 io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
360 if (!entry->next)
361 break;
362 entry = irq_2_pin + entry->next;
363 }
364 irq_desc[irq].affinity = cpumask;
365 spin_unlock_irqrestore(&ioapic_lock, flags);
366}
367
368#if defined(CONFIG_IRQBALANCE)
369# include <asm/processor.h> /* kernel_thread() */
370# include <linux/kernel_stat.h> /* kstat */
371# include <linux/slab.h> /* kmalloc() */
372# include <linux/timer.h>
373
374#define IRQBALANCE_CHECK_ARCH -999
375#define MAX_BALANCED_IRQ_INTERVAL (5*HZ)
376#define MIN_BALANCED_IRQ_INTERVAL (HZ/2)
377#define BALANCED_IRQ_MORE_DELTA (HZ/10)
378#define BALANCED_IRQ_LESS_DELTA (HZ)
379
380static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH;
381static int physical_balance __read_mostly;
382static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL;
383
384static struct irq_cpu_info {
385 unsigned long *last_irq;
386 unsigned long *irq_delta;
387 unsigned long irq;
388} irq_cpu_data[NR_CPUS];
389
390#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq)
391#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq])
392#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq])
393
394#define IDLE_ENOUGH(cpu,now) \
395 (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1))
396
397#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask)
398
399#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i)))
400
401static cpumask_t balance_irq_affinity[NR_IRQS] = {
402 [0 ... NR_IRQS-1] = CPU_MASK_ALL
403};
404
405void set_balance_irq_affinity(unsigned int irq, cpumask_t mask)
406{
407 balance_irq_affinity[irq] = mask;
408}
409
410static unsigned long move(int curr_cpu, cpumask_t allowed_mask,
411 unsigned long now, int direction)
412{
413 int search_idle = 1;
414 int cpu = curr_cpu;
415
416 goto inside;
417
418 do {
419 if (unlikely(cpu == curr_cpu))
420 search_idle = 0;
421inside:
422 if (direction == 1) {
423 cpu++;
424 if (cpu >= NR_CPUS)
425 cpu = 0;
426 } else {
427 cpu--;
428 if (cpu == -1)
429 cpu = NR_CPUS-1;
430 }
431 } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) ||
432 (search_idle && !IDLE_ENOUGH(cpu, now)));
433
434 return cpu;
435}
436
437static inline void balance_irq(int cpu, int irq)
438{
439 unsigned long now = jiffies;
440 cpumask_t allowed_mask;
441 unsigned int new_cpu;
442
443 if (irqbalance_disabled)
444 return;
445
446 cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]);
447 new_cpu = move(cpu, allowed_mask, now, 1);
448 if (cpu != new_cpu)
449 set_pending_irq(irq, cpumask_of_cpu(new_cpu));
450}
451
452static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold)
453{
454 int i, j;
455
456 for_each_online_cpu(i) {
457 for (j = 0; j < NR_IRQS; j++) {
458 if (!irq_desc[j].action)
459 continue;
460 /* Is it a significant load ? */
461 if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) <
462 useful_load_threshold)
463 continue;
464 balance_irq(i, j);
465 }
466 }
467 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
468 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
469 return;
470}
471
472static void do_irq_balance(void)
473{
474 int i, j;
475 unsigned long max_cpu_irq = 0, min_cpu_irq = (~0);
476 unsigned long move_this_load = 0;
477 int max_loaded = 0, min_loaded = 0;
478 int load;
479 unsigned long useful_load_threshold = balanced_irq_interval + 10;
480 int selected_irq;
481 int tmp_loaded, first_attempt = 1;
482 unsigned long tmp_cpu_irq;
483 unsigned long imbalance = 0;
484 cpumask_t allowed_mask, target_cpu_mask, tmp;
485
486 for_each_possible_cpu(i) {
487 int package_index;
488 CPU_IRQ(i) = 0;
489 if (!cpu_online(i))
490 continue;
491 package_index = CPU_TO_PACKAGEINDEX(i);
492 for (j = 0; j < NR_IRQS; j++) {
493 unsigned long value_now, delta;
494 /* Is this an active IRQ or balancing disabled ? */
495 if (!irq_desc[j].action || irq_balancing_disabled(j))
496 continue;
497 if (package_index == i)
498 IRQ_DELTA(package_index, j) = 0;
499 /* Determine the total count per processor per IRQ */
500 value_now = (unsigned long) kstat_cpu(i).irqs[j];
501
502 /* Determine the activity per processor per IRQ */
503 delta = value_now - LAST_CPU_IRQ(i, j);
504
505 /* Update last_cpu_irq[][] for the next time */
506 LAST_CPU_IRQ(i, j) = value_now;
507
508 /* Ignore IRQs whose rate is less than the clock */
509 if (delta < useful_load_threshold)
510 continue;
511 /* update the load for the processor or package total */
512 IRQ_DELTA(package_index, j) += delta;
513
514 /* Keep track of the higher numbered sibling as well */
515 if (i != package_index)
516 CPU_IRQ(i) += delta;
517 /*
518 * We have sibling A and sibling B in the package
519 *
520 * cpu_irq[A] = load for cpu A + load for cpu B
521 * cpu_irq[B] = load for cpu B
522 */
523 CPU_IRQ(package_index) += delta;
524 }
525 }
526 /* Find the least loaded processor package */
527 for_each_online_cpu(i) {
528 if (i != CPU_TO_PACKAGEINDEX(i))
529 continue;
530 if (min_cpu_irq > CPU_IRQ(i)) {
531 min_cpu_irq = CPU_IRQ(i);
532 min_loaded = i;
533 }
534 }
535 max_cpu_irq = ULONG_MAX;
536
537tryanothercpu:
538 /*
539 * Look for heaviest loaded processor.
540 * We may come back to get the next heaviest loaded processor.
541 * Skip processors with trivial loads.
542 */
543 tmp_cpu_irq = 0;
544 tmp_loaded = -1;
545 for_each_online_cpu(i) {
546 if (i != CPU_TO_PACKAGEINDEX(i))
547 continue;
548 if (max_cpu_irq <= CPU_IRQ(i))
549 continue;
550 if (tmp_cpu_irq < CPU_IRQ(i)) {
551 tmp_cpu_irq = CPU_IRQ(i);
552 tmp_loaded = i;
553 }
554 }
555
556 if (tmp_loaded == -1) {
557 /*
558 * In the case of small number of heavy interrupt sources,
559 * loading some of the cpus too much. We use Ingo's original
560 * approach to rotate them around.
561 */
562 if (!first_attempt && imbalance >= useful_load_threshold) {
563 rotate_irqs_among_cpus(useful_load_threshold);
564 return;
565 }
566 goto not_worth_the_effort;
567 }
568
569 first_attempt = 0; /* heaviest search */
570 max_cpu_irq = tmp_cpu_irq; /* load */
571 max_loaded = tmp_loaded; /* processor */
572 imbalance = (max_cpu_irq - min_cpu_irq) / 2;
573
574 /*
575 * if imbalance is less than approx 10% of max load, then
576 * observe diminishing returns action. - quit
577 */
578 if (imbalance < (max_cpu_irq >> 3))
579 goto not_worth_the_effort;
580
581tryanotherirq:
582 /* if we select an IRQ to move that can't go where we want, then
583 * see if there is another one to try.
584 */
585 move_this_load = 0;
586 selected_irq = -1;
587 for (j = 0; j < NR_IRQS; j++) {
588 /* Is this an active IRQ? */
589 if (!irq_desc[j].action)
590 continue;
591 if (imbalance <= IRQ_DELTA(max_loaded, j))
592 continue;
593 /* Try to find the IRQ that is closest to the imbalance
594 * without going over.
595 */
596 if (move_this_load < IRQ_DELTA(max_loaded, j)) {
597 move_this_load = IRQ_DELTA(max_loaded, j);
598 selected_irq = j;
599 }
600 }
601 if (selected_irq == -1)
602 goto tryanothercpu;
603
604 imbalance = move_this_load;
605
606 /* For physical_balance case, we accumulated both load
607 * values in the one of the siblings cpu_irq[],
608 * to use the same code for physical and logical processors
609 * as much as possible.
610 *
611 * NOTE: the cpu_irq[] array holds the sum of the load for
612 * sibling A and sibling B in the slot for the lowest numbered
613 * sibling (A), _AND_ the load for sibling B in the slot for
614 * the higher numbered sibling.
615 *
616 * We seek the least loaded sibling by making the comparison
617 * (A+B)/2 vs B
618 */
619 load = CPU_IRQ(min_loaded) >> 1;
620 for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) {
621 if (load > CPU_IRQ(j)) {
622 /* This won't change cpu_sibling_map[min_loaded] */
623 load = CPU_IRQ(j);
624 min_loaded = j;
625 }
626 }
627
628 cpus_and(allowed_mask,
629 cpu_online_map,
630 balance_irq_affinity[selected_irq]);
631 target_cpu_mask = cpumask_of_cpu(min_loaded);
632 cpus_and(tmp, target_cpu_mask, allowed_mask);
633
634 if (!cpus_empty(tmp)) {
635 /* mark for change destination */
636 set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded));
637
638 /* Since we made a change, come back sooner to
639 * check for more variation.
640 */
641 balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL,
642 balanced_irq_interval - BALANCED_IRQ_LESS_DELTA);
643 return;
644 }
645 goto tryanotherirq;
646
647not_worth_the_effort:
648 /*
649 * if we did not find an IRQ to move, then adjust the time interval
650 * upward
651 */
652 balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL,
653 balanced_irq_interval + BALANCED_IRQ_MORE_DELTA);
654 return;
655}
656
657static int balanced_irq(void *unused)
658{
659 int i;
660 unsigned long prev_balance_time = jiffies;
661 long time_remaining = balanced_irq_interval;
662
663 /* push everything to CPU 0 to give us a starting point. */
664 for (i = 0 ; i < NR_IRQS ; i++) {
665 irq_desc[i].pending_mask = cpumask_of_cpu(0);
666 set_pending_irq(i, cpumask_of_cpu(0));
667 }
668
669 set_freezable();
670 for ( ; ; ) {
671 time_remaining = schedule_timeout_interruptible(time_remaining);
672 try_to_freeze();
673 if (time_after(jiffies,
674 prev_balance_time+balanced_irq_interval)) {
675 preempt_disable();
676 do_irq_balance();
677 prev_balance_time = jiffies;
678 time_remaining = balanced_irq_interval;
679 preempt_enable();
680 }
681 }
682 return 0;
683}
684
685static int __init balanced_irq_init(void)
686{
687 int i;
688 struct cpuinfo_x86 *c;
689 cpumask_t tmp;
690
691 cpus_shift_right(tmp, cpu_online_map, 2);
692 c = &boot_cpu_data;
693 /* When not overwritten by the command line ask subarchitecture. */
694 if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH)
695 irqbalance_disabled = NO_BALANCE_IRQ;
696 if (irqbalance_disabled)
697 return 0;
698
699 /* disable irqbalance completely if there is only one processor online */
700 if (num_online_cpus() < 2) {
701 irqbalance_disabled = 1;
702 return 0;
703 }
704 /*
705 * Enable physical balance only if more than 1 physical processor
706 * is present
707 */
708 if (smp_num_siblings > 1 && !cpus_empty(tmp))
709 physical_balance = 1;
710
711 for_each_online_cpu(i) {
712 irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
713 irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL);
714 if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) {
715 printk(KERN_ERR "balanced_irq_init: out of memory");
716 goto failed;
717 }
718 }
719
720 printk(KERN_INFO "Starting balanced_irq\n");
721 if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd")))
722 return 0;
723 printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq");
724failed:
725 for_each_possible_cpu(i) {
726 kfree(irq_cpu_data[i].irq_delta);
727 irq_cpu_data[i].irq_delta = NULL;
728 kfree(irq_cpu_data[i].last_irq);
729 irq_cpu_data[i].last_irq = NULL;
730 }
731 return 0;
732}
733
734int __devinit irqbalance_disable(char *str)
735{
736 irqbalance_disabled = 1;
737 return 1;
738}
739
740__setup("noirqbalance", irqbalance_disable);
741
742late_initcall(balanced_irq_init);
743#endif /* CONFIG_IRQBALANCE */
744#endif /* CONFIG_SMP */
745
746#ifndef CONFIG_SMP
747void send_IPI_self(int vector)
748{
749 unsigned int cfg;
750
751 /*
752 * Wait for idle.
753 */
754 apic_wait_icr_idle();
755 cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL;
756 /*
757 * Send the IPI. The write to APIC_ICR fires this off.
758 */
759 apic_write_around(APIC_ICR, cfg);
760}
761#endif /* !CONFIG_SMP */
762
763
764/*
765 * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to
766 * specific CPU-side IRQs.
767 */
768
769#define MAX_PIRQS 8
770static int pirq_entries [MAX_PIRQS];
771static int pirqs_enabled;
772int skip_ioapic_setup;
773
774static int __init ioapic_pirq_setup(char *str)
775{
776 int i, max;
777 int ints[MAX_PIRQS+1];
778
779 get_options(str, ARRAY_SIZE(ints), ints);
780
781 for (i = 0; i < MAX_PIRQS; i++)
782 pirq_entries[i] = -1;
783
784 pirqs_enabled = 1;
785 apic_printk(APIC_VERBOSE, KERN_INFO
786 "PIRQ redirection, working around broken MP-BIOS.\n");
787 max = MAX_PIRQS;
788 if (ints[0] < MAX_PIRQS)
789 max = ints[0];
790
791 for (i = 0; i < max; i++) {
792 apic_printk(APIC_VERBOSE, KERN_DEBUG
793 "... PIRQ%d -> IRQ %d\n", i, ints[i+1]);
794 /*
795 * PIRQs are mapped upside down, usually.
796 */
797 pirq_entries[MAX_PIRQS-i-1] = ints[i+1];
798 }
799 return 1;
800}
801
802__setup("pirq=", ioapic_pirq_setup);
803
804/*
805 * Find the IRQ entry number of a certain pin.
806 */
807static int find_irq_entry(int apic, int pin, int type)
808{
809 int i;
810
811 for (i = 0; i < mp_irq_entries; i++)
812 if (mp_irqs[i].mp_irqtype == type &&
813 (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid ||
814 mp_irqs[i].mp_dstapic == MP_APIC_ALL) &&
815 mp_irqs[i].mp_dstirq == pin)
816 return i;
817
818 return -1;
819}
820
821/*
822 * Find the pin to which IRQ[irq] (ISA) is connected
823 */
824static int __init find_isa_irq_pin(int irq, int type)
825{
826 int i;
827
828 for (i = 0; i < mp_irq_entries; i++) {
829 int lbus = mp_irqs[i].mp_srcbus;
830
831 if (test_bit(lbus, mp_bus_not_pci) &&
832 (mp_irqs[i].mp_irqtype == type) &&
833 (mp_irqs[i].mp_srcbusirq == irq))
834
835 return mp_irqs[i].mp_dstirq;
836 }
837 return -1;
838}
839
840static int __init find_isa_irq_apic(int irq, int type)
841{
842 int i;
843
844 for (i = 0; i < mp_irq_entries; i++) {
845 int lbus = mp_irqs[i].mp_srcbus;
846
847 if (test_bit(lbus, mp_bus_not_pci) &&
848 (mp_irqs[i].mp_irqtype == type) &&
849 (mp_irqs[i].mp_srcbusirq == irq))
850 break;
851 }
852 if (i < mp_irq_entries) {
853 int apic;
854 for (apic = 0; apic < nr_ioapics; apic++) {
855 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic)
856 return apic;
857 }
858 }
859
860 return -1;
861}
862
863/*
864 * Find a specific PCI IRQ entry.
865 * Not an __init, possibly needed by modules
866 */
867static int pin_2_irq(int idx, int apic, int pin);
868
869int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
870{
871 int apic, i, best_guess = -1;
872
873 apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, "
874 "slot:%d, pin:%d.\n", bus, slot, pin);
875 if (test_bit(bus, mp_bus_not_pci)) {
876 printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
877 return -1;
878 }
879 for (i = 0; i < mp_irq_entries; i++) {
880 int lbus = mp_irqs[i].mp_srcbus;
881
882 for (apic = 0; apic < nr_ioapics; apic++)
883 if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic ||
884 mp_irqs[i].mp_dstapic == MP_APIC_ALL)
885 break;
886
887 if (!test_bit(lbus, mp_bus_not_pci) &&
888 !mp_irqs[i].mp_irqtype &&
889 (bus == lbus) &&
890 (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) {
891 int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq);
892
893 if (!(apic || IO_APIC_IRQ(irq)))
894 continue;
895
896 if (pin == (mp_irqs[i].mp_srcbusirq & 3))
897 return irq;
898 /*
899 * Use the first all-but-pin matching entry as a
900 * best-guess fuzzy result for broken mptables.
901 */
902 if (best_guess < 0)
903 best_guess = irq;
904 }
905 }
906 return best_guess;
907}
908EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
909
910/*
911 * This function currently is only a helper for the i386 smp boot process where
912 * we need to reprogram the ioredtbls to cater for the cpus which have come online
913 * so mask in all cases should simply be TARGET_CPUS
914 */
915#ifdef CONFIG_SMP
916void __init setup_ioapic_dest(void)
917{
918 int pin, ioapic, irq, irq_entry;
919
920 if (skip_ioapic_setup == 1)
921 return;
922
923 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
924 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
925 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
926 if (irq_entry == -1)
927 continue;
928 irq = pin_2_irq(irq_entry, ioapic, pin);
929 set_ioapic_affinity_irq(irq, TARGET_CPUS);
930 }
931
932 }
933}
934#endif
935
936#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
937/*
938 * EISA Edge/Level control register, ELCR
939 */
940static int EISA_ELCR(unsigned int irq)
941{
942 if (irq < 16) {
943 unsigned int port = 0x4d0 + (irq >> 3);
944 return (inb(port) >> (irq & 7)) & 1;
945 }
946 apic_printk(APIC_VERBOSE, KERN_INFO
947 "Broken MPtable reports ISA irq %d\n", irq);
948 return 0;
949}
950#endif
951
952/* ISA interrupts are always polarity zero edge triggered,
953 * when listed as conforming in the MP table. */
954
955#define default_ISA_trigger(idx) (0)
956#define default_ISA_polarity(idx) (0)
957
958/* EISA interrupts are always polarity zero and can be edge or level
959 * trigger depending on the ELCR value. If an interrupt is listed as
960 * EISA conforming in the MP table, that means its trigger type must
961 * be read in from the ELCR */
962
963#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq))
964#define default_EISA_polarity(idx) default_ISA_polarity(idx)
965
966/* PCI interrupts are always polarity one level triggered,
967 * when listed as conforming in the MP table. */
968
969#define default_PCI_trigger(idx) (1)
970#define default_PCI_polarity(idx) (1)
971
972/* MCA interrupts are always polarity zero level triggered,
973 * when listed as conforming in the MP table. */
974
975#define default_MCA_trigger(idx) (1)
976#define default_MCA_polarity(idx) default_ISA_polarity(idx)
977
978static int MPBIOS_polarity(int idx)
979{
980 int bus = mp_irqs[idx].mp_srcbus;
981 int polarity;
982
983 /*
984 * Determine IRQ line polarity (high active or low active):
985 */
986 switch (mp_irqs[idx].mp_irqflag & 3) {
987 case 0: /* conforms, ie. bus-type dependent polarity */
988 {
989 polarity = test_bit(bus, mp_bus_not_pci)?
990 default_ISA_polarity(idx):
991 default_PCI_polarity(idx);
992 break;
993 }
994 case 1: /* high active */
995 {
996 polarity = 0;
997 break;
998 }
999 case 2: /* reserved */
1000 {
1001 printk(KERN_WARNING "broken BIOS!!\n");
1002 polarity = 1;
1003 break;
1004 }
1005 case 3: /* low active */
1006 {
1007 polarity = 1;
1008 break;
1009 }
1010 default: /* invalid */
1011 {
1012 printk(KERN_WARNING "broken BIOS!!\n");
1013 polarity = 1;
1014 break;
1015 }
1016 }
1017 return polarity;
1018}
1019
1020static int MPBIOS_trigger(int idx)
1021{
1022 int bus = mp_irqs[idx].mp_srcbus;
1023 int trigger;
1024
1025 /*
1026 * Determine IRQ trigger mode (edge or level sensitive):
1027 */
1028 switch ((mp_irqs[idx].mp_irqflag>>2) & 3) {
1029 case 0: /* conforms, ie. bus-type dependent */
1030 {
1031 trigger = test_bit(bus, mp_bus_not_pci)?
1032 default_ISA_trigger(idx):
1033 default_PCI_trigger(idx);
1034#if defined(CONFIG_EISA) || defined(CONFIG_MCA)
1035 switch (mp_bus_id_to_type[bus]) {
1036 case MP_BUS_ISA: /* ISA pin */
1037 {
1038 /* set before the switch */
1039 break;
1040 }
1041 case MP_BUS_EISA: /* EISA pin */
1042 {
1043 trigger = default_EISA_trigger(idx);
1044 break;
1045 }
1046 case MP_BUS_PCI: /* PCI pin */
1047 {
1048 /* set before the switch */
1049 break;
1050 }
1051 case MP_BUS_MCA: /* MCA pin */
1052 {
1053 trigger = default_MCA_trigger(idx);
1054 break;
1055 }
1056 default:
1057 {
1058 printk(KERN_WARNING "broken BIOS!!\n");
1059 trigger = 1;
1060 break;
1061 }
1062 }
1063#endif
1064 break;
1065 }
1066 case 1: /* edge */
1067 {
1068 trigger = 0;
1069 break;
1070 }
1071 case 2: /* reserved */
1072 {
1073 printk(KERN_WARNING "broken BIOS!!\n");
1074 trigger = 1;
1075 break;
1076 }
1077 case 3: /* level */
1078 {
1079 trigger = 1;
1080 break;
1081 }
1082 default: /* invalid */
1083 {
1084 printk(KERN_WARNING "broken BIOS!!\n");
1085 trigger = 0;
1086 break;
1087 }
1088 }
1089 return trigger;
1090}
1091
1092static inline int irq_polarity(int idx)
1093{
1094 return MPBIOS_polarity(idx);
1095}
1096
1097static inline int irq_trigger(int idx)
1098{
1099 return MPBIOS_trigger(idx);
1100}
1101
1102static int pin_2_irq(int idx, int apic, int pin)
1103{
1104 int irq, i;
1105 int bus = mp_irqs[idx].mp_srcbus;
1106
1107 /*
1108 * Debugging check, we are in big trouble if this message pops up!
1109 */
1110 if (mp_irqs[idx].mp_dstirq != pin)
1111 printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n");
1112
1113 if (test_bit(bus, mp_bus_not_pci))
1114 irq = mp_irqs[idx].mp_srcbusirq;
1115 else {
1116 /*
1117 * PCI IRQs are mapped in order
1118 */
1119 i = irq = 0;
1120 while (i < apic)
1121 irq += nr_ioapic_registers[i++];
1122 irq += pin;
1123
1124 /*
1125 * For MPS mode, so far only needed by ES7000 platform
1126 */
1127 if (ioapic_renumber_irq)
1128 irq = ioapic_renumber_irq(apic, irq);
1129 }
1130
1131 /*
1132 * PCI IRQ command line redirection. Yes, limits are hardcoded.
1133 */
1134 if ((pin >= 16) && (pin <= 23)) {
1135 if (pirq_entries[pin-16] != -1) {
1136 if (!pirq_entries[pin-16]) {
1137 apic_printk(APIC_VERBOSE, KERN_DEBUG
1138 "disabling PIRQ%d\n", pin-16);
1139 } else {
1140 irq = pirq_entries[pin-16];
1141 apic_printk(APIC_VERBOSE, KERN_DEBUG
1142 "using PIRQ%d -> IRQ %d\n",
1143 pin-16, irq);
1144 }
1145 }
1146 }
1147 return irq;
1148}
1149
1150static inline int IO_APIC_irq_trigger(int irq)
1151{
1152 int apic, idx, pin;
1153
1154 for (apic = 0; apic < nr_ioapics; apic++) {
1155 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1156 idx = find_irq_entry(apic, pin, mp_INT);
1157 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
1158 return irq_trigger(idx);
1159 }
1160 }
1161 /*
1162 * nonexistent IRQs are edge default
1163 */
1164 return 0;
1165}
1166
1167/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */
1168static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 };
1169
1170static int __assign_irq_vector(int irq)
1171{
1172 static int current_vector = FIRST_DEVICE_VECTOR, current_offset;
1173 int vector, offset;
1174
1175 BUG_ON((unsigned)irq >= NR_IRQ_VECTORS);
1176
1177 if (irq_vector[irq] > 0)
1178 return irq_vector[irq];
1179
1180 vector = current_vector;
1181 offset = current_offset;
1182next:
1183 vector += 8;
1184 if (vector >= first_system_vector) {
1185 offset = (offset + 1) % 8;
1186 vector = FIRST_DEVICE_VECTOR + offset;
1187 }
1188 if (vector == current_vector)
1189 return -ENOSPC;
1190 if (test_and_set_bit(vector, used_vectors))
1191 goto next;
1192
1193 current_vector = vector;
1194 current_offset = offset;
1195 irq_vector[irq] = vector;
1196
1197 return vector;
1198}
1199
1200static int assign_irq_vector(int irq)
1201{
1202 unsigned long flags;
1203 int vector;
1204
1205 spin_lock_irqsave(&vector_lock, flags);
1206 vector = __assign_irq_vector(irq);
1207 spin_unlock_irqrestore(&vector_lock, flags);
1208
1209 return vector;
1210}
1211
1212void setup_vector_irq(int cpu)
1213{
1214}
1215
1216static struct irq_chip ioapic_chip;
1217
1218#define IOAPIC_AUTO -1
1219#define IOAPIC_EDGE 0
1220#define IOAPIC_LEVEL 1
1221
1222static void ioapic_register_intr(int irq, int vector, unsigned long trigger)
1223{
1224 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1225 trigger == IOAPIC_LEVEL) {
1226 irq_desc[irq].status |= IRQ_LEVEL;
1227 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1228 handle_fasteoi_irq, "fasteoi");
1229 } else {
1230 irq_desc[irq].status &= ~IRQ_LEVEL;
1231 set_irq_chip_and_handler_name(irq, &ioapic_chip,
1232 handle_edge_irq, "edge");
1233 }
1234 set_intr_gate(vector, interrupt[irq]);
1235}
1236
1237static void __init setup_IO_APIC_irqs(void)
1238{
1239 struct IO_APIC_route_entry entry;
1240 int apic, pin, idx, irq, first_notcon = 1, vector;
1241
1242 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1243
1244 for (apic = 0; apic < nr_ioapics; apic++) {
1245 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1246
1247 /*
1248 * add it to the IO-APIC irq-routing table:
1249 */
1250 memset(&entry, 0, sizeof(entry));
1251
1252 entry.delivery_mode = INT_DELIVERY_MODE;
1253 entry.dest_mode = INT_DEST_MODE;
1254 entry.mask = 0; /* enable IRQ */
1255 entry.dest.logical.logical_dest =
1256 cpu_mask_to_apicid(TARGET_CPUS);
1257
1258 idx = find_irq_entry(apic, pin, mp_INT);
1259 if (idx == -1) {
1260 if (first_notcon) {
1261 apic_printk(APIC_VERBOSE, KERN_DEBUG
1262 " IO-APIC (apicid-pin) %d-%d",
1263 mp_ioapics[apic].mp_apicid,
1264 pin);
1265 first_notcon = 0;
1266 } else
1267 apic_printk(APIC_VERBOSE, ", %d-%d",
1268 mp_ioapics[apic].mp_apicid, pin);
1269 continue;
1270 }
1271
1272 if (!first_notcon) {
1273 apic_printk(APIC_VERBOSE, " not connected.\n");
1274 first_notcon = 1;
1275 }
1276
1277 entry.trigger = irq_trigger(idx);
1278 entry.polarity = irq_polarity(idx);
1279
1280 if (irq_trigger(idx)) {
1281 entry.trigger = 1;
1282 entry.mask = 1;
1283 }
1284
1285 irq = pin_2_irq(idx, apic, pin);
1286 /*
1287 * skip adding the timer int on secondary nodes, which causes
1288 * a small but painful rift in the time-space continuum
1289 */
1290 if (multi_timer_check(apic, irq))
1291 continue;
1292 else
1293 add_pin_to_irq(irq, apic, pin);
1294
1295 if (!apic && !IO_APIC_IRQ(irq))
1296 continue;
1297
1298 if (IO_APIC_IRQ(irq)) {
1299 vector = assign_irq_vector(irq);
1300 entry.vector = vector;
1301 ioapic_register_intr(irq, vector, IOAPIC_AUTO);
1302
1303 if (!apic && (irq < 16))
1304 disable_8259A_irq(irq);
1305 }
1306 ioapic_write_entry(apic, pin, entry);
1307 }
1308 }
1309
1310 if (!first_notcon)
1311 apic_printk(APIC_VERBOSE, " not connected.\n");
1312}
1313
1314/*
1315 * Set up the timer pin, possibly with the 8259A-master behind.
1316 */
1317static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin,
1318 int vector)
1319{
1320 struct IO_APIC_route_entry entry;
1321
1322 memset(&entry, 0, sizeof(entry));
1323
1324 /*
1325 * We use logical delivery to get the timer IRQ
1326 * to the first CPU.
1327 */
1328 entry.dest_mode = INT_DEST_MODE;
1329 entry.mask = 1; /* mask IRQ now */
1330 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
1331 entry.delivery_mode = INT_DELIVERY_MODE;
1332 entry.polarity = 0;
1333 entry.trigger = 0;
1334 entry.vector = vector;
1335
1336 /*
1337 * The timer IRQ doesn't have to know that behind the
1338 * scene we may have a 8259A-master in AEOI mode ...
1339 */
1340 ioapic_register_intr(0, vector, IOAPIC_EDGE);
1341
1342 /*
1343 * Add it to the IO-APIC irq-routing table:
1344 */
1345 ioapic_write_entry(apic, pin, entry);
1346}
1347
1348void __init print_IO_APIC(void)
1349{
1350 int apic, i;
1351 union IO_APIC_reg_00 reg_00;
1352 union IO_APIC_reg_01 reg_01;
1353 union IO_APIC_reg_02 reg_02;
1354 union IO_APIC_reg_03 reg_03;
1355 unsigned long flags;
1356
1357 if (apic_verbosity == APIC_QUIET)
1358 return;
1359
1360 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1361 for (i = 0; i < nr_ioapics; i++)
1362 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1363 mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]);
1364
1365 /*
1366 * We are a bit conservative about what we expect. We have to
1367 * know about every hardware change ASAP.
1368 */
1369 printk(KERN_INFO "testing the IO APIC.......................\n");
1370
1371 for (apic = 0; apic < nr_ioapics; apic++) {
1372
1373 spin_lock_irqsave(&ioapic_lock, flags);
1374 reg_00.raw = io_apic_read(apic, 0);
1375 reg_01.raw = io_apic_read(apic, 1);
1376 if (reg_01.bits.version >= 0x10)
1377 reg_02.raw = io_apic_read(apic, 2);
1378 if (reg_01.bits.version >= 0x20)
1379 reg_03.raw = io_apic_read(apic, 3);
1380 spin_unlock_irqrestore(&ioapic_lock, flags);
1381
1382 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid);
1383 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1384 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1385 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
1386 printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS);
1387
1388 printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw);
1389 printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries);
1390
1391 printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ);
1392 printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version);
1393
1394 /*
1395 * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02,
1396 * but the value of reg_02 is read as the previous read register
1397 * value, so ignore it if reg_02 == reg_01.
1398 */
1399 if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) {
1400 printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw);
1401 printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration);
1402 }
1403
1404 /*
1405 * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02
1406 * or reg_03, but the value of reg_0[23] is read as the previous read
1407 * register value, so ignore it if reg_03 == reg_0[12].
1408 */
1409 if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw &&
1410 reg_03.raw != reg_01.raw) {
1411 printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw);
1412 printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT);
1413 }
1414
1415 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1416
1417 printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol"
1418 " Stat Dest Deli Vect: \n");
1419
1420 for (i = 0; i <= reg_01.bits.entries; i++) {
1421 struct IO_APIC_route_entry entry;
1422
1423 entry = ioapic_read_entry(apic, i);
1424
1425 printk(KERN_DEBUG " %02x %03X %02X ",
1426 i,
1427 entry.dest.logical.logical_dest,
1428 entry.dest.physical.physical_dest
1429 );
1430
1431 printk("%1d %1d %1d %1d %1d %1d %1d %02X\n",
1432 entry.mask,
1433 entry.trigger,
1434 entry.irr,
1435 entry.polarity,
1436 entry.delivery_status,
1437 entry.dest_mode,
1438 entry.delivery_mode,
1439 entry.vector
1440 );
1441 }
1442 }
1443 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1444 for (i = 0; i < NR_IRQS; i++) {
1445 struct irq_pin_list *entry = irq_2_pin + i;
1446 if (entry->pin < 0)
1447 continue;
1448 printk(KERN_DEBUG "IRQ%d ", i);
1449 for (;;) {
1450 printk("-> %d:%d", entry->apic, entry->pin);
1451 if (!entry->next)
1452 break;
1453 entry = irq_2_pin + entry->next;
1454 }
1455 printk("\n");
1456 }
1457
1458 printk(KERN_INFO ".................................... done.\n");
1459
1460 return;
1461}
1462
1463#if 0
1464
1465static void print_APIC_bitfield(int base)
1466{
1467 unsigned int v;
1468 int i, j;
1469
1470 if (apic_verbosity == APIC_QUIET)
1471 return;
1472
1473 printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG);
1474 for (i = 0; i < 8; i++) {
1475 v = apic_read(base + i*0x10);
1476 for (j = 0; j < 32; j++) {
1477 if (v & (1<<j))
1478 printk("1");
1479 else
1480 printk("0");
1481 }
1482 printk("\n");
1483 }
1484}
1485
1486void /*__init*/ print_local_APIC(void *dummy)
1487{
1488 unsigned int v, ver, maxlvt;
1489
1490 if (apic_verbosity == APIC_QUIET)
1491 return;
1492
1493 printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1494 smp_processor_id(), hard_smp_processor_id());
1495 v = apic_read(APIC_ID);
1496 printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v,
1497 GET_APIC_ID(read_apic_id()));
1498 v = apic_read(APIC_LVR);
1499 printk(KERN_INFO "... APIC VERSION: %08x\n", v);
1500 ver = GET_APIC_VERSION(v);
1501 maxlvt = lapic_get_maxlvt();
1502
1503 v = apic_read(APIC_TASKPRI);
1504 printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK);
1505
1506 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1507 v = apic_read(APIC_ARBPRI);
1508 printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v,
1509 v & APIC_ARBPRI_MASK);
1510 v = apic_read(APIC_PROCPRI);
1511 printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v);
1512 }
1513
1514 v = apic_read(APIC_EOI);
1515 printk(KERN_DEBUG "... APIC EOI: %08x\n", v);
1516 v = apic_read(APIC_RRR);
1517 printk(KERN_DEBUG "... APIC RRR: %08x\n", v);
1518 v = apic_read(APIC_LDR);
1519 printk(KERN_DEBUG "... APIC LDR: %08x\n", v);
1520 v = apic_read(APIC_DFR);
1521 printk(KERN_DEBUG "... APIC DFR: %08x\n", v);
1522 v = apic_read(APIC_SPIV);
1523 printk(KERN_DEBUG "... APIC SPIV: %08x\n", v);
1524
1525 printk(KERN_DEBUG "... APIC ISR field:\n");
1526 print_APIC_bitfield(APIC_ISR);
1527 printk(KERN_DEBUG "... APIC TMR field:\n");
1528 print_APIC_bitfield(APIC_TMR);
1529 printk(KERN_DEBUG "... APIC IRR field:\n");
1530 print_APIC_bitfield(APIC_IRR);
1531
1532 if (APIC_INTEGRATED(ver)) { /* !82489DX */
1533 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
1534 apic_write(APIC_ESR, 0);
1535 v = apic_read(APIC_ESR);
1536 printk(KERN_DEBUG "... APIC ESR: %08x\n", v);
1537 }
1538
1539 v = apic_read(APIC_ICR);
1540 printk(KERN_DEBUG "... APIC ICR: %08x\n", v);
1541 v = apic_read(APIC_ICR2);
1542 printk(KERN_DEBUG "... APIC ICR2: %08x\n", v);
1543
1544 v = apic_read(APIC_LVTT);
1545 printk(KERN_DEBUG "... APIC LVTT: %08x\n", v);
1546
1547 if (maxlvt > 3) { /* PC is LVT#4. */
1548 v = apic_read(APIC_LVTPC);
1549 printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v);
1550 }
1551 v = apic_read(APIC_LVT0);
1552 printk(KERN_DEBUG "... APIC LVT0: %08x\n", v);
1553 v = apic_read(APIC_LVT1);
1554 printk(KERN_DEBUG "... APIC LVT1: %08x\n", v);
1555
1556 if (maxlvt > 2) { /* ERR is LVT#3. */
1557 v = apic_read(APIC_LVTERR);
1558 printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v);
1559 }
1560
1561 v = apic_read(APIC_TMICT);
1562 printk(KERN_DEBUG "... APIC TMICT: %08x\n", v);
1563 v = apic_read(APIC_TMCCT);
1564 printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
1565 v = apic_read(APIC_TDCR);
1566 printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
1567 printk("\n");
1568}
1569
1570void print_all_local_APICs(void)
1571{
1572 on_each_cpu(print_local_APIC, NULL, 1);
1573}
1574
1575void /*__init*/ print_PIC(void)
1576{
1577 unsigned int v;
1578 unsigned long flags;
1579
1580 if (apic_verbosity == APIC_QUIET)
1581 return;
1582
1583 printk(KERN_DEBUG "\nprinting PIC contents\n");
1584
1585 spin_lock_irqsave(&i8259A_lock, flags);
1586
1587 v = inb(0xa1) << 8 | inb(0x21);
1588 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
1589
1590 v = inb(0xa0) << 8 | inb(0x20);
1591 printk(KERN_DEBUG "... PIC IRR: %04x\n", v);
1592
1593 outb(0x0b, 0xa0);
1594 outb(0x0b, 0x20);
1595 v = inb(0xa0) << 8 | inb(0x20);
1596 outb(0x0a, 0xa0);
1597 outb(0x0a, 0x20);
1598
1599 spin_unlock_irqrestore(&i8259A_lock, flags);
1600
1601 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1602
1603 v = inb(0x4d1) << 8 | inb(0x4d0);
1604 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1605}
1606
1607#endif /* 0 */
1608
1609static void __init enable_IO_APIC(void)
1610{
1611 union IO_APIC_reg_01 reg_01;
1612 int i8259_apic, i8259_pin;
1613 int i, apic;
1614 unsigned long flags;
1615
1616 for (i = 0; i < PIN_MAP_SIZE; i++) {
1617 irq_2_pin[i].pin = -1;
1618 irq_2_pin[i].next = 0;
1619 }
1620 if (!pirqs_enabled)
1621 for (i = 0; i < MAX_PIRQS; i++)
1622 pirq_entries[i] = -1;
1623
1624 /*
1625 * The number of IO-APIC IRQ registers (== #pins):
1626 */
1627 for (apic = 0; apic < nr_ioapics; apic++) {
1628 spin_lock_irqsave(&ioapic_lock, flags);
1629 reg_01.raw = io_apic_read(apic, 1);
1630 spin_unlock_irqrestore(&ioapic_lock, flags);
1631 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1632 }
1633 for (apic = 0; apic < nr_ioapics; apic++) {
1634 int pin;
1635 /* See if any of the pins is in ExtINT mode */
1636 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
1637 struct IO_APIC_route_entry entry;
1638 entry = ioapic_read_entry(apic, pin);
1639
1640
1641 /* If the interrupt line is enabled and in ExtInt mode
1642 * I have found the pin where the i8259 is connected.
1643 */
1644 if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) {
1645 ioapic_i8259.apic = apic;
1646 ioapic_i8259.pin = pin;
1647 goto found_i8259;
1648 }
1649 }
1650 }
1651 found_i8259:
1652 /* Look to see what if the MP table has reported the ExtINT */
1653 /* If we could not find the appropriate pin by looking at the ioapic
1654 * the i8259 probably is not connected the ioapic but give the
1655 * mptable a chance anyway.
1656 */
1657 i8259_pin = find_isa_irq_pin(0, mp_ExtINT);
1658 i8259_apic = find_isa_irq_apic(0, mp_ExtINT);
1659 /* Trust the MP table if nothing is setup in the hardware */
1660 if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) {
1661 printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n");
1662 ioapic_i8259.pin = i8259_pin;
1663 ioapic_i8259.apic = i8259_apic;
1664 }
1665 /* Complain if the MP table and the hardware disagree */
1666 if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) &&
1667 (i8259_pin >= 0) && (ioapic_i8259.pin >= 0))
1668 {
1669 printk(KERN_WARNING "ExtINT in hardware and MP table differ\n");
1670 }
1671
1672 /*
1673 * Do not trust the IO-APIC being empty at bootup
1674 */
1675 clear_IO_APIC();
1676}
1677
1678/*
1679 * Not an __init, needed by the reboot code
1680 */
1681void disable_IO_APIC(void)
1682{
1683 /*
1684 * Clear the IO-APIC before rebooting:
1685 */
1686 clear_IO_APIC();
1687
1688 /*
1689 * If the i8259 is routed through an IOAPIC
1690 * Put that IOAPIC in virtual wire mode
1691 * so legacy interrupts can be delivered.
1692 */
1693 if (ioapic_i8259.pin != -1) {
1694 struct IO_APIC_route_entry entry;
1695
1696 memset(&entry, 0, sizeof(entry));
1697 entry.mask = 0; /* Enabled */
1698 entry.trigger = 0; /* Edge */
1699 entry.irr = 0;
1700 entry.polarity = 0; /* High */
1701 entry.delivery_status = 0;
1702 entry.dest_mode = 0; /* Physical */
1703 entry.delivery_mode = dest_ExtINT; /* ExtInt */
1704 entry.vector = 0;
1705 entry.dest.physical.physical_dest =
1706 GET_APIC_ID(read_apic_id());
1707
1708 /*
1709 * Add it to the IO-APIC irq-routing table:
1710 */
1711 ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry);
1712 }
1713 disconnect_bsp_APIC(ioapic_i8259.pin != -1);
1714}
1715
1716/*
1717 * function to set the IO-APIC physical IDs based on the
1718 * values stored in the MPC table.
1719 *
1720 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
1721 */
1722
1723static void __init setup_ioapic_ids_from_mpc(void)
1724{
1725 union IO_APIC_reg_00 reg_00;
1726 physid_mask_t phys_id_present_map;
1727 int apic;
1728 int i;
1729 unsigned char old_id;
1730 unsigned long flags;
1731
1732#ifdef CONFIG_X86_NUMAQ
1733 if (found_numaq)
1734 return;
1735#endif
1736
1737 /*
1738 * Don't check I/O APIC IDs for xAPIC systems. They have
1739 * no meaning without the serial APIC bus.
1740 */
1741 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
1742 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
1743 return;
1744 /*
1745 * This is broken; anything with a real cpu count has to
1746 * circumvent this idiocy regardless.
1747 */
1748 phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map);
1749
1750 /*
1751 * Set the IOAPIC ID to the value stored in the MPC table.
1752 */
1753 for (apic = 0; apic < nr_ioapics; apic++) {
1754
1755 /* Read the register 0 value */
1756 spin_lock_irqsave(&ioapic_lock, flags);
1757 reg_00.raw = io_apic_read(apic, 0);
1758 spin_unlock_irqrestore(&ioapic_lock, flags);
1759
1760 old_id = mp_ioapics[apic].mp_apicid;
1761
1762 if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) {
1763 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1764 apic, mp_ioapics[apic].mp_apicid);
1765 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1766 reg_00.bits.ID);
1767 mp_ioapics[apic].mp_apicid = reg_00.bits.ID;
1768 }
1769
1770 /*
1771 * Sanity check, is the ID really free? Every APIC in a
1772 * system must have a unique ID or we get lots of nice
1773 * 'stuck on smp_invalidate_needed IPI wait' messages.
1774 */
1775 if (check_apicid_used(phys_id_present_map,
1776 mp_ioapics[apic].mp_apicid)) {
1777 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1778 apic, mp_ioapics[apic].mp_apicid);
1779 for (i = 0; i < get_physical_broadcast(); i++)
1780 if (!physid_isset(i, phys_id_present_map))
1781 break;
1782 if (i >= get_physical_broadcast())
1783 panic("Max APIC ID exceeded!\n");
1784 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1785 i);
1786 physid_set(i, phys_id_present_map);
1787 mp_ioapics[apic].mp_apicid = i;
1788 } else {
1789 physid_mask_t tmp;
1790 tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid);
1791 apic_printk(APIC_VERBOSE, "Setting %d in the "
1792 "phys_id_present_map\n",
1793 mp_ioapics[apic].mp_apicid);
1794 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1795 }
1796
1797
1798 /*
1799 * We need to adjust the IRQ routing table
1800 * if the ID changed.
1801 */
1802 if (old_id != mp_ioapics[apic].mp_apicid)
1803 for (i = 0; i < mp_irq_entries; i++)
1804 if (mp_irqs[i].mp_dstapic == old_id)
1805 mp_irqs[i].mp_dstapic
1806 = mp_ioapics[apic].mp_apicid;
1807
1808 /*
1809 * Read the right value from the MPC table and
1810 * write it into the ID register.
1811 */
1812 apic_printk(APIC_VERBOSE, KERN_INFO
1813 "...changing IO-APIC physical APIC ID to %d ...",
1814 mp_ioapics[apic].mp_apicid);
1815
1816 reg_00.bits.ID = mp_ioapics[apic].mp_apicid;
1817 spin_lock_irqsave(&ioapic_lock, flags);
1818 io_apic_write(apic, 0, reg_00.raw);
1819 spin_unlock_irqrestore(&ioapic_lock, flags);
1820
1821 /*
1822 * Sanity check
1823 */
1824 spin_lock_irqsave(&ioapic_lock, flags);
1825 reg_00.raw = io_apic_read(apic, 0);
1826 spin_unlock_irqrestore(&ioapic_lock, flags);
1827 if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid)
1828 printk("could not set ID!\n");
1829 else
1830 apic_printk(APIC_VERBOSE, " ok.\n");
1831 }
1832}
1833
1834int no_timer_check __initdata;
1835
1836static int __init notimercheck(char *s)
1837{
1838 no_timer_check = 1;
1839 return 1;
1840}
1841__setup("no_timer_check", notimercheck);
1842
1843/*
1844 * There is a nasty bug in some older SMP boards, their mptable lies
1845 * about the timer IRQ. We do the following to work around the situation:
1846 *
1847 * - timer IRQ defaults to IO-APIC IRQ
1848 * - if this function detects that timer IRQs are defunct, then we fall
1849 * back to ISA timer IRQs
1850 */
1851static int __init timer_irq_works(void)
1852{
1853 unsigned long t1 = jiffies;
1854 unsigned long flags;
1855
1856 if (no_timer_check)
1857 return 1;
1858
1859 local_save_flags(flags);
1860 local_irq_enable();
1861 /* Let ten ticks pass... */
1862 mdelay((10 * 1000) / HZ);
1863 local_irq_restore(flags);
1864
1865 /*
1866 * Expect a few ticks at least, to be sure some possible
1867 * glue logic does not lock up after one or two first
1868 * ticks in a non-ExtINT mode. Also the local APIC
1869 * might have cached one ExtINT interrupt. Finally, at
1870 * least one tick may be lost due to delays.
1871 */
1872 if (time_after(jiffies, t1 + 4))
1873 return 1;
1874
1875 return 0;
1876}
1877
1878/*
1879 * In the SMP+IOAPIC case it might happen that there are an unspecified
1880 * number of pending IRQ events unhandled. These cases are very rare,
1881 * so we 'resend' these IRQs via IPIs, to the same CPU. It's much
1882 * better to do it this way as thus we do not have to be aware of
1883 * 'pending' interrupts in the IRQ path, except at this point.
1884 */
1885/*
1886 * Edge triggered needs to resend any interrupt
1887 * that was delayed but this is now handled in the device
1888 * independent code.
1889 */
1890
1891/*
1892 * Startup quirk:
1893 *
1894 * Starting up a edge-triggered IO-APIC interrupt is
1895 * nasty - we need to make sure that we get the edge.
1896 * If it is already asserted for some reason, we need
1897 * return 1 to indicate that is was pending.
1898 *
1899 * This is not complete - we should be able to fake
1900 * an edge even if it isn't on the 8259A...
1901 *
1902 * (We do this for level-triggered IRQs too - it cannot hurt.)
1903 */
1904static unsigned int startup_ioapic_irq(unsigned int irq)
1905{
1906 int was_pending = 0;
1907 unsigned long flags;
1908
1909 spin_lock_irqsave(&ioapic_lock, flags);
1910 if (irq < 16) {
1911 disable_8259A_irq(irq);
1912 if (i8259A_irq_pending(irq))
1913 was_pending = 1;
1914 }
1915 __unmask_IO_APIC_irq(irq);
1916 spin_unlock_irqrestore(&ioapic_lock, flags);
1917
1918 return was_pending;
1919}
1920
1921static void ack_ioapic_irq(unsigned int irq)
1922{
1923 move_native_irq(irq);
1924 ack_APIC_irq();
1925}
1926
1927static void ack_ioapic_quirk_irq(unsigned int irq)
1928{
1929 unsigned long v;
1930 int i;
1931
1932 move_native_irq(irq);
1933/*
1934 * It appears there is an erratum which affects at least version 0x11
1935 * of I/O APIC (that's the 82093AA and cores integrated into various
1936 * chipsets). Under certain conditions a level-triggered interrupt is
1937 * erroneously delivered as edge-triggered one but the respective IRR
1938 * bit gets set nevertheless. As a result the I/O unit expects an EOI
1939 * message but it will never arrive and further interrupts are blocked
1940 * from the source. The exact reason is so far unknown, but the
1941 * phenomenon was observed when two consecutive interrupt requests
1942 * from a given source get delivered to the same CPU and the source is
1943 * temporarily disabled in between.
1944 *
1945 * A workaround is to simulate an EOI message manually. We achieve it
1946 * by setting the trigger mode to edge and then to level when the edge
1947 * trigger mode gets detected in the TMR of a local APIC for a
1948 * level-triggered interrupt. We mask the source for the time of the
1949 * operation to prevent an edge-triggered interrupt escaping meanwhile.
1950 * The idea is from Manfred Spraul. --macro
1951 */
1952 i = irq_vector[irq];
1953
1954 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
1955
1956 ack_APIC_irq();
1957
1958 if (!(v & (1 << (i & 0x1f)))) {
1959 atomic_inc(&irq_mis_count);
1960 spin_lock(&ioapic_lock);
1961 __mask_and_edge_IO_APIC_irq(irq);
1962 __unmask_and_level_IO_APIC_irq(irq);
1963 spin_unlock(&ioapic_lock);
1964 }
1965}
1966
1967static int ioapic_retrigger_irq(unsigned int irq)
1968{
1969 send_IPI_self(irq_vector[irq]);
1970
1971 return 1;
1972}
1973
1974static struct irq_chip ioapic_chip __read_mostly = {
1975 .name = "IO-APIC",
1976 .startup = startup_ioapic_irq,
1977 .mask = mask_IO_APIC_irq,
1978 .unmask = unmask_IO_APIC_irq,
1979 .ack = ack_ioapic_irq,
1980 .eoi = ack_ioapic_quirk_irq,
1981#ifdef CONFIG_SMP
1982 .set_affinity = set_ioapic_affinity_irq,
1983#endif
1984 .retrigger = ioapic_retrigger_irq,
1985};
1986
1987
1988static inline void init_IO_APIC_traps(void)
1989{
1990 int irq;
1991
1992 /*
1993 * NOTE! The local APIC isn't very good at handling
1994 * multiple interrupts at the same interrupt level.
1995 * As the interrupt level is determined by taking the
1996 * vector number and shifting that right by 4, we
1997 * want to spread these out a bit so that they don't
1998 * all fall in the same interrupt level.
1999 *
2000 * Also, we've got to be careful not to trash gate
2001 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2002 */
2003 for (irq = 0; irq < NR_IRQS ; irq++) {
2004 if (IO_APIC_IRQ(irq) && !irq_vector[irq]) {
2005 /*
2006 * Hmm.. We don't have an entry for this,
2007 * so default to an old-fashioned 8259
2008 * interrupt if we can..
2009 */
2010 if (irq < 16)
2011 make_8259A_irq(irq);
2012 else
2013 /* Strange. Oh, well.. */
2014 irq_desc[irq].chip = &no_irq_chip;
2015 }
2016 }
2017}
2018
2019/*
2020 * The local APIC irq-chip implementation:
2021 */
2022
2023static void ack_lapic_irq(unsigned int irq)
2024{
2025 ack_APIC_irq();
2026}
2027
2028static void mask_lapic_irq(unsigned int irq)
2029{
2030 unsigned long v;
2031
2032 v = apic_read(APIC_LVT0);
2033 apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED);
2034}
2035
2036static void unmask_lapic_irq(unsigned int irq)
2037{
2038 unsigned long v;
2039
2040 v = apic_read(APIC_LVT0);
2041 apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED);
2042}
2043
2044static struct irq_chip lapic_chip __read_mostly = {
2045 .name = "local-APIC",
2046 .mask = mask_lapic_irq,
2047 .unmask = unmask_lapic_irq,
2048 .ack = ack_lapic_irq,
2049};
2050
2051static void lapic_register_intr(int irq, int vector)
2052{
2053 irq_desc[irq].status &= ~IRQ_LEVEL;
2054 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2055 "edge");
2056 set_intr_gate(vector, interrupt[irq]);
2057}
2058
2059static void __init setup_nmi(void)
2060{
2061 /*
2062 * Dirty trick to enable the NMI watchdog ...
2063 * We put the 8259A master into AEOI mode and
2064 * unmask on all local APICs LVT0 as NMI.
2065 *
2066 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
2067 * is from Maciej W. Rozycki - so we do not have to EOI from
2068 * the NMI handler or the timer interrupt.
2069 */
2070 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2071
2072 enable_NMI_through_LVT0();
2073
2074 apic_printk(APIC_VERBOSE, " done.\n");
2075}
2076
2077/*
2078 * This looks a bit hackish but it's about the only one way of sending
2079 * a few INTA cycles to 8259As and any associated glue logic. ICR does
2080 * not support the ExtINT mode, unfortunately. We need to send these
2081 * cycles as some i82489DX-based boards have glue logic that keeps the
2082 * 8259A interrupt line asserted until INTA. --macro
2083 */
2084static inline void __init unlock_ExtINT_logic(void)
2085{
2086 int apic, pin, i;
2087 struct IO_APIC_route_entry entry0, entry1;
2088 unsigned char save_control, save_freq_select;
2089
2090 pin = find_isa_irq_pin(8, mp_INT);
2091 if (pin == -1) {
2092 WARN_ON_ONCE(1);
2093 return;
2094 }
2095 apic = find_isa_irq_apic(8, mp_INT);
2096 if (apic == -1) {
2097 WARN_ON_ONCE(1);
2098 return;
2099 }
2100
2101 entry0 = ioapic_read_entry(apic, pin);
2102 clear_IO_APIC_pin(apic, pin);
2103
2104 memset(&entry1, 0, sizeof(entry1));
2105
2106 entry1.dest_mode = 0; /* physical delivery */
2107 entry1.mask = 0; /* unmask IRQ now */
2108 entry1.dest.physical.physical_dest = hard_smp_processor_id();
2109 entry1.delivery_mode = dest_ExtINT;
2110 entry1.polarity = entry0.polarity;
2111 entry1.trigger = 0;
2112 entry1.vector = 0;
2113
2114 ioapic_write_entry(apic, pin, entry1);
2115
2116 save_control = CMOS_READ(RTC_CONTROL);
2117 save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
2118 CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6,
2119 RTC_FREQ_SELECT);
2120 CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL);
2121
2122 i = 100;
2123 while (i-- > 0) {
2124 mdelay(10);
2125 if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF)
2126 i -= 10;
2127 }
2128
2129 CMOS_WRITE(save_control, RTC_CONTROL);
2130 CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
2131 clear_IO_APIC_pin(apic, pin);
2132
2133 ioapic_write_entry(apic, pin, entry0);
2134}
2135
2136/*
2137 * This code may look a bit paranoid, but it's supposed to cooperate with
2138 * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ
2139 * is so screwy. Thanks to Brian Perkins for testing/hacking this beast
2140 * fanatically on his truly buggy board.
2141 */
2142static inline void __init check_timer(void)
2143{
2144 int apic1, pin1, apic2, pin2;
2145 int no_pin1 = 0;
2146 int vector;
2147 unsigned int ver;
2148 unsigned long flags;
2149
2150 local_irq_save(flags);
2151
2152 ver = apic_read(APIC_LVR);
2153 ver = GET_APIC_VERSION(ver);
2154
2155 /*
2156 * get/set the timer IRQ vector:
2157 */
2158 disable_8259A_irq(0);
2159 vector = assign_irq_vector(0);
2160 set_intr_gate(vector, interrupt[0]);
2161
2162 /*
2163 * As IRQ0 is to be enabled in the 8259A, the virtual
2164 * wire has to be disabled in the local APIC. Also
2165 * timer interrupts need to be acknowledged manually in
2166 * the 8259A for the i82489DX when using the NMI
2167 * watchdog as that APIC treats NMIs as level-triggered.
2168 * The AEOI mode will finish them in the 8259A
2169 * automatically.
2170 */
2171 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2172 init_8259A(1);
2173 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2174
2175 pin1 = find_isa_irq_pin(0, mp_INT);
2176 apic1 = find_isa_irq_apic(0, mp_INT);
2177 pin2 = ioapic_i8259.pin;
2178 apic2 = ioapic_i8259.apic;
2179
2180 printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n",
2181 vector, apic1, pin1, apic2, pin2);
2182
2183 /*
2184 * Some BIOS writers are clueless and report the ExtINTA
2185 * I/O APIC input from the cascaded 8259A as the timer
2186 * interrupt input. So just in case, if only one pin
2187 * was found above, try it both directly and through the
2188 * 8259A.
2189 */
2190 if (pin1 == -1) {
2191 pin1 = pin2;
2192 apic1 = apic2;
2193 no_pin1 = 1;
2194 } else if (pin2 == -1) {
2195 pin2 = pin1;
2196 apic2 = apic1;
2197 }
2198
2199 if (pin1 != -1) {
2200 /*
2201 * Ok, does IRQ0 through the IOAPIC work?
2202 */
2203 if (no_pin1) {
2204 add_pin_to_irq(0, apic1, pin1);
2205 setup_timer_IRQ0_pin(apic1, pin1, vector);
2206 }
2207 unmask_IO_APIC_irq(0);
2208 if (timer_irq_works()) {
2209 if (nmi_watchdog == NMI_IO_APIC) {
2210 setup_nmi();
2211 enable_8259A_irq(0);
2212 }
2213 if (disable_timer_pin_1 > 0)
2214 clear_IO_APIC_pin(0, pin1);
2215 goto out;
2216 }
2217 clear_IO_APIC_pin(apic1, pin1);
2218 if (!no_pin1)
2219 printk(KERN_ERR "..MP-BIOS bug: "
2220 "8254 timer not connected to IO-APIC\n");
2221
2222 printk(KERN_INFO "...trying to set up timer (IRQ0) "
2223 "through the 8259A ... ");
2224 printk("\n..... (found pin %d) ...", pin2);
2225 /*
2226 * legacy devices should be connected to IO APIC #0
2227 */
2228 replace_pin_at_irq(0, apic1, pin1, apic2, pin2);
2229 setup_timer_IRQ0_pin(apic2, pin2, vector);
2230 unmask_IO_APIC_irq(0);
2231 enable_8259A_irq(0);
2232 if (timer_irq_works()) {
2233 printk("works.\n");
2234 timer_through_8259 = 1;
2235 if (nmi_watchdog == NMI_IO_APIC) {
2236 disable_8259A_irq(0);
2237 setup_nmi();
2238 enable_8259A_irq(0);
2239 }
2240 goto out;
2241 }
2242 /*
2243 * Cleanup, just in case ...
2244 */
2245 disable_8259A_irq(0);
2246 clear_IO_APIC_pin(apic2, pin2);
2247 printk(" failed.\n");
2248 }
2249
2250 if (nmi_watchdog == NMI_IO_APIC) {
2251 printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n");
2252 nmi_watchdog = NMI_NONE;
2253 }
2254 timer_ack = 0;
2255
2256 printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ...");
2257
2258 lapic_register_intr(0, vector);
2259 apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */
2260 enable_8259A_irq(0);
2261
2262 if (timer_irq_works()) {
2263 printk(" works.\n");
2264 goto out;
2265 }
2266 disable_8259A_irq(0);
2267 apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector);
2268 printk(" failed.\n");
2269
2270 printk(KERN_INFO "...trying to set up timer as ExtINT IRQ...");
2271
2272 init_8259A(0);
2273 make_8259A_irq(0);
2274 apic_write_around(APIC_LVT0, APIC_DM_EXTINT);
2275
2276 unlock_ExtINT_logic();
2277
2278 if (timer_irq_works()) {
2279 printk(" works.\n");
2280 goto out;
2281 }
2282 printk(" failed :(.\n");
2283 panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a "
2284 "report. Then try booting with the 'noapic' option");
2285out:
2286 local_irq_restore(flags);
2287}
2288
2289/*
2290 * Traditionally ISA IRQ2 is the cascade IRQ, and is not available
2291 * to devices. However there may be an I/O APIC pin available for
2292 * this interrupt regardless. The pin may be left unconnected, but
2293 * typically it will be reused as an ExtINT cascade interrupt for
2294 * the master 8259A. In the MPS case such a pin will normally be
2295 * reported as an ExtINT interrupt in the MP table. With ACPI
2296 * there is no provision for ExtINT interrupts, and in the absence
2297 * of an override it would be treated as an ordinary ISA I/O APIC
2298 * interrupt, that is edge-triggered and unmasked by default. We
2299 * used to do this, but it caused problems on some systems because
2300 * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using
2301 * the same ExtINT cascade interrupt to drive the local APIC of the
2302 * bootstrap processor. Therefore we refrain from routing IRQ2 to
2303 * the I/O APIC in all cases now. No actual device should request
2304 * it anyway. --macro
2305 */
2306#define PIC_IRQS (1 << PIC_CASCADE_IR)
2307
2308void __init setup_IO_APIC(void)
2309{
2310 int i;
2311
2312 /* Reserve all the system vectors. */
2313 for (i = first_system_vector; i < NR_VECTORS; i++)
2314 set_bit(i, used_vectors);
2315
2316 enable_IO_APIC();
2317
2318 io_apic_irqs = ~PIC_IRQS;
2319
2320 printk("ENABLING IO-APIC IRQs\n");
2321
2322 /*
2323 * Set up IO-APIC IRQ routing.
2324 */
2325 if (!acpi_ioapic)
2326 setup_ioapic_ids_from_mpc();
2327 sync_Arb_IDs();
2328 setup_IO_APIC_irqs();
2329 init_IO_APIC_traps();
2330 check_timer();
2331 if (!acpi_ioapic)
2332 print_IO_APIC();
2333}
2334
2335/*
2336 * Called after all the initialization is done. If we didnt find any
2337 * APIC bugs then we can allow the modify fast path
2338 */
2339
2340static int __init io_apic_bug_finalize(void)
2341{
2342 if (sis_apic_bug == -1)
2343 sis_apic_bug = 0;
2344 return 0;
2345}
2346
2347late_initcall(io_apic_bug_finalize);
2348
2349struct sysfs_ioapic_data {
2350 struct sys_device dev;
2351 struct IO_APIC_route_entry entry[0];
2352};
2353static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS];
2354
2355static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
2356{
2357 struct IO_APIC_route_entry *entry;
2358 struct sysfs_ioapic_data *data;
2359 int i;
2360
2361 data = container_of(dev, struct sysfs_ioapic_data, dev);
2362 entry = data->entry;
2363 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
2364 entry[i] = ioapic_read_entry(dev->id, i);
2365
2366 return 0;
2367}
2368
2369static int ioapic_resume(struct sys_device *dev)
2370{
2371 struct IO_APIC_route_entry *entry;
2372 struct sysfs_ioapic_data *data;
2373 unsigned long flags;
2374 union IO_APIC_reg_00 reg_00;
2375 int i;
2376
2377 data = container_of(dev, struct sysfs_ioapic_data, dev);
2378 entry = data->entry;
2379
2380 spin_lock_irqsave(&ioapic_lock, flags);
2381 reg_00.raw = io_apic_read(dev->id, 0);
2382 if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) {
2383 reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid;
2384 io_apic_write(dev->id, 0, reg_00.raw);
2385 }
2386 spin_unlock_irqrestore(&ioapic_lock, flags);
2387 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
2388 ioapic_write_entry(dev->id, i, entry[i]);
2389
2390 return 0;
2391}
2392
2393static struct sysdev_class ioapic_sysdev_class = {
2394 .name = "ioapic",
2395 .suspend = ioapic_suspend,
2396 .resume = ioapic_resume,
2397};
2398
2399static int __init ioapic_init_sysfs(void)
2400{
2401 struct sys_device *dev;
2402 int i, size, error = 0;
2403
2404 error = sysdev_class_register(&ioapic_sysdev_class);
2405 if (error)
2406 return error;
2407
2408 for (i = 0; i < nr_ioapics; i++) {
2409 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
2410 * sizeof(struct IO_APIC_route_entry);
2411 mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
2412 if (!mp_ioapic_data[i]) {
2413 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2414 continue;
2415 }
2416 dev = &mp_ioapic_data[i]->dev;
2417 dev->id = i;
2418 dev->cls = &ioapic_sysdev_class;
2419 error = sysdev_register(dev);
2420 if (error) {
2421 kfree(mp_ioapic_data[i]);
2422 mp_ioapic_data[i] = NULL;
2423 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
2424 continue;
2425 }
2426 }
2427
2428 return 0;
2429}
2430
2431device_initcall(ioapic_init_sysfs);
2432
2433/*
2434 * Dynamic irq allocate and deallocation
2435 */
2436int create_irq(void)
2437{
2438 /* Allocate an unused irq */
2439 int irq, new, vector = 0;
2440 unsigned long flags;
2441
2442 irq = -ENOSPC;
2443 spin_lock_irqsave(&vector_lock, flags);
2444 for (new = (NR_IRQS - 1); new >= 0; new--) {
2445 if (platform_legacy_irq(new))
2446 continue;
2447 if (irq_vector[new] != 0)
2448 continue;
2449 vector = __assign_irq_vector(new);
2450 if (likely(vector > 0))
2451 irq = new;
2452 break;
2453 }
2454 spin_unlock_irqrestore(&vector_lock, flags);
2455
2456 if (irq >= 0) {
2457 set_intr_gate(vector, interrupt[irq]);
2458 dynamic_irq_init(irq);
2459 }
2460 return irq;
2461}
2462
2463void destroy_irq(unsigned int irq)
2464{
2465 unsigned long flags;
2466
2467 dynamic_irq_cleanup(irq);
2468
2469 spin_lock_irqsave(&vector_lock, flags);
2470 clear_bit(irq_vector[irq], used_vectors);
2471 irq_vector[irq] = 0;
2472 spin_unlock_irqrestore(&vector_lock, flags);
2473}
2474
2475/*
2476 * MSI message composition
2477 */
2478#ifdef CONFIG_PCI_MSI
2479static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg)
2480{
2481 int vector;
2482 unsigned dest;
2483
2484 vector = assign_irq_vector(irq);
2485 if (vector >= 0) {
2486 dest = cpu_mask_to_apicid(TARGET_CPUS);
2487
2488 msg->address_hi = MSI_ADDR_BASE_HI;
2489 msg->address_lo =
2490 MSI_ADDR_BASE_LO |
2491 ((INT_DEST_MODE == 0) ?
2492MSI_ADDR_DEST_MODE_PHYSICAL:
2493 MSI_ADDR_DEST_MODE_LOGICAL) |
2494 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2495 MSI_ADDR_REDIRECTION_CPU:
2496 MSI_ADDR_REDIRECTION_LOWPRI) |
2497 MSI_ADDR_DEST_ID(dest);
2498
2499 msg->data =
2500 MSI_DATA_TRIGGER_EDGE |
2501 MSI_DATA_LEVEL_ASSERT |
2502 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2503MSI_DATA_DELIVERY_FIXED:
2504 MSI_DATA_DELIVERY_LOWPRI) |
2505 MSI_DATA_VECTOR(vector);
2506 }
2507 return vector;
2508}
2509
2510#ifdef CONFIG_SMP
2511static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask)
2512{
2513 struct msi_msg msg;
2514 unsigned int dest;
2515 cpumask_t tmp;
2516 int vector;
2517
2518 cpus_and(tmp, mask, cpu_online_map);
2519 if (cpus_empty(tmp))
2520 tmp = TARGET_CPUS;
2521
2522 vector = assign_irq_vector(irq);
2523 if (vector < 0)
2524 return;
2525
2526 dest = cpu_mask_to_apicid(mask);
2527
2528 read_msi_msg(irq, &msg);
2529
2530 msg.data &= ~MSI_DATA_VECTOR_MASK;
2531 msg.data |= MSI_DATA_VECTOR(vector);
2532 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
2533 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
2534
2535 write_msi_msg(irq, &msg);
2536 irq_desc[irq].affinity = mask;
2537}
2538#endif /* CONFIG_SMP */
2539
2540/*
2541 * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices,
2542 * which implement the MSI or MSI-X Capability Structure.
2543 */
2544static struct irq_chip msi_chip = {
2545 .name = "PCI-MSI",
2546 .unmask = unmask_msi_irq,
2547 .mask = mask_msi_irq,
2548 .ack = ack_ioapic_irq,
2549#ifdef CONFIG_SMP
2550 .set_affinity = set_msi_irq_affinity,
2551#endif
2552 .retrigger = ioapic_retrigger_irq,
2553};
2554
2555int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc)
2556{
2557 struct msi_msg msg;
2558 int irq, ret;
2559 irq = create_irq();
2560 if (irq < 0)
2561 return irq;
2562
2563 ret = msi_compose_msg(dev, irq, &msg);
2564 if (ret < 0) {
2565 destroy_irq(irq);
2566 return ret;
2567 }
2568
2569 set_irq_msi(irq, desc);
2570 write_msi_msg(irq, &msg);
2571
2572 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq,
2573 "edge");
2574
2575 return 0;
2576}
2577
2578void arch_teardown_msi_irq(unsigned int irq)
2579{
2580 destroy_irq(irq);
2581}
2582
2583#endif /* CONFIG_PCI_MSI */
2584
2585/*
2586 * Hypertransport interrupt support
2587 */
2588#ifdef CONFIG_HT_IRQ
2589
2590#ifdef CONFIG_SMP
2591
2592static void target_ht_irq(unsigned int irq, unsigned int dest)
2593{
2594 struct ht_irq_msg msg;
2595 fetch_ht_irq_msg(irq, &msg);
2596
2597 msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK);
2598 msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK);
2599
2600 msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest);
2601 msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest);
2602
2603 write_ht_irq_msg(irq, &msg);
2604}
2605
2606static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask)
2607{
2608 unsigned int dest;
2609 cpumask_t tmp;
2610
2611 cpus_and(tmp, mask, cpu_online_map);
2612 if (cpus_empty(tmp))
2613 tmp = TARGET_CPUS;
2614
2615 cpus_and(mask, tmp, CPU_MASK_ALL);
2616
2617 dest = cpu_mask_to_apicid(mask);
2618
2619 target_ht_irq(irq, dest);
2620 irq_desc[irq].affinity = mask;
2621}
2622#endif
2623
2624static struct irq_chip ht_irq_chip = {
2625 .name = "PCI-HT",
2626 .mask = mask_ht_irq,
2627 .unmask = unmask_ht_irq,
2628 .ack = ack_ioapic_irq,
2629#ifdef CONFIG_SMP
2630 .set_affinity = set_ht_irq_affinity,
2631#endif
2632 .retrigger = ioapic_retrigger_irq,
2633};
2634
2635int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
2636{
2637 int vector;
2638
2639 vector = assign_irq_vector(irq);
2640 if (vector >= 0) {
2641 struct ht_irq_msg msg;
2642 unsigned dest;
2643 cpumask_t tmp;
2644
2645 cpus_clear(tmp);
2646 cpu_set(vector >> 8, tmp);
2647 dest = cpu_mask_to_apicid(tmp);
2648
2649 msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest);
2650
2651 msg.address_lo =
2652 HT_IRQ_LOW_BASE |
2653 HT_IRQ_LOW_DEST_ID(dest) |
2654 HT_IRQ_LOW_VECTOR(vector) |
2655 ((INT_DEST_MODE == 0) ?
2656 HT_IRQ_LOW_DM_PHYSICAL :
2657 HT_IRQ_LOW_DM_LOGICAL) |
2658 HT_IRQ_LOW_RQEOI_EDGE |
2659 ((INT_DELIVERY_MODE != dest_LowestPrio) ?
2660 HT_IRQ_LOW_MT_FIXED :
2661 HT_IRQ_LOW_MT_ARBITRATED) |
2662 HT_IRQ_LOW_IRQ_MASKED;
2663
2664 write_ht_irq_msg(irq, &msg);
2665
2666 set_irq_chip_and_handler_name(irq, &ht_irq_chip,
2667 handle_edge_irq, "edge");
2668 }
2669 return vector;
2670}
2671#endif /* CONFIG_HT_IRQ */
2672
2673/* --------------------------------------------------------------------------
2674 ACPI-based IOAPIC Configuration
2675 -------------------------------------------------------------------------- */
2676
2677#ifdef CONFIG_ACPI
2678
2679int __init io_apic_get_unique_id(int ioapic, int apic_id)
2680{
2681 union IO_APIC_reg_00 reg_00;
2682 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
2683 physid_mask_t tmp;
2684 unsigned long flags;
2685 int i = 0;
2686
2687 /*
2688 * The P4 platform supports up to 256 APIC IDs on two separate APIC
2689 * buses (one for LAPICs, one for IOAPICs), where predecessors only
2690 * supports up to 16 on one shared APIC bus.
2691 *
2692 * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full
2693 * advantage of new APIC bus architecture.
2694 */
2695
2696 if (physids_empty(apic_id_map))
2697 apic_id_map = ioapic_phys_id_map(phys_cpu_present_map);
2698
2699 spin_lock_irqsave(&ioapic_lock, flags);
2700 reg_00.raw = io_apic_read(ioapic, 0);
2701 spin_unlock_irqrestore(&ioapic_lock, flags);
2702
2703 if (apic_id >= get_physical_broadcast()) {
2704 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
2705 "%d\n", ioapic, apic_id, reg_00.bits.ID);
2706 apic_id = reg_00.bits.ID;
2707 }
2708
2709 /*
2710 * Every APIC in a system must have a unique ID or we get lots of nice
2711 * 'stuck on smp_invalidate_needed IPI wait' messages.
2712 */
2713 if (check_apicid_used(apic_id_map, apic_id)) {
2714
2715 for (i = 0; i < get_physical_broadcast(); i++) {
2716 if (!check_apicid_used(apic_id_map, i))
2717 break;
2718 }
2719
2720 if (i == get_physical_broadcast())
2721 panic("Max apic_id exceeded!\n");
2722
2723 printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
2724 "trying %d\n", ioapic, apic_id, i);
2725
2726 apic_id = i;
2727 }
2728
2729 tmp = apicid_to_cpu_present(apic_id);
2730 physids_or(apic_id_map, apic_id_map, tmp);
2731
2732 if (reg_00.bits.ID != apic_id) {
2733 reg_00.bits.ID = apic_id;
2734
2735 spin_lock_irqsave(&ioapic_lock, flags);
2736 io_apic_write(ioapic, 0, reg_00.raw);
2737 reg_00.raw = io_apic_read(ioapic, 0);
2738 spin_unlock_irqrestore(&ioapic_lock, flags);
2739
2740 /* Sanity check */
2741 if (reg_00.bits.ID != apic_id) {
2742 printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic);
2743 return -1;
2744 }
2745 }
2746
2747 apic_printk(APIC_VERBOSE, KERN_INFO
2748 "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
2749
2750 return apic_id;
2751}
2752
2753
2754int __init io_apic_get_version(int ioapic)
2755{
2756 union IO_APIC_reg_01 reg_01;
2757 unsigned long flags;
2758
2759 spin_lock_irqsave(&ioapic_lock, flags);
2760 reg_01.raw = io_apic_read(ioapic, 1);
2761 spin_unlock_irqrestore(&ioapic_lock, flags);
2762
2763 return reg_01.bits.version;
2764}
2765
2766
2767int __init io_apic_get_redir_entries(int ioapic)
2768{
2769 union IO_APIC_reg_01 reg_01;
2770 unsigned long flags;
2771
2772 spin_lock_irqsave(&ioapic_lock, flags);
2773 reg_01.raw = io_apic_read(ioapic, 1);
2774 spin_unlock_irqrestore(&ioapic_lock, flags);
2775
2776 return reg_01.bits.entries;
2777}
2778
2779
2780int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low)
2781{
2782 struct IO_APIC_route_entry entry;
2783
2784 if (!IO_APIC_IRQ(irq)) {
2785 printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
2786 ioapic);
2787 return -EINVAL;
2788 }
2789
2790 /*
2791 * Generate a PCI IRQ routing entry and program the IOAPIC accordingly.
2792 * Note that we mask (disable) IRQs now -- these get enabled when the
2793 * corresponding device driver registers for this IRQ.
2794 */
2795
2796 memset(&entry, 0, sizeof(entry));
2797
2798 entry.delivery_mode = INT_DELIVERY_MODE;
2799 entry.dest_mode = INT_DEST_MODE;
2800 entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS);
2801 entry.trigger = edge_level;
2802 entry.polarity = active_high_low;
2803 entry.mask = 1;
2804
2805 /*
2806 * IRQs < 16 are already in the irq_2_pin[] map
2807 */
2808 if (irq >= 16)
2809 add_pin_to_irq(irq, ioapic, pin);
2810
2811 entry.vector = assign_irq_vector(irq);
2812
2813 apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
2814 "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
2815 mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq,
2816 edge_level, active_high_low);
2817
2818 ioapic_register_intr(irq, entry.vector, edge_level);
2819
2820 if (!ioapic && (irq < 16))
2821 disable_8259A_irq(irq);
2822
2823 ioapic_write_entry(ioapic, pin, entry);
2824
2825 return 0;
2826}
2827
2828int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
2829{
2830 int i;
2831
2832 if (skip_ioapic_setup)
2833 return -1;
2834
2835 for (i = 0; i < mp_irq_entries; i++)
2836 if (mp_irqs[i].mp_irqtype == mp_INT &&
2837 mp_irqs[i].mp_srcbusirq == bus_irq)
2838 break;
2839 if (i >= mp_irq_entries)
2840 return -1;
2841
2842 *trigger = irq_trigger(i);
2843 *polarity = irq_polarity(i);
2844 return 0;
2845}
2846
2847#endif /* CONFIG_ACPI */
2848
2849static int __init parse_disable_timer_pin_1(char *arg)
2850{
2851 disable_timer_pin_1 = 1;
2852 return 0;
2853}
2854early_param("disable_timer_pin_1", parse_disable_timer_pin_1);
2855
2856static int __init parse_enable_timer_pin_1(char *arg)
2857{
2858 disable_timer_pin_1 = -1;
2859 return 0;
2860}
2861early_param("enable_timer_pin_1", parse_enable_timer_pin_1);
2862
2863static int __init parse_noapic(char *arg)
2864{
2865 /* disable IO-APIC */
2866 disable_ioapic_setup();
2867 return 0;
2868}
2869early_param("noapic", parse_noapic);
2870
2871void __init ioapic_init_mappings(void)
2872{
2873 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
2874 int i;
2875
2876 for (i = 0; i < nr_ioapics; i++) {
2877 if (smp_found_config) {
2878 ioapic_phys = mp_ioapics[i].mp_apicaddr;
2879 if (!ioapic_phys) {
2880 printk(KERN_ERR
2881 "WARNING: bogus zero IO-APIC "
2882 "address found in MPTABLE, "
2883 "disabling IO/APIC support!\n");
2884 smp_found_config = 0;
2885 skip_ioapic_setup = 1;
2886 goto fake_ioapic_page;
2887 }
2888 } else {
2889fake_ioapic_page:
2890 ioapic_phys = (unsigned long)
2891 alloc_bootmem_pages(PAGE_SIZE);
2892 ioapic_phys = __pa(ioapic_phys);
2893 }
2894 set_fixmap_nocache(idx, ioapic_phys);
2895 printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
2896 __fix_to_virt(idx), ioapic_phys);
2897 idx++;
2898 }
2899}
2900
diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c
index 5921e5f0a640..720d2607aacb 100644
--- a/arch/x86/kernel/io_delay.c
+++ b/arch/x86/kernel/io_delay.c
@@ -92,6 +92,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = {
92 DMI_MATCH(DMI_BOARD_NAME, "30BF") 92 DMI_MATCH(DMI_BOARD_NAME, "30BF")
93 } 93 }
94 }, 94 },
95 {
96 .callback = dmi_io_delay_0xed_port,
97 .ident = "Presario F700",
98 .matches = {
99 DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"),
100 DMI_MATCH(DMI_BOARD_NAME, "30D3")
101 }
102 },
95 { } 103 { }
96}; 104};
97 105
@@ -103,6 +111,9 @@ void __init io_delay_init(void)
103 111
104static int __init io_delay_param(char *s) 112static int __init io_delay_param(char *s)
105{ 113{
114 if (!s)
115 return -EINVAL;
116
106 if (!strcmp(s, "0x80")) 117 if (!strcmp(s, "0x80"))
107 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; 118 io_delay_type = CONFIG_IO_DELAY_TYPE_0X80;
108 else if (!strcmp(s, "0xed")) 119 else if (!strcmp(s, "0xed"))
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 50e5e4a31c85..191914302744 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,6 +14,7 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/thread_info.h> 15#include <linux/thread_info.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <asm/syscalls.h>
17 18
18/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ 19/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
19static void set_bitmap(unsigned long *bitmap, unsigned int base, 20static void set_bitmap(unsigned long *bitmap, unsigned int base,
diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c
index 9d98cda39ad9..f1c688e46f35 100644
--- a/arch/x86/kernel/ipi.c
+++ b/arch/x86/kernel/ipi.c
@@ -20,6 +20,8 @@
20 20
21#ifdef CONFIG_X86_32 21#ifdef CONFIG_X86_32
22#include <mach_apic.h> 22#include <mach_apic.h>
23#include <mach_ipi.h>
24
23/* 25/*
24 * the following functions deal with sending IPIs between CPUs. 26 * the following functions deal with sending IPIs between CPUs.
25 * 27 *
@@ -70,7 +72,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector)
70 /* 72 /*
71 * Send the IPI. The write to APIC_ICR fires this off. 73 * Send the IPI. The write to APIC_ICR fires this off.
72 */ 74 */
73 apic_write_around(APIC_ICR, cfg); 75 apic_write(APIC_ICR, cfg);
74} 76}
75 77
76void send_IPI_self(int vector) 78void send_IPI_self(int vector)
@@ -98,7 +100,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
98 * prepare target chip field 100 * prepare target chip field
99 */ 101 */
100 cfg = __prepare_ICR2(mask); 102 cfg = __prepare_ICR2(mask);
101 apic_write_around(APIC_ICR2, cfg); 103 apic_write(APIC_ICR2, cfg);
102 104
103 /* 105 /*
104 * program the ICR 106 * program the ICR
@@ -108,7 +110,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector)
108 /* 110 /*
109 * Send the IPI. The write to APIC_ICR fires this off. 111 * Send the IPI. The write to APIC_ICR fires this off.
110 */ 112 */
111 apic_write_around(APIC_ICR, cfg); 113 apic_write(APIC_ICR, cfg);
112} 114}
113 115
114/* 116/*
@@ -147,7 +149,6 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector)
147} 149}
148 150
149/* must come after the send_IPI functions above for inlining */ 151/* must come after the send_IPI functions above for inlining */
150#include <mach_ipi.h>
151static int convert_apicid_to_cpu(int apic_id) 152static int convert_apicid_to_cpu(int apic_id)
152{ 153{
153 int i; 154 int i;
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
new file mode 100644
index 000000000000..d1d4dc52f649
--- /dev/null
+++ b/arch/x86/kernel/irq.c
@@ -0,0 +1,189 @@
1/*
2 * Common interrupt code for 32 and 64 bit
3 */
4#include <linux/cpu.h>
5#include <linux/interrupt.h>
6#include <linux/kernel_stat.h>
7#include <linux/seq_file.h>
8
9#include <asm/apic.h>
10#include <asm/io_apic.h>
11#include <asm/smp.h>
12
13atomic_t irq_err_count;
14
15/*
16 * 'what should we do if we get a hw irq event on an illegal vector'.
17 * each architecture has to answer this themselves.
18 */
19void ack_bad_irq(unsigned int irq)
20{
21 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
22
23#ifdef CONFIG_X86_LOCAL_APIC
24 /*
25 * Currently unexpected vectors happen only on SMP and APIC.
26 * We _must_ ack these because every local APIC has only N
27 * irq slots per priority level, and a 'hanging, unacked' IRQ
28 * holds up an irq slot - in excessive cases (when multiple
29 * unexpected vectors occur) that might lock up the APIC
30 * completely.
31 * But only ack when the APIC is enabled -AK
32 */
33 if (cpu_has_apic)
34 ack_APIC_irq();
35#endif
36}
37
38#ifdef CONFIG_X86_32
39# define irq_stats(x) (&per_cpu(irq_stat, x))
40#else
41# define irq_stats(x) cpu_pda(x)
42#endif
43/*
44 * /proc/interrupts printing:
45 */
46static int show_other_interrupts(struct seq_file *p)
47{
48 int j;
49
50 seq_printf(p, "NMI: ");
51 for_each_online_cpu(j)
52 seq_printf(p, "%10u ", irq_stats(j)->__nmi_count);
53 seq_printf(p, " Non-maskable interrupts\n");
54#ifdef CONFIG_X86_LOCAL_APIC
55 seq_printf(p, "LOC: ");
56 for_each_online_cpu(j)
57 seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs);
58 seq_printf(p, " Local timer interrupts\n");
59#endif
60#ifdef CONFIG_SMP
61 seq_printf(p, "RES: ");
62 for_each_online_cpu(j)
63 seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count);
64 seq_printf(p, " Rescheduling interrupts\n");
65 seq_printf(p, "CAL: ");
66 for_each_online_cpu(j)
67 seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
68 seq_printf(p, " Function call interrupts\n");
69 seq_printf(p, "TLB: ");
70 for_each_online_cpu(j)
71 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
72 seq_printf(p, " TLB shootdowns\n");
73#endif
74#ifdef CONFIG_X86_MCE
75 seq_printf(p, "TRM: ");
76 for_each_online_cpu(j)
77 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
78 seq_printf(p, " Thermal event interrupts\n");
79# ifdef CONFIG_X86_64
80 seq_printf(p, "THR: ");
81 for_each_online_cpu(j)
82 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
83 seq_printf(p, " Threshold APIC interrupts\n");
84# endif
85#endif
86#ifdef CONFIG_X86_LOCAL_APIC
87 seq_printf(p, "SPU: ");
88 for_each_online_cpu(j)
89 seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);
90 seq_printf(p, " Spurious interrupts\n");
91#endif
92 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
93#if defined(CONFIG_X86_IO_APIC)
94 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
95#endif
96 return 0;
97}
98
99int show_interrupts(struct seq_file *p, void *v)
100{
101 unsigned long flags, any_count = 0;
102 int i = *(loff_t *) v, j;
103 struct irqaction *action;
104 struct irq_desc *desc;
105
106 if (i > nr_irqs)
107 return 0;
108
109 if (i == nr_irqs)
110 return show_other_interrupts(p);
111
112 /* print header */
113 if (i == 0) {
114 seq_printf(p, " ");
115 for_each_online_cpu(j)
116 seq_printf(p, "CPU%-8d", j);
117 seq_putc(p, '\n');
118 }
119
120 desc = irq_to_desc(i);
121 spin_lock_irqsave(&desc->lock, flags);
122#ifndef CONFIG_SMP
123 any_count = kstat_irqs(i);
124#else
125 for_each_online_cpu(j)
126 any_count |= kstat_irqs_cpu(i, j);
127#endif
128 action = desc->action;
129 if (!action && !any_count)
130 goto out;
131
132 seq_printf(p, "%3d: ", i);
133#ifndef CONFIG_SMP
134 seq_printf(p, "%10u ", kstat_irqs(i));
135#else
136 for_each_online_cpu(j)
137 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
138#endif
139 seq_printf(p, " %8s", desc->chip->name);
140 seq_printf(p, "-%-8s", desc->name);
141
142 if (action) {
143 seq_printf(p, " %s", action->name);
144 while ((action = action->next) != NULL)
145 seq_printf(p, ", %s", action->name);
146 }
147
148 seq_putc(p, '\n');
149out:
150 spin_unlock_irqrestore(&desc->lock, flags);
151 return 0;
152}
153
154/*
155 * /proc/stat helpers
156 */
157u64 arch_irq_stat_cpu(unsigned int cpu)
158{
159 u64 sum = irq_stats(cpu)->__nmi_count;
160
161#ifdef CONFIG_X86_LOCAL_APIC
162 sum += irq_stats(cpu)->apic_timer_irqs;
163#endif
164#ifdef CONFIG_SMP
165 sum += irq_stats(cpu)->irq_resched_count;
166 sum += irq_stats(cpu)->irq_call_count;
167 sum += irq_stats(cpu)->irq_tlb_count;
168#endif
169#ifdef CONFIG_X86_MCE
170 sum += irq_stats(cpu)->irq_thermal_count;
171# ifdef CONFIG_X86_64
172 sum += irq_stats(cpu)->irq_threshold_count;
173#endif
174#endif
175#ifdef CONFIG_X86_LOCAL_APIC
176 sum += irq_stats(cpu)->irq_spurious_count;
177#endif
178 return sum;
179}
180
181u64 arch_irq_stat(void)
182{
183 u64 sum = atomic_read(&irq_err_count);
184
185#ifdef CONFIG_X86_IO_APIC
186 sum += atomic_read(&irq_mis_count);
187#endif
188 return sum;
189}
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 47a6f6f12478..a51382672de0 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -25,29 +25,6 @@ EXPORT_PER_CPU_SYMBOL(irq_stat);
25DEFINE_PER_CPU(struct pt_regs *, irq_regs); 25DEFINE_PER_CPU(struct pt_regs *, irq_regs);
26EXPORT_PER_CPU_SYMBOL(irq_regs); 26EXPORT_PER_CPU_SYMBOL(irq_regs);
27 27
28/*
29 * 'what should we do if we get a hw irq event on an illegal vector'.
30 * each architecture has to answer this themselves.
31 */
32void ack_bad_irq(unsigned int irq)
33{
34 printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
35
36#ifdef CONFIG_X86_LOCAL_APIC
37 /*
38 * Currently unexpected vectors happen only on SMP and APIC.
39 * We _must_ ack these because every local APIC has only N
40 * irq slots per priority level, and a 'hanging, unacked' IRQ
41 * holds up an irq slot - in excessive cases (when multiple
42 * unexpected vectors occur) that might lock up the APIC
43 * completely.
44 * But only ack when the APIC is enabled -AK
45 */
46 if (cpu_has_apic)
47 ack_APIC_irq();
48#endif
49}
50
51#ifdef CONFIG_DEBUG_STACKOVERFLOW 28#ifdef CONFIG_DEBUG_STACKOVERFLOW
52/* Debugging check for stack overflow: is there less than 1KB free? */ 29/* Debugging check for stack overflow: is there less than 1KB free? */
53static int check_stack_overflow(void) 30static int check_stack_overflow(void)
@@ -83,11 +60,8 @@ union irq_ctx {
83static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; 60static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly;
84static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; 61static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly;
85 62
86static char softirq_stack[NR_CPUS * THREAD_SIZE] 63static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
87 __attribute__((__section__(".bss.page_aligned"))); 64static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss;
88
89static char hardirq_stack[NR_CPUS * THREAD_SIZE]
90 __attribute__((__section__(".bss.page_aligned")));
91 65
92static void call_on_stack(void *func, void *stack) 66static void call_on_stack(void *func, void *stack)
93{ 67{
@@ -226,20 +200,25 @@ unsigned int do_IRQ(struct pt_regs *regs)
226{ 200{
227 struct pt_regs *old_regs; 201 struct pt_regs *old_regs;
228 /* high bit used in ret_from_ code */ 202 /* high bit used in ret_from_ code */
229 int overflow, irq = ~regs->orig_ax; 203 int overflow;
230 struct irq_desc *desc = irq_desc + irq; 204 unsigned vector = ~regs->orig_ax;
205 struct irq_desc *desc;
206 unsigned irq;
231 207
232 if (unlikely((unsigned)irq >= NR_IRQS)) {
233 printk(KERN_EMERG "%s: cannot handle IRQ %d\n",
234 __func__, irq);
235 BUG();
236 }
237 208
238 old_regs = set_irq_regs(regs); 209 old_regs = set_irq_regs(regs);
239 irq_enter(); 210 irq_enter();
211 irq = __get_cpu_var(vector_irq)[vector];
240 212
241 overflow = check_stack_overflow(); 213 overflow = check_stack_overflow();
242 214
215 desc = irq_to_desc(irq);
216 if (unlikely(!desc)) {
217 printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n",
218 __func__, irq, vector, smp_processor_id());
219 BUG();
220 }
221
243 if (!execute_on_irq_stack(overflow, desc, irq)) { 222 if (!execute_on_irq_stack(overflow, desc, irq)) {
244 if (unlikely(overflow)) 223 if (unlikely(overflow))
245 print_stack_overflow(); 224 print_stack_overflow();
@@ -251,146 +230,6 @@ unsigned int do_IRQ(struct pt_regs *regs)
251 return 1; 230 return 1;
252} 231}
253 232
254/*
255 * Interrupt statistics:
256 */
257
258atomic_t irq_err_count;
259
260/*
261 * /proc/interrupts printing:
262 */
263
264int show_interrupts(struct seq_file *p, void *v)
265{
266 int i = *(loff_t *) v, j;
267 struct irqaction * action;
268 unsigned long flags;
269
270 if (i == 0) {
271 seq_printf(p, " ");
272 for_each_online_cpu(j)
273 seq_printf(p, "CPU%-8d",j);
274 seq_putc(p, '\n');
275 }
276
277 if (i < NR_IRQS) {
278 unsigned any_count = 0;
279
280 spin_lock_irqsave(&irq_desc[i].lock, flags);
281#ifndef CONFIG_SMP
282 any_count = kstat_irqs(i);
283#else
284 for_each_online_cpu(j)
285 any_count |= kstat_cpu(j).irqs[i];
286#endif
287 action = irq_desc[i].action;
288 if (!action && !any_count)
289 goto skip;
290 seq_printf(p, "%3d: ",i);
291#ifndef CONFIG_SMP
292 seq_printf(p, "%10u ", kstat_irqs(i));
293#else
294 for_each_online_cpu(j)
295 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
296#endif
297 seq_printf(p, " %8s", irq_desc[i].chip->name);
298 seq_printf(p, "-%-8s", irq_desc[i].name);
299
300 if (action) {
301 seq_printf(p, " %s", action->name);
302 while ((action = action->next) != NULL)
303 seq_printf(p, ", %s", action->name);
304 }
305
306 seq_putc(p, '\n');
307skip:
308 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
309 } else if (i == NR_IRQS) {
310 seq_printf(p, "NMI: ");
311 for_each_online_cpu(j)
312 seq_printf(p, "%10u ", nmi_count(j));
313 seq_printf(p, " Non-maskable interrupts\n");
314#ifdef CONFIG_X86_LOCAL_APIC
315 seq_printf(p, "LOC: ");
316 for_each_online_cpu(j)
317 seq_printf(p, "%10u ",
318 per_cpu(irq_stat,j).apic_timer_irqs);
319 seq_printf(p, " Local timer interrupts\n");
320#endif
321#ifdef CONFIG_SMP
322 seq_printf(p, "RES: ");
323 for_each_online_cpu(j)
324 seq_printf(p, "%10u ",
325 per_cpu(irq_stat,j).irq_resched_count);
326 seq_printf(p, " Rescheduling interrupts\n");
327 seq_printf(p, "CAL: ");
328 for_each_online_cpu(j)
329 seq_printf(p, "%10u ",
330 per_cpu(irq_stat,j).irq_call_count);
331 seq_printf(p, " function call interrupts\n");
332 seq_printf(p, "TLB: ");
333 for_each_online_cpu(j)
334 seq_printf(p, "%10u ",
335 per_cpu(irq_stat,j).irq_tlb_count);
336 seq_printf(p, " TLB shootdowns\n");
337#endif
338#ifdef CONFIG_X86_MCE
339 seq_printf(p, "TRM: ");
340 for_each_online_cpu(j)
341 seq_printf(p, "%10u ",
342 per_cpu(irq_stat,j).irq_thermal_count);
343 seq_printf(p, " Thermal event interrupts\n");
344#endif
345#ifdef CONFIG_X86_LOCAL_APIC
346 seq_printf(p, "SPU: ");
347 for_each_online_cpu(j)
348 seq_printf(p, "%10u ",
349 per_cpu(irq_stat,j).irq_spurious_count);
350 seq_printf(p, " Spurious interrupts\n");
351#endif
352 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
353#if defined(CONFIG_X86_IO_APIC)
354 seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count));
355#endif
356 }
357 return 0;
358}
359
360/*
361 * /proc/stat helpers
362 */
363u64 arch_irq_stat_cpu(unsigned int cpu)
364{
365 u64 sum = nmi_count(cpu);
366
367#ifdef CONFIG_X86_LOCAL_APIC
368 sum += per_cpu(irq_stat, cpu).apic_timer_irqs;
369#endif
370#ifdef CONFIG_SMP
371 sum += per_cpu(irq_stat, cpu).irq_resched_count;
372 sum += per_cpu(irq_stat, cpu).irq_call_count;
373 sum += per_cpu(irq_stat, cpu).irq_tlb_count;
374#endif
375#ifdef CONFIG_X86_MCE
376 sum += per_cpu(irq_stat, cpu).irq_thermal_count;
377#endif
378#ifdef CONFIG_X86_LOCAL_APIC
379 sum += per_cpu(irq_stat, cpu).irq_spurious_count;
380#endif
381 return sum;
382}
383
384u64 arch_irq_stat(void)
385{
386 u64 sum = atomic_read(&irq_err_count);
387
388#ifdef CONFIG_X86_IO_APIC
389 sum += atomic_read(&irq_mis_count);
390#endif
391 return sum;
392}
393
394#ifdef CONFIG_HOTPLUG_CPU 233#ifdef CONFIG_HOTPLUG_CPU
395#include <mach_apic.h> 234#include <mach_apic.h>
396 235
@@ -398,20 +237,22 @@ void fixup_irqs(cpumask_t map)
398{ 237{
399 unsigned int irq; 238 unsigned int irq;
400 static int warned; 239 static int warned;
240 struct irq_desc *desc;
401 241
402 for (irq = 0; irq < NR_IRQS; irq++) { 242 for_each_irq_desc(irq, desc) {
403 cpumask_t mask; 243 cpumask_t mask;
244
404 if (irq == 2) 245 if (irq == 2)
405 continue; 246 continue;
406 247
407 cpus_and(mask, irq_desc[irq].affinity, map); 248 cpus_and(mask, desc->affinity, map);
408 if (any_online_cpu(mask) == NR_CPUS) { 249 if (any_online_cpu(mask) == NR_CPUS) {
409 printk("Breaking affinity for irq %i\n", irq); 250 printk("Breaking affinity for irq %i\n", irq);
410 mask = map; 251 mask = map;
411 } 252 }
412 if (irq_desc[irq].chip->set_affinity) 253 if (desc->chip->set_affinity)
413 irq_desc[irq].chip->set_affinity(irq, mask); 254 desc->chip->set_affinity(irq, mask);
414 else if (irq_desc[irq].action && !(warned++)) 255 else if (desc->action && !(warned++))
415 printk("Cannot set affinity for irq %i\n", irq); 256 printk("Cannot set affinity for irq %i\n", irq);
416 } 257 }
417 258
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 1f78b238d8d2..60eb84eb77a0 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -18,28 +18,6 @@
18#include <asm/idle.h> 18#include <asm/idle.h>
19#include <asm/smp.h> 19#include <asm/smp.h>
20 20
21atomic_t irq_err_count;
22
23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'.
25 * each architecture has to answer this themselves.
26 */
27void ack_bad_irq(unsigned int irq)
28{
29 printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq);
30 /*
31 * Currently unexpected vectors happen only on SMP and APIC.
32 * We _must_ ack these because every local APIC has only N
33 * irq slots per priority level, and a 'hanging, unacked' IRQ
34 * holds up an irq slot - in excessive cases (when multiple
35 * unexpected vectors occur) that might lock up the APIC
36 * completely.
37 * But don't ack when the APIC is disabled. -AK
38 */
39 if (!disable_apic)
40 ack_APIC_irq();
41}
42
43#ifdef CONFIG_DEBUG_STACKOVERFLOW 21#ifdef CONFIG_DEBUG_STACKOVERFLOW
44/* 22/*
45 * Probabilistic stack overflow check: 23 * Probabilistic stack overflow check:
@@ -65,122 +43,6 @@ static inline void stack_overflow_check(struct pt_regs *regs)
65#endif 43#endif
66 44
67/* 45/*
68 * Generic, controller-independent functions:
69 */
70
71int show_interrupts(struct seq_file *p, void *v)
72{
73 int i = *(loff_t *) v, j;
74 struct irqaction * action;
75 unsigned long flags;
76
77 if (i == 0) {
78 seq_printf(p, " ");
79 for_each_online_cpu(j)
80 seq_printf(p, "CPU%-8d",j);
81 seq_putc(p, '\n');
82 }
83
84 if (i < NR_IRQS) {
85 unsigned any_count = 0;
86
87 spin_lock_irqsave(&irq_desc[i].lock, flags);
88#ifndef CONFIG_SMP
89 any_count = kstat_irqs(i);
90#else
91 for_each_online_cpu(j)
92 any_count |= kstat_cpu(j).irqs[i];
93#endif
94 action = irq_desc[i].action;
95 if (!action && !any_count)
96 goto skip;
97 seq_printf(p, "%3d: ",i);
98#ifndef CONFIG_SMP
99 seq_printf(p, "%10u ", kstat_irqs(i));
100#else
101 for_each_online_cpu(j)
102 seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
103#endif
104 seq_printf(p, " %8s", irq_desc[i].chip->name);
105 seq_printf(p, "-%-8s", irq_desc[i].name);
106
107 if (action) {
108 seq_printf(p, " %s", action->name);
109 while ((action = action->next) != NULL)
110 seq_printf(p, ", %s", action->name);
111 }
112 seq_putc(p, '\n');
113skip:
114 spin_unlock_irqrestore(&irq_desc[i].lock, flags);
115 } else if (i == NR_IRQS) {
116 seq_printf(p, "NMI: ");
117 for_each_online_cpu(j)
118 seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count);
119 seq_printf(p, " Non-maskable interrupts\n");
120 seq_printf(p, "LOC: ");
121 for_each_online_cpu(j)
122 seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs);
123 seq_printf(p, " Local timer interrupts\n");
124#ifdef CONFIG_SMP
125 seq_printf(p, "RES: ");
126 for_each_online_cpu(j)
127 seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count);
128 seq_printf(p, " Rescheduling interrupts\n");
129 seq_printf(p, "CAL: ");
130 for_each_online_cpu(j)
131 seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count);
132 seq_printf(p, " function call interrupts\n");
133 seq_printf(p, "TLB: ");
134 for_each_online_cpu(j)
135 seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count);
136 seq_printf(p, " TLB shootdowns\n");
137#endif
138#ifdef CONFIG_X86_MCE
139 seq_printf(p, "TRM: ");
140 for_each_online_cpu(j)
141 seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count);
142 seq_printf(p, " Thermal event interrupts\n");
143 seq_printf(p, "THR: ");
144 for_each_online_cpu(j)
145 seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count);
146 seq_printf(p, " Threshold APIC interrupts\n");
147#endif
148 seq_printf(p, "SPU: ");
149 for_each_online_cpu(j)
150 seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count);
151 seq_printf(p, " Spurious interrupts\n");
152 seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count));
153 }
154 return 0;
155}
156
157/*
158 * /proc/stat helpers
159 */
160u64 arch_irq_stat_cpu(unsigned int cpu)
161{
162 u64 sum = cpu_pda(cpu)->__nmi_count;
163
164 sum += cpu_pda(cpu)->apic_timer_irqs;
165#ifdef CONFIG_SMP
166 sum += cpu_pda(cpu)->irq_resched_count;
167 sum += cpu_pda(cpu)->irq_call_count;
168 sum += cpu_pda(cpu)->irq_tlb_count;
169#endif
170#ifdef CONFIG_X86_MCE
171 sum += cpu_pda(cpu)->irq_thermal_count;
172 sum += cpu_pda(cpu)->irq_threshold_count;
173#endif
174 sum += cpu_pda(cpu)->irq_spurious_count;
175 return sum;
176}
177
178u64 arch_irq_stat(void)
179{
180 return atomic_read(&irq_err_count);
181}
182
183/*
184 * do_IRQ handles all normal device IRQ's (the special 46 * do_IRQ handles all normal device IRQ's (the special
185 * SMP cross-CPU interrupts have their own specific 47 * SMP cross-CPU interrupts have their own specific
186 * handlers). 48 * handlers).
@@ -188,6 +50,7 @@ u64 arch_irq_stat(void)
188asmlinkage unsigned int do_IRQ(struct pt_regs *regs) 50asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
189{ 51{
190 struct pt_regs *old_regs = set_irq_regs(regs); 52 struct pt_regs *old_regs = set_irq_regs(regs);
53 struct irq_desc *desc;
191 54
192 /* high bit used in ret_from_ code */ 55 /* high bit used in ret_from_ code */
193 unsigned vector = ~regs->orig_ax; 56 unsigned vector = ~regs->orig_ax;
@@ -201,8 +64,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs)
201 stack_overflow_check(regs); 64 stack_overflow_check(regs);
202#endif 65#endif
203 66
204 if (likely(irq < NR_IRQS)) 67 desc = irq_to_desc(irq);
205 generic_handle_irq(irq); 68 if (likely(desc))
69 generic_handle_irq_desc(irq, desc);
206 else { 70 else {
207 if (!disable_apic) 71 if (!disable_apic)
208 ack_APIC_irq(); 72 ack_APIC_irq();
@@ -223,8 +87,9 @@ void fixup_irqs(cpumask_t map)
223{ 87{
224 unsigned int irq; 88 unsigned int irq;
225 static int warned; 89 static int warned;
90 struct irq_desc *desc;
226 91
227 for (irq = 0; irq < NR_IRQS; irq++) { 92 for_each_irq_desc(irq, desc) {
228 cpumask_t mask; 93 cpumask_t mask;
229 int break_affinity = 0; 94 int break_affinity = 0;
230 int set_affinity = 1; 95 int set_affinity = 1;
@@ -233,32 +98,32 @@ void fixup_irqs(cpumask_t map)
233 continue; 98 continue;
234 99
235 /* interrupt's are disabled at this point */ 100 /* interrupt's are disabled at this point */
236 spin_lock(&irq_desc[irq].lock); 101 spin_lock(&desc->lock);
237 102
238 if (!irq_has_action(irq) || 103 if (!irq_has_action(irq) ||
239 cpus_equal(irq_desc[irq].affinity, map)) { 104 cpus_equal(desc->affinity, map)) {
240 spin_unlock(&irq_desc[irq].lock); 105 spin_unlock(&desc->lock);
241 continue; 106 continue;
242 } 107 }
243 108
244 cpus_and(mask, irq_desc[irq].affinity, map); 109 cpus_and(mask, desc->affinity, map);
245 if (cpus_empty(mask)) { 110 if (cpus_empty(mask)) {
246 break_affinity = 1; 111 break_affinity = 1;
247 mask = map; 112 mask = map;
248 } 113 }
249 114
250 if (irq_desc[irq].chip->mask) 115 if (desc->chip->mask)
251 irq_desc[irq].chip->mask(irq); 116 desc->chip->mask(irq);
252 117
253 if (irq_desc[irq].chip->set_affinity) 118 if (desc->chip->set_affinity)
254 irq_desc[irq].chip->set_affinity(irq, mask); 119 desc->chip->set_affinity(irq, mask);
255 else if (!(warned++)) 120 else if (!(warned++))
256 set_affinity = 0; 121 set_affinity = 0;
257 122
258 if (irq_desc[irq].chip->unmask) 123 if (desc->chip->unmask)
259 irq_desc[irq].chip->unmask(irq); 124 desc->chip->unmask(irq);
260 125
261 spin_unlock(&irq_desc[irq].lock); 126 spin_unlock(&desc->lock);
262 127
263 if (break_affinity && set_affinity) 128 if (break_affinity && set_affinity)
264 printk("Broke affinity for irq %i\n", irq); 129 printk("Broke affinity for irq %i\n", irq);
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
index d66914287ee1..845aa9803e80 100644
--- a/arch/x86/kernel/irqinit_32.c
+++ b/arch/x86/kernel/irqinit_32.c
@@ -69,11 +69,48 @@ void __init init_ISA_irqs (void)
69 * 16 old-style INTA-cycle interrupts: 69 * 16 old-style INTA-cycle interrupts:
70 */ 70 */
71 for (i = 0; i < 16; i++) { 71 for (i = 0; i < 16; i++) {
72 /* first time call this irq_desc */
73 struct irq_desc *desc = irq_to_desc(i);
74
75 desc->status = IRQ_DISABLED;
76 desc->action = NULL;
77 desc->depth = 1;
78
72 set_irq_chip_and_handler_name(i, &i8259A_chip, 79 set_irq_chip_and_handler_name(i, &i8259A_chip,
73 handle_level_irq, "XT"); 80 handle_level_irq, "XT");
74 } 81 }
75} 82}
76 83
84/*
85 * IRQ2 is cascade interrupt to second interrupt controller
86 */
87static struct irqaction irq2 = {
88 .handler = no_action,
89 .mask = CPU_MASK_NONE,
90 .name = "cascade",
91};
92
93DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
94 [0 ... IRQ0_VECTOR - 1] = -1,
95 [IRQ0_VECTOR] = 0,
96 [IRQ1_VECTOR] = 1,
97 [IRQ2_VECTOR] = 2,
98 [IRQ3_VECTOR] = 3,
99 [IRQ4_VECTOR] = 4,
100 [IRQ5_VECTOR] = 5,
101 [IRQ6_VECTOR] = 6,
102 [IRQ7_VECTOR] = 7,
103 [IRQ8_VECTOR] = 8,
104 [IRQ9_VECTOR] = 9,
105 [IRQ10_VECTOR] = 10,
106 [IRQ11_VECTOR] = 11,
107 [IRQ12_VECTOR] = 12,
108 [IRQ13_VECTOR] = 13,
109 [IRQ14_VECTOR] = 14,
110 [IRQ15_VECTOR] = 15,
111 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
112};
113
77/* Overridden in paravirt.c */ 114/* Overridden in paravirt.c */
78void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 115void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
79 116
@@ -89,15 +126,50 @@ void __init native_init_IRQ(void)
89 * us. (some of these will be overridden and become 126 * us. (some of these will be overridden and become
90 * 'special' SMP interrupts) 127 * 'special' SMP interrupts)
91 */ 128 */
92 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { 129 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
93 int vector = FIRST_EXTERNAL_VECTOR + i;
94 if (i >= NR_IRQS)
95 break;
96 /* SYSCALL_VECTOR was reserved in trap_init. */ 130 /* SYSCALL_VECTOR was reserved in trap_init. */
97 if (!test_bit(vector, used_vectors)) 131 if (i != SYSCALL_VECTOR)
98 set_intr_gate(vector, interrupt[i]); 132 set_intr_gate(i, interrupt[i]);
99 } 133 }
100 134
135
136#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
137 /*
138 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
139 * IPI, driven by wakeup.
140 */
141 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
142
143 /* IPI for invalidation */
144 alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt);
145
146 /* IPI for generic function call */
147 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
148
149 /* IPI for single call function */
150 set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt);
151
152 /* Low priority IPI to cleanup after moving an irq */
153 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
154#endif
155
156#ifdef CONFIG_X86_LOCAL_APIC
157 /* self generated IPI for local APIC timer */
158 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
159
160 /* IPI vectors for APIC spurious and error interrupts */
161 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
162 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
163#endif
164
165#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
166 /* thermal monitor LVT interrupt */
167 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
168#endif
169
170 if (!acpi_ioapic)
171 setup_irq(2, &irq2);
172
101 /* setup after call gates are initialised (usually add in 173 /* setup after call gates are initialised (usually add in
102 * the architecture specific gates) 174 * the architecture specific gates)
103 */ 175 */
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
index 0373e88de95a..ff0235391285 100644
--- a/arch/x86/kernel/irqinit_64.c
+++ b/arch/x86/kernel/irqinit_64.c
@@ -43,10 +43,11 @@
43 43
44#define BUILD_IRQ(nr) \ 44#define BUILD_IRQ(nr) \
45 asmlinkage void IRQ_NAME(nr); \ 45 asmlinkage void IRQ_NAME(nr); \
46 asm("\n.p2align\n" \ 46 asm("\n.text\n.p2align\n" \
47 "IRQ" #nr "_interrupt:\n\t" \ 47 "IRQ" #nr "_interrupt:\n\t" \
48 "push $~(" #nr ") ; " \ 48 "push $~(" #nr ") ; " \
49 "jmp common_interrupt"); 49 "jmp common_interrupt\n" \
50 ".previous");
50 51
51#define BI(x,y) \ 52#define BI(x,y) \
52 BUILD_IRQ(x##y) 53 BUILD_IRQ(x##y)
@@ -134,51 +135,33 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
134 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 135 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
135}; 136};
136 137
137static void __init init_ISA_irqs (void) 138void __init init_ISA_irqs(void)
138{ 139{
139 int i; 140 int i;
140 141
141 init_bsp_APIC(); 142 init_bsp_APIC();
142 init_8259A(0); 143 init_8259A(0);
143 144
144 for (i = 0; i < NR_IRQS; i++) { 145 for (i = 0; i < 16; i++) {
145 irq_desc[i].status = IRQ_DISABLED; 146 /* first time call this irq_desc */
146 irq_desc[i].action = NULL; 147 struct irq_desc *desc = irq_to_desc(i);
147 irq_desc[i].depth = 1;
148 148
149 if (i < 16) { 149 desc->status = IRQ_DISABLED;
150 /* 150 desc->action = NULL;
151 * 16 old-style INTA-cycle interrupts: 151 desc->depth = 1;
152 */ 152
153 set_irq_chip_and_handler_name(i, &i8259A_chip, 153 /*
154 * 16 old-style INTA-cycle interrupts:
155 */
156 set_irq_chip_and_handler_name(i, &i8259A_chip,
154 handle_level_irq, "XT"); 157 handle_level_irq, "XT");
155 } else {
156 /*
157 * 'high' PCI IRQs filled in on demand
158 */
159 irq_desc[i].chip = &no_irq_chip;
160 }
161 } 158 }
162} 159}
163 160
164void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); 161void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
165 162
166void __init native_init_IRQ(void) 163static void __init smp_intr_init(void)
167{ 164{
168 int i;
169
170 init_ISA_irqs();
171 /*
172 * Cover the whole vector space, no vector can escape
173 * us. (some of these will be overridden and become
174 * 'special' SMP interrupts)
175 */
176 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
177 int vector = FIRST_EXTERNAL_VECTOR + i;
178 if (vector != IA32_SYSCALL_VECTOR)
179 set_intr_gate(vector, interrupt[i]);
180 }
181
182#ifdef CONFIG_SMP 165#ifdef CONFIG_SMP
183 /* 166 /*
184 * The reschedule interrupt is a CPU-to-CPU reschedule-helper 167 * The reschedule interrupt is a CPU-to-CPU reschedule-helper
@@ -206,6 +189,12 @@ void __init native_init_IRQ(void)
206 /* Low priority IPI to cleanup after moving an irq */ 189 /* Low priority IPI to cleanup after moving an irq */
207 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); 190 set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
208#endif 191#endif
192}
193
194static void __init apic_intr_init(void)
195{
196 smp_intr_init();
197
209 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); 198 alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
210 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); 199 alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
211 200
@@ -215,6 +204,25 @@ void __init native_init_IRQ(void)
215 /* IPI vectors for APIC spurious and error interrupts */ 204 /* IPI vectors for APIC spurious and error interrupts */
216 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 205 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
217 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 206 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
207}
208
209void __init native_init_IRQ(void)
210{
211 int i;
212
213 init_ISA_irqs();
214 /*
215 * Cover the whole vector space, no vector can escape
216 * us. (some of these will be overridden and become
217 * 'special' SMP interrupts)
218 */
219 for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
220 int vector = FIRST_EXTERNAL_VECTOR + i;
221 if (vector != IA32_SYSCALL_VECTOR)
222 set_intr_gate(vector, interrupt[i]);
223 }
224
225 apic_intr_init();
218 226
219 if (!acpi_ioapic) 227 if (!acpi_ioapic)
220 setup_irq(2, &irq2); 228 setup_irq(2, &irq2);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index 7377ccb21335..304d8bad6559 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges);
16static u32 *flush_words; 16static u32 *flush_words;
17 17
18struct pci_device_id k8_nb_ids[] = { 18struct pci_device_id k8_nb_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, 19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, 20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
21 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) },
21 {} 22 {}
22}; 23};
23EXPORT_SYMBOL(k8_nb_ids); 24EXPORT_SYMBOL(k8_nb_ids);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index c03205991718..ff7d3b0124f1 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -12,9 +12,13 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/io.h> 13#include <linux/io.h>
14#include <linux/mm.h> 14#include <linux/mm.h>
15#include <linux/module.h>
15 16
16#include <asm/setup.h> 17#include <asm/setup.h>
17 18
19struct dentry *arch_debugfs_dir;
20EXPORT_SYMBOL(arch_debugfs_dir);
21
18#ifdef CONFIG_DEBUG_BOOT_PARAMS 22#ifdef CONFIG_DEBUG_BOOT_PARAMS
19struct setup_data_node { 23struct setup_data_node {
20 u64 paddr; 24 u64 paddr;
@@ -135,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent)
135 if (PageHighMem(pg)) { 139 if (PageHighMem(pg)) {
136 data = ioremap_cache(pa_data, sizeof(*data)); 140 data = ioremap_cache(pa_data, sizeof(*data));
137 if (!data) { 141 if (!data) {
142 kfree(node);
138 error = -ENXIO; 143 error = -ENXIO;
139 goto err_dir; 144 goto err_dir;
140 } 145 }
@@ -209,6 +214,10 @@ static int __init arch_kdebugfs_init(void)
209{ 214{
210 int error = 0; 215 int error = 0;
211 216
217 arch_debugfs_dir = debugfs_create_dir("x86", NULL);
218 if (!arch_debugfs_dir)
219 return -ENOMEM;
220
212#ifdef CONFIG_DEBUG_BOOT_PARAMS 221#ifdef CONFIG_DEBUG_BOOT_PARAMS
213 error = boot_params_kdebugfs_init(); 222 error = boot_params_kdebugfs_init();
214#endif 223#endif
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index f47f0eb886b8..10435a120d22 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -69,6 +69,9 @@ static int gdb_x86vector = -1;
69 */ 69 */
70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) 70void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
71{ 71{
72#ifndef CONFIG_X86_32
73 u32 *gdb_regs32 = (u32 *)gdb_regs;
74#endif
72 gdb_regs[GDB_AX] = regs->ax; 75 gdb_regs[GDB_AX] = regs->ax;
73 gdb_regs[GDB_BX] = regs->bx; 76 gdb_regs[GDB_BX] = regs->bx;
74 gdb_regs[GDB_CX] = regs->cx; 77 gdb_regs[GDB_CX] = regs->cx;
@@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
76 gdb_regs[GDB_SI] = regs->si; 79 gdb_regs[GDB_SI] = regs->si;
77 gdb_regs[GDB_DI] = regs->di; 80 gdb_regs[GDB_DI] = regs->di;
78 gdb_regs[GDB_BP] = regs->bp; 81 gdb_regs[GDB_BP] = regs->bp;
79 gdb_regs[GDB_PS] = regs->flags;
80 gdb_regs[GDB_PC] = regs->ip; 82 gdb_regs[GDB_PC] = regs->ip;
81#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
84 gdb_regs[GDB_PS] = regs->flags;
82 gdb_regs[GDB_DS] = regs->ds; 85 gdb_regs[GDB_DS] = regs->ds;
83 gdb_regs[GDB_ES] = regs->es; 86 gdb_regs[GDB_ES] = regs->es;
84 gdb_regs[GDB_CS] = regs->cs; 87 gdb_regs[GDB_CS] = regs->cs;
@@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
94 gdb_regs[GDB_R13] = regs->r13; 97 gdb_regs[GDB_R13] = regs->r13;
95 gdb_regs[GDB_R14] = regs->r14; 98 gdb_regs[GDB_R14] = regs->r14;
96 gdb_regs[GDB_R15] = regs->r15; 99 gdb_regs[GDB_R15] = regs->r15;
100 gdb_regs32[GDB_PS] = regs->flags;
101 gdb_regs32[GDB_CS] = regs->cs;
102 gdb_regs32[GDB_SS] = regs->ss;
97#endif 103#endif
98 gdb_regs[GDB_SP] = regs->sp; 104 gdb_regs[GDB_SP] = regs->sp;
99} 105}
@@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
112 */ 118 */
113void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) 119void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
114{ 120{
121#ifndef CONFIG_X86_32
122 u32 *gdb_regs32 = (u32 *)gdb_regs;
123#endif
115 gdb_regs[GDB_AX] = 0; 124 gdb_regs[GDB_AX] = 0;
116 gdb_regs[GDB_BX] = 0; 125 gdb_regs[GDB_BX] = 0;
117 gdb_regs[GDB_CX] = 0; 126 gdb_regs[GDB_CX] = 0;
@@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
129 gdb_regs[GDB_FS] = 0xFFFF; 138 gdb_regs[GDB_FS] = 0xFFFF;
130 gdb_regs[GDB_GS] = 0xFFFF; 139 gdb_regs[GDB_GS] = 0xFFFF;
131#else 140#else
132 gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); 141 gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8);
133 gdb_regs[GDB_PC] = 0; 142 gdb_regs32[GDB_CS] = __KERNEL_CS;
143 gdb_regs32[GDB_SS] = __KERNEL_DS;
144 gdb_regs[GDB_PC] = p->thread.ip;
134 gdb_regs[GDB_R8] = 0; 145 gdb_regs[GDB_R8] = 0;
135 gdb_regs[GDB_R9] = 0; 146 gdb_regs[GDB_R9] = 0;
136 gdb_regs[GDB_R10] = 0; 147 gdb_regs[GDB_R10] = 0;
@@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
153 */ 164 */
154void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) 165void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
155{ 166{
167#ifndef CONFIG_X86_32
168 u32 *gdb_regs32 = (u32 *)gdb_regs;
169#endif
156 regs->ax = gdb_regs[GDB_AX]; 170 regs->ax = gdb_regs[GDB_AX];
157 regs->bx = gdb_regs[GDB_BX]; 171 regs->bx = gdb_regs[GDB_BX];
158 regs->cx = gdb_regs[GDB_CX]; 172 regs->cx = gdb_regs[GDB_CX];
@@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
160 regs->si = gdb_regs[GDB_SI]; 174 regs->si = gdb_regs[GDB_SI];
161 regs->di = gdb_regs[GDB_DI]; 175 regs->di = gdb_regs[GDB_DI];
162 regs->bp = gdb_regs[GDB_BP]; 176 regs->bp = gdb_regs[GDB_BP];
163 regs->flags = gdb_regs[GDB_PS];
164 regs->ip = gdb_regs[GDB_PC]; 177 regs->ip = gdb_regs[GDB_PC];
165#ifdef CONFIG_X86_32 178#ifdef CONFIG_X86_32
179 regs->flags = gdb_regs[GDB_PS];
166 regs->ds = gdb_regs[GDB_DS]; 180 regs->ds = gdb_regs[GDB_DS];
167 regs->es = gdb_regs[GDB_ES]; 181 regs->es = gdb_regs[GDB_ES];
168 regs->cs = gdb_regs[GDB_CS]; 182 regs->cs = gdb_regs[GDB_CS];
@@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
175 regs->r13 = gdb_regs[GDB_R13]; 189 regs->r13 = gdb_regs[GDB_R13];
176 regs->r14 = gdb_regs[GDB_R14]; 190 regs->r14 = gdb_regs[GDB_R14];
177 regs->r15 = gdb_regs[GDB_R15]; 191 regs->r15 = gdb_regs[GDB_R15];
192 regs->flags = gdb_regs32[GDB_PS];
193 regs->cs = gdb_regs32[GDB_CS];
194 regs->ss = gdb_regs32[GDB_SS];
178#endif 195#endif
179} 196}
180 197
@@ -378,10 +395,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
378 if (remcomInBuffer[0] == 's') { 395 if (remcomInBuffer[0] == 's') {
379 linux_regs->flags |= X86_EFLAGS_TF; 396 linux_regs->flags |= X86_EFLAGS_TF;
380 kgdb_single_step = 1; 397 kgdb_single_step = 1;
381 if (kgdb_contthread) { 398 atomic_set(&kgdb_cpu_doing_single_step,
382 atomic_set(&kgdb_cpu_doing_single_step, 399 raw_smp_processor_id());
383 raw_smp_processor_id());
384 }
385 } 400 }
386 401
387 get_debugreg(dr6, 6); 402 get_debugreg(dr6, 6);
@@ -440,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
440 return NOTIFY_DONE; 455 return NOTIFY_DONE;
441 456
442 case DIE_NMI_IPI: 457 case DIE_NMI_IPI:
443 if (atomic_read(&kgdb_active) != -1) { 458 /* Just ignore, we will handle the roundup on DIE_NMI. */
444 /* KGDB CPU roundup */
445 kgdb_nmicallback(raw_smp_processor_id(), regs);
446 was_in_debug_nmi[raw_smp_processor_id()] = 1;
447 touch_nmi_watchdog();
448 }
449 return NOTIFY_DONE; 459 return NOTIFY_DONE;
450 460
451 case DIE_NMIUNKNOWN: 461 case DIE_NMIUNKNOWN:
@@ -466,9 +476,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
466 476
467 case DIE_DEBUG: 477 case DIE_DEBUG:
468 if (atomic_read(&kgdb_cpu_doing_single_step) == 478 if (atomic_read(&kgdb_cpu_doing_single_step) ==
469 raw_smp_processor_id() && 479 raw_smp_processor_id()) {
470 user_mode(regs)) 480 if (user_mode(regs))
471 return single_step_cont(regs, args); 481 return single_step_cont(regs, args);
482 break;
483 } else if (test_thread_flag(TIF_SINGLESTEP))
484 /* This means a user thread is single stepping
485 * a system call which should be ignored
486 */
487 return NOTIFY_DONE;
472 /* fall through */ 488 /* fall through */
473 default: 489 default:
474 if (user_mode(regs)) 490 if (user_mode(regs))
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index b8c6743a13da..6c27679ec6aa 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
431 regs->ip = (unsigned long)p->ainsn.insn; 431 regs->ip = (unsigned long)p->ainsn.insn;
432} 432}
433 433
434/* Called with kretprobe_lock held */
435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 434void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
436 struct pt_regs *regs) 435 struct pt_regs *regs)
437{ 436{
@@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
682 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; 681 unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
683 682
684 INIT_HLIST_HEAD(&empty_rp); 683 INIT_HLIST_HEAD(&empty_rp);
685 spin_lock_irqsave(&kretprobe_lock, flags); 684 kretprobe_hash_lock(current, &head, &flags);
686 head = kretprobe_inst_table_head(current);
687 /* fixup registers */ 685 /* fixup registers */
688#ifdef CONFIG_X86_64 686#ifdef CONFIG_X86_64
689 regs->cs = __KERNEL_CS; 687 regs->cs = __KERNEL_CS;
@@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
732 730
733 kretprobe_assert(ri, orig_ret_address, trampoline_address); 731 kretprobe_assert(ri, orig_ret_address, trampoline_address);
734 732
735 spin_unlock_irqrestore(&kretprobe_lock, flags); 733 kretprobe_hash_unlock(current, &flags);
736 734
737 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 735 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
738 hlist_del(&ri->hlist); 736 hlist_del(&ri->hlist);
@@ -860,7 +858,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs)
860 858
861 resume_execution(cur, regs, kcb); 859 resume_execution(cur, regs, kcb);
862 regs->flags |= kcb->kprobe_saved_flags; 860 regs->flags |= kcb->kprobe_saved_flags;
863 trace_hardirqs_fixup_flags(regs->flags);
864 861
865 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { 862 if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) {
866 kcb->kprobe_status = KPROBE_HIT_SSDONE; 863 kcb->kprobe_status = KPROBE_HIT_SSDONE;
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8b7a3cf37d2b..478bca986eca 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -178,7 +178,7 @@ static void kvm_flush_tlb(void)
178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb); 178 kvm_deferred_mmu_op(&ftlb, sizeof ftlb);
179} 179}
180 180
181static void kvm_release_pt(u32 pfn) 181static void kvm_release_pt(unsigned long pfn)
182{ 182{
183 struct kvm_mmu_op_release_pt rpt = { 183 struct kvm_mmu_op_release_pt rpt = {
184 .header.op = KVM_MMU_OP_RELEASE_PT, 184 .header.op = KVM_MMU_OP_RELEASE_PT,
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 87edf1ceb1df..774ac4991568 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void)
78 return ret; 78 return ret;
79} 79}
80 80
81/*
82 * If we don't do that, there is the possibility that the guest
83 * will calibrate under heavy load - thus, getting a lower lpj -
84 * and execute the delays themselves without load. This is wrong,
85 * because no delay loop can finish beforehand.
86 * Any heuristics is subject to fail, because ultimately, a large
87 * poll of guests can be running and trouble each other. So we preset
88 * lpj here
89 */
90static unsigned long kvm_get_tsc_khz(void)
91{
92 return preset_lpj;
93}
94
95static void kvm_get_preset_lpj(void)
96{
97 struct pvclock_vcpu_time_info *src;
98 unsigned long khz;
99 u64 lpj;
100
101 src = &per_cpu(hv_clock, 0);
102 khz = pvclock_tsc_khz(src);
103
104 lpj = ((u64)khz * 1000);
105 do_div(lpj, HZ);
106 preset_lpj = lpj;
107}
108
81static struct clocksource kvm_clock = { 109static struct clocksource kvm_clock = {
82 .name = "kvm-clock", 110 .name = "kvm-clock",
83 .read = kvm_clock_read, 111 .read = kvm_clock_read,
@@ -113,7 +141,7 @@ static void kvm_setup_secondary_clock(void)
113#endif 141#endif
114 142
115#ifdef CONFIG_SMP 143#ifdef CONFIG_SMP
116void __init kvm_smp_prepare_boot_cpu(void) 144static void __init kvm_smp_prepare_boot_cpu(void)
117{ 145{
118 WARN_ON(kvm_register_clock("primary cpu clock")); 146 WARN_ON(kvm_register_clock("primary cpu clock"));
119 native_smp_prepare_boot_cpu(); 147 native_smp_prepare_boot_cpu();
@@ -153,6 +181,7 @@ void __init kvmclock_init(void)
153 pv_time_ops.get_wallclock = kvm_get_wallclock; 181 pv_time_ops.get_wallclock = kvm_get_wallclock;
154 pv_time_ops.set_wallclock = kvm_set_wallclock; 182 pv_time_ops.set_wallclock = kvm_set_wallclock;
155 pv_time_ops.sched_clock = kvm_clock_read; 183 pv_time_ops.sched_clock = kvm_clock_read;
184 pv_time_ops.get_tsc_khz = kvm_get_tsc_khz;
156#ifdef CONFIG_X86_LOCAL_APIC 185#ifdef CONFIG_X86_LOCAL_APIC
157 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; 186 pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock;
158#endif 187#endif
@@ -163,6 +192,7 @@ void __init kvmclock_init(void)
163#ifdef CONFIG_KEXEC 192#ifdef CONFIG_KEXEC
164 machine_ops.crash_shutdown = kvm_crash_shutdown; 193 machine_ops.crash_shutdown = kvm_crash_shutdown;
165#endif 194#endif
195 kvm_get_preset_lpj();
166 clocksource_register(&kvm_clock); 196 clocksource_register(&kvm_clock);
167 } 197 }
168} 198}
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a8449571858a..eee32b43fee3 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -18,6 +18,7 @@
18#include <asm/ldt.h> 18#include <asm/ldt.h>
19#include <asm/desc.h> 19#include <asm/desc.h>
20#include <asm/mmu_context.h> 20#include <asm/mmu_context.h>
21#include <asm/syscalls.h>
21 22
22#ifdef CONFIG_SMP 23#ifdef CONFIG_SMP
23static void flush_ldt(void *current_mm) 24static void flush_ldt(void *current_mm)
@@ -51,6 +52,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
51 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, 52 memset(newldt + oldsize * LDT_ENTRY_SIZE, 0,
52 (mincount - oldsize) * LDT_ENTRY_SIZE); 53 (mincount - oldsize) * LDT_ENTRY_SIZE);
53 54
55 paravirt_alloc_ldt(newldt, mincount);
56
54#ifdef CONFIG_X86_64 57#ifdef CONFIG_X86_64
55 /* CHECKME: Do we really need this ? */ 58 /* CHECKME: Do we really need this ? */
56 wmb(); 59 wmb();
@@ -62,12 +65,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
62 65
63 if (reload) { 66 if (reload) {
64#ifdef CONFIG_SMP 67#ifdef CONFIG_SMP
65 cpumask_t mask;
66
67 preempt_disable(); 68 preempt_disable();
68 load_LDT(pc); 69 load_LDT(pc);
69 mask = cpumask_of_cpu(smp_processor_id()); 70 if (!cpus_equal(current->mm->cpu_vm_mask,
70 if (!cpus_equal(current->mm->cpu_vm_mask, mask)) 71 cpumask_of_cpu(smp_processor_id())))
71 smp_call_function(flush_ldt, current->mm, 1); 72 smp_call_function(flush_ldt, current->mm, 1);
72 preempt_enable(); 73 preempt_enable();
73#else 74#else
@@ -75,6 +76,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
75#endif 76#endif
76 } 77 }
77 if (oldsize) { 78 if (oldsize) {
79 paravirt_free_ldt(oldldt, oldsize);
78 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) 80 if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
79 vfree(oldldt); 81 vfree(oldldt);
80 else 82 else
@@ -86,10 +88,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
86static inline int copy_ldt(mm_context_t *new, mm_context_t *old) 88static inline int copy_ldt(mm_context_t *new, mm_context_t *old)
87{ 89{
88 int err = alloc_ldt(new, old->size, 0); 90 int err = alloc_ldt(new, old->size, 0);
91 int i;
89 92
90 if (err < 0) 93 if (err < 0)
91 return err; 94 return err;
92 memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); 95
96 for(i = 0; i < old->size; i++)
97 write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE);
93 return 0; 98 return 0;
94} 99}
95 100
@@ -126,6 +131,7 @@ void destroy_context(struct mm_struct *mm)
126 if (mm == current->active_mm) 131 if (mm == current->active_mm)
127 clear_LDT(); 132 clear_LDT();
128#endif 133#endif
134 paravirt_free_ldt(mm->context.ldt, mm->context.size);
129 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) 135 if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
130 vfree(mm->context.ldt); 136 vfree(mm->context.ldt);
131 else 137 else
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index 8864230d55af..0732adba05ca 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -12,6 +12,7 @@
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/numa.h> 13#include <linux/numa.h>
14#include <linux/ftrace.h> 14#include <linux/ftrace.h>
15#include <linux/suspend.h>
15 16
16#include <asm/pgtable.h> 17#include <asm/pgtable.h>
17#include <asm/pgalloc.h> 18#include <asm/pgalloc.h>
@@ -22,6 +23,7 @@
22#include <asm/cpufeature.h> 23#include <asm/cpufeature.h>
23#include <asm/desc.h> 24#include <asm/desc.h>
24#include <asm/system.h> 25#include <asm/system.h>
26#include <asm/cacheflush.h>
25 27
26#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) 28#define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE)))
27static u32 kexec_pgd[1024] PAGE_ALIGNED; 29static u32 kexec_pgd[1024] PAGE_ALIGNED;
@@ -77,7 +79,7 @@ static void load_segments(void)
77/* 79/*
78 * A architecture hook called to validate the 80 * A architecture hook called to validate the
79 * proposed image and prepare the control pages 81 * proposed image and prepare the control pages
80 * as needed. The pages for KEXEC_CONTROL_CODE_SIZE 82 * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE
81 * have been allocated, but the segments have yet 83 * have been allocated, but the segments have yet
82 * been copied into the kernel. 84 * been copied into the kernel.
83 * 85 *
@@ -85,10 +87,12 @@ static void load_segments(void)
85 * reboot code buffer to allow us to avoid allocations 87 * reboot code buffer to allow us to avoid allocations
86 * later. 88 * later.
87 * 89 *
88 * Currently nothing. 90 * Make control page executable.
89 */ 91 */
90int machine_kexec_prepare(struct kimage *image) 92int machine_kexec_prepare(struct kimage *image)
91{ 93{
94 if (nx_enabled)
95 set_pages_x(image->control_code_page, 1);
92 return 0; 96 return 0;
93} 97}
94 98
@@ -98,27 +102,54 @@ int machine_kexec_prepare(struct kimage *image)
98 */ 102 */
99void machine_kexec_cleanup(struct kimage *image) 103void machine_kexec_cleanup(struct kimage *image)
100{ 104{
105 if (nx_enabled)
106 set_pages_nx(image->control_code_page, 1);
101} 107}
102 108
103/* 109/*
104 * Do not allocate memory (or fail in any way) in machine_kexec(). 110 * Do not allocate memory (or fail in any way) in machine_kexec().
105 * We are past the point of no return, committed to rebooting now. 111 * We are past the point of no return, committed to rebooting now.
106 */ 112 */
107NORET_TYPE void machine_kexec(struct kimage *image) 113void machine_kexec(struct kimage *image)
108{ 114{
109 unsigned long page_list[PAGES_NR]; 115 unsigned long page_list[PAGES_NR];
110 void *control_page; 116 void *control_page;
117 int save_ftrace_enabled;
118 asmlinkage unsigned long
119 (*relocate_kernel_ptr)(unsigned long indirection_page,
120 unsigned long control_page,
121 unsigned long start_address,
122 unsigned int has_pae,
123 unsigned int preserve_context);
124
125#ifdef CONFIG_KEXEC_JUMP
126 if (kexec_image->preserve_context)
127 save_processor_state();
128#endif
111 129
112 tracer_disable(); 130 save_ftrace_enabled = __ftrace_enabled_save();
113 131
114 /* Interrupts aren't acceptable while we reboot */ 132 /* Interrupts aren't acceptable while we reboot */
115 local_irq_disable(); 133 local_irq_disable();
116 134
135 if (image->preserve_context) {
136#ifdef CONFIG_X86_IO_APIC
137 /* We need to put APICs in legacy mode so that we can
138 * get timer interrupts in second kernel. kexec/kdump
139 * paths already have calls to disable_IO_APIC() in
140 * one form or other. kexec jump path also need
141 * one.
142 */
143 disable_IO_APIC();
144#endif
145 }
146
117 control_page = page_address(image->control_code_page); 147 control_page = page_address(image->control_code_page);
118 memcpy(control_page, relocate_kernel, PAGE_SIZE); 148 memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE);
119 149
150 relocate_kernel_ptr = control_page;
120 page_list[PA_CONTROL_PAGE] = __pa(control_page); 151 page_list[PA_CONTROL_PAGE] = __pa(control_page);
121 page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; 152 page_list[VA_CONTROL_PAGE] = (unsigned long)control_page;
122 page_list[PA_PGD] = __pa(kexec_pgd); 153 page_list[PA_PGD] = __pa(kexec_pgd);
123 page_list[VA_PGD] = (unsigned long)kexec_pgd; 154 page_list[VA_PGD] = (unsigned long)kexec_pgd;
124#ifdef CONFIG_X86_PAE 155#ifdef CONFIG_X86_PAE
@@ -131,6 +162,7 @@ NORET_TYPE void machine_kexec(struct kimage *image)
131 page_list[VA_PTE_0] = (unsigned long)kexec_pte0; 162 page_list[VA_PTE_0] = (unsigned long)kexec_pte0;
132 page_list[PA_PTE_1] = __pa(kexec_pte1); 163 page_list[PA_PTE_1] = __pa(kexec_pte1);
133 page_list[VA_PTE_1] = (unsigned long)kexec_pte1; 164 page_list[VA_PTE_1] = (unsigned long)kexec_pte1;
165 page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT);
134 166
135 /* The segment registers are funny things, they have both a 167 /* The segment registers are funny things, they have both a
136 * visible and an invisible part. Whenever the visible part is 168 * visible and an invisible part. Whenever the visible part is
@@ -149,8 +181,17 @@ NORET_TYPE void machine_kexec(struct kimage *image)
149 set_idt(phys_to_virt(0),0); 181 set_idt(phys_to_virt(0),0);
150 182
151 /* now call it */ 183 /* now call it */
152 relocate_kernel((unsigned long)image->head, (unsigned long)page_list, 184 image->start = relocate_kernel_ptr((unsigned long)image->head,
153 image->start, cpu_has_pae); 185 (unsigned long)page_list,
186 image->start, cpu_has_pae,
187 image->preserve_context);
188
189#ifdef CONFIG_KEXEC_JUMP
190 if (kexec_image->preserve_context)
191 restore_processor_state();
192#endif
193
194 __ftrace_enabled_restore(save_ftrace_enabled);
154} 195}
155 196
156void arch_crash_save_vmcoreinfo(void) 197void arch_crash_save_vmcoreinfo(void)
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 9dd9262693a3..c43caa3a91f3 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image)
181 * Do not allocate memory (or fail in any way) in machine_kexec(). 181 * Do not allocate memory (or fail in any way) in machine_kexec().
182 * We are past the point of no return, committed to rebooting now. 182 * We are past the point of no return, committed to rebooting now.
183 */ 183 */
184NORET_TYPE void machine_kexec(struct kimage *image) 184void machine_kexec(struct kimage *image)
185{ 185{
186 unsigned long page_list[PAGES_NR]; 186 unsigned long page_list[PAGES_NR];
187 void *control_page; 187 void *control_page;
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
index 07c0f828f488..3b599518c322 100644
--- a/arch/x86/kernel/mfgpt_32.c
+++ b/arch/x86/kernel/mfgpt_32.c
@@ -33,6 +33,8 @@
33#include <linux/module.h> 33#include <linux/module.h>
34#include <asm/geode.h> 34#include <asm/geode.h>
35 35
36#define MFGPT_DEFAULT_IRQ 7
37
36static struct mfgpt_timer_t { 38static struct mfgpt_timer_t {
37 unsigned int avail:1; 39 unsigned int avail:1;
38} mfgpt_timers[MFGPT_MAX_TIMERS]; 40} mfgpt_timers[MFGPT_MAX_TIMERS];
@@ -157,29 +159,48 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
157} 159}
158EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event); 160EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
159 161
160int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) 162int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
161{ 163{
162 u32 val, dummy; 164 u32 zsel, lpc, dummy;
163 int offset; 165 int shift;
164 166
165 if (timer < 0 || timer >= MFGPT_MAX_TIMERS) 167 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
166 return -EIO; 168 return -EIO;
167 169
168 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable)) 170 /*
171 * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
172 * is using the same CMP of the timer's Siamese twin, the IRQ is set to
173 * 2, and we mustn't use nor change it.
174 * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
175 * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
176 * with *irq==0 is safe. Currently there _are_ no 2 drivers.
177 */
178 rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
179 shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
180 if (((zsel >> shift) & 0xF) == 2)
169 return -EIO; 181 return -EIO;
170 182
171 rdmsr(MSR_PIC_ZSEL_LOW, val, dummy); 183 /* Choose IRQ: if none supplied, keep IRQ already set or use default */
184 if (!*irq)
185 *irq = (zsel >> shift) & 0xF;
186 if (!*irq)
187 *irq = MFGPT_DEFAULT_IRQ;
172 188
173 offset = (timer % 4) * 4; 189 /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
174 190 if (*irq < 1 || *irq == 2 || *irq > 15)
175 val &= ~((0xF << offset) | (0xF << (offset + 16))); 191 return -EIO;
192 rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
193 if (lpc & (1 << *irq))
194 return -EIO;
176 195
196 /* All chosen and checked - go for it */
197 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
198 return -EIO;
177 if (enable) { 199 if (enable) {
178 val |= (irq & 0x0F) << (offset); 200 zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
179 val |= (irq & 0x0F) << (offset + 16); 201 wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
180 } 202 }
181 203
182 wrmsr(MSR_PIC_ZSEL_LOW, val, dummy);
183 return 0; 204 return 0;
184} 205}
185 206
@@ -242,7 +263,7 @@ EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
242static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN; 263static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
243static u16 mfgpt_event_clock; 264static u16 mfgpt_event_clock;
244 265
245static int irq = 7; 266static int irq;
246static int __init mfgpt_setup(char *str) 267static int __init mfgpt_setup(char *str)
247{ 268{
248 get_option(&str, &irq); 269 get_option(&str, &irq);
@@ -346,7 +367,7 @@ int __init mfgpt_timer_setup(void)
346 mfgpt_event_clock = timer; 367 mfgpt_event_clock = timer;
347 368
348 /* Set up the IRQ on the MFGPT side */ 369 /* Set up the IRQ on the MFGPT side */
349 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) { 370 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
350 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq); 371 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq);
351 return -EIO; 372 return -EIO;
352 } 373 }
@@ -374,13 +395,14 @@ int __init mfgpt_timer_setup(void)
374 &mfgpt_clockevent); 395 &mfgpt_clockevent);
375 396
376 printk(KERN_INFO 397 printk(KERN_INFO
377 "mfgpt-timer: registering the MFGPT timer as a clock event.\n"); 398 "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n",
399 timer, irq);
378 clockevents_register_device(&mfgpt_clockevent); 400 clockevents_register_device(&mfgpt_clockevent);
379 401
380 return 0; 402 return 0;
381 403
382err: 404err:
383 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, irq); 405 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
384 printk(KERN_ERR 406 printk(KERN_ERR
385 "mfgpt-timer: Unable to set up the MFGPT clock source\n"); 407 "mfgpt-timer: Unable to set up the MFGPT clock source\n");
386 return -EIO; 408 return -EIO;
diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c
deleted file mode 100644
index 56b933119a04..000000000000
--- a/arch/x86/kernel/microcode.c
+++ /dev/null
@@ -1,851 +0,0 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to speculative
59 * nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73
74//#define DEBUG /* pr_debug */
75#include <linux/capability.h>
76#include <linux/kernel.h>
77#include <linux/init.h>
78#include <linux/sched.h>
79#include <linux/smp_lock.h>
80#include <linux/cpumask.h>
81#include <linux/module.h>
82#include <linux/slab.h>
83#include <linux/vmalloc.h>
84#include <linux/miscdevice.h>
85#include <linux/spinlock.h>
86#include <linux/mm.h>
87#include <linux/fs.h>
88#include <linux/mutex.h>
89#include <linux/cpu.h>
90#include <linux/firmware.h>
91#include <linux/platform_device.h>
92
93#include <asm/msr.h>
94#include <asm/uaccess.h>
95#include <asm/processor.h>
96
97MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver");
98MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
99MODULE_LICENSE("GPL");
100
101#define MICROCODE_VERSION "1.14a"
102
103#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */
104#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */
105#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */
106#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */
107#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */
108#define DWSIZE (sizeof (u32))
109#define get_totalsize(mc) \
110 (((microcode_t *)mc)->hdr.totalsize ? \
111 ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE)
112#define get_datasize(mc) \
113 (((microcode_t *)mc)->hdr.datasize ? \
114 ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
115
116#define sigmatch(s1, s2, p1, p2) \
117 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
118
119#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
120
121/* serialize access to the physical write to MSR 0x79 */
122static DEFINE_SPINLOCK(microcode_update_lock);
123
124/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
125static DEFINE_MUTEX(microcode_mutex);
126
127static struct ucode_cpu_info {
128 int valid;
129 unsigned int sig;
130 unsigned int pf;
131 unsigned int rev;
132 microcode_t *mc;
133} ucode_cpu_info[NR_CPUS];
134
135static void collect_cpu_info(int cpu_num)
136{
137 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
138 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
139 unsigned int val[2];
140
141 /* We should bind the task to the CPU */
142 BUG_ON(raw_smp_processor_id() != cpu_num);
143 uci->pf = uci->rev = 0;
144 uci->mc = NULL;
145 uci->valid = 1;
146
147 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
148 cpu_has(c, X86_FEATURE_IA64)) {
149 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
150 "processor\n", cpu_num);
151 uci->valid = 0;
152 return;
153 }
154
155 uci->sig = cpuid_eax(0x00000001);
156
157 if ((c->x86_model >= 5) || (c->x86 > 6)) {
158 /* get processor flags from MSR 0x17 */
159 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
160 uci->pf = 1 << ((val[1] >> 18) & 7);
161 }
162
163 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
164 /* see notes above for revision 1.07. Apparent chip bug */
165 sync_core();
166 /* get the current revision from MSR 0x8B */
167 rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev);
168 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
169 uci->sig, uci->pf, uci->rev);
170}
171
172static inline int microcode_update_match(int cpu_num,
173 microcode_header_t *mc_header, int sig, int pf)
174{
175 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
176
177 if (!sigmatch(sig, uci->sig, pf, uci->pf)
178 || mc_header->rev <= uci->rev)
179 return 0;
180 return 1;
181}
182
183static int microcode_sanity_check(void *mc)
184{
185 microcode_header_t *mc_header = mc;
186 struct extended_sigtable *ext_header = NULL;
187 struct extended_signature *ext_sig;
188 unsigned long total_size, data_size, ext_table_size;
189 int sum, orig_sum, ext_sigcount = 0, i;
190
191 total_size = get_totalsize(mc_header);
192 data_size = get_datasize(mc_header);
193 if (data_size + MC_HEADER_SIZE > total_size) {
194 printk(KERN_ERR "microcode: error! "
195 "Bad data size in microcode data file\n");
196 return -EINVAL;
197 }
198
199 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
200 printk(KERN_ERR "microcode: error! "
201 "Unknown microcode update format\n");
202 return -EINVAL;
203 }
204 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
205 if (ext_table_size) {
206 if ((ext_table_size < EXT_HEADER_SIZE)
207 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
208 printk(KERN_ERR "microcode: error! "
209 "Small exttable size in microcode data file\n");
210 return -EINVAL;
211 }
212 ext_header = mc + MC_HEADER_SIZE + data_size;
213 if (ext_table_size != exttable_size(ext_header)) {
214 printk(KERN_ERR "microcode: error! "
215 "Bad exttable size in microcode data file\n");
216 return -EFAULT;
217 }
218 ext_sigcount = ext_header->count;
219 }
220
221 /* check extended table checksum */
222 if (ext_table_size) {
223 int ext_table_sum = 0;
224 int *ext_tablep = (int *)ext_header;
225
226 i = ext_table_size / DWSIZE;
227 while (i--)
228 ext_table_sum += ext_tablep[i];
229 if (ext_table_sum) {
230 printk(KERN_WARNING "microcode: aborting, "
231 "bad extended signature table checksum\n");
232 return -EINVAL;
233 }
234 }
235
236 /* calculate the checksum */
237 orig_sum = 0;
238 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
239 while (i--)
240 orig_sum += ((int *)mc)[i];
241 if (orig_sum) {
242 printk(KERN_ERR "microcode: aborting, bad checksum\n");
243 return -EINVAL;
244 }
245 if (!ext_table_size)
246 return 0;
247 /* check extended signature checksum */
248 for (i = 0; i < ext_sigcount; i++) {
249 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
250 EXT_SIGNATURE_SIZE * i;
251 sum = orig_sum
252 - (mc_header->sig + mc_header->pf + mc_header->cksum)
253 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
254 if (sum) {
255 printk(KERN_ERR "microcode: aborting, bad checksum\n");
256 return -EINVAL;
257 }
258 }
259 return 0;
260}
261
262/*
263 * return 0 - no update found
264 * return 1 - found update
265 * return < 0 - error
266 */
267static int get_maching_microcode(void *mc, int cpu)
268{
269 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
270 microcode_header_t *mc_header = mc;
271 struct extended_sigtable *ext_header;
272 unsigned long total_size = get_totalsize(mc_header);
273 int ext_sigcount, i;
274 struct extended_signature *ext_sig;
275 void *new_mc;
276
277 if (microcode_update_match(cpu, mc_header,
278 mc_header->sig, mc_header->pf))
279 goto find;
280
281 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
282 return 0;
283
284 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
285 ext_sigcount = ext_header->count;
286 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
287 for (i = 0; i < ext_sigcount; i++) {
288 if (microcode_update_match(cpu, mc_header,
289 ext_sig->sig, ext_sig->pf))
290 goto find;
291 ext_sig++;
292 }
293 return 0;
294find:
295 pr_debug("microcode: CPU%d found a matching microcode update with"
296 " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev);
297 new_mc = vmalloc(total_size);
298 if (!new_mc) {
299 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
300 return -ENOMEM;
301 }
302
303 /* free previous update file */
304 vfree(uci->mc);
305
306 memcpy(new_mc, mc, total_size);
307 uci->mc = new_mc;
308 return 1;
309}
310
311static void apply_microcode(int cpu)
312{
313 unsigned long flags;
314 unsigned int val[2];
315 int cpu_num = raw_smp_processor_id();
316 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
317
318 /* We should bind the task to the CPU */
319 BUG_ON(cpu_num != cpu);
320
321 if (uci->mc == NULL)
322 return;
323
324 /* serialize access to the physical write to MSR 0x79 */
325 spin_lock_irqsave(&microcode_update_lock, flags);
326
327 /* write microcode via MSR 0x79 */
328 wrmsr(MSR_IA32_UCODE_WRITE,
329 (unsigned long) uci->mc->bits,
330 (unsigned long) uci->mc->bits >> 16 >> 16);
331 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
332
333 /* see notes above for revision 1.07. Apparent chip bug */
334 sync_core();
335
336 /* get the current revision from MSR 0x8B */
337 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
338
339 spin_unlock_irqrestore(&microcode_update_lock, flags);
340 if (val[1] != uci->mc->hdr.rev) {
341 printk(KERN_ERR "microcode: CPU%d update from revision "
342 "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]);
343 return;
344 }
345 printk(KERN_INFO "microcode: CPU%d updated from revision "
346 "0x%x to 0x%x, date = %08x \n",
347 cpu_num, uci->rev, val[1], uci->mc->hdr.date);
348 uci->rev = val[1];
349}
350
351#ifdef CONFIG_MICROCODE_OLD_INTERFACE
352static void __user *user_buffer; /* user area microcode data buffer */
353static unsigned int user_buffer_size; /* it's size */
354
355static long get_next_ucode(void **mc, long offset)
356{
357 microcode_header_t mc_header;
358 unsigned long total_size;
359
360 /* No more data */
361 if (offset >= user_buffer_size)
362 return 0;
363 if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) {
364 printk(KERN_ERR "microcode: error! Can not read user data\n");
365 return -EFAULT;
366 }
367 total_size = get_totalsize(&mc_header);
368 if (offset + total_size > user_buffer_size) {
369 printk(KERN_ERR "microcode: error! Bad total size in microcode "
370 "data file\n");
371 return -EINVAL;
372 }
373 *mc = vmalloc(total_size);
374 if (!*mc)
375 return -ENOMEM;
376 if (copy_from_user(*mc, user_buffer + offset, total_size)) {
377 printk(KERN_ERR "microcode: error! Can not read user data\n");
378 vfree(*mc);
379 return -EFAULT;
380 }
381 return offset + total_size;
382}
383
384static int do_microcode_update (void)
385{
386 long cursor = 0;
387 int error = 0;
388 void *new_mc = NULL;
389 int cpu;
390 cpumask_t old;
391
392 old = current->cpus_allowed;
393
394 while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) {
395 error = microcode_sanity_check(new_mc);
396 if (error)
397 goto out;
398 /*
399 * It's possible the data file has multiple matching ucode,
400 * lets keep searching till the latest version
401 */
402 for_each_online_cpu(cpu) {
403 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
404
405 if (!uci->valid)
406 continue;
407 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
408 error = get_maching_microcode(new_mc, cpu);
409 if (error < 0)
410 goto out;
411 if (error == 1)
412 apply_microcode(cpu);
413 }
414 vfree(new_mc);
415 }
416out:
417 if (cursor > 0)
418 vfree(new_mc);
419 if (cursor < 0)
420 error = cursor;
421 set_cpus_allowed_ptr(current, &old);
422 return error;
423}
424
425static int microcode_open (struct inode *unused1, struct file *unused2)
426{
427 cycle_kernel_lock();
428 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
429}
430
431static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos)
432{
433 ssize_t ret;
434
435 if ((len >> PAGE_SHIFT) > num_physpages) {
436 printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages);
437 return -EINVAL;
438 }
439
440 get_online_cpus();
441 mutex_lock(&microcode_mutex);
442
443 user_buffer = (void __user *) buf;
444 user_buffer_size = (int) len;
445
446 ret = do_microcode_update();
447 if (!ret)
448 ret = (ssize_t)len;
449
450 mutex_unlock(&microcode_mutex);
451 put_online_cpus();
452
453 return ret;
454}
455
456static const struct file_operations microcode_fops = {
457 .owner = THIS_MODULE,
458 .write = microcode_write,
459 .open = microcode_open,
460};
461
462static struct miscdevice microcode_dev = {
463 .minor = MICROCODE_MINOR,
464 .name = "microcode",
465 .fops = &microcode_fops,
466};
467
468static int __init microcode_dev_init (void)
469{
470 int error;
471
472 error = misc_register(&microcode_dev);
473 if (error) {
474 printk(KERN_ERR
475 "microcode: can't misc_register on minor=%d\n",
476 MICROCODE_MINOR);
477 return error;
478 }
479
480 return 0;
481}
482
483static void microcode_dev_exit (void)
484{
485 misc_deregister(&microcode_dev);
486}
487
488MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
489#else
490#define microcode_dev_init() 0
491#define microcode_dev_exit() do { } while(0)
492#endif
493
494static long get_next_ucode_from_buffer(void **mc, const u8 *buf,
495 unsigned long size, long offset)
496{
497 microcode_header_t *mc_header;
498 unsigned long total_size;
499
500 /* No more data */
501 if (offset >= size)
502 return 0;
503 mc_header = (microcode_header_t *)(buf + offset);
504 total_size = get_totalsize(mc_header);
505
506 if (offset + total_size > size) {
507 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
508 return -EINVAL;
509 }
510
511 *mc = vmalloc(total_size);
512 if (!*mc) {
513 printk(KERN_ERR "microcode: error! Can not allocate memory\n");
514 return -ENOMEM;
515 }
516 memcpy(*mc, buf + offset, total_size);
517 return offset + total_size;
518}
519
520/* fake device for request_firmware */
521static struct platform_device *microcode_pdev;
522
523static int cpu_request_microcode(int cpu)
524{
525 char name[30];
526 struct cpuinfo_x86 *c = &cpu_data(cpu);
527 const struct firmware *firmware;
528 const u8 *buf;
529 unsigned long size;
530 long offset = 0;
531 int error;
532 void *mc;
533
534 /* We should bind the task to the CPU */
535 BUG_ON(cpu != raw_smp_processor_id());
536 sprintf(name,"intel-ucode/%02x-%02x-%02x",
537 c->x86, c->x86_model, c->x86_mask);
538 error = request_firmware(&firmware, name, &microcode_pdev->dev);
539 if (error) {
540 pr_debug("microcode: data file %s load failed\n", name);
541 return error;
542 }
543 buf = firmware->data;
544 size = firmware->size;
545 while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset))
546 > 0) {
547 error = microcode_sanity_check(mc);
548 if (error)
549 break;
550 error = get_maching_microcode(mc, cpu);
551 if (error < 0)
552 break;
553 /*
554 * It's possible the data file has multiple matching ucode,
555 * lets keep searching till the latest version
556 */
557 if (error == 1) {
558 apply_microcode(cpu);
559 error = 0;
560 }
561 vfree(mc);
562 }
563 if (offset > 0)
564 vfree(mc);
565 if (offset < 0)
566 error = offset;
567 release_firmware(firmware);
568
569 return error;
570}
571
572static int apply_microcode_check_cpu(int cpu)
573{
574 struct cpuinfo_x86 *c = &cpu_data(cpu);
575 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
576 cpumask_t old;
577 unsigned int val[2];
578 int err = 0;
579
580 /* Check if the microcode is available */
581 if (!uci->mc)
582 return 0;
583
584 old = current->cpus_allowed;
585 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
586
587 /* Check if the microcode we have in memory matches the CPU */
588 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
589 cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001))
590 err = -EINVAL;
591
592 if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) {
593 /* get processor flags from MSR 0x17 */
594 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
595 if (uci->pf != (1 << ((val[1] >> 18) & 7)))
596 err = -EINVAL;
597 }
598
599 if (!err) {
600 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
601 /* see notes above for revision 1.07. Apparent chip bug */
602 sync_core();
603 /* get the current revision from MSR 0x8B */
604 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
605 if (uci->rev != val[1])
606 err = -EINVAL;
607 }
608
609 if (!err)
610 apply_microcode(cpu);
611 else
612 printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:"
613 " sig=0x%x, pf=0x%x, rev=0x%x\n",
614 cpu, uci->sig, uci->pf, uci->rev);
615
616 set_cpus_allowed_ptr(current, &old);
617 return err;
618}
619
620static void microcode_init_cpu(int cpu, int resume)
621{
622 cpumask_t old;
623 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
624
625 old = current->cpus_allowed;
626
627 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
628 mutex_lock(&microcode_mutex);
629 collect_cpu_info(cpu);
630 if (uci->valid && system_state == SYSTEM_RUNNING && !resume)
631 cpu_request_microcode(cpu);
632 mutex_unlock(&microcode_mutex);
633 set_cpus_allowed_ptr(current, &old);
634}
635
636static void microcode_fini_cpu(int cpu)
637{
638 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
639
640 mutex_lock(&microcode_mutex);
641 uci->valid = 0;
642 vfree(uci->mc);
643 uci->mc = NULL;
644 mutex_unlock(&microcode_mutex);
645}
646
647static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz)
648{
649 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
650 char *end;
651 unsigned long val = simple_strtoul(buf, &end, 0);
652 int err = 0;
653 int cpu = dev->id;
654
655 if (end == buf)
656 return -EINVAL;
657 if (val == 1) {
658 cpumask_t old;
659
660 old = current->cpus_allowed;
661
662 get_online_cpus();
663 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
664
665 mutex_lock(&microcode_mutex);
666 if (uci->valid)
667 err = cpu_request_microcode(cpu);
668 mutex_unlock(&microcode_mutex);
669 put_online_cpus();
670 set_cpus_allowed_ptr(current, &old);
671 }
672 if (err)
673 return err;
674 return sz;
675}
676
677static ssize_t version_show(struct sys_device *dev, char *buf)
678{
679 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
680
681 return sprintf(buf, "0x%x\n", uci->rev);
682}
683
684static ssize_t pf_show(struct sys_device *dev, char *buf)
685{
686 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
687
688 return sprintf(buf, "0x%x\n", uci->pf);
689}
690
691static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
692static SYSDEV_ATTR(version, 0400, version_show, NULL);
693static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
694
695static struct attribute *mc_default_attrs[] = {
696 &attr_reload.attr,
697 &attr_version.attr,
698 &attr_processor_flags.attr,
699 NULL
700};
701
702static struct attribute_group mc_attr_group = {
703 .attrs = mc_default_attrs,
704 .name = "microcode",
705};
706
707static int __mc_sysdev_add(struct sys_device *sys_dev, int resume)
708{
709 int err, cpu = sys_dev->id;
710 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
711
712 if (!cpu_online(cpu))
713 return 0;
714
715 pr_debug("microcode: CPU%d added\n", cpu);
716 memset(uci, 0, sizeof(*uci));
717
718 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
719 if (err)
720 return err;
721
722 microcode_init_cpu(cpu, resume);
723
724 return 0;
725}
726
727static int mc_sysdev_add(struct sys_device *sys_dev)
728{
729 return __mc_sysdev_add(sys_dev, 0);
730}
731
732static int mc_sysdev_remove(struct sys_device *sys_dev)
733{
734 int cpu = sys_dev->id;
735
736 if (!cpu_online(cpu))
737 return 0;
738
739 pr_debug("microcode: CPU%d removed\n", cpu);
740 microcode_fini_cpu(cpu);
741 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
742 return 0;
743}
744
745static int mc_sysdev_resume(struct sys_device *dev)
746{
747 int cpu = dev->id;
748
749 if (!cpu_online(cpu))
750 return 0;
751 pr_debug("microcode: CPU%d resumed\n", cpu);
752 /* only CPU 0 will apply ucode here */
753 apply_microcode(0);
754 return 0;
755}
756
757static struct sysdev_driver mc_sysdev_driver = {
758 .add = mc_sysdev_add,
759 .remove = mc_sysdev_remove,
760 .resume = mc_sysdev_resume,
761};
762
763static __cpuinit int
764mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
765{
766 unsigned int cpu = (unsigned long)hcpu;
767 struct sys_device *sys_dev;
768
769 sys_dev = get_cpu_sysdev(cpu);
770 switch (action) {
771 case CPU_UP_CANCELED_FROZEN:
772 /* The CPU refused to come up during a system resume */
773 microcode_fini_cpu(cpu);
774 break;
775 case CPU_ONLINE:
776 case CPU_DOWN_FAILED:
777 mc_sysdev_add(sys_dev);
778 break;
779 case CPU_ONLINE_FROZEN:
780 /* System-wide resume is in progress, try to apply microcode */
781 if (apply_microcode_check_cpu(cpu)) {
782 /* The application of microcode failed */
783 microcode_fini_cpu(cpu);
784 __mc_sysdev_add(sys_dev, 1);
785 break;
786 }
787 case CPU_DOWN_FAILED_FROZEN:
788 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
789 printk(KERN_ERR "microcode: Failed to create the sysfs "
790 "group for CPU%d\n", cpu);
791 break;
792 case CPU_DOWN_PREPARE:
793 mc_sysdev_remove(sys_dev);
794 break;
795 case CPU_DOWN_PREPARE_FROZEN:
796 /* Suspend is in progress, only remove the interface */
797 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
798 break;
799 }
800 return NOTIFY_OK;
801}
802
803static struct notifier_block __refdata mc_cpu_notifier = {
804 .notifier_call = mc_cpu_callback,
805};
806
807static int __init microcode_init (void)
808{
809 int error;
810
811 printk(KERN_INFO
812 "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " <tigran@aivazian.fsnet.co.uk>\n");
813
814 error = microcode_dev_init();
815 if (error)
816 return error;
817 microcode_pdev = platform_device_register_simple("microcode", -1,
818 NULL, 0);
819 if (IS_ERR(microcode_pdev)) {
820 microcode_dev_exit();
821 return PTR_ERR(microcode_pdev);
822 }
823
824 get_online_cpus();
825 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
826 put_online_cpus();
827 if (error) {
828 microcode_dev_exit();
829 platform_device_unregister(microcode_pdev);
830 return error;
831 }
832
833 register_hotcpu_notifier(&mc_cpu_notifier);
834 return 0;
835}
836
837static void __exit microcode_exit (void)
838{
839 microcode_dev_exit();
840
841 unregister_hotcpu_notifier(&mc_cpu_notifier);
842
843 get_online_cpus();
844 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
845 put_online_cpus();
846
847 platform_device_unregister(microcode_pdev);
848}
849
850module_init(microcode_init)
851module_exit(microcode_exit)
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
new file mode 100644
index 000000000000..7a1f8eeac2c7
--- /dev/null
+++ b/arch/x86/kernel/microcode_amd.c
@@ -0,0 +1,435 @@
1/*
2 * AMD CPU Microcode Update Driver for Linux
3 * Copyright (C) 2008 Advanced Micro Devices Inc.
4 *
5 * Author: Peter Oruba <peter.oruba@amd.com>
6 *
7 * Based on work by:
8 * Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
9 *
10 * This driver allows to upgrade microcode on AMD
11 * family 0x10 and 0x11 processors.
12 *
13 * Licensed unter the terms of the GNU General Public
14 * License version 2. See file COPYING for details.
15*/
16
17#include <linux/capability.h>
18#include <linux/kernel.h>
19#include <linux/init.h>
20#include <linux/sched.h>
21#include <linux/cpumask.h>
22#include <linux/module.h>
23#include <linux/slab.h>
24#include <linux/vmalloc.h>
25#include <linux/miscdevice.h>
26#include <linux/spinlock.h>
27#include <linux/mm.h>
28#include <linux/fs.h>
29#include <linux/mutex.h>
30#include <linux/cpu.h>
31#include <linux/firmware.h>
32#include <linux/platform_device.h>
33#include <linux/pci.h>
34#include <linux/pci_ids.h>
35
36#include <asm/msr.h>
37#include <asm/uaccess.h>
38#include <asm/processor.h>
39#include <asm/microcode.h>
40
41MODULE_DESCRIPTION("AMD Microcode Update Driver");
42MODULE_AUTHOR("Peter Oruba <peter.oruba@amd.com>");
43MODULE_LICENSE("GPL v2");
44
45#define UCODE_MAGIC 0x00414d44
46#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000
47#define UCODE_UCODE_TYPE 0x00000001
48
49struct equiv_cpu_entry {
50 unsigned int installed_cpu;
51 unsigned int fixed_errata_mask;
52 unsigned int fixed_errata_compare;
53 unsigned int equiv_cpu;
54};
55
56struct microcode_header_amd {
57 unsigned int data_code;
58 unsigned int patch_id;
59 unsigned char mc_patch_data_id[2];
60 unsigned char mc_patch_data_len;
61 unsigned char init_flag;
62 unsigned int mc_patch_data_checksum;
63 unsigned int nb_dev_id;
64 unsigned int sb_dev_id;
65 unsigned char processor_rev_id[2];
66 unsigned char nb_rev_id;
67 unsigned char sb_rev_id;
68 unsigned char bios_api_rev;
69 unsigned char reserved1[3];
70 unsigned int match_reg[8];
71};
72
73struct microcode_amd {
74 struct microcode_header_amd hdr;
75 unsigned int mpb[0];
76};
77
78#define UCODE_MAX_SIZE (2048)
79#define DEFAULT_UCODE_DATASIZE (896)
80#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd))
81#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
82#define DWSIZE (sizeof(u32))
83/* For now we support a fixed ucode total size only */
84#define get_totalsize(mc) \
85 ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \
86 + MC_HEADER_SIZE)
87
88/* serialize access to the physical write */
89static DEFINE_SPINLOCK(microcode_update_lock);
90
91static struct equiv_cpu_entry *equiv_cpu_table;
92
93static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
94{
95 struct cpuinfo_x86 *c = &cpu_data(cpu);
96
97 memset(csig, 0, sizeof(*csig));
98
99 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
100 printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n",
101 cpu);
102 return -1;
103 }
104
105 asm volatile("movl %1, %%ecx; rdmsr"
106 : "=a" (csig->rev)
107 : "i" (0x0000008B) : "ecx");
108
109 printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n",
110 csig->rev);
111
112 return 0;
113}
114
115static int get_matching_microcode(int cpu, void *mc, int rev)
116{
117 struct microcode_header_amd *mc_header = mc;
118 struct pci_dev *nb_pci_dev, *sb_pci_dev;
119 unsigned int current_cpu_id;
120 unsigned int equiv_cpu_id = 0x00;
121 unsigned int i = 0;
122
123 BUG_ON(equiv_cpu_table == NULL);
124 current_cpu_id = cpuid_eax(0x00000001);
125
126 while (equiv_cpu_table[i].installed_cpu != 0) {
127 if (current_cpu_id == equiv_cpu_table[i].installed_cpu) {
128 equiv_cpu_id = equiv_cpu_table[i].equiv_cpu;
129 break;
130 }
131 i++;
132 }
133
134 if (!equiv_cpu_id) {
135 printk(KERN_ERR "microcode: CPU%d cpu_id "
136 "not found in equivalent cpu table \n", cpu);
137 return 0;
138 }
139
140 if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) {
141 printk(KERN_ERR
142 "microcode: CPU%d patch does not match "
143 "(patch is %x, cpu extended is %x) \n",
144 cpu, mc_header->processor_rev_id[0],
145 (equiv_cpu_id & 0xff));
146 return 0;
147 }
148
149 if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) {
150 printk(KERN_ERR "microcode: CPU%d patch does not match "
151 "(patch is %x, cpu base id is %x) \n",
152 cpu, mc_header->processor_rev_id[1],
153 ((equiv_cpu_id >> 16) & 0xff));
154
155 return 0;
156 }
157
158 /* ucode may be northbridge specific */
159 if (mc_header->nb_dev_id) {
160 nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
161 (mc_header->nb_dev_id & 0xff),
162 NULL);
163 if ((!nb_pci_dev) ||
164 (mc_header->nb_rev_id != nb_pci_dev->revision)) {
165 printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu);
166 pci_dev_put(nb_pci_dev);
167 return 0;
168 }
169 pci_dev_put(nb_pci_dev);
170 }
171
172 /* ucode may be southbridge specific */
173 if (mc_header->sb_dev_id) {
174 sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD,
175 (mc_header->sb_dev_id & 0xff),
176 NULL);
177 if ((!sb_pci_dev) ||
178 (mc_header->sb_rev_id != sb_pci_dev->revision)) {
179 printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu);
180 pci_dev_put(sb_pci_dev);
181 return 0;
182 }
183 pci_dev_put(sb_pci_dev);
184 }
185
186 if (mc_header->patch_id <= rev)
187 return 0;
188
189 return 1;
190}
191
192static void apply_microcode_amd(int cpu)
193{
194 unsigned long flags;
195 unsigned int eax, edx;
196 unsigned int rev;
197 int cpu_num = raw_smp_processor_id();
198 struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
199 struct microcode_amd *mc_amd = uci->mc;
200 unsigned long addr;
201
202 /* We should bind the task to the CPU */
203 BUG_ON(cpu_num != cpu);
204
205 if (mc_amd == NULL)
206 return;
207
208 spin_lock_irqsave(&microcode_update_lock, flags);
209
210 addr = (unsigned long)&mc_amd->hdr.data_code;
211 edx = (unsigned int)(((unsigned long)upper_32_bits(addr)));
212 eax = (unsigned int)(((unsigned long)lower_32_bits(addr)));
213
214 asm volatile("movl %0, %%ecx; wrmsr" :
215 : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx");
216
217 /* get patch id after patching */
218 asm volatile("movl %1, %%ecx; rdmsr"
219 : "=a" (rev)
220 : "i" (0x0000008B) : "ecx");
221
222 spin_unlock_irqrestore(&microcode_update_lock, flags);
223
224 /* check current patch id and patch's id for match */
225 if (rev != mc_amd->hdr.patch_id) {
226 printk(KERN_ERR "microcode: CPU%d update from revision "
227 "0x%x to 0x%x failed\n", cpu_num,
228 mc_amd->hdr.patch_id, rev);
229 return;
230 }
231
232 printk(KERN_INFO "microcode: CPU%d updated from revision "
233 "0x%x to 0x%x \n",
234 cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id);
235
236 uci->cpu_sig.rev = rev;
237}
238
239static void * get_next_ucode(u8 *buf, unsigned int size,
240 int (*get_ucode_data)(void *, const void *, size_t),
241 unsigned int *mc_size)
242{
243 unsigned int total_size;
244#define UCODE_CONTAINER_SECTION_HDR 8
245 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR];
246 void *mc;
247
248 if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR))
249 return NULL;
250
251 if (section_hdr[0] != UCODE_UCODE_TYPE) {
252 printk(KERN_ERR "microcode: error! "
253 "Wrong microcode payload type field\n");
254 return NULL;
255 }
256
257 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
258
259 printk(KERN_INFO "microcode: size %u, total_size %u\n",
260 size, total_size);
261
262 if (total_size > size || total_size > UCODE_MAX_SIZE) {
263 printk(KERN_ERR "microcode: error! Bad data in microcode data file\n");
264 return NULL;
265 }
266
267 mc = vmalloc(UCODE_MAX_SIZE);
268 if (mc) {
269 memset(mc, 0, UCODE_MAX_SIZE);
270 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) {
271 vfree(mc);
272 mc = NULL;
273 } else
274 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
275 }
276#undef UCODE_CONTAINER_SECTION_HDR
277 return mc;
278}
279
280
281static int install_equiv_cpu_table(u8 *buf,
282 int (*get_ucode_data)(void *, const void *, size_t))
283{
284#define UCODE_CONTAINER_HEADER_SIZE 12
285 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE];
286 unsigned int *buf_pos = (unsigned int *)container_hdr;
287 unsigned long size;
288
289 if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE))
290 return 0;
291
292 size = buf_pos[2];
293
294 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
295 printk(KERN_ERR "microcode: error! "
296 "Wrong microcode equivalnet cpu table\n");
297 return 0;
298 }
299
300 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
301 if (!equiv_cpu_table) {
302 printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n");
303 return 0;
304 }
305
306 buf += UCODE_CONTAINER_HEADER_SIZE;
307 if (get_ucode_data(equiv_cpu_table, buf, size)) {
308 vfree(equiv_cpu_table);
309 return 0;
310 }
311
312 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
313#undef UCODE_CONTAINER_HEADER_SIZE
314}
315
316static void free_equiv_cpu_table(void)
317{
318 if (equiv_cpu_table) {
319 vfree(equiv_cpu_table);
320 equiv_cpu_table = NULL;
321 }
322}
323
324static int generic_load_microcode(int cpu, void *data, size_t size,
325 int (*get_ucode_data)(void *, const void *, size_t))
326{
327 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
328 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
329 int new_rev = uci->cpu_sig.rev;
330 unsigned int leftover;
331 unsigned long offset;
332
333 offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data);
334 if (!offset) {
335 printk(KERN_ERR "microcode: installing equivalent cpu table failed\n");
336 return -EINVAL;
337 }
338
339 ucode_ptr += offset;
340 leftover = size - offset;
341
342 while (leftover) {
343 unsigned int uninitialized_var(mc_size);
344 struct microcode_header_amd *mc_header;
345
346 mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size);
347 if (!mc)
348 break;
349
350 mc_header = (struct microcode_header_amd *)mc;
351 if (get_matching_microcode(cpu, mc, new_rev)) {
352 if (new_mc)
353 vfree(new_mc);
354 new_rev = mc_header->patch_id;
355 new_mc = mc;
356 } else
357 vfree(mc);
358
359 ucode_ptr += mc_size;
360 leftover -= mc_size;
361 }
362
363 if (new_mc) {
364 if (!leftover) {
365 if (uci->mc)
366 vfree(uci->mc);
367 uci->mc = new_mc;
368 pr_debug("microcode: CPU%d found a matching microcode update with"
369 " version 0x%x (current=0x%x)\n",
370 cpu, new_rev, uci->cpu_sig.rev);
371 } else
372 vfree(new_mc);
373 }
374
375 free_equiv_cpu_table();
376
377 return (int)leftover;
378}
379
380static int get_ucode_fw(void *to, const void *from, size_t n)
381{
382 memcpy(to, from, n);
383 return 0;
384}
385
386static int request_microcode_fw(int cpu, struct device *device)
387{
388 const char *fw_name = "amd-ucode/microcode_amd.bin";
389 const struct firmware *firmware;
390 int ret;
391
392 /* We should bind the task to the CPU */
393 BUG_ON(cpu != raw_smp_processor_id());
394
395 ret = request_firmware(&firmware, fw_name, device);
396 if (ret) {
397 printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name);
398 return ret;
399 }
400
401 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
402 &get_ucode_fw);
403
404 release_firmware(firmware);
405
406 return ret;
407}
408
409static int request_microcode_user(int cpu, const void __user *buf, size_t size)
410{
411 printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode"
412 "is not supported\n");
413 return -1;
414}
415
416static void microcode_fini_cpu_amd(int cpu)
417{
418 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
419
420 vfree(uci->mc);
421 uci->mc = NULL;
422}
423
424static struct microcode_ops microcode_amd_ops = {
425 .request_microcode_user = request_microcode_user,
426 .request_microcode_fw = request_microcode_fw,
427 .collect_cpu_info = collect_cpu_info_amd,
428 .apply_microcode = apply_microcode_amd,
429 .microcode_fini_cpu = microcode_fini_cpu_amd,
430};
431
432struct microcode_ops * __init init_amd_microcode(void)
433{
434 return &microcode_amd_ops;
435}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
new file mode 100644
index 000000000000..936d8d55f230
--- /dev/null
+++ b/arch/x86/kernel/microcode_core.c
@@ -0,0 +1,508 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to
59 * speculative nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73#include <linux/capability.h>
74#include <linux/kernel.h>
75#include <linux/init.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h>
95
96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL");
99
100#define MICROCODE_VERSION "2.00"
101
102struct microcode_ops *microcode_ops;
103
104/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
105static DEFINE_MUTEX(microcode_mutex);
106
107struct ucode_cpu_info ucode_cpu_info[NR_CPUS];
108EXPORT_SYMBOL_GPL(ucode_cpu_info);
109
110#ifdef CONFIG_MICROCODE_OLD_INTERFACE
111static int do_microcode_update(const void __user *buf, size_t size)
112{
113 cpumask_t old;
114 int error = 0;
115 int cpu;
116
117 old = current->cpus_allowed;
118
119 for_each_online_cpu(cpu) {
120 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
121
122 if (!uci->valid)
123 continue;
124
125 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
126 error = microcode_ops->request_microcode_user(cpu, buf, size);
127 if (error < 0)
128 goto out;
129 if (!error)
130 microcode_ops->apply_microcode(cpu);
131 }
132out:
133 set_cpus_allowed_ptr(current, &old);
134 return error;
135}
136
137static int microcode_open(struct inode *unused1, struct file *unused2)
138{
139 cycle_kernel_lock();
140 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
141}
142
143static ssize_t microcode_write(struct file *file, const char __user *buf,
144 size_t len, loff_t *ppos)
145{
146 ssize_t ret;
147
148 if ((len >> PAGE_SHIFT) > num_physpages) {
149 printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
150 num_physpages);
151 return -EINVAL;
152 }
153
154 get_online_cpus();
155 mutex_lock(&microcode_mutex);
156
157 ret = do_microcode_update(buf, len);
158 if (!ret)
159 ret = (ssize_t)len;
160
161 mutex_unlock(&microcode_mutex);
162 put_online_cpus();
163
164 return ret;
165}
166
167static const struct file_operations microcode_fops = {
168 .owner = THIS_MODULE,
169 .write = microcode_write,
170 .open = microcode_open,
171};
172
173static struct miscdevice microcode_dev = {
174 .minor = MICROCODE_MINOR,
175 .name = "microcode",
176 .fops = &microcode_fops,
177};
178
179static int __init microcode_dev_init(void)
180{
181 int error;
182
183 error = misc_register(&microcode_dev);
184 if (error) {
185 printk(KERN_ERR
186 "microcode: can't misc_register on minor=%d\n",
187 MICROCODE_MINOR);
188 return error;
189 }
190
191 return 0;
192}
193
194static void microcode_dev_exit(void)
195{
196 misc_deregister(&microcode_dev);
197}
198
199MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
200#else
201#define microcode_dev_init() 0
202#define microcode_dev_exit() do { } while (0)
203#endif
204
205/* fake device for request_firmware */
206struct platform_device *microcode_pdev;
207
208static ssize_t reload_store(struct sys_device *dev,
209 struct sysdev_attribute *attr,
210 const char *buf, size_t sz)
211{
212 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
213 char *end;
214 unsigned long val = simple_strtoul(buf, &end, 0);
215 int err = 0;
216 int cpu = dev->id;
217
218 if (end == buf)
219 return -EINVAL;
220 if (val == 1) {
221 cpumask_t old = current->cpus_allowed;
222
223 get_online_cpus();
224 if (cpu_online(cpu)) {
225 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
226 mutex_lock(&microcode_mutex);
227 if (uci->valid) {
228 err = microcode_ops->request_microcode_fw(cpu,
229 &microcode_pdev->dev);
230 if (!err)
231 microcode_ops->apply_microcode(cpu);
232 }
233 mutex_unlock(&microcode_mutex);
234 set_cpus_allowed_ptr(current, &old);
235 }
236 put_online_cpus();
237 }
238 if (err)
239 return err;
240 return sz;
241}
242
243static ssize_t version_show(struct sys_device *dev,
244 struct sysdev_attribute *attr, char *buf)
245{
246 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
247
248 return sprintf(buf, "0x%x\n", uci->cpu_sig.rev);
249}
250
251static ssize_t pf_show(struct sys_device *dev,
252 struct sysdev_attribute *attr, char *buf)
253{
254 struct ucode_cpu_info *uci = ucode_cpu_info + dev->id;
255
256 return sprintf(buf, "0x%x\n", uci->cpu_sig.pf);
257}
258
259static SYSDEV_ATTR(reload, 0200, NULL, reload_store);
260static SYSDEV_ATTR(version, 0400, version_show, NULL);
261static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL);
262
263static struct attribute *mc_default_attrs[] = {
264 &attr_reload.attr,
265 &attr_version.attr,
266 &attr_processor_flags.attr,
267 NULL
268};
269
270static struct attribute_group mc_attr_group = {
271 .attrs = mc_default_attrs,
272 .name = "microcode",
273};
274
275static void microcode_fini_cpu(int cpu)
276{
277 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
278
279 mutex_lock(&microcode_mutex);
280 microcode_ops->microcode_fini_cpu(cpu);
281 uci->valid = 0;
282 mutex_unlock(&microcode_mutex);
283}
284
285static void collect_cpu_info(int cpu)
286{
287 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
288
289 memset(uci, 0, sizeof(*uci));
290 if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
291 uci->valid = 1;
292}
293
294static int microcode_resume_cpu(int cpu)
295{
296 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
297 struct cpu_signature nsig;
298
299 pr_debug("microcode: CPU%d resumed\n", cpu);
300
301 if (!uci->mc)
302 return 1;
303
304 /*
305 * Let's verify that the 'cached' ucode does belong
306 * to this cpu (a bit of paranoia):
307 */
308 if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
309 microcode_fini_cpu(cpu);
310 return -1;
311 }
312
313 if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) {
314 microcode_fini_cpu(cpu);
315 /* Should we look for a new ucode here? */
316 return 1;
317 }
318
319 return 0;
320}
321
322void microcode_update_cpu(int cpu)
323{
324 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
325 int err = 0;
326
327 /*
328 * Check if the system resume is in progress (uci->valid != NULL),
329 * otherwise just request a firmware:
330 */
331 if (uci->valid) {
332 err = microcode_resume_cpu(cpu);
333 } else {
334 collect_cpu_info(cpu);
335 if (uci->valid && system_state == SYSTEM_RUNNING)
336 err = microcode_ops->request_microcode_fw(cpu,
337 &microcode_pdev->dev);
338 }
339 if (!err)
340 microcode_ops->apply_microcode(cpu);
341}
342
343static void microcode_init_cpu(int cpu)
344{
345 cpumask_t old = current->cpus_allowed;
346
347 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
348 /* We should bind the task to the CPU */
349 BUG_ON(raw_smp_processor_id() != cpu);
350
351 mutex_lock(&microcode_mutex);
352 microcode_update_cpu(cpu);
353 mutex_unlock(&microcode_mutex);
354
355 set_cpus_allowed_ptr(current, &old);
356}
357
358static int mc_sysdev_add(struct sys_device *sys_dev)
359{
360 int err, cpu = sys_dev->id;
361 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
362
363 if (!cpu_online(cpu))
364 return 0;
365
366 pr_debug("microcode: CPU%d added\n", cpu);
367 memset(uci, 0, sizeof(*uci));
368
369 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
370 if (err)
371 return err;
372
373 microcode_init_cpu(cpu);
374 return 0;
375}
376
377static int mc_sysdev_remove(struct sys_device *sys_dev)
378{
379 int cpu = sys_dev->id;
380
381 if (!cpu_online(cpu))
382 return 0;
383
384 pr_debug("microcode: CPU%d removed\n", cpu);
385 microcode_fini_cpu(cpu);
386 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
387 return 0;
388}
389
390static int mc_sysdev_resume(struct sys_device *dev)
391{
392 int cpu = dev->id;
393
394 if (!cpu_online(cpu))
395 return 0;
396
397 /* only CPU 0 will apply ucode here */
398 microcode_update_cpu(0);
399 return 0;
400}
401
402static struct sysdev_driver mc_sysdev_driver = {
403 .add = mc_sysdev_add,
404 .remove = mc_sysdev_remove,
405 .resume = mc_sysdev_resume,
406};
407
408static __cpuinit int
409mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
410{
411 unsigned int cpu = (unsigned long)hcpu;
412 struct sys_device *sys_dev;
413
414 sys_dev = get_cpu_sysdev(cpu);
415 switch (action) {
416 case CPU_ONLINE:
417 case CPU_ONLINE_FROZEN:
418 microcode_init_cpu(cpu);
419 case CPU_DOWN_FAILED:
420 case CPU_DOWN_FAILED_FROZEN:
421 pr_debug("microcode: CPU%d added\n", cpu);
422 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
423 printk(KERN_ERR "microcode: Failed to create the sysfs "
424 "group for CPU%d\n", cpu);
425 break;
426 case CPU_DOWN_PREPARE:
427 case CPU_DOWN_PREPARE_FROZEN:
428 /* Suspend is in progress, only remove the interface */
429 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
430 pr_debug("microcode: CPU%d removed\n", cpu);
431 break;
432 case CPU_DEAD:
433 case CPU_UP_CANCELED_FROZEN:
434 /* The CPU refused to come up during a system resume */
435 microcode_fini_cpu(cpu);
436 break;
437 }
438 return NOTIFY_OK;
439}
440
441static struct notifier_block __refdata mc_cpu_notifier = {
442 .notifier_call = mc_cpu_callback,
443};
444
445static int __init microcode_init(void)
446{
447 struct cpuinfo_x86 *c = &cpu_data(0);
448 int error;
449
450 if (c->x86_vendor == X86_VENDOR_INTEL)
451 microcode_ops = init_intel_microcode();
452 else if (c->x86_vendor == X86_VENDOR_AMD)
453 microcode_ops = init_amd_microcode();
454
455 if (!microcode_ops) {
456 printk(KERN_ERR "microcode: no support for this CPU vendor\n");
457 return -ENODEV;
458 }
459
460 error = microcode_dev_init();
461 if (error)
462 return error;
463 microcode_pdev = platform_device_register_simple("microcode", -1,
464 NULL, 0);
465 if (IS_ERR(microcode_pdev)) {
466 microcode_dev_exit();
467 return PTR_ERR(microcode_pdev);
468 }
469
470 get_online_cpus();
471 error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
472 put_online_cpus();
473 if (error) {
474 microcode_dev_exit();
475 platform_device_unregister(microcode_pdev);
476 return error;
477 }
478
479 register_hotcpu_notifier(&mc_cpu_notifier);
480
481 printk(KERN_INFO
482 "Microcode Update Driver: v" MICROCODE_VERSION
483 " <tigran@aivazian.fsnet.co.uk>"
484 " <peter.oruba@amd.com>\n");
485
486 return 0;
487}
488
489static void __exit microcode_exit(void)
490{
491 microcode_dev_exit();
492
493 unregister_hotcpu_notifier(&mc_cpu_notifier);
494
495 get_online_cpus();
496 sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
497 put_online_cpus();
498
499 platform_device_unregister(microcode_pdev);
500
501 microcode_ops = NULL;
502
503 printk(KERN_INFO
504 "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
505}
506
507module_init(microcode_init);
508module_exit(microcode_exit);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
new file mode 100644
index 000000000000..622dc4a21784
--- /dev/null
+++ b/arch/x86/kernel/microcode_intel.c
@@ -0,0 +1,480 @@
1/*
2 * Intel CPU Microcode Update Driver for Linux
3 *
4 * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk>
5 * 2006 Shaohua Li <shaohua.li@intel.com>
6 *
7 * This driver allows to upgrade microcode on Intel processors
8 * belonging to IA-32 family - PentiumPro, Pentium II,
9 * Pentium III, Xeon, Pentium 4, etc.
10 *
11 * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture
12 * Software Developer's Manual
13 * Order Number 253668 or free download from:
14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm
16 *
17 * For more information, go to http://www.urbanmyth.org/microcode
18 *
19 * This program is free software; you can redistribute it and/or
20 * modify it under the terms of the GNU General Public License
21 * as published by the Free Software Foundation; either version
22 * 2 of the License, or (at your option) any later version.
23 *
24 * 1.0 16 Feb 2000, Tigran Aivazian <tigran@sco.com>
25 * Initial release.
26 * 1.01 18 Feb 2000, Tigran Aivazian <tigran@sco.com>
27 * Added read() support + cleanups.
28 * 1.02 21 Feb 2000, Tigran Aivazian <tigran@sco.com>
29 * Added 'device trimming' support. open(O_WRONLY) zeroes
30 * and frees the saved copy of applied microcode.
31 * 1.03 29 Feb 2000, Tigran Aivazian <tigran@sco.com>
32 * Made to use devfs (/dev/cpu/microcode) + cleanups.
33 * 1.04 06 Jun 2000, Simon Trimmer <simon@veritas.com>
34 * Added misc device support (now uses both devfs and misc).
35 * Added MICROCODE_IOCFREE ioctl to clear memory.
36 * 1.05 09 Jun 2000, Simon Trimmer <simon@veritas.com>
37 * Messages for error cases (non Intel & no suitable microcode).
38 * 1.06 03 Aug 2000, Tigran Aivazian <tigran@veritas.com>
39 * Removed ->release(). Removed exclusive open and status bitmap.
40 * Added microcode_rwsem to serialize read()/write()/ioctl().
41 * Removed global kernel lock usage.
42 * 1.07 07 Sep 2000, Tigran Aivazian <tigran@veritas.com>
43 * Write 0 to 0x8B msr and then cpuid before reading revision,
44 * so that it works even if there were no update done by the
45 * BIOS. Otherwise, reading from 0x8B gives junk (which happened
46 * to be 0 on my machine which is why it worked even when I
47 * disabled update by the BIOS)
48 * Thanks to Eric W. Biederman <ebiederman@lnxi.com> for the fix.
49 * 1.08 11 Dec 2000, Richard Schaal <richard.schaal@intel.com> and
50 * Tigran Aivazian <tigran@veritas.com>
51 * Intel Pentium 4 processor support and bugfixes.
52 * 1.09 30 Oct 2001, Tigran Aivazian <tigran@veritas.com>
53 * Bugfix for HT (Hyper-Threading) enabled processors
54 * whereby processor resources are shared by all logical processors
55 * in a single CPU package.
56 * 1.10 28 Feb 2002 Asit K Mallick <asit.k.mallick@intel.com> and
57 * Tigran Aivazian <tigran@veritas.com>,
58 * Serialize updates as required on HT processors due to
59 * speculative nature of implementation.
60 * 1.11 22 Mar 2002 Tigran Aivazian <tigran@veritas.com>
61 * Fix the panic when writing zero-length microcode chunk.
62 * 1.12 29 Sep 2003 Nitin Kamble <nitin.a.kamble@intel.com>,
63 * Jun Nakajima <jun.nakajima@intel.com>
64 * Support for the microcode updates in the new format.
65 * 1.13 10 Oct 2003 Tigran Aivazian <tigran@veritas.com>
66 * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl
67 * because we no longer hold a copy of applied microcode
68 * in kernel memory.
69 * 1.14 25 Jun 2004 Tigran Aivazian <tigran@veritas.com>
70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug.
72 */
73#include <linux/capability.h>
74#include <linux/kernel.h>
75#include <linux/init.h>
76#include <linux/sched.h>
77#include <linux/smp_lock.h>
78#include <linux/cpumask.h>
79#include <linux/module.h>
80#include <linux/slab.h>
81#include <linux/vmalloc.h>
82#include <linux/miscdevice.h>
83#include <linux/spinlock.h>
84#include <linux/mm.h>
85#include <linux/fs.h>
86#include <linux/mutex.h>
87#include <linux/cpu.h>
88#include <linux/firmware.h>
89#include <linux/platform_device.h>
90
91#include <asm/msr.h>
92#include <asm/uaccess.h>
93#include <asm/processor.h>
94#include <asm/microcode.h>
95
96MODULE_DESCRIPTION("Microcode Update Driver");
97MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
98MODULE_LICENSE("GPL");
99
100struct microcode_header_intel {
101 unsigned int hdrver;
102 unsigned int rev;
103 unsigned int date;
104 unsigned int sig;
105 unsigned int cksum;
106 unsigned int ldrver;
107 unsigned int pf;
108 unsigned int datasize;
109 unsigned int totalsize;
110 unsigned int reserved[3];
111};
112
113struct microcode_intel {
114 struct microcode_header_intel hdr;
115 unsigned int bits[0];
116};
117
118/* microcode format is extended from prescott processors */
119struct extended_signature {
120 unsigned int sig;
121 unsigned int pf;
122 unsigned int cksum;
123};
124
125struct extended_sigtable {
126 unsigned int count;
127 unsigned int cksum;
128 unsigned int reserved[3];
129 struct extended_signature sigs[0];
130};
131
132#define DEFAULT_UCODE_DATASIZE (2000)
133#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel))
134#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE)
135#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable))
136#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature))
137#define DWSIZE (sizeof(u32))
138#define get_totalsize(mc) \
139 (((struct microcode_intel *)mc)->hdr.totalsize ? \
140 ((struct microcode_intel *)mc)->hdr.totalsize : \
141 DEFAULT_UCODE_TOTALSIZE)
142
143#define get_datasize(mc) \
144 (((struct microcode_intel *)mc)->hdr.datasize ? \
145 ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE)
146
147#define sigmatch(s1, s2, p1, p2) \
148 (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0))))
149
150#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
151
152/* serialize access to the physical write to MSR 0x79 */
153static DEFINE_SPINLOCK(microcode_update_lock);
154
155static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
156{
157 struct cpuinfo_x86 *c = &cpu_data(cpu_num);
158 unsigned int val[2];
159
160 memset(csig, 0, sizeof(*csig));
161
162 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
163 cpu_has(c, X86_FEATURE_IA64)) {
164 printk(KERN_ERR "microcode: CPU%d not a capable Intel "
165 "processor\n", cpu_num);
166 return -1;
167 }
168
169 csig->sig = cpuid_eax(0x00000001);
170
171 if ((c->x86_model >= 5) || (c->x86 > 6)) {
172 /* get processor flags from MSR 0x17 */
173 rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
174 csig->pf = 1 << ((val[1] >> 18) & 7);
175 }
176
177 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
178 /* see notes above for revision 1.07. Apparent chip bug */
179 sync_core();
180 /* get the current revision from MSR 0x8B */
181 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
182 pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
183 csig->sig, csig->pf, csig->rev);
184
185 return 0;
186}
187
188static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf)
189{
190 return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1;
191}
192
193static inline int
194update_match_revision(struct microcode_header_intel *mc_header, int rev)
195{
196 return (mc_header->rev <= rev) ? 0 : 1;
197}
198
199static int microcode_sanity_check(void *mc)
200{
201 struct microcode_header_intel *mc_header = mc;
202 struct extended_sigtable *ext_header = NULL;
203 struct extended_signature *ext_sig;
204 unsigned long total_size, data_size, ext_table_size;
205 int sum, orig_sum, ext_sigcount = 0, i;
206
207 total_size = get_totalsize(mc_header);
208 data_size = get_datasize(mc_header);
209 if (data_size + MC_HEADER_SIZE > total_size) {
210 printk(KERN_ERR "microcode: error! "
211 "Bad data size in microcode data file\n");
212 return -EINVAL;
213 }
214
215 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
216 printk(KERN_ERR "microcode: error! "
217 "Unknown microcode update format\n");
218 return -EINVAL;
219 }
220 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
221 if (ext_table_size) {
222 if ((ext_table_size < EXT_HEADER_SIZE)
223 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
224 printk(KERN_ERR "microcode: error! "
225 "Small exttable size in microcode data file\n");
226 return -EINVAL;
227 }
228 ext_header = mc + MC_HEADER_SIZE + data_size;
229 if (ext_table_size != exttable_size(ext_header)) {
230 printk(KERN_ERR "microcode: error! "
231 "Bad exttable size in microcode data file\n");
232 return -EFAULT;
233 }
234 ext_sigcount = ext_header->count;
235 }
236
237 /* check extended table checksum */
238 if (ext_table_size) {
239 int ext_table_sum = 0;
240 int *ext_tablep = (int *)ext_header;
241
242 i = ext_table_size / DWSIZE;
243 while (i--)
244 ext_table_sum += ext_tablep[i];
245 if (ext_table_sum) {
246 printk(KERN_WARNING "microcode: aborting, "
247 "bad extended signature table checksum\n");
248 return -EINVAL;
249 }
250 }
251
252 /* calculate the checksum */
253 orig_sum = 0;
254 i = (MC_HEADER_SIZE + data_size) / DWSIZE;
255 while (i--)
256 orig_sum += ((int *)mc)[i];
257 if (orig_sum) {
258 printk(KERN_ERR "microcode: aborting, bad checksum\n");
259 return -EINVAL;
260 }
261 if (!ext_table_size)
262 return 0;
263 /* check extended signature checksum */
264 for (i = 0; i < ext_sigcount; i++) {
265 ext_sig = (void *)ext_header + EXT_HEADER_SIZE +
266 EXT_SIGNATURE_SIZE * i;
267 sum = orig_sum
268 - (mc_header->sig + mc_header->pf + mc_header->cksum)
269 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
270 if (sum) {
271 printk(KERN_ERR "microcode: aborting, bad checksum\n");
272 return -EINVAL;
273 }
274 }
275 return 0;
276}
277
278/*
279 * return 0 - no update found
280 * return 1 - found update
281 */
282static int
283get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
284{
285 struct microcode_header_intel *mc_header = mc;
286 struct extended_sigtable *ext_header;
287 unsigned long total_size = get_totalsize(mc_header);
288 int ext_sigcount, i;
289 struct extended_signature *ext_sig;
290
291 if (!update_match_revision(mc_header, rev))
292 return 0;
293
294 if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf))
295 return 1;
296
297 /* Look for ext. headers: */
298 if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE)
299 return 0;
300
301 ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE;
302 ext_sigcount = ext_header->count;
303 ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
304
305 for (i = 0; i < ext_sigcount; i++) {
306 if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf))
307 return 1;
308 ext_sig++;
309 }
310 return 0;
311}
312
313static void apply_microcode(int cpu)
314{
315 unsigned long flags;
316 unsigned int val[2];
317 int cpu_num = raw_smp_processor_id();
318 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
319 struct microcode_intel *mc_intel = uci->mc;
320
321 /* We should bind the task to the CPU */
322 BUG_ON(cpu_num != cpu);
323
324 if (mc_intel == NULL)
325 return;
326
327 /* serialize access to the physical write to MSR 0x79 */
328 spin_lock_irqsave(&microcode_update_lock, flags);
329
330 /* write microcode via MSR 0x79 */
331 wrmsr(MSR_IA32_UCODE_WRITE,
332 (unsigned long) mc_intel->bits,
333 (unsigned long) mc_intel->bits >> 16 >> 16);
334 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
335
336 /* see notes above for revision 1.07. Apparent chip bug */
337 sync_core();
338
339 /* get the current revision from MSR 0x8B */
340 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
341
342 spin_unlock_irqrestore(&microcode_update_lock, flags);
343 if (val[1] != mc_intel->hdr.rev) {
344 printk(KERN_ERR "microcode: CPU%d update from revision "
345 "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]);
346 return;
347 }
348 printk(KERN_INFO "microcode: CPU%d updated from revision "
349 "0x%x to 0x%x, date = %04x-%02x-%02x \n",
350 cpu_num, uci->cpu_sig.rev, val[1],
351 mc_intel->hdr.date & 0xffff,
352 mc_intel->hdr.date >> 24,
353 (mc_intel->hdr.date >> 16) & 0xff);
354 uci->cpu_sig.rev = val[1];
355}
356
357static int generic_load_microcode(int cpu, void *data, size_t size,
358 int (*get_ucode_data)(void *, const void *, size_t))
359{
360 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
361 u8 *ucode_ptr = data, *new_mc = NULL, *mc;
362 int new_rev = uci->cpu_sig.rev;
363 unsigned int leftover = size;
364
365 while (leftover) {
366 struct microcode_header_intel mc_header;
367 unsigned int mc_size;
368
369 if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header)))
370 break;
371
372 mc_size = get_totalsize(&mc_header);
373 if (!mc_size || mc_size > leftover) {
374 printk(KERN_ERR "microcode: error!"
375 "Bad data in microcode data file\n");
376 break;
377 }
378
379 mc = vmalloc(mc_size);
380 if (!mc)
381 break;
382
383 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
384 microcode_sanity_check(mc) < 0) {
385 vfree(mc);
386 break;
387 }
388
389 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
390 if (new_mc)
391 vfree(new_mc);
392 new_rev = mc_header.rev;
393 new_mc = mc;
394 } else
395 vfree(mc);
396
397 ucode_ptr += mc_size;
398 leftover -= mc_size;
399 }
400
401 if (new_mc) {
402 if (!leftover) {
403 if (uci->mc)
404 vfree(uci->mc);
405 uci->mc = (struct microcode_intel *)new_mc;
406 pr_debug("microcode: CPU%d found a matching microcode update with"
407 " version 0x%x (current=0x%x)\n",
408 cpu, new_rev, uci->cpu_sig.rev);
409 } else
410 vfree(new_mc);
411 }
412
413 return (int)leftover;
414}
415
416static int get_ucode_fw(void *to, const void *from, size_t n)
417{
418 memcpy(to, from, n);
419 return 0;
420}
421
422static int request_microcode_fw(int cpu, struct device *device)
423{
424 char name[30];
425 struct cpuinfo_x86 *c = &cpu_data(cpu);
426 const struct firmware *firmware;
427 int ret;
428
429 /* We should bind the task to the CPU */
430 BUG_ON(cpu != raw_smp_processor_id());
431 sprintf(name, "intel-ucode/%02x-%02x-%02x",
432 c->x86, c->x86_model, c->x86_mask);
433 ret = request_firmware(&firmware, name, device);
434 if (ret) {
435 pr_debug("microcode: data file %s load failed\n", name);
436 return ret;
437 }
438
439 ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size,
440 &get_ucode_fw);
441
442 release_firmware(firmware);
443
444 return ret;
445}
446
447static int get_ucode_user(void *to, const void *from, size_t n)
448{
449 return copy_from_user(to, from, n);
450}
451
452static int request_microcode_user(int cpu, const void __user *buf, size_t size)
453{
454 /* We should bind the task to the CPU */
455 BUG_ON(cpu != raw_smp_processor_id());
456
457 return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user);
458}
459
460static void microcode_fini_cpu(int cpu)
461{
462 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
463
464 vfree(uci->mc);
465 uci->mc = NULL;
466}
467
468struct microcode_ops microcode_intel_ops = {
469 .request_microcode_user = request_microcode_user,
470 .request_microcode_fw = request_microcode_fw,
471 .collect_cpu_info = collect_cpu_info,
472 .apply_microcode = apply_microcode,
473 .microcode_fini_cpu = microcode_fini_cpu,
474};
475
476struct microcode_ops * __init init_intel_microcode(void)
477{
478 return &microcode_intel_ops;
479}
480
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index fdfdc550b366..efc2f361fe85 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -238,7 +238,7 @@ static struct dmi_system_id __devinitdata mmconf_dmi_table[] = {
238 {} 238 {}
239}; 239};
240 240
241void __init check_enable_amd_mmconf_dmi(void) 241void __cpuinit check_enable_amd_mmconf_dmi(void)
242{ 242{
243 dmi_check_system(mmconf_dmi_table); 243 dmi_check_system(mmconf_dmi_table);
244} 244}
diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c
index a888e67f5874..6ba87830d4b1 100644
--- a/arch/x86/kernel/module_64.c
+++ b/arch/x86/kernel/module_64.c
@@ -22,6 +22,7 @@
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/string.h> 23#include <linux/string.h>
24#include <linux/kernel.h> 24#include <linux/kernel.h>
25#include <linux/mm.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/bug.h> 27#include <linux/bug.h>
27 28
@@ -150,7 +151,8 @@ int module_finalize(const Elf_Ehdr *hdr,
150 const Elf_Shdr *sechdrs, 151 const Elf_Shdr *sechdrs,
151 struct module *me) 152 struct module *me)
152{ 153{
153 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; 154 const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL,
155 *para = NULL;
154 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; 156 char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset;
155 157
156 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { 158 for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) {
@@ -160,6 +162,8 @@ int module_finalize(const Elf_Ehdr *hdr,
160 alt = s; 162 alt = s;
161 if (!strcmp(".smp_locks", secstrings + s->sh_name)) 163 if (!strcmp(".smp_locks", secstrings + s->sh_name))
162 locks= s; 164 locks= s;
165 if (!strcmp(".parainstructions", secstrings + s->sh_name))
166 para = s;
163 } 167 }
164 168
165 if (alt) { 169 if (alt) {
@@ -175,6 +179,11 @@ int module_finalize(const Elf_Ehdr *hdr,
175 tseg, tseg + text->sh_size); 179 tseg, tseg + text->sh_size);
176 } 180 }
177 181
182 if (para) {
183 void *pseg = (void *)para->sh_addr;
184 apply_paravirt(pseg, pseg + para->sh_size);
185 }
186
178 return module_bug_finalize(hdr, sechdrs, me); 187 return module_bug_finalize(hdr, sechdrs, me);
179} 188}
180 189
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 3b25e49380c6..f98f4e1dba09 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -27,6 +27,7 @@
27#include <asm/bios_ebda.h> 27#include <asm/bios_ebda.h>
28#include <asm/e820.h> 28#include <asm/e820.h>
29#include <asm/trampoline.h> 29#include <asm/trampoline.h>
30#include <asm/setup.h>
30 31
31#include <mach_apic.h> 32#include <mach_apic.h>
32#ifdef CONFIG_X86_32 33#ifdef CONFIG_X86_32
@@ -48,77 +49,7 @@ static int __init mpf_checksum(unsigned char *mp, int len)
48 return sum & 0xFF; 49 return sum & 0xFF;
49} 50}
50 51
51#ifdef CONFIG_X86_NUMAQ 52static void __init MP_processor_info(struct mpc_config_processor *m)
52int found_numaq;
53/*
54 * Have to match translation table entries to main table entries by counter
55 * hence the mpc_record variable .... can't see a less disgusting way of
56 * doing this ....
57 */
58struct mpc_config_translation {
59 unsigned char mpc_type;
60 unsigned char trans_len;
61 unsigned char trans_type;
62 unsigned char trans_quad;
63 unsigned char trans_global;
64 unsigned char trans_local;
65 unsigned short trans_reserved;
66};
67
68
69static int mpc_record;
70static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
71 __cpuinitdata;
72
73static inline int generate_logical_apicid(int quad, int phys_apicid)
74{
75 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
76}
77
78
79static inline int mpc_apic_id(struct mpc_config_processor *m,
80 struct mpc_config_translation *translation_record)
81{
82 int quad = translation_record->trans_quad;
83 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
84
85 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
86 m->mpc_apicid,
87 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
88 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
89 m->mpc_apicver, quad, logical_apicid);
90 return logical_apicid;
91}
92
93int mp_bus_id_to_node[MAX_MP_BUSSES];
94
95int mp_bus_id_to_local[MAX_MP_BUSSES];
96
97static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name,
98 struct mpc_config_translation *translation)
99{
100 int quad = translation->trans_quad;
101 int local = translation->trans_local;
102
103 mp_bus_id_to_node[m->mpc_busid] = quad;
104 mp_bus_id_to_local[m->mpc_busid] = local;
105 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
106 m->mpc_busid, name, quad);
107}
108
109int quad_local_to_mp_bus_id [NR_CPUS/4][4];
110static void mpc_oem_pci_bus(struct mpc_config_bus *m,
111 struct mpc_config_translation *translation)
112{
113 int quad = translation->trans_quad;
114 int local = translation->trans_local;
115
116 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
117}
118
119#endif
120
121static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
122{ 53{
123 int apicid; 54 int apicid;
124 char *bootup_cpu = ""; 55 char *bootup_cpu = "";
@@ -127,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m)
127 disabled_cpus++; 58 disabled_cpus++;
128 return; 59 return;
129 } 60 }
130#ifdef CONFIG_X86_NUMAQ 61
131 if (found_numaq) 62 if (x86_quirks->mpc_apic_id)
132 apicid = mpc_apic_id(m, translation_table[mpc_record]); 63 apicid = x86_quirks->mpc_apic_id(m);
133 else 64 else
134 apicid = m->mpc_apicid; 65 apicid = m->mpc_apicid;
135#else 66
136 apicid = m->mpc_apicid;
137#endif
138 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { 67 if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) {
139 bootup_cpu = " (Bootup-CPU)"; 68 bootup_cpu = " (Bootup-CPU)";
140 boot_cpu_physical_apicid = m->mpc_apicid; 69 boot_cpu_physical_apicid = m->mpc_apicid;
@@ -151,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
151 memcpy(str, m->mpc_bustype, 6); 80 memcpy(str, m->mpc_bustype, 6);
152 str[6] = 0; 81 str[6] = 0;
153 82
154#ifdef CONFIG_X86_NUMAQ 83 if (x86_quirks->mpc_oem_bus_info)
155 if (found_numaq) 84 x86_quirks->mpc_oem_bus_info(m, str);
156 mpc_oem_bus_info(m, str, translation_table[mpc_record]); 85 else
157#else 86 apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str);
158 printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str);
159#endif
160 87
161#if MAX_MP_BUSSES < 256 88#if MAX_MP_BUSSES < 256
162 if (m->mpc_busid >= MAX_MP_BUSSES) { 89 if (m->mpc_busid >= MAX_MP_BUSSES) {
@@ -173,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m)
173 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; 100 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA;
174#endif 101#endif
175 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { 102 } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) {
176#ifdef CONFIG_X86_NUMAQ 103 if (x86_quirks->mpc_oem_pci_bus)
177 if (found_numaq) 104 x86_quirks->mpc_oem_pci_bus(m);
178 mpc_oem_pci_bus(m, translation_table[mpc_record]); 105
179#endif
180 clear_bit(m->mpc_busid, mp_bus_not_pci); 106 clear_bit(m->mpc_busid, mp_bus_not_pci);
181#if defined(CONFIG_EISA) || defined (CONFIG_MCA) 107#if defined(CONFIG_EISA) || defined (CONFIG_MCA)
182 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; 108 mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI;
@@ -228,7 +154,7 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m)
228 154
229static void print_MP_intsrc_info(struct mpc_config_intsrc *m) 155static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
230{ 156{
231 printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," 157 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
232 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 158 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
233 m->mpc_irqtype, m->mpc_irqflag & 3, 159 m->mpc_irqtype, m->mpc_irqflag & 3,
234 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, 160 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus,
@@ -237,7 +163,7 @@ static void print_MP_intsrc_info(struct mpc_config_intsrc *m)
237 163
238static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) 164static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq)
239{ 165{
240 printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," 166 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
241 " IRQ %02x, APIC ID %x, APIC INT %02x\n", 167 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
242 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, 168 mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3,
243 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, 169 (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus,
@@ -309,90 +235,13 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m)
309 235
310static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) 236static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m)
311{ 237{
312 printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x," 238 apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
313 " IRQ %02x, APIC ID %x, APIC LINT %02x\n", 239 " IRQ %02x, APIC ID %x, APIC LINT %02x\n",
314 m->mpc_irqtype, m->mpc_irqflag & 3, 240 m->mpc_irqtype, m->mpc_irqflag & 3,
315 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, 241 (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid,
316 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); 242 m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint);
317} 243}
318 244
319#ifdef CONFIG_X86_NUMAQ
320static void __init MP_translation_info(struct mpc_config_translation *m)
321{
322 printk(KERN_INFO
323 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
324 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
325 m->trans_local);
326
327 if (mpc_record >= MAX_MPC_ENTRY)
328 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
329 else
330 translation_table[mpc_record] = m; /* stash this for later */
331 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
332 node_set_online(m->trans_quad);
333}
334
335/*
336 * Read/parse the MPC oem tables
337 */
338
339static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
340 unsigned short oemsize)
341{
342 int count = sizeof(*oemtable); /* the header size */
343 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
344
345 mpc_record = 0;
346 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
347 oemtable);
348 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
349 printk(KERN_WARNING
350 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
351 oemtable->oem_signature[0], oemtable->oem_signature[1],
352 oemtable->oem_signature[2], oemtable->oem_signature[3]);
353 return;
354 }
355 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
356 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
357 return;
358 }
359 while (count < oemtable->oem_length) {
360 switch (*oemptr) {
361 case MP_TRANSLATION:
362 {
363 struct mpc_config_translation *m =
364 (struct mpc_config_translation *)oemptr;
365 MP_translation_info(m);
366 oemptr += sizeof(*m);
367 count += sizeof(*m);
368 ++mpc_record;
369 break;
370 }
371 default:
372 {
373 printk(KERN_WARNING
374 "Unrecognised OEM table entry type! - %d\n",
375 (int)*oemptr);
376 return;
377 }
378 }
379 }
380}
381
382void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
383 char *productid)
384{
385 if (strncmp(oem, "IBM NUMA", 8))
386 printk("Warning! Not a NUMA-Q system!\n");
387 else
388 found_numaq = 1;
389
390 if (mpc->mpc_oemptr)
391 smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr,
392 mpc->mpc_oemsize);
393}
394#endif /* CONFIG_X86_NUMAQ */
395
396/* 245/*
397 * Read/parse the MPC 246 * Read/parse the MPC
398 */ 247 */
@@ -457,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
457 } else 306 } else
458 mps_oem_check(mpc, oem, str); 307 mps_oem_check(mpc, oem, str);
459#endif 308#endif
460
461 /* save the local APIC address, it might be non-default */ 309 /* save the local APIC address, it might be non-default */
462 if (!acpi_lapic) 310 if (!acpi_lapic)
463 mp_lapic_addr = mpc->mpc_lapic; 311 mp_lapic_addr = mpc->mpc_lapic;
@@ -465,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
465 if (early) 313 if (early)
466 return 1; 314 return 1;
467 315
316 if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) {
317 struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr;
318 x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize);
319 }
320
468 /* 321 /*
469 * Now process the configuration blocks. 322 * Now process the configuration blocks.
470 */ 323 */
471#ifdef CONFIG_X86_NUMAQ 324 if (x86_quirks->mpc_record)
472 mpc_record = 0; 325 *x86_quirks->mpc_record = 0;
473#endif 326
474 while (count < mpc->mpc_length) { 327 while (count < mpc->mpc_length) {
475 switch (*mpt) { 328 switch (*mpt) {
476 case MP_PROCESSOR: 329 case MP_PROCESSOR:
@@ -536,16 +389,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early)
536 count = mpc->mpc_length; 389 count = mpc->mpc_length;
537 break; 390 break;
538 } 391 }
539#ifdef CONFIG_X86_NUMAQ 392 if (x86_quirks->mpc_record)
540 ++mpc_record; 393 (*x86_quirks->mpc_record)++;
541#endif
542 } 394 }
543 395
544#ifdef CONFIG_X86_GENERICARCH 396#ifdef CONFIG_X86_GENERICARCH
545 generic_bigsmp_probe(); 397 generic_bigsmp_probe();
546#endif 398#endif
547 399
400#ifdef CONFIG_X86_32
548 setup_apic_routing(); 401 setup_apic_routing();
402#endif
549 if (!num_processors) 403 if (!num_processors)
550 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 404 printk(KERN_ERR "MPTABLE: no processors registered!\n");
551 return num_processors; 405 return num_processors;
@@ -632,7 +486,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
632} 486}
633 487
634 488
635static void construct_ioapic_table(int mpc_default_type) 489static void __init construct_ioapic_table(int mpc_default_type)
636{ 490{
637 struct mpc_config_ioapic ioapic; 491 struct mpc_config_ioapic ioapic;
638 struct mpc_config_bus bus; 492 struct mpc_config_bus bus;
@@ -677,7 +531,7 @@ static void construct_ioapic_table(int mpc_default_type)
677 construct_default_ioirq_mptable(mpc_default_type); 531 construct_default_ioirq_mptable(mpc_default_type);
678} 532}
679#else 533#else
680static inline void construct_ioapic_table(int mpc_default_type) { } 534static inline void __init construct_ioapic_table(int mpc_default_type) { }
681#endif 535#endif
682 536
683static inline void __init construct_default_ISA_mptable(int mpc_default_type) 537static inline void __init construct_default_ISA_mptable(int mpc_default_type)
@@ -726,20 +580,14 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type)
726static struct intel_mp_floating *mpf_found; 580static struct intel_mp_floating *mpf_found;
727 581
728/* 582/*
729 * Machine specific quirk for finding the SMP config before other setup
730 * activities destroy the table:
731 */
732int (*mach_get_smp_config_quirk)(unsigned int early);
733
734/*
735 * Scan the memory blocks for an SMP configuration block. 583 * Scan the memory blocks for an SMP configuration block.
736 */ 584 */
737static void __init __get_smp_config(unsigned int early) 585static void __init __get_smp_config(unsigned int early)
738{ 586{
739 struct intel_mp_floating *mpf = mpf_found; 587 struct intel_mp_floating *mpf = mpf_found;
740 588
741 if (mach_get_smp_config_quirk) { 589 if (x86_quirks->mach_get_smp_config) {
742 if (mach_get_smp_config_quirk(early)) 590 if (x86_quirks->mach_get_smp_config(early))
743 return; 591 return;
744 } 592 }
745 if (acpi_lapic && early) 593 if (acpi_lapic && early)
@@ -849,7 +697,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
849 unsigned int *bp = phys_to_virt(base); 697 unsigned int *bp = phys_to_virt(base);
850 struct intel_mp_floating *mpf; 698 struct intel_mp_floating *mpf;
851 699
852 printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length); 700 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
701 bp, length);
853 BUILD_BUG_ON(sizeof(*mpf) != 16); 702 BUILD_BUG_ON(sizeof(*mpf) != 16);
854 703
855 while (length > 0) { 704 while (length > 0) {
@@ -899,14 +748,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
899 return 0; 748 return 0;
900} 749}
901 750
902int (*mach_find_smp_config_quirk)(unsigned int reserve);
903
904static void __init __find_smp_config(unsigned int reserve) 751static void __init __find_smp_config(unsigned int reserve)
905{ 752{
906 unsigned int address; 753 unsigned int address;
907 754
908 if (mach_find_smp_config_quirk) { 755 if (x86_quirks->mach_find_smp_config) {
909 if (mach_find_smp_config_quirk(reserve)) 756 if (x86_quirks->mach_find_smp_config(reserve))
910 return; 757 return;
911 } 758 }
912 /* 759 /*
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index a153b3905f60..82a7c7ed6d45 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -72,21 +72,28 @@ static ssize_t msr_read(struct file *file, char __user *buf,
72 u32 data[2]; 72 u32 data[2];
73 u32 reg = *ppos; 73 u32 reg = *ppos;
74 int cpu = iminor(file->f_path.dentry->d_inode); 74 int cpu = iminor(file->f_path.dentry->d_inode);
75 int err; 75 int err = 0;
76 ssize_t bytes = 0;
76 77
77 if (count % 8) 78 if (count % 8)
78 return -EINVAL; /* Invalid chunk size */ 79 return -EINVAL; /* Invalid chunk size */
79 80
80 for (; count; count -= 8) { 81 for (; count; count -= 8) {
81 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); 82 err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]);
82 if (err) 83 if (err) {
83 return -EIO; 84 if (err == -EFAULT) /* Fix idiotic error code */
84 if (copy_to_user(tmp, &data, 8)) 85 err = -EIO;
85 return -EFAULT; 86 break;
87 }
88 if (copy_to_user(tmp, &data, 8)) {
89 err = -EFAULT;
90 break;
91 }
86 tmp += 2; 92 tmp += 2;
93 bytes += 8;
87 } 94 }
88 95
89 return ((char __user *)tmp) - buf; 96 return bytes ? bytes : err;
90} 97}
91 98
92static ssize_t msr_write(struct file *file, const char __user *buf, 99static ssize_t msr_write(struct file *file, const char __user *buf,
@@ -96,21 +103,28 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
96 u32 data[2]; 103 u32 data[2];
97 u32 reg = *ppos; 104 u32 reg = *ppos;
98 int cpu = iminor(file->f_path.dentry->d_inode); 105 int cpu = iminor(file->f_path.dentry->d_inode);
99 int err; 106 int err = 0;
107 ssize_t bytes = 0;
100 108
101 if (count % 8) 109 if (count % 8)
102 return -EINVAL; /* Invalid chunk size */ 110 return -EINVAL; /* Invalid chunk size */
103 111
104 for (; count; count -= 8) { 112 for (; count; count -= 8) {
105 if (copy_from_user(&data, tmp, 8)) 113 if (copy_from_user(&data, tmp, 8)) {
106 return -EFAULT; 114 err = -EFAULT;
115 break;
116 }
107 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); 117 err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
108 if (err) 118 if (err) {
109 return -EIO; 119 if (err == -EFAULT) /* Fix idiotic error code */
120 err = -EIO;
121 break;
122 }
110 tmp += 2; 123 tmp += 2;
124 bytes += 8;
111 } 125 }
112 126
113 return ((char __user *)tmp) - buf; 127 return bytes ? bytes : err;
114} 128}
115 129
116static int msr_open(struct inode *inode, struct file *file) 130static int msr_open(struct inode *inode, struct file *file)
@@ -131,7 +145,7 @@ static int msr_open(struct inode *inode, struct file *file)
131 ret = -EIO; /* MSR not supported */ 145 ret = -EIO; /* MSR not supported */
132out: 146out:
133 unlock_kernel(); 147 unlock_kernel();
134 return 0; 148 return ret;
135} 149}
136 150
137/* 151/*
@@ -149,7 +163,7 @@ static int __cpuinit msr_device_create(int cpu)
149{ 163{
150 struct device *dev; 164 struct device *dev;
151 165
152 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), 166 dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL,
153 "msr%d", cpu); 167 "msr%d", cpu);
154 return IS_ERR(dev) ? PTR_ERR(dev) : 0; 168 return IS_ERR(dev) ? PTR_ERR(dev) : 0;
155} 169}
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index ec024b3baad0..2c97f07f1c2c 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -114,6 +114,23 @@ static __init void nmi_cpu_busy(void *data)
114} 114}
115#endif 115#endif
116 116
117static void report_broken_nmi(int cpu, int *prev_nmi_count)
118{
119 printk(KERN_CONT "\n");
120
121 printk(KERN_WARNING
122 "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
123 cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
124
125 printk(KERN_WARNING
126 "Please report this to bugzilla.kernel.org,\n");
127 printk(KERN_WARNING
128 "and attach the output of the 'dmesg' command.\n");
129
130 per_cpu(wd_enabled, cpu) = 0;
131 atomic_dec(&nmi_active);
132}
133
117int __init check_nmi_watchdog(void) 134int __init check_nmi_watchdog(void)
118{ 135{
119 unsigned int *prev_nmi_count; 136 unsigned int *prev_nmi_count;
@@ -141,15 +158,8 @@ int __init check_nmi_watchdog(void)
141 for_each_online_cpu(cpu) { 158 for_each_online_cpu(cpu) {
142 if (!per_cpu(wd_enabled, cpu)) 159 if (!per_cpu(wd_enabled, cpu))
143 continue; 160 continue;
144 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { 161 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
145 printk(KERN_WARNING "WARNING: CPU#%d: NMI " 162 report_broken_nmi(cpu, prev_nmi_count);
146 "appears to be stuck (%d->%d)!\n",
147 cpu,
148 prev_nmi_count[cpu],
149 get_nmi_count(cpu));
150 per_cpu(wd_enabled, cpu) = 0;
151 atomic_dec(&nmi_active);
152 }
153 } 163 }
154 endflag = 1; 164 endflag = 1;
155 if (!atomic_read(&nmi_active)) { 165 if (!atomic_read(&nmi_active)) {
@@ -263,7 +273,7 @@ late_initcall(init_lapic_nmi_sysfs);
263 273
264static void __acpi_nmi_enable(void *__unused) 274static void __acpi_nmi_enable(void *__unused)
265{ 275{
266 apic_write_around(APIC_LVT0, APIC_DM_NMI); 276 apic_write(APIC_LVT0, APIC_DM_NMI);
267} 277}
268 278
269/* 279/*
@@ -277,7 +287,7 @@ void acpi_nmi_enable(void)
277 287
278static void __acpi_nmi_disable(void *__unused) 288static void __acpi_nmi_disable(void *__unused)
279{ 289{
280 apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); 290 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
281} 291}
282 292
283/* 293/*
@@ -289,6 +299,15 @@ void acpi_nmi_disable(void)
289 on_each_cpu(__acpi_nmi_disable, NULL, 1); 299 on_each_cpu(__acpi_nmi_disable, NULL, 1);
290} 300}
291 301
302/*
303 * This function is called as soon the LAPIC NMI watchdog driver has everything
304 * in place and it's ready to check if the NMIs belong to the NMI watchdog
305 */
306void cpu_nmi_set_wd_enabled(void)
307{
308 __get_cpu_var(wd_enabled) = 1;
309}
310
292void setup_apic_nmi_watchdog(void *unused) 311void setup_apic_nmi_watchdog(void *unused)
293{ 312{
294 if (__get_cpu_var(wd_enabled)) 313 if (__get_cpu_var(wd_enabled))
@@ -301,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused)
301 320
302 switch (nmi_watchdog) { 321 switch (nmi_watchdog) {
303 case NMI_LOCAL_APIC: 322 case NMI_LOCAL_APIC:
304 /* enable it before to avoid race with handler */
305 __get_cpu_var(wd_enabled) = 1;
306 if (lapic_watchdog_init(nmi_hz) < 0) { 323 if (lapic_watchdog_init(nmi_hz) < 0) {
307 __get_cpu_var(wd_enabled) = 0; 324 __get_cpu_var(wd_enabled) = 0;
308 return; 325 return;
@@ -448,6 +465,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
448 465
449#ifdef CONFIG_SYSCTL 466#ifdef CONFIG_SYSCTL
450 467
468static int __init setup_unknown_nmi_panic(char *str)
469{
470 unknown_nmi_panic = 1;
471 return 1;
472}
473__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
474
451static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) 475static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
452{ 476{
453 unsigned char reason = get_nmi_reason(); 477 unsigned char reason = get_nmi_reason();
diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c
index a23e8233b9ac..4caff39078e0 100644
--- a/arch/x86/kernel/numaq_32.c
+++ b/arch/x86/kernel/numaq_32.c
@@ -33,6 +33,7 @@
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/mpspec.h> 34#include <asm/mpspec.h>
35#include <asm/e820.h> 35#include <asm/e820.h>
36#include <asm/setup.h>
36 37
37#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) 38#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
38 39
@@ -71,6 +72,195 @@ static void __init smp_dump_qct(void)
71 } 72 }
72} 73}
73 74
75
76void __cpuinit numaq_tsc_disable(void)
77{
78 if (!found_numaq)
79 return;
80
81 if (num_online_nodes() > 1) {
82 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
83 setup_clear_cpu_cap(X86_FEATURE_TSC);
84 }
85}
86
87static int __init numaq_pre_time_init(void)
88{
89 numaq_tsc_disable();
90 return 0;
91}
92
93int found_numaq;
94/*
95 * Have to match translation table entries to main table entries by counter
96 * hence the mpc_record variable .... can't see a less disgusting way of
97 * doing this ....
98 */
99struct mpc_config_translation {
100 unsigned char mpc_type;
101 unsigned char trans_len;
102 unsigned char trans_type;
103 unsigned char trans_quad;
104 unsigned char trans_global;
105 unsigned char trans_local;
106 unsigned short trans_reserved;
107};
108
109/* x86_quirks member */
110static int mpc_record;
111static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY]
112 __cpuinitdata;
113
114static inline int generate_logical_apicid(int quad, int phys_apicid)
115{
116 return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1);
117}
118
119/* x86_quirks member */
120static int mpc_apic_id(struct mpc_config_processor *m)
121{
122 int quad = translation_table[mpc_record]->trans_quad;
123 int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid);
124
125 printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n",
126 m->mpc_apicid,
127 (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8,
128 (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4,
129 m->mpc_apicver, quad, logical_apicid);
130 return logical_apicid;
131}
132
133int mp_bus_id_to_node[MAX_MP_BUSSES];
134
135int mp_bus_id_to_local[MAX_MP_BUSSES];
136
137/* x86_quirks member */
138static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name)
139{
140 int quad = translation_table[mpc_record]->trans_quad;
141 int local = translation_table[mpc_record]->trans_local;
142
143 mp_bus_id_to_node[m->mpc_busid] = quad;
144 mp_bus_id_to_local[m->mpc_busid] = local;
145 printk(KERN_INFO "Bus #%d is %s (node %d)\n",
146 m->mpc_busid, name, quad);
147}
148
149int quad_local_to_mp_bus_id [NR_CPUS/4][4];
150
151/* x86_quirks member */
152static void mpc_oem_pci_bus(struct mpc_config_bus *m)
153{
154 int quad = translation_table[mpc_record]->trans_quad;
155 int local = translation_table[mpc_record]->trans_local;
156
157 quad_local_to_mp_bus_id[quad][local] = m->mpc_busid;
158}
159
160static void __init MP_translation_info(struct mpc_config_translation *m)
161{
162 printk(KERN_INFO
163 "Translation: record %d, type %d, quad %d, global %d, local %d\n",
164 mpc_record, m->trans_type, m->trans_quad, m->trans_global,
165 m->trans_local);
166
167 if (mpc_record >= MAX_MPC_ENTRY)
168 printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n");
169 else
170 translation_table[mpc_record] = m; /* stash this for later */
171 if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad))
172 node_set_online(m->trans_quad);
173}
174
175static int __init mpf_checksum(unsigned char *mp, int len)
176{
177 int sum = 0;
178
179 while (len--)
180 sum += *mp++;
181
182 return sum & 0xFF;
183}
184
185/*
186 * Read/parse the MPC oem tables
187 */
188
189static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable,
190 unsigned short oemsize)
191{
192 int count = sizeof(*oemtable); /* the header size */
193 unsigned char *oemptr = ((unsigned char *)oemtable) + count;
194
195 mpc_record = 0;
196 printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n",
197 oemtable);
198 if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) {
199 printk(KERN_WARNING
200 "SMP mpc oemtable: bad signature [%c%c%c%c]!\n",
201 oemtable->oem_signature[0], oemtable->oem_signature[1],
202 oemtable->oem_signature[2], oemtable->oem_signature[3]);
203 return;
204 }
205 if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) {
206 printk(KERN_WARNING "SMP oem mptable: checksum error!\n");
207 return;
208 }
209 while (count < oemtable->oem_length) {
210 switch (*oemptr) {
211 case MP_TRANSLATION:
212 {
213 struct mpc_config_translation *m =
214 (struct mpc_config_translation *)oemptr;
215 MP_translation_info(m);
216 oemptr += sizeof(*m);
217 count += sizeof(*m);
218 ++mpc_record;
219 break;
220 }
221 default:
222 {
223 printk(KERN_WARNING
224 "Unrecognised OEM table entry type! - %d\n",
225 (int)*oemptr);
226 return;
227 }
228 }
229 }
230}
231
232static int __init numaq_setup_ioapic_ids(void)
233{
234 /* so can skip it */
235 return 1;
236}
237
238static struct x86_quirks numaq_x86_quirks __initdata = {
239 .arch_pre_time_init = numaq_pre_time_init,
240 .arch_time_init = NULL,
241 .arch_pre_intr_init = NULL,
242 .arch_memory_setup = NULL,
243 .arch_intr_init = NULL,
244 .arch_trap_init = NULL,
245 .mach_get_smp_config = NULL,
246 .mach_find_smp_config = NULL,
247 .mpc_record = &mpc_record,
248 .mpc_apic_id = mpc_apic_id,
249 .mpc_oem_bus_info = mpc_oem_bus_info,
250 .mpc_oem_pci_bus = mpc_oem_pci_bus,
251 .smp_read_mpc_oem = smp_read_mpc_oem,
252 .setup_ioapic_ids = numaq_setup_ioapic_ids,
253};
254
255void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem,
256 char *productid)
257{
258 if (strncmp(oem, "IBM NUMA", 8))
259 printk("Warning! Not a NUMA-Q system!\n");
260 else
261 found_numaq = 1;
262}
263
74static __init void early_check_numaq(void) 264static __init void early_check_numaq(void)
75{ 265{
76 /* 266 /*
@@ -82,6 +272,9 @@ static __init void early_check_numaq(void)
82 */ 272 */
83 if (smp_found_config) 273 if (smp_found_config)
84 early_get_smp_config(); 274 early_get_smp_config();
275
276 if (found_numaq)
277 x86_quirks = &numaq_x86_quirks;
85} 278}
86 279
87int __init get_memcfg_numaq(void) 280int __init get_memcfg_numaq(void)
@@ -92,14 +285,3 @@ int __init get_memcfg_numaq(void)
92 smp_dump_qct(); 285 smp_dump_qct();
93 return 1; 286 return 1;
94} 287}
95
96void __init numaq_tsc_disable(void)
97{
98 if (!found_numaq)
99 return;
100
101 if (num_online_nodes() > 1) {
102 printk(KERN_DEBUG "NUMAQ: disabling TSC\n");
103 setup_clear_cpu_cap(X86_FEATURE_TSC);
104 }
105}
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 3e6672274807..7a13fac63a1f 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd);
190static void __init platform_detect(void) 190static void __init platform_detect(void)
191{ 191{
192 size_t propsize; 192 size_t propsize;
193 u32 rev; 193 __be32 rev;
194 194
195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, 195 if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
196 &propsize) || propsize != 4) { 196 &propsize) || propsize != 4) {
197 printk(KERN_ERR "ofw: getprop call failed!\n"); 197 printk(KERN_ERR "ofw: getprop call failed!\n");
198 rev = 0; 198 rev = cpu_to_be32(0);
199 } 199 }
200 olpc_platform_info.boardrev = be32_to_cpu(rev); 200 olpc_platform_info.boardrev = be32_to_cpu(rev);
201} 201}
@@ -203,7 +203,7 @@ static void __init platform_detect(void)
203static void __init platform_detect(void) 203static void __init platform_detect(void)
204{ 204{
205 /* stopgap until OFW support is added to the kernel */ 205 /* stopgap until OFW support is added to the kernel */
206 olpc_platform_info.boardrev = be32_to_cpu(0xc2); 206 olpc_platform_info.boardrev = 0xc2;
207} 207}
208#endif 208#endif
209 209
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
new file mode 100644
index 000000000000..0e9f1982b1dd
--- /dev/null
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -0,0 +1,37 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/spinlock.h>
6#include <linux/module.h>
7
8#include <asm/paravirt.h>
9
10static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
11{
12 __raw_spin_lock(lock);
13}
14
15struct pv_lock_ops pv_lock_ops = {
16#ifdef CONFIG_SMP
17 .spin_is_locked = __ticket_spin_is_locked,
18 .spin_is_contended = __ticket_spin_is_contended,
19
20 .spin_lock = __ticket_spin_lock,
21 .spin_lock_flags = default_spin_lock_flags,
22 .spin_trylock = __ticket_spin_trylock,
23 .spin_unlock = __ticket_spin_unlock,
24#endif
25};
26EXPORT_SYMBOL(pv_lock_ops);
27
28void __init paravirt_use_bytelocks(void)
29{
30#ifdef CONFIG_SMP
31 pv_lock_ops.spin_is_locked = __byte_spin_is_locked;
32 pv_lock_ops.spin_is_contended = __byte_spin_is_contended;
33 pv_lock_ops.spin_lock = __byte_spin_lock;
34 pv_lock_ops.spin_trylock = __byte_spin_trylock;
35 pv_lock_ops.spin_unlock = __byte_spin_unlock;
36#endif
37}
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index e0f571d58c19..e4c8fb608873 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -29,6 +29,7 @@
29#include <asm/desc.h> 29#include <asm/desc.h>
30#include <asm/setup.h> 30#include <asm/setup.h>
31#include <asm/arch_hooks.h> 31#include <asm/arch_hooks.h>
32#include <asm/pgtable.h>
32#include <asm/time.h> 33#include <asm/time.h>
33#include <asm/pgalloc.h> 34#include <asm/pgalloc.h>
34#include <asm/irq.h> 35#include <asm/irq.h>
@@ -123,6 +124,7 @@ static void *get_call_destination(u8 type)
123 .pv_irq_ops = pv_irq_ops, 124 .pv_irq_ops = pv_irq_ops,
124 .pv_apic_ops = pv_apic_ops, 125 .pv_apic_ops = pv_apic_ops,
125 .pv_mmu_ops = pv_mmu_ops, 126 .pv_mmu_ops = pv_mmu_ops,
127 .pv_lock_ops = pv_lock_ops,
126 }; 128 };
127 return *((void **)&tmpl + type); 129 return *((void **)&tmpl + type);
128} 130}
@@ -317,6 +319,7 @@ struct pv_cpu_ops pv_cpu_ops = {
317#endif 319#endif
318 .wbinvd = native_wbinvd, 320 .wbinvd = native_wbinvd,
319 .read_msr = native_read_msr_safe, 321 .read_msr = native_read_msr_safe,
322 .read_msr_amd = native_read_msr_amd_safe,
320 .write_msr = native_write_msr_safe, 323 .write_msr = native_write_msr_safe,
321 .read_tsc = native_read_tsc, 324 .read_tsc = native_read_tsc,
322 .read_pmc = native_read_pmc, 325 .read_pmc = native_read_pmc,
@@ -335,6 +338,10 @@ struct pv_cpu_ops pv_cpu_ops = {
335 .write_ldt_entry = native_write_ldt_entry, 338 .write_ldt_entry = native_write_ldt_entry,
336 .write_gdt_entry = native_write_gdt_entry, 339 .write_gdt_entry = native_write_gdt_entry,
337 .write_idt_entry = native_write_idt_entry, 340 .write_idt_entry = native_write_idt_entry,
341
342 .alloc_ldt = paravirt_nop,
343 .free_ldt = paravirt_nop,
344
338 .load_sp0 = native_load_sp0, 345 .load_sp0 = native_load_sp0,
339 346
340#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) 347#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
@@ -360,9 +367,6 @@ struct pv_cpu_ops pv_cpu_ops = {
360 367
361struct pv_apic_ops pv_apic_ops = { 368struct pv_apic_ops pv_apic_ops = {
362#ifdef CONFIG_X86_LOCAL_APIC 369#ifdef CONFIG_X86_LOCAL_APIC
363 .apic_write = native_apic_write,
364 .apic_write_atomic = native_apic_write_atomic,
365 .apic_read = native_apic_read,
366 .setup_boot_clock = setup_boot_APIC_clock, 370 .setup_boot_clock = setup_boot_APIC_clock,
367 .setup_secondary_clock = setup_secondary_APIC_clock, 371 .setup_secondary_clock = setup_secondary_APIC_clock,
368 .startup_ipi_hook = paravirt_nop, 372 .startup_ipi_hook = paravirt_nop,
@@ -373,6 +377,9 @@ struct pv_mmu_ops pv_mmu_ops = {
373#ifndef CONFIG_X86_64 377#ifndef CONFIG_X86_64
374 .pagetable_setup_start = native_pagetable_setup_start, 378 .pagetable_setup_start = native_pagetable_setup_start,
375 .pagetable_setup_done = native_pagetable_setup_done, 379 .pagetable_setup_done = native_pagetable_setup_done,
380#else
381 .pagetable_setup_start = paravirt_nop,
382 .pagetable_setup_done = paravirt_nop,
376#endif 383#endif
377 384
378 .read_cr2 = native_read_cr2, 385 .read_cr2 = native_read_cr2,
@@ -428,7 +435,7 @@ struct pv_mmu_ops pv_mmu_ops = {
428#endif /* PAGETABLE_LEVELS >= 3 */ 435#endif /* PAGETABLE_LEVELS >= 3 */
429 436
430 .pte_val = native_pte_val, 437 .pte_val = native_pte_val,
431 .pte_flags = native_pte_val, 438 .pte_flags = native_pte_flags,
432 .pgd_val = native_pgd_val, 439 .pgd_val = native_pgd_val,
433 440
434 .make_pte = native_make_pte, 441 .make_pte = native_make_pte,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index 58262218781b..9fe644f4861d 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
23 start = start_##ops##_##x; \ 23 start = start_##ops##_##x; \
24 end = end_##ops##_##x; \ 24 end = end_##ops##_##x; \
25 goto patch_site 25 goto patch_site
26 switch(type) { 26 switch (type) {
27 PATCH_SITE(pv_irq_ops, irq_disable); 27 PATCH_SITE(pv_irq_ops, irq_disable);
28 PATCH_SITE(pv_irq_ops, irq_enable); 28 PATCH_SITE(pv_irq_ops, irq_enable);
29 PATCH_SITE(pv_irq_ops, restore_fl); 29 PATCH_SITE(pv_irq_ops, restore_fl);
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 6959b5c45df4..e1e731d78f38 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -29,6 +29,7 @@
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/spinlock.h> 30#include <linux/spinlock.h>
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/crash_dump.h>
32#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
33#include <linux/bitops.h> 34#include <linux/bitops.h>
34#include <linux/pci_ids.h> 35#include <linux/pci_ids.h>
@@ -36,7 +37,8 @@
36#include <linux/delay.h> 37#include <linux/delay.h>
37#include <linux/scatterlist.h> 38#include <linux/scatterlist.h>
38#include <linux/iommu-helper.h> 39#include <linux/iommu-helper.h>
39#include <asm/gart.h> 40
41#include <asm/iommu.h>
40#include <asm/calgary.h> 42#include <asm/calgary.h>
41#include <asm/tce.h> 43#include <asm/tce.h>
42#include <asm/pci-direct.h> 44#include <asm/pci-direct.h>
@@ -167,6 +169,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl);
167static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); 169static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev);
168static void calioc2_tce_cache_blast(struct iommu_table *tbl); 170static void calioc2_tce_cache_blast(struct iommu_table *tbl);
169static void calioc2_dump_error_regs(struct iommu_table *tbl); 171static void calioc2_dump_error_regs(struct iommu_table *tbl);
172static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl);
173static void get_tce_space_from_tar(void);
170 174
171static struct cal_chipset_ops calgary_chip_ops = { 175static struct cal_chipset_ops calgary_chip_ops = {
172 .handle_quirks = calgary_handle_quirks, 176 .handle_quirks = calgary_handle_quirks,
@@ -213,16 +217,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap,
213 217
214#endif /* CONFIG_IOMMU_DEBUG */ 218#endif /* CONFIG_IOMMU_DEBUG */
215 219
216static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen)
217{
218 unsigned int npages;
219
220 npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK);
221 npages >>= PAGE_SHIFT;
222
223 return npages;
224}
225
226static inline int translation_enabled(struct iommu_table *tbl) 220static inline int translation_enabled(struct iommu_table *tbl)
227{ 221{
228 /* only PHBs with translation enabled have an IOMMU table */ 222 /* only PHBs with translation enabled have an IOMMU table */
@@ -257,7 +251,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
257 badbit, tbl, start_addr, npages); 251 badbit, tbl, start_addr, npages);
258 } 252 }
259 253
260 set_bit_string(tbl->it_map, index, npages); 254 iommu_area_reserve(tbl->it_map, index, npages);
261 255
262 spin_unlock_irqrestore(&tbl->it_lock, flags); 256 spin_unlock_irqrestore(&tbl->it_lock, flags);
263} 257}
@@ -339,9 +333,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
339 /* were we called with bad_dma_address? */ 333 /* were we called with bad_dma_address? */
340 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); 334 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE);
341 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { 335 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) {
342 printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " 336 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
343 "address 0x%Lx\n", dma_addr); 337 "address 0x%Lx\n", dma_addr);
344 WARN_ON(1);
345 return; 338 return;
346 } 339 }
347 340
@@ -405,27 +398,11 @@ static void calgary_unmap_sg(struct device *dev,
405 if (dmalen == 0) 398 if (dmalen == 0)
406 break; 399 break;
407 400
408 npages = num_dma_pages(dma, dmalen); 401 npages = iommu_num_pages(dma, dmalen, PAGE_SIZE);
409 iommu_free(tbl, dma, npages); 402 iommu_free(tbl, dma, npages);
410 } 403 }
411} 404}
412 405
413static int calgary_nontranslate_map_sg(struct device* dev,
414 struct scatterlist *sg, int nelems, int direction)
415{
416 struct scatterlist *s;
417 int i;
418
419 for_each_sg(sg, s, nelems, i) {
420 struct page *p = sg_page(s);
421
422 BUG_ON(!p);
423 s->dma_address = virt_to_bus(sg_virt(s));
424 s->dma_length = s->length;
425 }
426 return nelems;
427}
428
429static int calgary_map_sg(struct device *dev, struct scatterlist *sg, 406static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
430 int nelems, int direction) 407 int nelems, int direction)
431{ 408{
@@ -436,14 +413,11 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
436 unsigned long entry; 413 unsigned long entry;
437 int i; 414 int i;
438 415
439 if (!translation_enabled(tbl))
440 return calgary_nontranslate_map_sg(dev, sg, nelems, direction);
441
442 for_each_sg(sg, s, nelems, i) { 416 for_each_sg(sg, s, nelems, i) {
443 BUG_ON(!sg_page(s)); 417 BUG_ON(!sg_page(s));
444 418
445 vaddr = (unsigned long) sg_virt(s); 419 vaddr = (unsigned long) sg_virt(s);
446 npages = num_dma_pages(vaddr, s->length); 420 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
447 421
448 entry = iommu_range_alloc(dev, tbl, npages); 422 entry = iommu_range_alloc(dev, tbl, npages);
449 if (entry == bad_dma_address) { 423 if (entry == bad_dma_address) {
@@ -474,21 +448,15 @@ error:
474static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, 448static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr,
475 size_t size, int direction) 449 size_t size, int direction)
476{ 450{
477 dma_addr_t dma_handle = bad_dma_address;
478 void *vaddr = phys_to_virt(paddr); 451 void *vaddr = phys_to_virt(paddr);
479 unsigned long uaddr; 452 unsigned long uaddr;
480 unsigned int npages; 453 unsigned int npages;
481 struct iommu_table *tbl = find_iommu_table(dev); 454 struct iommu_table *tbl = find_iommu_table(dev);
482 455
483 uaddr = (unsigned long)vaddr; 456 uaddr = (unsigned long)vaddr;
484 npages = num_dma_pages(uaddr, size); 457 npages = iommu_num_pages(uaddr, size, PAGE_SIZE);
485 458
486 if (translation_enabled(tbl)) 459 return iommu_alloc(dev, tbl, vaddr, npages, direction);
487 dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction);
488 else
489 dma_handle = virt_to_bus(vaddr);
490
491 return dma_handle;
492} 460}
493 461
494static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, 462static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
@@ -497,10 +465,7 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle,
497 struct iommu_table *tbl = find_iommu_table(dev); 465 struct iommu_table *tbl = find_iommu_table(dev);
498 unsigned int npages; 466 unsigned int npages;
499 467
500 if (!translation_enabled(tbl)) 468 npages = iommu_num_pages(dma_handle, size, PAGE_SIZE);
501 return;
502
503 npages = num_dma_pages(dma_handle, size);
504 iommu_free(tbl, dma_handle, npages); 469 iommu_free(tbl, dma_handle, npages);
505} 470}
506 471
@@ -516,24 +481,20 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
516 npages = size >> PAGE_SHIFT; 481 npages = size >> PAGE_SHIFT;
517 order = get_order(size); 482 order = get_order(size);
518 483
484 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
485
519 /* alloc enough pages (and possibly more) */ 486 /* alloc enough pages (and possibly more) */
520 ret = (void *)__get_free_pages(flag, order); 487 ret = (void *)__get_free_pages(flag, order);
521 if (!ret) 488 if (!ret)
522 goto error; 489 goto error;
523 memset(ret, 0, size); 490 memset(ret, 0, size);
524 491
525 if (translation_enabled(tbl)) { 492 /* set up tces to cover the allocated range */
526 /* set up tces to cover the allocated range */ 493 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
527 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 494 if (mapping == bad_dma_address)
528 if (mapping == bad_dma_address) 495 goto free;
529 goto free; 496 *dma_handle = mapping;
530
531 *dma_handle = mapping;
532 } else /* non translated slot */
533 *dma_handle = virt_to_bus(ret);
534
535 return ret; 497 return ret;
536
537free: 498free:
538 free_pages((unsigned long)ret, get_order(size)); 499 free_pages((unsigned long)ret, get_order(size));
539 ret = NULL; 500 ret = NULL;
@@ -541,8 +502,22 @@ error:
541 return ret; 502 return ret;
542} 503}
543 504
544static const struct dma_mapping_ops calgary_dma_ops = { 505static void calgary_free_coherent(struct device *dev, size_t size,
506 void *vaddr, dma_addr_t dma_handle)
507{
508 unsigned int npages;
509 struct iommu_table *tbl = find_iommu_table(dev);
510
511 size = PAGE_ALIGN(size);
512 npages = size >> PAGE_SHIFT;
513
514 iommu_free(tbl, dma_handle, npages);
515 free_pages((unsigned long)vaddr, get_order(size));
516}
517
518static struct dma_mapping_ops calgary_dma_ops = {
545 .alloc_coherent = calgary_alloc_coherent, 519 .alloc_coherent = calgary_alloc_coherent,
520 .free_coherent = calgary_free_coherent,
546 .map_single = calgary_map_single, 521 .map_single = calgary_map_single,
547 .unmap_single = calgary_unmap_single, 522 .unmap_single = calgary_unmap_single,
548 .map_sg = calgary_map_sg, 523 .map_sg = calgary_map_sg,
@@ -830,7 +805,11 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar)
830 805
831 tbl = pci_iommu(dev->bus); 806 tbl = pci_iommu(dev->bus);
832 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; 807 tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space;
833 tce_free(tbl, 0, tbl->it_size); 808
809 if (is_kdump_kernel())
810 calgary_init_bitmap_from_tce_table(tbl);
811 else
812 tce_free(tbl, 0, tbl->it_size);
834 813
835 if (is_calgary(dev->device)) 814 if (is_calgary(dev->device))
836 tbl->chip_ops = &calgary_chip_ops; 815 tbl->chip_ops = &calgary_chip_ops;
@@ -1209,6 +1188,10 @@ static int __init calgary_init(void)
1209 if (ret) 1188 if (ret)
1210 return ret; 1189 return ret;
1211 1190
1191 /* Purely for kdump kernel case */
1192 if (is_kdump_kernel())
1193 get_tce_space_from_tar();
1194
1212 do { 1195 do {
1213 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); 1196 dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev);
1214 if (!dev) 1197 if (!dev)
@@ -1230,6 +1213,16 @@ static int __init calgary_init(void)
1230 goto error; 1213 goto error;
1231 } while (1); 1214 } while (1);
1232 1215
1216 dev = NULL;
1217 for_each_pci_dev(dev) {
1218 struct iommu_table *tbl;
1219
1220 tbl = find_iommu_table(&dev->dev);
1221
1222 if (translation_enabled(tbl))
1223 dev->dev.archdata.dma_ops = &calgary_dma_ops;
1224 }
1225
1233 return ret; 1226 return ret;
1234 1227
1235error: 1228error:
@@ -1251,6 +1244,7 @@ error:
1251 calgary_disable_translation(dev); 1244 calgary_disable_translation(dev);
1252 calgary_free_bus(dev); 1245 calgary_free_bus(dev);
1253 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ 1246 pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */
1247 dev->dev.archdata.dma_ops = NULL;
1254 } while (1); 1248 } while (1);
1255 1249
1256 return ret; 1250 return ret;
@@ -1280,13 +1274,15 @@ static inline int __init determine_tce_table_size(u64 ram)
1280static int __init build_detail_arrays(void) 1274static int __init build_detail_arrays(void)
1281{ 1275{
1282 unsigned long ptr; 1276 unsigned long ptr;
1283 int i, scal_detail_size, rio_detail_size; 1277 unsigned numnodes, i;
1278 int scal_detail_size, rio_detail_size;
1284 1279
1285 if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ 1280 numnodes = rio_table_hdr->num_scal_dev;
1281 if (numnodes > MAX_NUMNODES){
1286 printk(KERN_WARNING 1282 printk(KERN_WARNING
1287 "Calgary: MAX_NUMNODES too low! Defined as %d, " 1283 "Calgary: MAX_NUMNODES too low! Defined as %d, "
1288 "but system has %d nodes.\n", 1284 "but system has %d nodes.\n",
1289 MAX_NUMNODES, rio_table_hdr->num_scal_dev); 1285 MAX_NUMNODES, numnodes);
1290 return -ENODEV; 1286 return -ENODEV;
1291 } 1287 }
1292 1288
@@ -1307,8 +1303,7 @@ static int __init build_detail_arrays(void)
1307 } 1303 }
1308 1304
1309 ptr = ((unsigned long)rio_table_hdr) + 3; 1305 ptr = ((unsigned long)rio_table_hdr) + 3;
1310 for (i = 0; i < rio_table_hdr->num_scal_dev; 1306 for (i = 0; i < numnodes; i++, ptr += scal_detail_size)
1311 i++, ptr += scal_detail_size)
1312 scal_devs[i] = (struct scal_detail *)ptr; 1307 scal_devs[i] = (struct scal_detail *)ptr;
1313 1308
1314 for (i = 0; i < rio_table_hdr->num_rio_dev; 1309 for (i = 0; i < rio_table_hdr->num_rio_dev;
@@ -1339,6 +1334,61 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1339 return (val != 0xffffffff); 1334 return (val != 0xffffffff);
1340} 1335}
1341 1336
1337/*
1338 * calgary_init_bitmap_from_tce_table():
1339 * Funtion for kdump case. In the second/kdump kernel initialize
1340 * the bitmap based on the tce table entries obtained from first kernel
1341 */
1342static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1343{
1344 u64 *tp;
1345 unsigned int index;
1346 tp = ((u64 *)tbl->it_base);
1347 for (index = 0 ; index < tbl->it_size; index++) {
1348 if (*tp != 0x0)
1349 set_bit(index, tbl->it_map);
1350 tp++;
1351 }
1352}
1353
1354/*
1355 * get_tce_space_from_tar():
1356 * Function for kdump case. Get the tce tables from first kernel
1357 * by reading the contents of the base adress register of calgary iommu
1358 */
1359static void __init get_tce_space_from_tar(void)
1360{
1361 int bus;
1362 void __iomem *target;
1363 unsigned long tce_space;
1364
1365 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1366 struct calgary_bus_info *info = &bus_info[bus];
1367 unsigned short pci_device;
1368 u32 val;
1369
1370 val = read_pci_config(bus, 0, 0, 0);
1371 pci_device = (val & 0xFFFF0000) >> 16;
1372
1373 if (!is_cal_pci_dev(pci_device))
1374 continue;
1375 if (info->translation_disabled)
1376 continue;
1377
1378 if (calgary_bus_has_devices(bus, pci_device) ||
1379 translate_empty_slots) {
1380 target = calgary_reg(bus_info[bus].bbar,
1381 tar_offset(bus));
1382 tce_space = be64_to_cpu(readq(target));
1383 tce_space = tce_space & TAR_SW_BITS;
1384
1385 tce_space = tce_space & (~specified_table_size);
1386 info->tce_space = (u64 *)__va(tce_space);
1387 }
1388 }
1389 return;
1390}
1391
1342void __init detect_calgary(void) 1392void __init detect_calgary(void)
1343{ 1393{
1344 int bus; 1394 int bus;
@@ -1394,7 +1444,8 @@ void __init detect_calgary(void)
1394 return; 1444 return;
1395 } 1445 }
1396 1446
1397 specified_table_size = determine_tce_table_size(max_pfn * PAGE_SIZE); 1447 specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
1448 saved_max_pfn : max_pfn) * PAGE_SIZE);
1398 1449
1399 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { 1450 for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) {
1400 struct calgary_bus_info *info = &bus_info[bus]; 1451 struct calgary_bus_info *info = &bus_info[bus];
@@ -1412,10 +1463,16 @@ void __init detect_calgary(void)
1412 1463
1413 if (calgary_bus_has_devices(bus, pci_device) || 1464 if (calgary_bus_has_devices(bus, pci_device) ||
1414 translate_empty_slots) { 1465 translate_empty_slots) {
1415 tbl = alloc_tce_table(); 1466 /*
1416 if (!tbl) 1467 * If it is kdump kernel, find and use tce tables
1417 goto cleanup; 1468 * from first kernel, else allocate tce tables here
1418 info->tce_space = tbl; 1469 */
1470 if (!is_kdump_kernel()) {
1471 tbl = alloc_tce_table();
1472 if (!tbl)
1473 goto cleanup;
1474 info->tce_space = tbl;
1475 }
1419 calgary_found = 1; 1476 calgary_found = 1;
1420 } 1477 }
1421 } 1478 }
@@ -1430,6 +1487,10 @@ void __init detect_calgary(void)
1430 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " 1487 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
1431 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, 1488 "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
1432 debugging ? "enabled" : "disabled"); 1489 debugging ? "enabled" : "disabled");
1490
1491 /* swiotlb for devices that aren't behind the Calgary. */
1492 if (max_pfn > MAX_DMA32_PFN)
1493 swiotlb = 1;
1433 } 1494 }
1434 return; 1495 return;
1435 1496
@@ -1446,7 +1507,7 @@ int __init calgary_iommu_init(void)
1446{ 1507{
1447 int ret; 1508 int ret;
1448 1509
1449 if (no_iommu || swiotlb) 1510 if (no_iommu || (swiotlb && !calgary_detected))
1450 return -ENODEV; 1511 return -ENODEV;
1451 1512
1452 if (!calgary_detected) 1513 if (!calgary_detected)
@@ -1459,15 +1520,14 @@ int __init calgary_iommu_init(void)
1459 if (ret) { 1520 if (ret) {
1460 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " 1521 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1461 "falling back to no_iommu\n", ret); 1522 "falling back to no_iommu\n", ret);
1462 if (max_pfn > MAX_DMA32_PFN)
1463 printk(KERN_ERR "WARNING more than 4GB of memory, "
1464 "32bit PCI may malfunction.\n");
1465 return ret; 1523 return ret;
1466 } 1524 }
1467 1525
1468 force_iommu = 1; 1526 force_iommu = 1;
1469 bad_dma_address = 0x0; 1527 bad_dma_address = 0x0;
1470 dma_ops = &calgary_dma_ops; 1528 /* dma_ops is set to swiotlb or nommu */
1529 if (!dma_ops)
1530 dma_ops = &nommu_dma_ops;
1471 1531
1472 return 0; 1532 return 0;
1473} 1533}
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 8467ec2320f1..1972266e8ba5 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -5,14 +5,11 @@
5 5
6#include <asm/proto.h> 6#include <asm/proto.h>
7#include <asm/dma.h> 7#include <asm/dma.h>
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/calgary.h> 9#include <asm/calgary.h>
10#include <asm/amd_iommu.h> 10#include <asm/amd_iommu.h>
11 11
12int forbid_dac __read_mostly; 12struct dma_mapping_ops *dma_ops;
13EXPORT_SYMBOL(forbid_dac);
14
15const struct dma_mapping_ops *dma_ops;
16EXPORT_SYMBOL(dma_ops); 13EXPORT_SYMBOL(dma_ops);
17 14
18static int iommu_sac_force __read_mostly; 15static int iommu_sac_force __read_mostly;
@@ -42,11 +39,12 @@ EXPORT_SYMBOL(bad_dma_address);
42/* Dummy device used for NULL arguments (normally ISA). Better would 39/* Dummy device used for NULL arguments (normally ISA). Better would
43 be probably a smaller DMA mask, but this is bug-to-bug compatible 40 be probably a smaller DMA mask, but this is bug-to-bug compatible
44 to older i386. */ 41 to older i386. */
45struct device fallback_dev = { 42struct device x86_dma_fallback_dev = {
46 .bus_id = "fallback device", 43 .bus_id = "fallback device",
47 .coherent_dma_mask = DMA_32BIT_MASK, 44 .coherent_dma_mask = DMA_32BIT_MASK,
48 .dma_mask = &fallback_dev.coherent_dma_mask, 45 .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask,
49}; 46};
47EXPORT_SYMBOL(x86_dma_fallback_dev);
50 48
51int dma_set_mask(struct device *dev, u64 mask) 49int dma_set_mask(struct device *dev, u64 mask)
52{ 50{
@@ -83,7 +81,7 @@ void __init dma32_reserve_bootmem(void)
83 * using 512M as goal 81 * using 512M as goal
84 */ 82 */
85 align = 64ULL<<20; 83 align = 64ULL<<20;
86 size = round_up(dma32_bootmem_size, align); 84 size = roundup(dma32_bootmem_size, align);
87 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 85 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
88 512ULL<<20); 86 512ULL<<20);
89 if (dma32_bootmem_ptr) 87 if (dma32_bootmem_ptr)
@@ -114,24 +112,57 @@ void __init pci_iommu_alloc(void)
114 * The order of these functions is important for 112 * The order of these functions is important for
115 * fall-back/fail-over reasons 113 * fall-back/fail-over reasons
116 */ 114 */
117#ifdef CONFIG_GART_IOMMU
118 gart_iommu_hole_init(); 115 gart_iommu_hole_init();
119#endif
120 116
121#ifdef CONFIG_CALGARY_IOMMU
122 detect_calgary(); 117 detect_calgary();
123#endif
124 118
125 detect_intel_iommu(); 119 detect_intel_iommu();
126 120
127 amd_iommu_detect(); 121 amd_iommu_detect();
128 122
129#ifdef CONFIG_SWIOTLB
130 pci_swiotlb_init(); 123 pci_swiotlb_init();
131#endif
132} 124}
125
126unsigned long iommu_nr_pages(unsigned long addr, unsigned long len)
127{
128 unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE);
129
130 return size >> PAGE_SHIFT;
131}
132EXPORT_SYMBOL(iommu_nr_pages);
133#endif 133#endif
134 134
135void *dma_generic_alloc_coherent(struct device *dev, size_t size,
136 dma_addr_t *dma_addr, gfp_t flag)
137{
138 unsigned long dma_mask;
139 struct page *page;
140 dma_addr_t addr;
141
142 dma_mask = dma_alloc_coherent_mask(dev, flag);
143
144 flag |= __GFP_ZERO;
145again:
146 page = alloc_pages_node(dev_to_node(dev), flag, get_order(size));
147 if (!page)
148 return NULL;
149
150 addr = page_to_phys(page);
151 if (!is_buffer_dma_capable(dma_mask, addr, size)) {
152 __free_pages(page, get_order(size));
153
154 if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) {
155 flag = (flag & ~GFP_DMA32) | GFP_DMA;
156 goto again;
157 }
158
159 return NULL;
160 }
161
162 *dma_addr = addr;
163 return page_address(page);
164}
165
135/* 166/*
136 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter 167 * See <Documentation/x86_64/boot-options.txt> for the iommu kernel parameter
137 * documentation. 168 * documentation.
@@ -184,9 +215,7 @@ static __init int iommu_setup(char *p)
184 swiotlb = 1; 215 swiotlb = 1;
185#endif 216#endif
186 217
187#ifdef CONFIG_GART_IOMMU
188 gart_parse_options(p); 218 gart_parse_options(p);
189#endif
190 219
191#ifdef CONFIG_CALGARY_IOMMU 220#ifdef CONFIG_CALGARY_IOMMU
192 if (!strncmp(p, "calgary", 7)) 221 if (!strncmp(p, "calgary", 7))
@@ -201,136 +230,19 @@ static __init int iommu_setup(char *p)
201} 230}
202early_param("iommu", iommu_setup); 231early_param("iommu", iommu_setup);
203 232
204#ifdef CONFIG_X86_32
205int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
206 dma_addr_t device_addr, size_t size, int flags)
207{
208 void __iomem *mem_base = NULL;
209 int pages = size >> PAGE_SHIFT;
210 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
211
212 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
213 goto out;
214 if (!size)
215 goto out;
216 if (dev->dma_mem)
217 goto out;
218
219 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
220
221 mem_base = ioremap(bus_addr, size);
222 if (!mem_base)
223 goto out;
224
225 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
226 if (!dev->dma_mem)
227 goto out;
228 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
229 if (!dev->dma_mem->bitmap)
230 goto free1_out;
231
232 dev->dma_mem->virt_base = mem_base;
233 dev->dma_mem->device_base = device_addr;
234 dev->dma_mem->size = pages;
235 dev->dma_mem->flags = flags;
236
237 if (flags & DMA_MEMORY_MAP)
238 return DMA_MEMORY_MAP;
239
240 return DMA_MEMORY_IO;
241
242 free1_out:
243 kfree(dev->dma_mem);
244 out:
245 if (mem_base)
246 iounmap(mem_base);
247 return 0;
248}
249EXPORT_SYMBOL(dma_declare_coherent_memory);
250
251void dma_release_declared_memory(struct device *dev)
252{
253 struct dma_coherent_mem *mem = dev->dma_mem;
254
255 if (!mem)
256 return;
257 dev->dma_mem = NULL;
258 iounmap(mem->virt_base);
259 kfree(mem->bitmap);
260 kfree(mem);
261}
262EXPORT_SYMBOL(dma_release_declared_memory);
263
264void *dma_mark_declared_memory_occupied(struct device *dev,
265 dma_addr_t device_addr, size_t size)
266{
267 struct dma_coherent_mem *mem = dev->dma_mem;
268 int pos, err;
269 int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1);
270
271 pages >>= PAGE_SHIFT;
272
273 if (!mem)
274 return ERR_PTR(-EINVAL);
275
276 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
277 err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages));
278 if (err != 0)
279 return ERR_PTR(err);
280 return mem->virt_base + (pos << PAGE_SHIFT);
281}
282EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
283
284static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size,
285 dma_addr_t *dma_handle, void **ret)
286{
287 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
288 int order = get_order(size);
289
290 if (mem) {
291 int page = bitmap_find_free_region(mem->bitmap, mem->size,
292 order);
293 if (page >= 0) {
294 *dma_handle = mem->device_base + (page << PAGE_SHIFT);
295 *ret = mem->virt_base + (page << PAGE_SHIFT);
296 memset(*ret, 0, size);
297 }
298 if (mem->flags & DMA_MEMORY_EXCLUSIVE)
299 *ret = NULL;
300 }
301 return (mem != NULL);
302}
303
304static int dma_release_coherent(struct device *dev, int order, void *vaddr)
305{
306 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
307
308 if (mem && vaddr >= mem->virt_base && vaddr <
309 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
310 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
311
312 bitmap_release_region(mem->bitmap, page, order);
313 return 1;
314 }
315 return 0;
316}
317#else
318#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0)
319#define dma_release_coherent(dev, order, vaddr) (0)
320#endif /* CONFIG_X86_32 */
321
322int dma_supported(struct device *dev, u64 mask) 233int dma_supported(struct device *dev, u64 mask)
323{ 234{
235 struct dma_mapping_ops *ops = get_dma_ops(dev);
236
324#ifdef CONFIG_PCI 237#ifdef CONFIG_PCI
325 if (mask > 0xffffffff && forbid_dac > 0) { 238 if (mask > 0xffffffff && forbid_dac > 0) {
326 printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", 239 dev_info(dev, "PCI: Disallowing DAC for device\n");
327 dev->bus_id);
328 return 0; 240 return 0;
329 } 241 }
330#endif 242#endif
331 243
332 if (dma_ops->dma_supported) 244 if (ops->dma_supported)
333 return dma_ops->dma_supported(dev, mask); 245 return ops->dma_supported(dev, mask);
334 246
335 /* Copied from i386. Doesn't make much sense, because it will 247 /* Copied from i386. Doesn't make much sense, because it will
336 only work for pci_alloc_coherent. 248 only work for pci_alloc_coherent.
@@ -351,8 +263,7 @@ int dma_supported(struct device *dev, u64 mask)
351 type. Normally this doesn't make any difference, but gives 263 type. Normally this doesn't make any difference, but gives
352 more gentle handling of IOMMU overflow. */ 264 more gentle handling of IOMMU overflow. */
353 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { 265 if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) {
354 printk(KERN_INFO "%s: Force SAC with mask %Lx\n", 266 dev_info(dev, "Force SAC with mask %Lx\n", mask);
355 dev->bus_id, mask);
356 return 0; 267 return 0;
357 } 268 }
358 269
@@ -360,157 +271,15 @@ int dma_supported(struct device *dev, u64 mask)
360} 271}
361EXPORT_SYMBOL(dma_supported); 272EXPORT_SYMBOL(dma_supported);
362 273
363/* Allocate DMA memory on node near device */
364static noinline struct page *
365dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order)
366{
367 int node;
368
369 node = dev_to_node(dev);
370
371 return alloc_pages_node(node, gfp, order);
372}
373
374/*
375 * Allocate memory for a coherent mapping.
376 */
377void *
378dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle,
379 gfp_t gfp)
380{
381 void *memory = NULL;
382 struct page *page;
383 unsigned long dma_mask = 0;
384 dma_addr_t bus;
385 int noretry = 0;
386
387 /* ignore region specifiers */
388 gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
389
390 if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory))
391 return memory;
392
393 if (!dev) {
394 dev = &fallback_dev;
395 gfp |= GFP_DMA;
396 }
397 dma_mask = dev->coherent_dma_mask;
398 if (dma_mask == 0)
399 dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK;
400
401 /* Device not DMA able */
402 if (dev->dma_mask == NULL)
403 return NULL;
404
405 /* Don't invoke OOM killer or retry in lower 16MB DMA zone */
406 if (gfp & __GFP_DMA)
407 noretry = 1;
408
409#ifdef CONFIG_X86_64
410 /* Why <=? Even when the mask is smaller than 4GB it is often
411 larger than 16MB and in this case we have a chance of
412 finding fitting memory in the next higher zone first. If
413 not retry with true GFP_DMA. -AK */
414 if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
415 gfp |= GFP_DMA32;
416 if (dma_mask < DMA_32BIT_MASK)
417 noretry = 1;
418 }
419#endif
420
421 again:
422 page = dma_alloc_pages(dev,
423 noretry ? gfp | __GFP_NORETRY : gfp, get_order(size));
424 if (page == NULL)
425 return NULL;
426
427 {
428 int high, mmu;
429 bus = page_to_phys(page);
430 memory = page_address(page);
431 high = (bus + size) >= dma_mask;
432 mmu = high;
433 if (force_iommu && !(gfp & GFP_DMA))
434 mmu = 1;
435 else if (high) {
436 free_pages((unsigned long)memory,
437 get_order(size));
438
439 /* Don't use the 16MB ZONE_DMA unless absolutely
440 needed. It's better to use remapping first. */
441 if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) {
442 gfp = (gfp & ~GFP_DMA32) | GFP_DMA;
443 goto again;
444 }
445
446 /* Let low level make its own zone decisions */
447 gfp &= ~(GFP_DMA32|GFP_DMA);
448
449 if (dma_ops->alloc_coherent)
450 return dma_ops->alloc_coherent(dev, size,
451 dma_handle, gfp);
452 return NULL;
453 }
454
455 memset(memory, 0, size);
456 if (!mmu) {
457 *dma_handle = bus;
458 return memory;
459 }
460 }
461
462 if (dma_ops->alloc_coherent) {
463 free_pages((unsigned long)memory, get_order(size));
464 gfp &= ~(GFP_DMA|GFP_DMA32);
465 return dma_ops->alloc_coherent(dev, size, dma_handle, gfp);
466 }
467
468 if (dma_ops->map_simple) {
469 *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory),
470 size,
471 PCI_DMA_BIDIRECTIONAL);
472 if (*dma_handle != bad_dma_address)
473 return memory;
474 }
475
476 if (panic_on_overflow)
477 panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n",
478 (unsigned long)size);
479 free_pages((unsigned long)memory, get_order(size));
480 return NULL;
481}
482EXPORT_SYMBOL(dma_alloc_coherent);
483
484/*
485 * Unmap coherent memory.
486 * The caller must ensure that the device has finished accessing the mapping.
487 */
488void dma_free_coherent(struct device *dev, size_t size,
489 void *vaddr, dma_addr_t bus)
490{
491 int order = get_order(size);
492 WARN_ON(irqs_disabled()); /* for portability */
493 if (dma_release_coherent(dev, order, vaddr))
494 return;
495 if (dma_ops->unmap_single)
496 dma_ops->unmap_single(dev, bus, size, 0);
497 free_pages((unsigned long)vaddr, order);
498}
499EXPORT_SYMBOL(dma_free_coherent);
500
501static int __init pci_iommu_init(void) 274static int __init pci_iommu_init(void)
502{ 275{
503#ifdef CONFIG_CALGARY_IOMMU
504 calgary_iommu_init(); 276 calgary_iommu_init();
505#endif
506 277
507 intel_iommu_init(); 278 intel_iommu_init();
508 279
509 amd_iommu_init(); 280 amd_iommu_init();
510 281
511#ifdef CONFIG_GART_IOMMU
512 gart_iommu_init(); 282 gart_iommu_init();
513#endif
514 283
515 no_iommu_init(); 284 no_iommu_init();
516 return 0; 285 return 0;
@@ -522,17 +291,3 @@ void pci_iommu_shutdown(void)
522} 291}
523/* Must execute after PCI subsystem */ 292/* Must execute after PCI subsystem */
524fs_initcall(pci_iommu_init); 293fs_initcall(pci_iommu_init);
525
526#ifdef CONFIG_PCI
527/* Many VIA bridges seem to corrupt data for DAC. Disable it here */
528
529static __devinit void via_no_dac(struct pci_dev *dev)
530{
531 if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) {
532 printk(KERN_INFO "PCI: VIA PCI bridge detected."
533 "Disabling DAC.\n");
534 forbid_dac = 1;
535 }
536}
537DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac);
538#endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index c3fe78406d18..e3f75bbcedea 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -27,11 +27,12 @@
27#include <linux/scatterlist.h> 27#include <linux/scatterlist.h>
28#include <linux/iommu-helper.h> 28#include <linux/iommu-helper.h>
29#include <linux/sysdev.h> 29#include <linux/sysdev.h>
30#include <linux/io.h>
30#include <asm/atomic.h> 31#include <asm/atomic.h>
31#include <asm/io.h>
32#include <asm/mtrr.h> 32#include <asm/mtrr.h>
33#include <asm/pgtable.h> 33#include <asm/pgtable.h>
34#include <asm/proto.h> 34#include <asm/proto.h>
35#include <asm/iommu.h>
35#include <asm/gart.h> 36#include <asm/gart.h>
36#include <asm/cacheflush.h> 37#include <asm/cacheflush.h>
37#include <asm/swiotlb.h> 38#include <asm/swiotlb.h>
@@ -66,9 +67,6 @@ static u32 gart_unmapped_entry;
66 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) 67 (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT)
67#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) 68#define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28))
68 69
69#define to_pages(addr, size) \
70 (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT)
71
72#define EMERGENCY_PAGES 32 /* = 128KB */ 70#define EMERGENCY_PAGES 32 /* = 128KB */
73 71
74#ifdef CONFIG_AGP 72#ifdef CONFIG_AGP
@@ -82,9 +80,10 @@ AGPEXTERN int agp_memory_reserved;
82AGPEXTERN __u32 *agp_gatt_table; 80AGPEXTERN __u32 *agp_gatt_table;
83 81
84static unsigned long next_bit; /* protected by iommu_bitmap_lock */ 82static unsigned long next_bit; /* protected by iommu_bitmap_lock */
85static int need_flush; /* global flush state. set for each gart wrap */ 83static bool need_flush; /* global flush state. set for each gart wrap */
86 84
87static unsigned long alloc_iommu(struct device *dev, int size) 85static unsigned long alloc_iommu(struct device *dev, int size,
86 unsigned long align_mask)
88{ 87{
89 unsigned long offset, flags; 88 unsigned long offset, flags;
90 unsigned long boundary_size; 89 unsigned long boundary_size;
@@ -92,26 +91,27 @@ static unsigned long alloc_iommu(struct device *dev, int size)
92 91
93 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 92 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
94 PAGE_SIZE) >> PAGE_SHIFT; 93 PAGE_SIZE) >> PAGE_SHIFT;
95 boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, 94 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1,
96 PAGE_SIZE) >> PAGE_SHIFT; 95 PAGE_SIZE) >> PAGE_SHIFT;
97 96
98 spin_lock_irqsave(&iommu_bitmap_lock, flags); 97 spin_lock_irqsave(&iommu_bitmap_lock, flags);
99 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, 98 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit,
100 size, base_index, boundary_size, 0); 99 size, base_index, boundary_size, align_mask);
101 if (offset == -1) { 100 if (offset == -1) {
102 need_flush = 1; 101 need_flush = true;
103 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, 102 offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0,
104 size, base_index, boundary_size, 0); 103 size, base_index, boundary_size,
104 align_mask);
105 } 105 }
106 if (offset != -1) { 106 if (offset != -1) {
107 next_bit = offset+size; 107 next_bit = offset+size;
108 if (next_bit >= iommu_pages) { 108 if (next_bit >= iommu_pages) {
109 next_bit = 0; 109 next_bit = 0;
110 need_flush = 1; 110 need_flush = true;
111 } 111 }
112 } 112 }
113 if (iommu_fullflush) 113 if (iommu_fullflush)
114 need_flush = 1; 114 need_flush = true;
115 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 115 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
116 116
117 return offset; 117 return offset;
@@ -136,7 +136,7 @@ static void flush_gart(void)
136 spin_lock_irqsave(&iommu_bitmap_lock, flags); 136 spin_lock_irqsave(&iommu_bitmap_lock, flags);
137 if (need_flush) { 137 if (need_flush) {
138 k8_flush_garts(); 138 k8_flush_garts();
139 need_flush = 0; 139 need_flush = false;
140 } 140 }
141 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 141 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
142} 142}
@@ -175,7 +175,8 @@ static void dump_leak(void)
175 iommu_leak_pages); 175 iommu_leak_pages);
176 for (i = 0; i < iommu_leak_pages; i += 2) { 176 for (i = 0; i < iommu_leak_pages; i += 2) {
177 printk(KERN_DEBUG "%lu: ", iommu_pages-i); 177 printk(KERN_DEBUG "%lu: ", iommu_pages-i);
178 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); 178 printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
179 0);
179 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); 180 printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
180 } 181 }
181 printk(KERN_DEBUG "\n"); 182 printk(KERN_DEBUG "\n");
@@ -197,9 +198,7 @@ static void iommu_full(struct device *dev, size_t size, int dir)
197 * out. Hopefully no network devices use single mappings that big. 198 * out. Hopefully no network devices use single mappings that big.
198 */ 199 */
199 200
200 printk(KERN_ERR 201 dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size);
201 "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n",
202 size, dev->bus_id);
203 202
204 if (size > PAGE_SIZE*EMERGENCY_PAGES) { 203 if (size > PAGE_SIZE*EMERGENCY_PAGES) {
205 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) 204 if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL)
@@ -216,34 +215,24 @@ static void iommu_full(struct device *dev, size_t size, int dir)
216static inline int 215static inline int
217need_iommu(struct device *dev, unsigned long addr, size_t size) 216need_iommu(struct device *dev, unsigned long addr, size_t size)
218{ 217{
219 u64 mask = *dev->dma_mask; 218 return force_iommu ||
220 int high = addr + size > mask; 219 !is_buffer_dma_capable(*dev->dma_mask, addr, size);
221 int mmu = high;
222
223 if (force_iommu)
224 mmu = 1;
225
226 return mmu;
227} 220}
228 221
229static inline int 222static inline int
230nonforced_iommu(struct device *dev, unsigned long addr, size_t size) 223nonforced_iommu(struct device *dev, unsigned long addr, size_t size)
231{ 224{
232 u64 mask = *dev->dma_mask; 225 return !is_buffer_dma_capable(*dev->dma_mask, addr, size);
233 int high = addr + size > mask;
234 int mmu = high;
235
236 return mmu;
237} 226}
238 227
239/* Map a single continuous physical area into the IOMMU. 228/* Map a single continuous physical area into the IOMMU.
240 * Caller needs to check if the iommu is needed and flush. 229 * Caller needs to check if the iommu is needed and flush.
241 */ 230 */
242static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, 231static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
243 size_t size, int dir) 232 size_t size, int dir, unsigned long align_mask)
244{ 233{
245 unsigned long npages = to_pages(phys_mem, size); 234 unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
246 unsigned long iommu_page = alloc_iommu(dev, npages); 235 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask);
247 int i; 236 int i;
248 237
249 if (iommu_page == -1) { 238 if (iommu_page == -1) {
@@ -263,16 +252,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
263 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); 252 return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
264} 253}
265 254
266static dma_addr_t
267gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir)
268{
269 dma_addr_t map = dma_map_area(dev, paddr, size, dir);
270
271 flush_gart();
272
273 return map;
274}
275
276/* Map a single area into the IOMMU */ 255/* Map a single area into the IOMMU */
277static dma_addr_t 256static dma_addr_t
278gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) 257gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
@@ -280,12 +259,13 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir)
280 unsigned long bus; 259 unsigned long bus;
281 260
282 if (!dev) 261 if (!dev)
283 dev = &fallback_dev; 262 dev = &x86_dma_fallback_dev;
284 263
285 if (!need_iommu(dev, paddr, size)) 264 if (!need_iommu(dev, paddr, size))
286 return paddr; 265 return paddr;
287 266
288 bus = gart_map_simple(dev, paddr, size, dir); 267 bus = dma_map_area(dev, paddr, size, dir, 0);
268 flush_gart();
289 269
290 return bus; 270 return bus;
291} 271}
@@ -305,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr,
305 return; 285 return;
306 286
307 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; 287 iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT;
308 npages = to_pages(dma_addr, size); 288 npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
309 for (i = 0; i < npages; i++) { 289 for (i = 0; i < npages; i++) {
310 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; 290 iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
311 CLEAR_LEAK(iommu_page + i); 291 CLEAR_LEAK(iommu_page + i);
@@ -344,7 +324,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
344 unsigned long addr = sg_phys(s); 324 unsigned long addr = sg_phys(s);
345 325
346 if (nonforced_iommu(dev, addr, s->length)) { 326 if (nonforced_iommu(dev, addr, s->length)) {
347 addr = dma_map_area(dev, addr, s->length, dir); 327 addr = dma_map_area(dev, addr, s->length, dir, 0);
348 if (addr == bad_dma_address) { 328 if (addr == bad_dma_address) {
349 if (i > 0) 329 if (i > 0)
350 gart_unmap_sg(dev, sg, i, dir); 330 gart_unmap_sg(dev, sg, i, dir);
@@ -366,7 +346,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
366 int nelems, struct scatterlist *sout, 346 int nelems, struct scatterlist *sout,
367 unsigned long pages) 347 unsigned long pages)
368{ 348{
369 unsigned long iommu_start = alloc_iommu(dev, pages); 349 unsigned long iommu_start = alloc_iommu(dev, pages, 0);
370 unsigned long iommu_page = iommu_start; 350 unsigned long iommu_page = iommu_start;
371 struct scatterlist *s; 351 struct scatterlist *s;
372 int i; 352 int i;
@@ -388,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
388 } 368 }
389 369
390 addr = phys_addr; 370 addr = phys_addr;
391 pages = to_pages(s->offset, s->length); 371 pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
392 while (pages--) { 372 while (pages--) {
393 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); 373 iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
394 SET_LEAK(iommu_page); 374 SET_LEAK(iommu_page);
@@ -431,7 +411,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
431 return 0; 411 return 0;
432 412
433 if (!dev) 413 if (!dev)
434 dev = &fallback_dev; 414 dev = &x86_dma_fallback_dev;
435 415
436 out = 0; 416 out = 0;
437 start = 0; 417 start = 0;
@@ -471,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir)
471 451
472 seg_size += s->length; 452 seg_size += s->length;
473 need = nextneed; 453 need = nextneed;
474 pages += to_pages(s->offset, s->length); 454 pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
475 ps = s; 455 ps = s;
476 } 456 }
477 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) 457 if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
@@ -503,6 +483,46 @@ error:
503 return 0; 483 return 0;
504} 484}
505 485
486/* allocate and map a coherent mapping */
487static void *
488gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
489 gfp_t flag)
490{
491 dma_addr_t paddr;
492 unsigned long align_mask;
493 struct page *page;
494
495 if (force_iommu && !(flag & GFP_DMA)) {
496 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
497 page = alloc_pages(flag | __GFP_ZERO, get_order(size));
498 if (!page)
499 return NULL;
500
501 align_mask = (1UL << get_order(size)) - 1;
502 paddr = dma_map_area(dev, page_to_phys(page), size,
503 DMA_BIDIRECTIONAL, align_mask);
504
505 flush_gart();
506 if (paddr != bad_dma_address) {
507 *dma_addr = paddr;
508 return page_address(page);
509 }
510 __free_pages(page, get_order(size));
511 } else
512 return dma_generic_alloc_coherent(dev, size, dma_addr, flag);
513
514 return NULL;
515}
516
517/* free a coherent mapping */
518static void
519gart_free_coherent(struct device *dev, size_t size, void *vaddr,
520 dma_addr_t dma_addr)
521{
522 gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL);
523 free_pages((unsigned long)vaddr, get_order(size));
524}
525
506static int no_agp; 526static int no_agp;
507 527
508static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 528static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -630,7 +650,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
630 struct pci_dev *dev; 650 struct pci_dev *dev;
631 void *gatt; 651 void *gatt;
632 int i, error; 652 int i, error;
633 unsigned long start_pfn, end_pfn;
634 653
635 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 654 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n");
636 aper_size = aper_base = info->aper_size = 0; 655 aper_size = aper_base = info->aper_size = 0;
@@ -654,13 +673,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
654 info->aper_size = aper_size >> 20; 673 info->aper_size = aper_size >> 20;
655 674
656 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); 675 gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32);
657 gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); 676 gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
677 get_order(gatt_size));
658 if (!gatt) 678 if (!gatt)
659 panic("Cannot allocate GATT table"); 679 panic("Cannot allocate GATT table");
660 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) 680 if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT))
661 panic("Could not set GART PTEs to uncacheable pages"); 681 panic("Could not set GART PTEs to uncacheable pages");
662 682
663 memset(gatt, 0, gatt_size);
664 agp_gatt_table = gatt; 683 agp_gatt_table = gatt;
665 684
666 enable_gart_translations(); 685 enable_gart_translations();
@@ -669,19 +688,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
669 if (!error) 688 if (!error)
670 error = sysdev_register(&device_gart); 689 error = sysdev_register(&device_gart);
671 if (error) 690 if (error)
672 panic("Could not register gart_sysdev -- would corrupt data on next suspend"); 691 panic("Could not register gart_sysdev -- "
692 "would corrupt data on next suspend");
673 693
674 flush_gart(); 694 flush_gart();
675 695
676 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 696 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n",
677 aper_base, aper_size>>10); 697 aper_base, aper_size>>10);
678 698
679 /* need to map that range */
680 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
681 if (end_pfn > max_low_pfn_mapped) {
682 start_pfn = (aper_base>>PAGE_SHIFT);
683 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
684 }
685 return 0; 699 return 0;
686 700
687 nommu: 701 nommu:
@@ -691,21 +705,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
691 return -1; 705 return -1;
692} 706}
693 707
694extern int agp_amd64_init(void); 708static struct dma_mapping_ops gart_dma_ops = {
695
696static const struct dma_mapping_ops gart_dma_ops = {
697 .mapping_error = NULL,
698 .map_single = gart_map_single, 709 .map_single = gart_map_single,
699 .map_simple = gart_map_simple,
700 .unmap_single = gart_unmap_single, 710 .unmap_single = gart_unmap_single,
701 .sync_single_for_cpu = NULL,
702 .sync_single_for_device = NULL,
703 .sync_single_range_for_cpu = NULL,
704 .sync_single_range_for_device = NULL,
705 .sync_sg_for_cpu = NULL,
706 .sync_sg_for_device = NULL,
707 .map_sg = gart_map_sg, 711 .map_sg = gart_map_sg,
708 .unmap_sg = gart_unmap_sg, 712 .unmap_sg = gart_unmap_sg,
713 .alloc_coherent = gart_alloc_coherent,
714 .free_coherent = gart_free_coherent,
709}; 715};
710 716
711void gart_iommu_shutdown(void) 717void gart_iommu_shutdown(void)
@@ -732,7 +738,8 @@ void __init gart_iommu_init(void)
732{ 738{
733 struct agp_kern_info info; 739 struct agp_kern_info info;
734 unsigned long iommu_start; 740 unsigned long iommu_start;
735 unsigned long aper_size; 741 unsigned long aper_base, aper_size;
742 unsigned long start_pfn, end_pfn;
736 unsigned long scratch; 743 unsigned long scratch;
737 long i; 744 long i;
738 745
@@ -764,30 +771,35 @@ void __init gart_iommu_init(void)
764 (no_agp && init_k8_gatt(&info) < 0)) { 771 (no_agp && init_k8_gatt(&info) < 0)) {
765 if (max_pfn > MAX_DMA32_PFN) { 772 if (max_pfn > MAX_DMA32_PFN) {
766 printk(KERN_WARNING "More than 4GB of memory " 773 printk(KERN_WARNING "More than 4GB of memory "
767 "but GART IOMMU not available.\n" 774 "but GART IOMMU not available.\n");
768 KERN_WARNING "falling back to iommu=soft.\n"); 775 printk(KERN_WARNING "falling back to iommu=soft.\n");
769 } 776 }
770 return; 777 return;
771 } 778 }
772 779
780 /* need to map that range */
781 aper_size = info.aper_size << 20;
782 aper_base = info.aper_base;
783 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
784 if (end_pfn > max_low_pfn_mapped) {
785 start_pfn = (aper_base>>PAGE_SHIFT);
786 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
787 }
788
773 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 789 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n");
774 aper_size = info.aper_size * 1024 * 1024;
775 iommu_size = check_iommu_size(info.aper_base, aper_size); 790 iommu_size = check_iommu_size(info.aper_base, aper_size);
776 iommu_pages = iommu_size >> PAGE_SHIFT; 791 iommu_pages = iommu_size >> PAGE_SHIFT;
777 792
778 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, 793 iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO,
779 get_order(iommu_pages/8)); 794 get_order(iommu_pages/8));
780 if (!iommu_gart_bitmap) 795 if (!iommu_gart_bitmap)
781 panic("Cannot allocate iommu bitmap\n"); 796 panic("Cannot allocate iommu bitmap\n");
782 memset(iommu_gart_bitmap, 0, iommu_pages/8);
783 797
784#ifdef CONFIG_IOMMU_LEAK 798#ifdef CONFIG_IOMMU_LEAK
785 if (leak_trace) { 799 if (leak_trace) {
786 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, 800 iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
787 get_order(iommu_pages*sizeof(void *))); 801 get_order(iommu_pages*sizeof(void *)));
788 if (iommu_leak_tab) 802 if (!iommu_leak_tab)
789 memset(iommu_leak_tab, 0, iommu_pages * 8);
790 else
791 printk(KERN_DEBUG 803 printk(KERN_DEBUG
792 "PCI-DMA: Cannot allocate leak trace area\n"); 804 "PCI-DMA: Cannot allocate leak trace area\n");
793 } 805 }
@@ -797,7 +809,7 @@ void __init gart_iommu_init(void)
797 * Out of IOMMU space handling. 809 * Out of IOMMU space handling.
798 * Reserve some invalid pages at the beginning of the GART. 810 * Reserve some invalid pages at the beginning of the GART.
799 */ 811 */
800 set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 812 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
801 813
802 agp_memory_reserved = iommu_size; 814 agp_memory_reserved = iommu_size;
803 printk(KERN_INFO 815 printk(KERN_INFO
@@ -855,7 +867,8 @@ void __init gart_parse_options(char *p)
855 if (!strncmp(p, "leak", 4)) { 867 if (!strncmp(p, "leak", 4)) {
856 leak_trace = 1; 868 leak_trace = 1;
857 p += 4; 869 p += 4;
858 if (*p == '=') ++p; 870 if (*p == '=')
871 ++p;
859 if (isdigit(*p) && get_option(&p, &arg)) 872 if (isdigit(*p) && get_option(&p, &arg))
860 iommu_leak_pages = arg; 873 iommu_leak_pages = arg;
861 } 874 }
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index aec43d56f49c..c70ab5a5d4c8 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -7,14 +7,14 @@
7#include <linux/dma-mapping.h> 7#include <linux/dma-mapping.h>
8#include <linux/scatterlist.h> 8#include <linux/scatterlist.h>
9 9
10#include <asm/gart.h> 10#include <asm/iommu.h>
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/dma.h> 12#include <asm/dma.h>
13 13
14static int 14static int
15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) 15check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size)
16{ 16{
17 if (hwdev && bus + size > *hwdev->dma_mask) { 17 if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) {
18 if (*hwdev->dma_mask >= DMA_32BIT_MASK) 18 if (*hwdev->dma_mask >= DMA_32BIT_MASK)
19 printk(KERN_ERR 19 printk(KERN_ERR
20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", 20 "nommu_%s: overflow %Lx+%zu of device mask %Lx\n",
@@ -72,21 +72,17 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg,
72 return nents; 72 return nents;
73} 73}
74 74
75/* Make sure we keep the same behaviour */ 75static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr,
76static int nommu_mapping_error(dma_addr_t dma_addr) 76 dma_addr_t dma_addr)
77{ 77{
78#ifdef CONFIG_X86_32 78 free_pages((unsigned long)vaddr, get_order(size));
79 return 0;
80#else
81 return (dma_addr == bad_dma_address);
82#endif
83} 79}
84 80
85 81struct dma_mapping_ops nommu_dma_ops = {
86const struct dma_mapping_ops nommu_dma_ops = { 82 .alloc_coherent = dma_generic_alloc_coherent,
83 .free_coherent = nommu_free_coherent,
87 .map_single = nommu_map_single, 84 .map_single = nommu_map_single,
88 .map_sg = nommu_map_sg, 85 .map_sg = nommu_map_sg,
89 .mapping_error = nommu_mapping_error,
90 .is_phys = 1, 86 .is_phys = 1,
91}; 87};
92 88
diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c
index 82299cd1d04d..c4ce0332759e 100644
--- a/arch/x86/kernel/pci-swiotlb_64.c
+++ b/arch/x86/kernel/pci-swiotlb_64.c
@@ -5,7 +5,7 @@
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/dma-mapping.h> 6#include <linux/dma-mapping.h>
7 7
8#include <asm/gart.h> 8#include <asm/iommu.h>
9#include <asm/swiotlb.h> 9#include <asm/swiotlb.h>
10#include <asm/dma.h> 10#include <asm/dma.h>
11 11
@@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size,
18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); 18 return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction);
19} 19}
20 20
21const struct dma_mapping_ops swiotlb_dma_ops = { 21struct dma_mapping_ops swiotlb_dma_ops = {
22 .mapping_error = swiotlb_dma_mapping_error, 22 .mapping_error = swiotlb_dma_mapping_error,
23 .alloc_coherent = swiotlb_alloc_coherent, 23 .alloc_coherent = swiotlb_alloc_coherent,
24 .free_coherent = swiotlb_free_coherent, 24 .free_coherent = swiotlb_free_coherent,
diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c
index bc1f2d3ea277..a311ffcaad16 100644
--- a/arch/x86/kernel/pcspeaker.c
+++ b/arch/x86/kernel/pcspeaker.c
@@ -1,20 +1,13 @@
1#include <linux/platform_device.h> 1#include <linux/platform_device.h>
2#include <linux/errno.h> 2#include <linux/err.h>
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5static __init int add_pcspkr(void) 5static __init int add_pcspkr(void)
6{ 6{
7 struct platform_device *pd; 7 struct platform_device *pd;
8 int ret;
9 8
10 pd = platform_device_alloc("pcspkr", -1); 9 pd = platform_device_register_simple("pcspkr", -1, NULL, 0);
11 if (!pd)
12 return -ENOMEM;
13 10
14 ret = platform_device_add(pd); 11 return IS_ERR(pd) ? PTR_ERR(pd) : 0;
15 if (ret)
16 platform_device_put(pd);
17
18 return ret;
19} 12}
20device_initcall(add_pcspkr); 13device_initcall(add_pcspkr);
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 4d629c62f4f8..c622772744d8 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -184,7 +184,8 @@ static void mwait_idle(void)
184static void poll_idle(void) 184static void poll_idle(void)
185{ 185{
186 local_irq_enable(); 186 local_irq_enable();
187 cpu_relax(); 187 while (!need_resched())
188 cpu_relax();
188} 189}
189 190
190/* 191/*
@@ -199,6 +200,7 @@ static void poll_idle(void)
199 * 200 *
200 * idle=mwait overrides this decision and forces the usage of mwait. 201 * idle=mwait overrides this decision and forces the usage of mwait.
201 */ 202 */
203static int __cpuinitdata force_mwait;
202 204
203#define MWAIT_INFO 0x05 205#define MWAIT_INFO 0x05
204#define MWAIT_ECX_EXTENDED_INFO 0x01 206#define MWAIT_ECX_EXTENDED_INFO 0x01
@@ -244,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
244 return 1; 246 return 1;
245} 247}
246 248
249static cpumask_t c1e_mask = CPU_MASK_NONE;
250static int c1e_detected;
251
252void c1e_remove_cpu(int cpu)
253{
254 cpu_clear(cpu, c1e_mask);
255}
256
247/* 257/*
248 * C1E aware idle routine. We check for C1E active in the interrupt 258 * C1E aware idle routine. We check for C1E active in the interrupt
249 * pending message MSR. If we detect C1E, then we handle it the same 259 * pending message MSR. If we detect C1E, then we handle it the same
@@ -251,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
251 */ 261 */
252static void c1e_idle(void) 262static void c1e_idle(void)
253{ 263{
254 static cpumask_t c1e_mask = CPU_MASK_NONE;
255 static int c1e_detected;
256
257 if (need_resched()) 264 if (need_resched())
258 return; 265 return;
259 266
@@ -263,8 +270,10 @@ static void c1e_idle(void)
263 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 270 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
264 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 271 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
265 c1e_detected = 1; 272 c1e_detected = 1;
266 mark_tsc_unstable("TSC halt in C1E"); 273 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
267 printk(KERN_INFO "System has C1E enabled\n"); 274 mark_tsc_unstable("TSC halt in AMD C1E");
275 printk(KERN_INFO "System has AMD C1E enabled\n");
276 set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
268 } 277 }
269 } 278 }
270 279
@@ -326,6 +335,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
326 335
327static int __init idle_setup(char *str) 336static int __init idle_setup(char *str)
328{ 337{
338 if (!str)
339 return -EINVAL;
340
329 if (!strcmp(str, "poll")) { 341 if (!strcmp(str, "poll")) {
330 printk("using polling idle threads.\n"); 342 printk("using polling idle threads.\n");
331 pm_idle = poll_idle; 343 pm_idle = poll_idle;
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 0c3927accb00..0a1302fe6d45 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -37,6 +37,7 @@
37#include <linux/tick.h> 37#include <linux/tick.h>
38#include <linux/percpu.h> 38#include <linux/percpu.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/dmi.h>
40 41
41#include <asm/uaccess.h> 42#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
@@ -55,6 +56,9 @@
55#include <asm/tlbflush.h> 56#include <asm/tlbflush.h>
56#include <asm/cpu.h> 57#include <asm/cpu.h>
57#include <asm/kdebug.h> 58#include <asm/kdebug.h>
59#include <asm/idle.h>
60#include <asm/syscalls.h>
61#include <asm/smp.h>
58 62
59asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 63asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
60 64
@@ -72,47 +76,12 @@ unsigned long thread_saved_pc(struct task_struct *tsk)
72 return ((unsigned long *)tsk->thread.sp)[3]; 76 return ((unsigned long *)tsk->thread.sp)[3];
73} 77}
74 78
75#ifdef CONFIG_HOTPLUG_CPU 79#ifndef CONFIG_SMP
76#include <asm/nmi.h>
77
78static void cpu_exit_clear(void)
79{
80 int cpu = raw_smp_processor_id();
81
82 idle_task_exit();
83
84 cpu_uninit();
85 irq_ctx_exit(cpu);
86
87 cpu_clear(cpu, cpu_callout_map);
88 cpu_clear(cpu, cpu_callin_map);
89
90 numa_remove_cpu(cpu);
91}
92
93/* We don't actually take CPU down, just spin without interrupts. */
94static inline void play_dead(void)
95{
96 /* This must be done before dead CPU ack */
97 cpu_exit_clear();
98 wbinvd();
99 mb();
100 /* Ack it */
101 __get_cpu_var(cpu_state) = CPU_DEAD;
102
103 /*
104 * With physical CPU hotplug, we should halt the cpu
105 */
106 local_irq_disable();
107 while (1)
108 halt();
109}
110#else
111static inline void play_dead(void) 80static inline void play_dead(void)
112{ 81{
113 BUG(); 82 BUG();
114} 83}
115#endif /* CONFIG_HOTPLUG_CPU */ 84#endif
116 85
117/* 86/*
118 * The idle thread. There's no useful work to be 87 * The idle thread. There's no useful work to be
@@ -128,7 +97,7 @@ void cpu_idle(void)
128 97
129 /* endless idle loop with no priority at all */ 98 /* endless idle loop with no priority at all */
130 while (1) { 99 while (1) {
131 tick_nohz_stop_sched_tick(); 100 tick_nohz_stop_sched_tick(1);
132 while (!need_resched()) { 101 while (!need_resched()) {
133 102
134 check_pgt_cache(); 103 check_pgt_cache();
@@ -154,12 +123,13 @@ void cpu_idle(void)
154 } 123 }
155} 124}
156 125
157void __show_registers(struct pt_regs *regs, int all) 126void __show_regs(struct pt_regs *regs, int all)
158{ 127{
159 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; 128 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L;
160 unsigned long d0, d1, d2, d3, d6, d7; 129 unsigned long d0, d1, d2, d3, d6, d7;
161 unsigned long sp; 130 unsigned long sp;
162 unsigned short ss, gs; 131 unsigned short ss, gs;
132 const char *board;
163 133
164 if (user_mode_vm(regs)) { 134 if (user_mode_vm(regs)) {
165 sp = regs->sp; 135 sp = regs->sp;
@@ -172,11 +142,15 @@ void __show_registers(struct pt_regs *regs, int all)
172 } 142 }
173 143
174 printk("\n"); 144 printk("\n");
175 printk("Pid: %d, comm: %s %s (%s %.*s)\n", 145
146 board = dmi_get_system_info(DMI_PRODUCT_NAME);
147 if (!board)
148 board = "";
149 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
176 task_pid_nr(current), current->comm, 150 task_pid_nr(current), current->comm,
177 print_tainted(), init_utsname()->release, 151 print_tainted(), init_utsname()->release,
178 (int)strcspn(init_utsname()->version, " "), 152 (int)strcspn(init_utsname()->version, " "),
179 init_utsname()->version); 153 init_utsname()->version, board);
180 154
181 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", 155 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
182 (u16)regs->cs, regs->ip, regs->flags, 156 (u16)regs->cs, regs->ip, regs->flags,
@@ -215,7 +189,7 @@ void __show_registers(struct pt_regs *regs, int all)
215 189
216void show_regs(struct pt_regs *regs) 190void show_regs(struct pt_regs *regs)
217{ 191{
218 __show_registers(regs, 1); 192 __show_regs(regs, 1);
219 show_trace(NULL, regs, &regs->sp, regs->bp); 193 show_trace(NULL, regs, &regs->sp, regs->bp);
220} 194}
221 195
@@ -276,6 +250,14 @@ void exit_thread(void)
276 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; 250 tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET;
277 put_cpu(); 251 put_cpu();
278 } 252 }
253#ifdef CONFIG_X86_DS
254 /* Free any DS contexts that have not been properly released. */
255 if (unlikely(current->thread.ds_ctx)) {
256 /* we clear debugctl to make sure DS is not used. */
257 update_debugctlmsr(0);
258 ds_free(current->thread.ds_ctx);
259 }
260#endif /* CONFIG_X86_DS */
279} 261}
280 262
281void flush_thread(void) 263void flush_thread(void)
@@ -437,6 +419,35 @@ int set_tsc_mode(unsigned int val)
437 return 0; 419 return 0;
438} 420}
439 421
422#ifdef CONFIG_X86_DS
423static int update_debugctl(struct thread_struct *prev,
424 struct thread_struct *next, unsigned long debugctl)
425{
426 unsigned long ds_prev = 0;
427 unsigned long ds_next = 0;
428
429 if (prev->ds_ctx)
430 ds_prev = (unsigned long)prev->ds_ctx->ds;
431 if (next->ds_ctx)
432 ds_next = (unsigned long)next->ds_ctx->ds;
433
434 if (ds_next != ds_prev) {
435 /* we clear debugctl to make sure DS
436 * is not in use when we change it */
437 debugctl = 0;
438 update_debugctlmsr(0);
439 wrmsr(MSR_IA32_DS_AREA, ds_next, 0);
440 }
441 return debugctl;
442}
443#else
444static int update_debugctl(struct thread_struct *prev,
445 struct thread_struct *next, unsigned long debugctl)
446{
447 return debugctl;
448}
449#endif /* CONFIG_X86_DS */
450
440static noinline void 451static noinline void
441__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 452__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
442 struct tss_struct *tss) 453 struct tss_struct *tss)
@@ -447,14 +458,7 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
447 prev = &prev_p->thread; 458 prev = &prev_p->thread;
448 next = &next_p->thread; 459 next = &next_p->thread;
449 460
450 debugctl = prev->debugctlmsr; 461 debugctl = update_debugctl(prev, next, prev->debugctlmsr);
451 if (next->ds_area_msr != prev->ds_area_msr) {
452 /* we clear debugctl to make sure DS
453 * is not in use when we change it */
454 debugctl = 0;
455 update_debugctlmsr(0);
456 wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0);
457 }
458 462
459 if (next->debugctlmsr != debugctl) 463 if (next->debugctlmsr != debugctl)
460 update_debugctlmsr(next->debugctlmsr); 464 update_debugctlmsr(next->debugctlmsr);
@@ -478,13 +482,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
478 hard_enable_TSC(); 482 hard_enable_TSC();
479 } 483 }
480 484
481#ifdef X86_BTS 485#ifdef CONFIG_X86_PTRACE_BTS
482 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 486 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
483 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 487 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
484 488
485 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 489 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
486 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 490 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
487#endif 491#endif /* CONFIG_X86_PTRACE_BTS */
488 492
489 493
490 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { 494 if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index a8e53626ac9a..c958120fb1b6 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -37,11 +37,11 @@
37#include <linux/kdebug.h> 37#include <linux/kdebug.h>
38#include <linux/tick.h> 38#include <linux/tick.h>
39#include <linux/prctl.h> 39#include <linux/prctl.h>
40#include <linux/uaccess.h>
41#include <linux/io.h>
40 42
41#include <asm/uaccess.h>
42#include <asm/pgtable.h> 43#include <asm/pgtable.h>
43#include <asm/system.h> 44#include <asm/system.h>
44#include <asm/io.h>
45#include <asm/processor.h> 45#include <asm/processor.h>
46#include <asm/i387.h> 46#include <asm/i387.h>
47#include <asm/mmu_context.h> 47#include <asm/mmu_context.h>
@@ -51,6 +51,7 @@
51#include <asm/proto.h> 51#include <asm/proto.h>
52#include <asm/ia32.h> 52#include <asm/ia32.h>
53#include <asm/idle.h> 53#include <asm/idle.h>
54#include <asm/syscalls.h>
54 55
55asmlinkage extern void ret_from_fork(void); 56asmlinkage extern void ret_from_fork(void);
56 57
@@ -62,6 +63,13 @@ void idle_notifier_register(struct notifier_block *n)
62{ 63{
63 atomic_notifier_chain_register(&idle_notifier, n); 64 atomic_notifier_chain_register(&idle_notifier, n);
64} 65}
66EXPORT_SYMBOL_GPL(idle_notifier_register);
67
68void idle_notifier_unregister(struct notifier_block *n)
69{
70 atomic_notifier_chain_unregister(&idle_notifier, n);
71}
72EXPORT_SYMBOL_GPL(idle_notifier_unregister);
65 73
66void enter_idle(void) 74void enter_idle(void)
67{ 75{
@@ -85,29 +93,12 @@ void exit_idle(void)
85 __exit_idle(); 93 __exit_idle();
86} 94}
87 95
88#ifdef CONFIG_HOTPLUG_CPU 96#ifndef CONFIG_SMP
89DECLARE_PER_CPU(int, cpu_state);
90
91#include <asm/nmi.h>
92/* We halt the CPU with physical CPU hotplug */
93static inline void play_dead(void)
94{
95 idle_task_exit();
96 wbinvd();
97 mb();
98 /* Ack it */
99 __get_cpu_var(cpu_state) = CPU_DEAD;
100
101 local_irq_disable();
102 while (1)
103 halt();
104}
105#else
106static inline void play_dead(void) 97static inline void play_dead(void)
107{ 98{
108 BUG(); 99 BUG();
109} 100}
110#endif /* CONFIG_HOTPLUG_CPU */ 101#endif
111 102
112/* 103/*
113 * The idle thread. There's no useful work to be 104 * The idle thread. There's no useful work to be
@@ -120,7 +111,7 @@ void cpu_idle(void)
120 current_thread_info()->status |= TS_POLLING; 111 current_thread_info()->status |= TS_POLLING;
121 /* endless idle loop with no priority at all */ 112 /* endless idle loop with no priority at all */
122 while (1) { 113 while (1) {
123 tick_nohz_stop_sched_tick(); 114 tick_nohz_stop_sched_tick(1);
124 while (!need_resched()) { 115 while (!need_resched()) {
125 116
126 rmb(); 117 rmb();
@@ -152,7 +143,7 @@ void cpu_idle(void)
152} 143}
153 144
154/* Prints also some state that isn't saved in the pt_regs */ 145/* Prints also some state that isn't saved in the pt_regs */
155void __show_regs(struct pt_regs * regs) 146void __show_regs(struct pt_regs *regs, int all)
156{ 147{
157 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; 148 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
158 unsigned long d0, d1, d2, d3, d6, d7; 149 unsigned long d0, d1, d2, d3, d6, d7;
@@ -161,60 +152,65 @@ void __show_regs(struct pt_regs * regs)
161 152
162 printk("\n"); 153 printk("\n");
163 print_modules(); 154 print_modules();
164 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 155 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n",
165 current->pid, current->comm, print_tainted(), 156 current->pid, current->comm, print_tainted(),
166 init_utsname()->release, 157 init_utsname()->release,
167 (int)strcspn(init_utsname()->version, " "), 158 (int)strcspn(init_utsname()->version, " "),
168 init_utsname()->version); 159 init_utsname()->version);
169 printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); 160 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
170 printk_address(regs->ip, 1); 161 printk_address(regs->ip, 1);
171 printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, 162 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
172 regs->flags); 163 regs->sp, regs->flags);
173 printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", 164 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n",
174 regs->ax, regs->bx, regs->cx); 165 regs->ax, regs->bx, regs->cx);
175 printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", 166 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n",
176 regs->dx, regs->si, regs->di); 167 regs->dx, regs->si, regs->di);
177 printk("RBP: %016lx R08: %016lx R09: %016lx\n", 168 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n",
178 regs->bp, regs->r8, regs->r9); 169 regs->bp, regs->r8, regs->r9);
179 printk("R10: %016lx R11: %016lx R12: %016lx\n", 170 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n",
180 regs->r10, regs->r11, regs->r12); 171 regs->r10, regs->r11, regs->r12);
181 printk("R13: %016lx R14: %016lx R15: %016lx\n", 172 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n",
182 regs->r13, regs->r14, regs->r15); 173 regs->r13, regs->r14, regs->r15);
183 174
184 asm("movl %%ds,%0" : "=r" (ds)); 175 asm("movl %%ds,%0" : "=r" (ds));
185 asm("movl %%cs,%0" : "=r" (cs)); 176 asm("movl %%cs,%0" : "=r" (cs));
186 asm("movl %%es,%0" : "=r" (es)); 177 asm("movl %%es,%0" : "=r" (es));
187 asm("movl %%fs,%0" : "=r" (fsindex)); 178 asm("movl %%fs,%0" : "=r" (fsindex));
188 asm("movl %%gs,%0" : "=r" (gsindex)); 179 asm("movl %%gs,%0" : "=r" (gsindex));
189 180
190 rdmsrl(MSR_FS_BASE, fs); 181 rdmsrl(MSR_FS_BASE, fs);
191 rdmsrl(MSR_GS_BASE, gs); 182 rdmsrl(MSR_GS_BASE, gs);
192 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); 183 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
184
185 if (!all)
186 return;
193 187
194 cr0 = read_cr0(); 188 cr0 = read_cr0();
195 cr2 = read_cr2(); 189 cr2 = read_cr2();
196 cr3 = read_cr3(); 190 cr3 = read_cr3();
197 cr4 = read_cr4(); 191 cr4 = read_cr4();
198 192
199 printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 193 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
200 fs,fsindex,gs,gsindex,shadowgs); 194 fs, fsindex, gs, gsindex, shadowgs);
201 printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); 195 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
202 printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); 196 es, cr0);
197 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
198 cr4);
203 199
204 get_debugreg(d0, 0); 200 get_debugreg(d0, 0);
205 get_debugreg(d1, 1); 201 get_debugreg(d1, 1);
206 get_debugreg(d2, 2); 202 get_debugreg(d2, 2);
207 printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 203 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
208 get_debugreg(d3, 3); 204 get_debugreg(d3, 3);
209 get_debugreg(d6, 6); 205 get_debugreg(d6, 6);
210 get_debugreg(d7, 7); 206 get_debugreg(d7, 7);
211 printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 207 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
212} 208}
213 209
214void show_regs(struct pt_regs *regs) 210void show_regs(struct pt_regs *regs)
215{ 211{
216 printk("CPU %d:", smp_processor_id()); 212 printk(KERN_INFO "CPU %d:", smp_processor_id());
217 __show_regs(regs); 213 __show_regs(regs, 1);
218 show_trace(NULL, regs, (void *)(regs + 1), regs->bp); 214 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
219} 215}
220 216
@@ -239,6 +235,14 @@ void exit_thread(void)
239 t->io_bitmap_max = 0; 235 t->io_bitmap_max = 0;
240 put_cpu(); 236 put_cpu();
241 } 237 }
238#ifdef CONFIG_X86_DS
239 /* Free any DS contexts that have not been properly released. */
240 if (unlikely(t->ds_ctx)) {
241 /* we clear debugctl to make sure DS is not used. */
242 update_debugctlmsr(0);
243 ds_free(t->ds_ctx);
244 }
245#endif /* CONFIG_X86_DS */
242} 246}
243 247
244void flush_thread(void) 248void flush_thread(void)
@@ -314,10 +318,10 @@ void prepare_to_copy(struct task_struct *tsk)
314 318
315int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, 319int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
316 unsigned long unused, 320 unsigned long unused,
317 struct task_struct * p, struct pt_regs * regs) 321 struct task_struct *p, struct pt_regs *regs)
318{ 322{
319 int err; 323 int err;
320 struct pt_regs * childregs; 324 struct pt_regs *childregs;
321 struct task_struct *me = current; 325 struct task_struct *me = current;
322 326
323 childregs = ((struct pt_regs *) 327 childregs = ((struct pt_regs *)
@@ -362,10 +366,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
362 if (test_thread_flag(TIF_IA32)) 366 if (test_thread_flag(TIF_IA32))
363 err = do_set_thread_area(p, -1, 367 err = do_set_thread_area(p, -1,
364 (struct user_desc __user *)childregs->si, 0); 368 (struct user_desc __user *)childregs->si, 0);
365 else 369 else
366#endif 370#endif
367 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); 371 err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8);
368 if (err) 372 if (err)
369 goto out; 373 goto out;
370 } 374 }
371 err = 0; 375 err = 0;
@@ -472,13 +476,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
472 next = &next_p->thread; 476 next = &next_p->thread;
473 477
474 debugctl = prev->debugctlmsr; 478 debugctl = prev->debugctlmsr;
475 if (next->ds_area_msr != prev->ds_area_msr) { 479
476 /* we clear debugctl to make sure DS 480#ifdef CONFIG_X86_DS
477 * is not in use when we change it */ 481 {
478 debugctl = 0; 482 unsigned long ds_prev = 0, ds_next = 0;
479 update_debugctlmsr(0); 483
480 wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); 484 if (prev->ds_ctx)
485 ds_prev = (unsigned long)prev->ds_ctx->ds;
486 if (next->ds_ctx)
487 ds_next = (unsigned long)next->ds_ctx->ds;
488
489 if (ds_next != ds_prev) {
490 /*
491 * We clear debugctl to make sure DS
492 * is not in use when we change it:
493 */
494 debugctl = 0;
495 update_debugctlmsr(0);
496 wrmsrl(MSR_IA32_DS_AREA, ds_next);
497 }
481 } 498 }
499#endif /* CONFIG_X86_DS */
482 500
483 if (next->debugctlmsr != debugctl) 501 if (next->debugctlmsr != debugctl)
484 update_debugctlmsr(next->debugctlmsr); 502 update_debugctlmsr(next->debugctlmsr);
@@ -516,13 +534,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
516 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 534 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
517 } 535 }
518 536
519#ifdef X86_BTS 537#ifdef CONFIG_X86_PTRACE_BTS
520 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) 538 if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS))
521 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); 539 ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS);
522 540
523 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) 541 if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS))
524 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); 542 ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES);
525#endif 543#endif /* CONFIG_X86_PTRACE_BTS */
526} 544}
527 545
528/* 546/*
@@ -537,14 +555,14 @@ static inline void __switch_to_xtra(struct task_struct *prev_p,
537struct task_struct * 555struct task_struct *
538__switch_to(struct task_struct *prev_p, struct task_struct *next_p) 556__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
539{ 557{
540 struct thread_struct *prev = &prev_p->thread, 558 struct thread_struct *prev = &prev_p->thread;
541 *next = &next_p->thread; 559 struct thread_struct *next = &next_p->thread;
542 int cpu = smp_processor_id(); 560 int cpu = smp_processor_id();
543 struct tss_struct *tss = &per_cpu(init_tss, cpu); 561 struct tss_struct *tss = &per_cpu(init_tss, cpu);
544 unsigned fsindex, gsindex; 562 unsigned fsindex, gsindex;
545 563
546 /* we're going to use this soon, after a few expensive things */ 564 /* we're going to use this soon, after a few expensive things */
547 if (next_p->fpu_counter>5) 565 if (next_p->fpu_counter > 5)
548 prefetch(next->xstate); 566 prefetch(next->xstate);
549 567
550 /* 568 /*
@@ -552,13 +570,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
552 */ 570 */
553 load_sp0(tss, next); 571 load_sp0(tss, next);
554 572
555 /* 573 /*
556 * Switch DS and ES. 574 * Switch DS and ES.
557 * This won't pick up thread selector changes, but I guess that is ok. 575 * This won't pick up thread selector changes, but I guess that is ok.
558 */ 576 */
559 savesegment(es, prev->es); 577 savesegment(es, prev->es);
560 if (unlikely(next->es | prev->es)) 578 if (unlikely(next->es | prev->es))
561 loadsegment(es, next->es); 579 loadsegment(es, next->es);
562 580
563 savesegment(ds, prev->ds); 581 savesegment(ds, prev->ds);
564 if (unlikely(next->ds | prev->ds)) 582 if (unlikely(next->ds | prev->ds))
@@ -584,50 +602,50 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
584 */ 602 */
585 arch_leave_lazy_cpu_mode(); 603 arch_leave_lazy_cpu_mode();
586 604
587 /* 605 /*
588 * Switch FS and GS. 606 * Switch FS and GS.
607 *
608 * Segment register != 0 always requires a reload. Also
609 * reload when it has changed. When prev process used 64bit
610 * base always reload to avoid an information leak.
589 */ 611 */
590 { 612 if (unlikely(fsindex | next->fsindex | prev->fs)) {
591 /* segment register != 0 always requires a reload. 613 loadsegment(fs, next->fsindex);
592 also reload when it has changed. 614 /*
593 when prev process used 64bit base always reload 615 * Check if the user used a selector != 0; if yes
594 to avoid an information leak. */ 616 * clear 64bit base, since overloaded base is always
595 if (unlikely(fsindex | next->fsindex | prev->fs)) { 617 * mapped to the Null selector
596 loadsegment(fs, next->fsindex); 618 */
597 /* check if the user used a selector != 0 619 if (fsindex)
598 * if yes clear 64bit base, since overloaded base 620 prev->fs = 0;
599 * is always mapped to the Null selector 621 }
600 */ 622 /* when next process has a 64bit base use it */
601 if (fsindex) 623 if (next->fs)
602 prev->fs = 0; 624 wrmsrl(MSR_FS_BASE, next->fs);
603 } 625 prev->fsindex = fsindex;
604 /* when next process has a 64bit base use it */ 626
605 if (next->fs) 627 if (unlikely(gsindex | next->gsindex | prev->gs)) {
606 wrmsrl(MSR_FS_BASE, next->fs); 628 load_gs_index(next->gsindex);
607 prev->fsindex = fsindex; 629 if (gsindex)
608 630 prev->gs = 0;
609 if (unlikely(gsindex | next->gsindex | prev->gs)) {
610 load_gs_index(next->gsindex);
611 if (gsindex)
612 prev->gs = 0;
613 }
614 if (next->gs)
615 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
616 prev->gsindex = gsindex;
617 } 631 }
632 if (next->gs)
633 wrmsrl(MSR_KERNEL_GS_BASE, next->gs);
634 prev->gsindex = gsindex;
618 635
619 /* Must be after DS reload */ 636 /* Must be after DS reload */
620 unlazy_fpu(prev_p); 637 unlazy_fpu(prev_p);
621 638
622 /* 639 /*
623 * Switch the PDA and FPU contexts. 640 * Switch the PDA and FPU contexts.
624 */ 641 */
625 prev->usersp = read_pda(oldrsp); 642 prev->usersp = read_pda(oldrsp);
626 write_pda(oldrsp, next->usersp); 643 write_pda(oldrsp, next->usersp);
627 write_pda(pcurrent, next_p); 644 write_pda(pcurrent, next_p);
628 645
629 write_pda(kernelstack, 646 write_pda(kernelstack,
630 (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); 647 (unsigned long)task_stack_page(next_p) +
648 THREAD_SIZE - PDA_STACKOFFSET);
631#ifdef CONFIG_CC_STACKPROTECTOR 649#ifdef CONFIG_CC_STACKPROTECTOR
632 write_pda(stack_canary, next_p->stack_canary); 650 write_pda(stack_canary, next_p->stack_canary);
633 /* 651 /*
@@ -664,7 +682,7 @@ long sys_execve(char __user *name, char __user * __user *argv,
664 char __user * __user *envp, struct pt_regs *regs) 682 char __user * __user *envp, struct pt_regs *regs)
665{ 683{
666 long error; 684 long error;
667 char * filename; 685 char *filename;
668 686
669 filename = getname(name); 687 filename = getname(name);
670 error = PTR_ERR(filename); 688 error = PTR_ERR(filename);
@@ -722,55 +740,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs)
722unsigned long get_wchan(struct task_struct *p) 740unsigned long get_wchan(struct task_struct *p)
723{ 741{
724 unsigned long stack; 742 unsigned long stack;
725 u64 fp,ip; 743 u64 fp, ip;
726 int count = 0; 744 int count = 0;
727 745
728 if (!p || p == current || p->state==TASK_RUNNING) 746 if (!p || p == current || p->state == TASK_RUNNING)
729 return 0; 747 return 0;
730 stack = (unsigned long)task_stack_page(p); 748 stack = (unsigned long)task_stack_page(p);
731 if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) 749 if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
732 return 0; 750 return 0;
733 fp = *(u64 *)(p->thread.sp); 751 fp = *(u64 *)(p->thread.sp);
734 do { 752 do {
735 if (fp < (unsigned long)stack || 753 if (fp < (unsigned long)stack ||
736 fp > (unsigned long)stack+THREAD_SIZE) 754 fp >= (unsigned long)stack+THREAD_SIZE)
737 return 0; 755 return 0;
738 ip = *(u64 *)(fp+8); 756 ip = *(u64 *)(fp+8);
739 if (!in_sched_functions(ip)) 757 if (!in_sched_functions(ip))
740 return ip; 758 return ip;
741 fp = *(u64 *)fp; 759 fp = *(u64 *)fp;
742 } while (count++ < 16); 760 } while (count++ < 16);
743 return 0; 761 return 0;
744} 762}
745 763
746long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) 764long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
747{ 765{
748 int ret = 0; 766 int ret = 0;
749 int doit = task == current; 767 int doit = task == current;
750 int cpu; 768 int cpu;
751 769
752 switch (code) { 770 switch (code) {
753 case ARCH_SET_GS: 771 case ARCH_SET_GS:
754 if (addr >= TASK_SIZE_OF(task)) 772 if (addr >= TASK_SIZE_OF(task))
755 return -EPERM; 773 return -EPERM;
756 cpu = get_cpu(); 774 cpu = get_cpu();
757 /* handle small bases via the GDT because that's faster to 775 /* handle small bases via the GDT because that's faster to
758 switch. */ 776 switch. */
759 if (addr <= 0xffffffff) { 777 if (addr <= 0xffffffff) {
760 set_32bit_tls(task, GS_TLS, addr); 778 set_32bit_tls(task, GS_TLS, addr);
761 if (doit) { 779 if (doit) {
762 load_TLS(&task->thread, cpu); 780 load_TLS(&task->thread, cpu);
763 load_gs_index(GS_TLS_SEL); 781 load_gs_index(GS_TLS_SEL);
764 } 782 }
765 task->thread.gsindex = GS_TLS_SEL; 783 task->thread.gsindex = GS_TLS_SEL;
766 task->thread.gs = 0; 784 task->thread.gs = 0;
767 } else { 785 } else {
768 task->thread.gsindex = 0; 786 task->thread.gsindex = 0;
769 task->thread.gs = addr; 787 task->thread.gs = addr;
770 if (doit) { 788 if (doit) {
771 load_gs_index(0); 789 load_gs_index(0);
772 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); 790 ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr);
773 } 791 }
774 } 792 }
775 put_cpu(); 793 put_cpu();
776 break; 794 break;
@@ -824,8 +842,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
824 rdmsrl(MSR_KERNEL_GS_BASE, base); 842 rdmsrl(MSR_KERNEL_GS_BASE, base);
825 else 843 else
826 base = task->thread.gs; 844 base = task->thread.gs;
827 } 845 } else
828 else
829 base = task->thread.gs; 846 base = task->thread.gs;
830 ret = put_user(base, (unsigned long __user *)addr); 847 ret = put_user(base, (unsigned long __user *)addr);
831 break; 848 break;
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 77040b6070e1..0a6d8c12e10d 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -14,6 +14,7 @@
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/ptrace.h> 15#include <linux/ptrace.h>
16#include <linux/regset.h> 16#include <linux/regset.h>
17#include <linux/tracehook.h>
17#include <linux/user.h> 18#include <linux/user.h>
18#include <linux/elf.h> 19#include <linux/elf.h>
19#include <linux/security.h> 20#include <linux/security.h>
@@ -39,7 +40,9 @@ enum x86_regset {
39 REGSET_GENERAL, 40 REGSET_GENERAL,
40 REGSET_FP, 41 REGSET_FP,
41 REGSET_XFP, 42 REGSET_XFP,
43 REGSET_IOPERM64 = REGSET_XFP,
42 REGSET_TLS, 44 REGSET_TLS,
45 REGSET_IOPERM32,
43}; 46};
44 47
45/* 48/*
@@ -69,7 +72,7 @@ static inline bool invalid_selector(u16 value)
69 72
70#define FLAG_MASK FLAG_MASK_32 73#define FLAG_MASK FLAG_MASK_32
71 74
72static long *pt_regs_access(struct pt_regs *regs, unsigned long regno) 75static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno)
73{ 76{
74 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); 77 BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0);
75 regno >>= 2; 78 regno >>= 2;
@@ -554,45 +557,138 @@ static int ptrace_set_debugreg(struct task_struct *child,
554 return 0; 557 return 0;
555} 558}
556 559
557#ifdef X86_BTS 560/*
561 * These access the current or another (stopped) task's io permission
562 * bitmap for debugging or core dump.
563 */
564static int ioperm_active(struct task_struct *target,
565 const struct user_regset *regset)
566{
567 return target->thread.io_bitmap_max / regset->size;
568}
558 569
559static int ptrace_bts_get_size(struct task_struct *child) 570static int ioperm_get(struct task_struct *target,
571 const struct user_regset *regset,
572 unsigned int pos, unsigned int count,
573 void *kbuf, void __user *ubuf)
560{ 574{
561 if (!child->thread.ds_area_msr) 575 if (!target->thread.io_bitmap_ptr)
562 return -ENXIO; 576 return -ENXIO;
563 577
564 return ds_get_bts_index((void *)child->thread.ds_area_msr); 578 return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
579 target->thread.io_bitmap_ptr,
580 0, IO_BITMAP_BYTES);
581}
582
583#ifdef CONFIG_X86_PTRACE_BTS
584/*
585 * The configuration for a particular BTS hardware implementation.
586 */
587struct bts_configuration {
588 /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */
589 unsigned char sizeof_bts;
590 /* the size of a field in the BTS record in bytes */
591 unsigned char sizeof_field;
592 /* a bitmask to enable/disable BTS in DEBUGCTL MSR */
593 unsigned long debugctl_mask;
594};
595static struct bts_configuration bts_cfg;
596
597#define BTS_MAX_RECORD_SIZE (8 * 3)
598
599
600/*
601 * Branch Trace Store (BTS) uses the following format. Different
602 * architectures vary in the size of those fields.
603 * - source linear address
604 * - destination linear address
605 * - flags
606 *
607 * Later architectures use 64bit pointers throughout, whereas earlier
608 * architectures use 32bit pointers in 32bit mode.
609 *
610 * We compute the base address for the first 8 fields based on:
611 * - the field size stored in the DS configuration
612 * - the relative field position
613 *
614 * In order to store additional information in the BTS buffer, we use
615 * a special source address to indicate that the record requires
616 * special interpretation.
617 *
618 * Netburst indicated via a bit in the flags field whether the branch
619 * was predicted; this is ignored.
620 */
621
622enum bts_field {
623 bts_from = 0,
624 bts_to,
625 bts_flags,
626
627 bts_escape = (unsigned long)-1,
628 bts_qual = bts_to,
629 bts_jiffies = bts_flags
630};
631
632static inline unsigned long bts_get(const char *base, enum bts_field field)
633{
634 base += (bts_cfg.sizeof_field * field);
635 return *(unsigned long *)base;
565} 636}
566 637
567static int ptrace_bts_read_record(struct task_struct *child, 638static inline void bts_set(char *base, enum bts_field field, unsigned long val)
568 long index, 639{
640 base += (bts_cfg.sizeof_field * field);;
641 (*(unsigned long *)base) = val;
642}
643
644/*
645 * Translate a BTS record from the raw format into the bts_struct format
646 *
647 * out (out): bts_struct interpretation
648 * raw: raw BTS record
649 */
650static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw)
651{
652 memset(out, 0, sizeof(*out));
653 if (bts_get(raw, bts_from) == bts_escape) {
654 out->qualifier = bts_get(raw, bts_qual);
655 out->variant.jiffies = bts_get(raw, bts_jiffies);
656 } else {
657 out->qualifier = BTS_BRANCH;
658 out->variant.lbr.from_ip = bts_get(raw, bts_from);
659 out->variant.lbr.to_ip = bts_get(raw, bts_to);
660 }
661}
662
663static int ptrace_bts_read_record(struct task_struct *child, size_t index,
569 struct bts_struct __user *out) 664 struct bts_struct __user *out)
570{ 665{
571 struct bts_struct ret; 666 struct bts_struct ret;
572 int retval; 667 const void *bts_record;
573 int bts_end; 668 size_t bts_index, bts_end;
574 int bts_index; 669 int error;
575 670
576 if (!child->thread.ds_area_msr) 671 error = ds_get_bts_end(child, &bts_end);
577 return -ENXIO; 672 if (error < 0)
673 return error;
578 674
579 if (index < 0)
580 return -EINVAL;
581
582 bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr);
583 if (bts_end <= index) 675 if (bts_end <= index)
584 return -EINVAL; 676 return -EINVAL;
585 677
678 error = ds_get_bts_index(child, &bts_index);
679 if (error < 0)
680 return error;
681
586 /* translate the ptrace bts index into the ds bts index */ 682 /* translate the ptrace bts index into the ds bts index */
587 bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); 683 bts_index += bts_end - (index + 1);
588 bts_index -= (index + 1); 684 if (bts_end <= bts_index)
589 if (bts_index < 0) 685 bts_index -= bts_end;
590 bts_index += bts_end; 686
687 error = ds_access_bts(child, bts_index, &bts_record);
688 if (error < 0)
689 return error;
591 690
592 retval = ds_read_bts((void *)child->thread.ds_area_msr, 691 ptrace_bts_translate_record(&ret, bts_record);
593 bts_index, &ret);
594 if (retval < 0)
595 return retval;
596 692
597 if (copy_to_user(out, &ret, sizeof(ret))) 693 if (copy_to_user(out, &ret, sizeof(ret)))
598 return -EFAULT; 694 return -EFAULT;
@@ -600,101 +696,106 @@ static int ptrace_bts_read_record(struct task_struct *child,
600 return sizeof(ret); 696 return sizeof(ret);
601} 697}
602 698
603static int ptrace_bts_clear(struct task_struct *child)
604{
605 if (!child->thread.ds_area_msr)
606 return -ENXIO;
607
608 return ds_clear((void *)child->thread.ds_area_msr);
609}
610
611static int ptrace_bts_drain(struct task_struct *child, 699static int ptrace_bts_drain(struct task_struct *child,
612 long size, 700 long size,
613 struct bts_struct __user *out) 701 struct bts_struct __user *out)
614{ 702{
615 int end, i; 703 struct bts_struct ret;
616 void *ds = (void *)child->thread.ds_area_msr; 704 const unsigned char *raw;
617 705 size_t end, i;
618 if (!ds) 706 int error;
619 return -ENXIO;
620 707
621 end = ds_get_bts_index(ds); 708 error = ds_get_bts_index(child, &end);
622 if (end <= 0) 709 if (error < 0)
623 return end; 710 return error;
624 711
625 if (size < (end * sizeof(struct bts_struct))) 712 if (size < (end * sizeof(struct bts_struct)))
626 return -EIO; 713 return -EIO;
627 714
628 for (i = 0; i < end; i++, out++) { 715 error = ds_access_bts(child, 0, (const void **)&raw);
629 struct bts_struct ret; 716 if (error < 0)
630 int retval; 717 return error;
631 718
632 retval = ds_read_bts(ds, i, &ret); 719 for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) {
633 if (retval < 0) 720 ptrace_bts_translate_record(&ret, raw);
634 return retval;
635 721
636 if (copy_to_user(out, &ret, sizeof(ret))) 722 if (copy_to_user(out, &ret, sizeof(ret)))
637 return -EFAULT; 723 return -EFAULT;
638 } 724 }
639 725
640 ds_clear(ds); 726 error = ds_clear_bts(child);
727 if (error < 0)
728 return error;
641 729
642 return end; 730 return end;
643} 731}
644 732
733static void ptrace_bts_ovfl(struct task_struct *child)
734{
735 send_sig(child->thread.bts_ovfl_signal, child, 0);
736}
737
645static int ptrace_bts_config(struct task_struct *child, 738static int ptrace_bts_config(struct task_struct *child,
646 long cfg_size, 739 long cfg_size,
647 const struct ptrace_bts_config __user *ucfg) 740 const struct ptrace_bts_config __user *ucfg)
648{ 741{
649 struct ptrace_bts_config cfg; 742 struct ptrace_bts_config cfg;
650 int bts_size, ret = 0; 743 int error = 0;
651 void *ds;
652 744
745 error = -EOPNOTSUPP;
746 if (!bts_cfg.sizeof_bts)
747 goto errout;
748
749 error = -EIO;
653 if (cfg_size < sizeof(cfg)) 750 if (cfg_size < sizeof(cfg))
654 return -EIO; 751 goto errout;
655 752
753 error = -EFAULT;
656 if (copy_from_user(&cfg, ucfg, sizeof(cfg))) 754 if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
657 return -EFAULT; 755 goto errout;
658 756
659 if ((int)cfg.size < 0) 757 error = -EINVAL;
660 return -EINVAL; 758 if ((cfg.flags & PTRACE_BTS_O_SIGNAL) &&
759 !(cfg.flags & PTRACE_BTS_O_ALLOC))
760 goto errout;
661 761
662 bts_size = 0; 762 if (cfg.flags & PTRACE_BTS_O_ALLOC) {
663 ds = (void *)child->thread.ds_area_msr; 763 ds_ovfl_callback_t ovfl = NULL;
664 if (ds) { 764 unsigned int sig = 0;
665 bts_size = ds_get_bts_size(ds);
666 if (bts_size < 0)
667 return bts_size;
668 }
669 cfg.size = PAGE_ALIGN(cfg.size);
670 765
671 if (bts_size != cfg.size) { 766 /* we ignore the error in case we were not tracing child */
672 ret = ptrace_bts_realloc(child, cfg.size, 767 (void)ds_release_bts(child);
673 cfg.flags & PTRACE_BTS_O_CUT_SIZE); 768
674 if (ret < 0) 769 if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
770 if (!cfg.signal)
771 goto errout;
772
773 sig = cfg.signal;
774 ovfl = ptrace_bts_ovfl;
775 }
776
777 error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl);
778 if (error < 0)
675 goto errout; 779 goto errout;
676 780
677 ds = (void *)child->thread.ds_area_msr; 781 child->thread.bts_ovfl_signal = sig;
678 } 782 }
679 783
680 if (cfg.flags & PTRACE_BTS_O_SIGNAL) 784 error = -EINVAL;
681 ret = ds_set_overflow(ds, DS_O_SIGNAL); 785 if (!child->thread.ds_ctx && cfg.flags)
682 else
683 ret = ds_set_overflow(ds, DS_O_WRAP);
684 if (ret < 0)
685 goto errout; 786 goto errout;
686 787
687 if (cfg.flags & PTRACE_BTS_O_TRACE) 788 if (cfg.flags & PTRACE_BTS_O_TRACE)
688 child->thread.debugctlmsr |= ds_debugctl_mask(); 789 child->thread.debugctlmsr |= bts_cfg.debugctl_mask;
689 else 790 else
690 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 791 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
691 792
692 if (cfg.flags & PTRACE_BTS_O_SCHED) 793 if (cfg.flags & PTRACE_BTS_O_SCHED)
693 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 794 set_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
694 else 795 else
695 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 796 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
696 797
697 ret = sizeof(cfg); 798 error = sizeof(cfg);
698 799
699out: 800out:
700 if (child->thread.debugctlmsr) 801 if (child->thread.debugctlmsr)
@@ -702,10 +803,10 @@ out:
702 else 803 else
703 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 804 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
704 805
705 return ret; 806 return error;
706 807
707errout: 808errout:
708 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 809 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
709 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 810 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
710 goto out; 811 goto out;
711} 812}
@@ -714,29 +815,40 @@ static int ptrace_bts_status(struct task_struct *child,
714 long cfg_size, 815 long cfg_size,
715 struct ptrace_bts_config __user *ucfg) 816 struct ptrace_bts_config __user *ucfg)
716{ 817{
717 void *ds = (void *)child->thread.ds_area_msr;
718 struct ptrace_bts_config cfg; 818 struct ptrace_bts_config cfg;
819 size_t end;
820 const void *base, *max;
821 int error;
719 822
720 if (cfg_size < sizeof(cfg)) 823 if (cfg_size < sizeof(cfg))
721 return -EIO; 824 return -EIO;
722 825
723 memset(&cfg, 0, sizeof(cfg)); 826 error = ds_get_bts_end(child, &end);
827 if (error < 0)
828 return error;
829
830 error = ds_access_bts(child, /* index = */ 0, &base);
831 if (error < 0)
832 return error;
724 833
725 if (ds) { 834 error = ds_access_bts(child, /* index = */ end, &max);
726 cfg.size = ds_get_bts_size(ds); 835 if (error < 0)
836 return error;
727 837
728 if (ds_get_overflow(ds) == DS_O_SIGNAL) 838 memset(&cfg, 0, sizeof(cfg));
729 cfg.flags |= PTRACE_BTS_O_SIGNAL; 839 cfg.size = (max - base);
840 cfg.signal = child->thread.bts_ovfl_signal;
841 cfg.bts_size = sizeof(struct bts_struct);
730 842
731 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && 843 if (cfg.signal)
732 child->thread.debugctlmsr & ds_debugctl_mask()) 844 cfg.flags |= PTRACE_BTS_O_SIGNAL;
733 cfg.flags |= PTRACE_BTS_O_TRACE;
734 845
735 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) 846 if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) &&
736 cfg.flags |= PTRACE_BTS_O_SCHED; 847 child->thread.debugctlmsr & bts_cfg.debugctl_mask)
737 } 848 cfg.flags |= PTRACE_BTS_O_TRACE;
738 849
739 cfg.bts_size = sizeof(struct bts_struct); 850 if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS))
851 cfg.flags |= PTRACE_BTS_O_SCHED;
740 852
741 if (copy_to_user(ucfg, &cfg, sizeof(cfg))) 853 if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
742 return -EFAULT; 854 return -EFAULT;
@@ -744,89 +856,38 @@ static int ptrace_bts_status(struct task_struct *child,
744 return sizeof(cfg); 856 return sizeof(cfg);
745} 857}
746 858
747
748static int ptrace_bts_write_record(struct task_struct *child, 859static int ptrace_bts_write_record(struct task_struct *child,
749 const struct bts_struct *in) 860 const struct bts_struct *in)
750{ 861{
751 int retval; 862 unsigned char bts_record[BTS_MAX_RECORD_SIZE];
752 863
753 if (!child->thread.ds_area_msr) 864 BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts);
754 return -ENXIO;
755 865
756 retval = ds_write_bts((void *)child->thread.ds_area_msr, in); 866 memset(bts_record, 0, bts_cfg.sizeof_bts);
757 if (retval) 867 switch (in->qualifier) {
758 return retval; 868 case BTS_INVALID:
869 break;
759 870
760 return sizeof(*in); 871 case BTS_BRANCH:
761} 872 bts_set(bts_record, bts_from, in->variant.lbr.from_ip);
873 bts_set(bts_record, bts_to, in->variant.lbr.to_ip);
874 break;
762 875
763static int ptrace_bts_realloc(struct task_struct *child, 876 case BTS_TASK_ARRIVES:
764 int size, int reduce_size) 877 case BTS_TASK_DEPARTS:
765{ 878 bts_set(bts_record, bts_from, bts_escape);
766 unsigned long rlim, vm; 879 bts_set(bts_record, bts_qual, in->qualifier);
767 int ret, old_size; 880 bts_set(bts_record, bts_jiffies, in->variant.jiffies);
881 break;
768 882
769 if (size < 0) 883 default:
770 return -EINVAL; 884 return -EINVAL;
771
772 old_size = ds_get_bts_size((void *)child->thread.ds_area_msr);
773 if (old_size < 0)
774 return old_size;
775
776 ret = ds_free((void **)&child->thread.ds_area_msr);
777 if (ret < 0)
778 goto out;
779
780 size >>= PAGE_SHIFT;
781 old_size >>= PAGE_SHIFT;
782
783 current->mm->total_vm -= old_size;
784 current->mm->locked_vm -= old_size;
785
786 if (size == 0)
787 goto out;
788
789 rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
790 vm = current->mm->total_vm + size;
791 if (rlim < vm) {
792 ret = -ENOMEM;
793
794 if (!reduce_size)
795 goto out;
796
797 size = rlim - current->mm->total_vm;
798 if (size <= 0)
799 goto out;
800 } 885 }
801 886
802 rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; 887 /* The writing task will be the switched-to task on a context
803 vm = current->mm->locked_vm + size; 888 * switch. It needs to write into the switched-from task's BTS
804 if (rlim < vm) { 889 * buffer. */
805 ret = -ENOMEM; 890 return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts);
806
807 if (!reduce_size)
808 goto out;
809
810 size = rlim - current->mm->locked_vm;
811 if (size <= 0)
812 goto out;
813 }
814
815 ret = ds_allocate((void **)&child->thread.ds_area_msr,
816 size << PAGE_SHIFT);
817 if (ret < 0)
818 goto out;
819
820 current->mm->total_vm += size;
821 current->mm->locked_vm += size;
822
823out:
824 if (child->thread.ds_area_msr)
825 set_tsk_thread_flag(child, TIF_DS_AREA_MSR);
826 else
827 clear_tsk_thread_flag(child, TIF_DS_AREA_MSR);
828
829 return ret;
830} 891}
831 892
832void ptrace_bts_take_timestamp(struct task_struct *tsk, 893void ptrace_bts_take_timestamp(struct task_struct *tsk,
@@ -839,7 +900,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk,
839 900
840 ptrace_bts_write_record(tsk, &rec); 901 ptrace_bts_write_record(tsk, &rec);
841} 902}
842#endif /* X86_BTS */ 903
904static const struct bts_configuration bts_cfg_netburst = {
905 .sizeof_bts = sizeof(long) * 3,
906 .sizeof_field = sizeof(long),
907 .debugctl_mask = (1<<2)|(1<<3)|(1<<5)
908};
909
910static const struct bts_configuration bts_cfg_pentium_m = {
911 .sizeof_bts = sizeof(long) * 3,
912 .sizeof_field = sizeof(long),
913 .debugctl_mask = (1<<6)|(1<<7)
914};
915
916static const struct bts_configuration bts_cfg_core2 = {
917 .sizeof_bts = 8 * 3,
918 .sizeof_field = 8,
919 .debugctl_mask = (1<<6)|(1<<7)|(1<<9)
920};
921
922static inline void bts_configure(const struct bts_configuration *cfg)
923{
924 bts_cfg = *cfg;
925}
926
927void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c)
928{
929 switch (c->x86) {
930 case 0x6:
931 switch (c->x86_model) {
932 case 0xD:
933 case 0xE: /* Pentium M */
934 bts_configure(&bts_cfg_pentium_m);
935 break;
936 case 0xF: /* Core2 */
937 case 0x1C: /* Atom */
938 bts_configure(&bts_cfg_core2);
939 break;
940 default:
941 /* sorry, don't know about them */
942 break;
943 }
944 break;
945 case 0xF:
946 switch (c->x86_model) {
947 case 0x0:
948 case 0x1:
949 case 0x2: /* Netburst */
950 bts_configure(&bts_cfg_netburst);
951 break;
952 default:
953 /* sorry, don't know about them */
954 break;
955 }
956 break;
957 default:
958 /* sorry, don't know about them */
959 break;
960 }
961}
962#endif /* CONFIG_X86_PTRACE_BTS */
843 963
844/* 964/*
845 * Called by kernel/ptrace.c when detaching.. 965 * Called by kernel/ptrace.c when detaching..
@@ -852,15 +972,15 @@ void ptrace_disable(struct task_struct *child)
852#ifdef TIF_SYSCALL_EMU 972#ifdef TIF_SYSCALL_EMU
853 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); 973 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
854#endif 974#endif
855 if (child->thread.ds_area_msr) { 975#ifdef CONFIG_X86_PTRACE_BTS
856#ifdef X86_BTS 976 (void)ds_release_bts(child);
857 ptrace_bts_realloc(child, 0, 0); 977
858#endif 978 child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask;
859 child->thread.debugctlmsr &= ~ds_debugctl_mask(); 979 if (!child->thread.debugctlmsr)
860 if (!child->thread.debugctlmsr) 980 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
861 clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); 981
862 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); 982 clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS);
863 } 983#endif /* CONFIG_X86_PTRACE_BTS */
864} 984}
865 985
866#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 986#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
@@ -980,7 +1100,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
980 /* 1100 /*
981 * These bits need more cooking - not enabled yet: 1101 * These bits need more cooking - not enabled yet:
982 */ 1102 */
983#ifdef X86_BTS 1103#ifdef CONFIG_X86_PTRACE_BTS
984 case PTRACE_BTS_CONFIG: 1104 case PTRACE_BTS_CONFIG:
985 ret = ptrace_bts_config 1105 ret = ptrace_bts_config
986 (child, data, (struct ptrace_bts_config __user *)addr); 1106 (child, data, (struct ptrace_bts_config __user *)addr);
@@ -992,7 +1112,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
992 break; 1112 break;
993 1113
994 case PTRACE_BTS_SIZE: 1114 case PTRACE_BTS_SIZE:
995 ret = ptrace_bts_get_size(child); 1115 ret = ds_get_bts_index(child, /* pos = */ NULL);
996 break; 1116 break;
997 1117
998 case PTRACE_BTS_GET: 1118 case PTRACE_BTS_GET:
@@ -1001,14 +1121,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
1001 break; 1121 break;
1002 1122
1003 case PTRACE_BTS_CLEAR: 1123 case PTRACE_BTS_CLEAR:
1004 ret = ptrace_bts_clear(child); 1124 ret = ds_clear_bts(child);
1005 break; 1125 break;
1006 1126
1007 case PTRACE_BTS_DRAIN: 1127 case PTRACE_BTS_DRAIN:
1008 ret = ptrace_bts_drain 1128 ret = ptrace_bts_drain
1009 (child, data, (struct bts_struct __user *) addr); 1129 (child, data, (struct bts_struct __user *) addr);
1010 break; 1130 break;
1011#endif 1131#endif /* CONFIG_X86_PTRACE_BTS */
1012 1132
1013 default: 1133 default:
1014 ret = ptrace_request(child, request, addr, data); 1134 ret = ptrace_request(child, request, addr, data);
@@ -1290,6 +1410,12 @@ static const struct user_regset x86_64_regsets[] = {
1290 .size = sizeof(long), .align = sizeof(long), 1410 .size = sizeof(long), .align = sizeof(long),
1291 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1411 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1292 }, 1412 },
1413 [REGSET_IOPERM64] = {
1414 .core_note_type = NT_386_IOPERM,
1415 .n = IO_BITMAP_LONGS,
1416 .size = sizeof(long), .align = sizeof(long),
1417 .active = ioperm_active, .get = ioperm_get
1418 },
1293}; 1419};
1294 1420
1295static const struct user_regset_view user_x86_64_view = { 1421static const struct user_regset_view user_x86_64_view = {
@@ -1336,6 +1462,12 @@ static const struct user_regset x86_32_regsets[] = {
1336 .active = regset_tls_active, 1462 .active = regset_tls_active,
1337 .get = regset_tls_get, .set = regset_tls_set 1463 .get = regset_tls_get, .set = regset_tls_set
1338 }, 1464 },
1465 [REGSET_IOPERM32] = {
1466 .core_note_type = NT_386_IOPERM,
1467 .n = IO_BITMAP_BYTES / sizeof(u32),
1468 .size = sizeof(u32), .align = sizeof(u32),
1469 .active = ioperm_active, .get = ioperm_get
1470 },
1339}; 1471};
1340 1472
1341static const struct user_regset_view user_x86_32_view = { 1473static const struct user_regset_view user_x86_32_view = {
@@ -1357,9 +1489,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1357#endif 1489#endif
1358} 1490}
1359 1491
1360#ifdef CONFIG_X86_32 1492void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1361 1493 int error_code, int si_code)
1362void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1363{ 1494{
1364 struct siginfo info; 1495 struct siginfo info;
1365 1496
@@ -1368,7 +1499,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1368 1499
1369 memset(&info, 0, sizeof(info)); 1500 memset(&info, 0, sizeof(info));
1370 info.si_signo = SIGTRAP; 1501 info.si_signo = SIGTRAP;
1371 info.si_code = TRAP_BRKPT; 1502 info.si_code = si_code;
1372 1503
1373 /* User-mode ip? */ 1504 /* User-mode ip? */
1374 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; 1505 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL;
@@ -1377,143 +1508,83 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code)
1377 force_sig_info(SIGTRAP, &info, tsk); 1508 force_sig_info(SIGTRAP, &info, tsk);
1378} 1509}
1379 1510
1380/* notification of system call entry/exit
1381 * - triggered by current->work.syscall_trace
1382 */
1383int do_syscall_trace(struct pt_regs *regs, int entryexit)
1384{
1385 int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU);
1386 /*
1387 * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall
1388 * interception
1389 */
1390 int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP);
1391 int ret = 0;
1392
1393 /* do the secure computing check first */
1394 if (!entryexit)
1395 secure_computing(regs->orig_ax);
1396
1397 if (unlikely(current->audit_context)) {
1398 if (entryexit)
1399 audit_syscall_exit(AUDITSC_RESULT(regs->ax),
1400 regs->ax);
1401 /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only
1402 * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is
1403 * not used, entry.S will call us only on syscall exit, not
1404 * entry; so when TIF_SYSCALL_AUDIT is used we must avoid
1405 * calling send_sigtrap() on syscall entry.
1406 *
1407 * Note that when PTRACE_SYSEMU_SINGLESTEP is used,
1408 * is_singlestep is false, despite his name, so we will still do
1409 * the correct thing.
1410 */
1411 else if (is_singlestep)
1412 goto out;
1413 }
1414
1415 if (!(current->ptrace & PT_PTRACED))
1416 goto out;
1417
1418 /* If a process stops on the 1st tracepoint with SYSCALL_TRACE
1419 * and then is resumed with SYSEMU_SINGLESTEP, it will come in
1420 * here. We have to check this and return */
1421 if (is_sysemu && entryexit)
1422 return 0;
1423
1424 /* Fake a debug trap */
1425 if (is_singlestep)
1426 send_sigtrap(current, regs, 0);
1427
1428 if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu)
1429 goto out;
1430
1431 /* the 0x80 provides a way for the tracing parent to distinguish
1432 between a syscall stop and SIGTRAP delivery */
1433 /* Note that the debugger could change the result of test_thread_flag!*/
1434 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0));
1435 1511
1436 /* 1512#ifdef CONFIG_X86_32
1437 * this isn't the same as continuing with a signal, but it will do 1513# define IS_IA32 1
1438 * for normal use. strace only continues with a signal if the 1514#elif defined CONFIG_IA32_EMULATION
1439 * stopping signal is not SIGTRAP. -brl 1515# define IS_IA32 test_thread_flag(TIF_IA32)
1440 */ 1516#else
1441 if (current->exit_code) { 1517# define IS_IA32 0
1442 send_sig(current->exit_code, current, 1); 1518#endif
1443 current->exit_code = 0;
1444 }
1445 ret = is_sysemu;
1446out:
1447 if (unlikely(current->audit_context) && !entryexit)
1448 audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax,
1449 regs->bx, regs->cx, regs->dx, regs->si);
1450 if (ret == 0)
1451 return 0;
1452
1453 regs->orig_ax = -1; /* force skip of syscall restarting */
1454 if (unlikely(current->audit_context))
1455 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1456 return 1;
1457}
1458
1459#else /* CONFIG_X86_64 */
1460 1519
1461static void syscall_trace(struct pt_regs *regs) 1520/*
1521 * We must return the syscall number to actually look up in the table.
1522 * This can be -1L to skip running any syscall at all.
1523 */
1524asmregparm long syscall_trace_enter(struct pt_regs *regs)
1462{ 1525{
1526 long ret = 0;
1463 1527
1464#if 0
1465 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n",
1466 current->comm,
1467 regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0),
1468 current_thread_info()->flags, current->ptrace);
1469#endif
1470
1471 ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
1472 ? 0x80 : 0));
1473 /* 1528 /*
1474 * this isn't the same as continuing with a signal, but it will do 1529 * If we stepped into a sysenter/syscall insn, it trapped in
1475 * for normal use. strace only continues with a signal if the 1530 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
1476 * stopping signal is not SIGTRAP. -brl 1531 * If user-mode had set TF itself, then it's still clear from
1532 * do_debug() and we need to set it again to restore the user
1533 * state. If we entered on the slow path, TF was already set.
1477 */ 1534 */
1478 if (current->exit_code) { 1535 if (test_thread_flag(TIF_SINGLESTEP))
1479 send_sig(current->exit_code, current, 1); 1536 regs->flags |= X86_EFLAGS_TF;
1480 current->exit_code = 0;
1481 }
1482}
1483 1537
1484asmlinkage void syscall_trace_enter(struct pt_regs *regs)
1485{
1486 /* do the secure computing check first */ 1538 /* do the secure computing check first */
1487 secure_computing(regs->orig_ax); 1539 secure_computing(regs->orig_ax);
1488 1540
1489 if (test_thread_flag(TIF_SYSCALL_TRACE) 1541 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1490 && (current->ptrace & PT_PTRACED)) 1542 ret = -1L;
1491 syscall_trace(regs); 1543
1544 if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
1545 tracehook_report_syscall_entry(regs))
1546 ret = -1L;
1492 1547
1493 if (unlikely(current->audit_context)) { 1548 if (unlikely(current->audit_context)) {
1494 if (test_thread_flag(TIF_IA32)) { 1549 if (IS_IA32)
1495 audit_syscall_entry(AUDIT_ARCH_I386, 1550 audit_syscall_entry(AUDIT_ARCH_I386,
1496 regs->orig_ax, 1551 regs->orig_ax,
1497 regs->bx, regs->cx, 1552 regs->bx, regs->cx,
1498 regs->dx, regs->si); 1553 regs->dx, regs->si);
1499 } else { 1554#ifdef CONFIG_X86_64
1555 else
1500 audit_syscall_entry(AUDIT_ARCH_X86_64, 1556 audit_syscall_entry(AUDIT_ARCH_X86_64,
1501 regs->orig_ax, 1557 regs->orig_ax,
1502 regs->di, regs->si, 1558 regs->di, regs->si,
1503 regs->dx, regs->r10); 1559 regs->dx, regs->r10);
1504 } 1560#endif
1505 } 1561 }
1562
1563 return ret ?: regs->orig_ax;
1506} 1564}
1507 1565
1508asmlinkage void syscall_trace_leave(struct pt_regs *regs) 1566asmregparm void syscall_trace_leave(struct pt_regs *regs)
1509{ 1567{
1510 if (unlikely(current->audit_context)) 1568 if (unlikely(current->audit_context))
1511 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1569 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1512 1570
1513 if ((test_thread_flag(TIF_SYSCALL_TRACE) 1571 if (test_thread_flag(TIF_SYSCALL_TRACE))
1514 || test_thread_flag(TIF_SINGLESTEP)) 1572 tracehook_report_syscall_exit(regs, 0);
1515 && (current->ptrace & PT_PTRACED))
1516 syscall_trace(regs);
1517}
1518 1573
1519#endif /* CONFIG_X86_32 */ 1574 /*
1575 * If TIF_SYSCALL_EMU is set, we only get here because of
1576 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1577 * We already reported this syscall instruction in
1578 * syscall_trace_enter(), so don't do any more now.
1579 */
1580 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1581 return;
1582
1583 /*
1584 * If we are single-stepping, synthesize a trap to follow the
1585 * system call instruction.
1586 */
1587 if (test_thread_flag(TIF_SINGLESTEP) &&
1588 tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
1589 send_sigtrap(current, regs, 0, TRAP_BRKPT);
1590}
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 05fbe9a0325a..4f9c55f3a7c0 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
97 return dst->version; 97 return dst->version;
98} 98}
99 99
100unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
101{
102 u64 pv_tsc_khz = 1000000ULL << 32;
103
104 do_div(pv_tsc_khz, src->tsc_to_system_mul);
105 if (src->tsc_shift < 0)
106 pv_tsc_khz <<= -src->tsc_shift;
107 else
108 pv_tsc_khz >>= src->tsc_shift;
109 return pv_tsc_khz;
110}
111
100cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 112cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
101{ 113{
102 struct pvclock_shadow_time shadow; 114 struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index d13858818100..67465ed89310 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -35,9 +35,6 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev)
35 if (!(word & (1 << 13))) { 35 if (!(word & (1 << 13))) {
36 dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " 36 dev_info(&dev->dev, "Intel E7520/7320/7525 detected; "
37 "disabling irq balancing and affinity\n"); 37 "disabling irq balancing and affinity\n");
38#ifdef CONFIG_IRQBALANCE
39 irqbalance_disable("");
40#endif
41 noirqdebug_setup(""); 38 noirqdebug_setup("");
42#ifdef CONFIG_PROC_FS 39#ifdef CONFIG_PROC_FS
43 no_irq_affinity = 1; 40 no_irq_affinity = 1;
@@ -354,9 +351,27 @@ static void ati_force_hpet_resume(void)
354 printk(KERN_DEBUG "Force enabled HPET at resume\n"); 351 printk(KERN_DEBUG "Force enabled HPET at resume\n");
355} 352}
356 353
354static u32 ati_ixp4x0_rev(struct pci_dev *dev)
355{
356 u32 d;
357 u8 b;
358
359 pci_read_config_byte(dev, 0xac, &b);
360 b &= ~(1<<5);
361 pci_write_config_byte(dev, 0xac, b);
362 pci_read_config_dword(dev, 0x70, &d);
363 d |= 1<<8;
364 pci_write_config_dword(dev, 0x70, d);
365 pci_read_config_dword(dev, 0x8, &d);
366 d &= 0xff;
367 dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d);
368 return d;
369}
370
357static void ati_force_enable_hpet(struct pci_dev *dev) 371static void ati_force_enable_hpet(struct pci_dev *dev)
358{ 372{
359 u32 uninitialized_var(val); 373 u32 d, val;
374 u8 b;
360 375
361 if (hpet_address || force_hpet_address) 376 if (hpet_address || force_hpet_address)
362 return; 377 return;
@@ -366,14 +381,33 @@ static void ati_force_enable_hpet(struct pci_dev *dev)
366 return; 381 return;
367 } 382 }
368 383
384 d = ati_ixp4x0_rev(dev);
385 if (d < 0x82)
386 return;
387
388 /* base address */
369 pci_write_config_dword(dev, 0x14, 0xfed00000); 389 pci_write_config_dword(dev, 0x14, 0xfed00000);
370 pci_read_config_dword(dev, 0x14, &val); 390 pci_read_config_dword(dev, 0x14, &val);
391
392 /* enable interrupt */
393 outb(0x72, 0xcd6); b = inb(0xcd7);
394 b |= 0x1;
395 outb(0x72, 0xcd6); outb(b, 0xcd7);
396 outb(0x72, 0xcd6); b = inb(0xcd7);
397 if (!(b & 0x1))
398 return;
399 pci_read_config_dword(dev, 0x64, &d);
400 d |= (1<<10);
401 pci_write_config_dword(dev, 0x64, d);
402 pci_read_config_dword(dev, 0x64, &d);
403 if (!(d & (1<<10)))
404 return;
405
371 force_hpet_address = val; 406 force_hpet_address = val;
372 force_hpet_resume_type = ATI_FORCE_HPET_RESUME; 407 force_hpet_resume_type = ATI_FORCE_HPET_RESUME;
373 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", 408 dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n",
374 force_hpet_address); 409 force_hpet_address);
375 cached_dev = dev; 410 cached_dev = dev;
376 return;
377} 411}
378DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, 412DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS,
379 ati_force_enable_hpet); 413 ati_force_enable_hpet);
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f8a62160e151..f4c93f1cfc19 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off);
29 29
30static const struct desc_ptr no_idt = {}; 30static const struct desc_ptr no_idt = {};
31static int reboot_mode; 31static int reboot_mode;
32enum reboot_type reboot_type = BOOT_KBD; 32/*
33 * Keyboard reset and triple fault may result in INIT, not RESET, which
34 * doesn't work when we're in vmx root mode. Try ACPI first.
35 */
36enum reboot_type reboot_type = BOOT_ACPI;
33int reboot_force; 37int reboot_force;
34 38
35#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 39#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
@@ -177,6 +181,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
177 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), 181 DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"),
178 }, 182 },
179 }, 183 },
184 { /* Handle problems with rebooting on Dell T5400's */
185 .callback = set_bios_reboot,
186 .ident = "Dell Precision T5400",
187 .matches = {
188 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
189 DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
190 },
191 },
180 { /* Handle problems with rebooting on HP laptops */ 192 { /* Handle problems with rebooting on HP laptops */
181 .callback = set_bios_reboot, 193 .callback = set_bios_reboot,
182 .ident = "HP Compaq Laptop", 194 .ident = "HP Compaq Laptop",
@@ -403,10 +415,9 @@ void native_machine_shutdown(void)
403{ 415{
404 /* Stop the cpus and apics */ 416 /* Stop the cpus and apics */
405#ifdef CONFIG_SMP 417#ifdef CONFIG_SMP
406 int reboot_cpu_id;
407 418
408 /* The boot cpu is always logical cpu 0 */ 419 /* The boot cpu is always logical cpu 0 */
409 reboot_cpu_id = 0; 420 int reboot_cpu_id = 0;
410 421
411#ifdef CONFIG_X86_32 422#ifdef CONFIG_X86_32
412 /* See if there has been given a command line override */ 423 /* See if there has been given a command line override */
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index c30fe25d470d..6f50664b2ba5 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -20,11 +20,45 @@
20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) 20#define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY)
21#define PAE_PGD_ATTR (_PAGE_PRESENT) 21#define PAE_PGD_ATTR (_PAGE_PRESENT)
22 22
23/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE
24 * ~ control_page + PAGE_SIZE are used as data storage and stack for
25 * jumping back
26 */
27#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset))
28
29/* Minimal CPU state */
30#define ESP DATA(0x0)
31#define CR0 DATA(0x4)
32#define CR3 DATA(0x8)
33#define CR4 DATA(0xc)
34
35/* other data */
36#define CP_VA_CONTROL_PAGE DATA(0x10)
37#define CP_PA_PGD DATA(0x14)
38#define CP_PA_SWAP_PAGE DATA(0x18)
39#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c)
40
23 .text 41 .text
24 .align PAGE_SIZE 42 .align PAGE_SIZE
25 .globl relocate_kernel 43 .globl relocate_kernel
26relocate_kernel: 44relocate_kernel:
27 movl 8(%esp), %ebp /* list of pages */ 45 /* Save the CPU context, used for jumping back */
46
47 pushl %ebx
48 pushl %esi
49 pushl %edi
50 pushl %ebp
51 pushf
52
53 movl 20+8(%esp), %ebp /* list of pages */
54 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
55 movl %esp, ESP(%edi)
56 movl %cr0, %eax
57 movl %eax, CR0(%edi)
58 movl %cr3, %eax
59 movl %eax, CR3(%edi)
60 movl %cr4, %eax
61 movl %eax, CR4(%edi)
28 62
29#ifdef CONFIG_X86_PAE 63#ifdef CONFIG_X86_PAE
30 /* map the control page at its virtual address */ 64 /* map the control page at its virtual address */
@@ -138,15 +172,25 @@ relocate_kernel:
138 172
139relocate_new_kernel: 173relocate_new_kernel:
140 /* read the arguments and say goodbye to the stack */ 174 /* read the arguments and say goodbye to the stack */
141 movl 4(%esp), %ebx /* page_list */ 175 movl 20+4(%esp), %ebx /* page_list */
142 movl 8(%esp), %ebp /* list of pages */ 176 movl 20+8(%esp), %ebp /* list of pages */
143 movl 12(%esp), %edx /* start address */ 177 movl 20+12(%esp), %edx /* start address */
144 movl 16(%esp), %ecx /* cpu_has_pae */ 178 movl 20+16(%esp), %ecx /* cpu_has_pae */
179 movl 20+20(%esp), %esi /* preserve_context */
145 180
146 /* zero out flags, and disable interrupts */ 181 /* zero out flags, and disable interrupts */
147 pushl $0 182 pushl $0
148 popfl 183 popfl
149 184
185 /* save some information for jumping back */
186 movl PTR(VA_CONTROL_PAGE)(%ebp), %edi
187 movl %edi, CP_VA_CONTROL_PAGE(%edi)
188 movl PTR(PA_PGD)(%ebp), %eax
189 movl %eax, CP_PA_PGD(%edi)
190 movl PTR(PA_SWAP_PAGE)(%ebp), %eax
191 movl %eax, CP_PA_SWAP_PAGE(%edi)
192 movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi)
193
150 /* get physical address of control page now */ 194 /* get physical address of control page now */
151 /* this is impossible after page table switch */ 195 /* this is impossible after page table switch */
152 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi 196 movl PTR(PA_CONTROL_PAGE)(%ebp), %edi
@@ -197,8 +241,90 @@ identity_mapped:
197 xorl %eax, %eax 241 xorl %eax, %eax
198 movl %eax, %cr3 242 movl %eax, %cr3
199 243
244 movl CP_PA_SWAP_PAGE(%edi), %eax
245 pushl %eax
246 pushl %ebx
247 call swap_pages
248 addl $8, %esp
249
250 /* To be certain of avoiding problems with self-modifying code
251 * I need to execute a serializing instruction here.
252 * So I flush the TLB, it's handy, and not processor dependent.
253 */
254 xorl %eax, %eax
255 movl %eax, %cr3
256
257 /* set all of the registers to known values */
258 /* leave %esp alone */
259
260 testl %esi, %esi
261 jnz 1f
262 xorl %edi, %edi
263 xorl %eax, %eax
264 xorl %ebx, %ebx
265 xorl %ecx, %ecx
266 xorl %edx, %edx
267 xorl %esi, %esi
268 xorl %ebp, %ebp
269 ret
2701:
271 popl %edx
272 movl CP_PA_SWAP_PAGE(%edi), %esp
273 addl $PAGE_SIZE, %esp
2742:
275 call *%edx
276
277 /* get the re-entry point of the peer system */
278 movl 0(%esp), %ebp
279 call 1f
2801:
281 popl %ebx
282 subl $(1b - relocate_kernel), %ebx
283 movl CP_VA_CONTROL_PAGE(%ebx), %edi
284 lea PAGE_SIZE(%ebx), %esp
285 movl CP_PA_SWAP_PAGE(%ebx), %eax
286 movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx
287 pushl %eax
288 pushl %edx
289 call swap_pages
290 addl $8, %esp
291 movl CP_PA_PGD(%ebx), %eax
292 movl %eax, %cr3
293 movl %cr0, %eax
294 orl $(1<<31), %eax
295 movl %eax, %cr0
296 lea PAGE_SIZE(%edi), %esp
297 movl %edi, %eax
298 addl $(virtual_mapped - relocate_kernel), %eax
299 pushl %eax
300 ret
301
302virtual_mapped:
303 movl CR4(%edi), %eax
304 movl %eax, %cr4
305 movl CR3(%edi), %eax
306 movl %eax, %cr3
307 movl CR0(%edi), %eax
308 movl %eax, %cr0
309 movl ESP(%edi), %esp
310 movl %ebp, %eax
311
312 popf
313 popl %ebp
314 popl %edi
315 popl %esi
316 popl %ebx
317 ret
318
200 /* Do the copies */ 319 /* Do the copies */
201 movl %ebx, %ecx 320swap_pages:
321 movl 8(%esp), %edx
322 movl 4(%esp), %ecx
323 pushl %ebp
324 pushl %ebx
325 pushl %edi
326 pushl %esi
327 movl %ecx, %ebx
202 jmp 1f 328 jmp 1f
203 329
2040: /* top, read another word from the indirection page */ 3300: /* top, read another word from the indirection page */
@@ -226,27 +352,31 @@ identity_mapped:
226 movl %ecx, %esi /* For every source page do a copy */ 352 movl %ecx, %esi /* For every source page do a copy */
227 andl $0xfffff000, %esi 353 andl $0xfffff000, %esi
228 354
355 movl %edi, %eax
356 movl %esi, %ebp
357
358 movl %edx, %edi
229 movl $1024, %ecx 359 movl $1024, %ecx
230 rep ; movsl 360 rep ; movsl
231 jmp 0b
232
2333:
234 361
235 /* To be certain of avoiding problems with self-modifying code 362 movl %ebp, %edi
236 * I need to execute a serializing instruction here. 363 movl %eax, %esi
237 * So I flush the TLB, it's handy, and not processor dependent. 364 movl $1024, %ecx
238 */ 365 rep ; movsl
239 xorl %eax, %eax
240 movl %eax, %cr3
241 366
242 /* set all of the registers to known values */ 367 movl %eax, %edi
243 /* leave %esp alone */ 368 movl %edx, %esi
369 movl $1024, %ecx
370 rep ; movsl
244 371
245 xorl %eax, %eax 372 lea PAGE_SIZE(%ebp), %esi
246 xorl %ebx, %ebx 373 jmp 0b
247 xorl %ecx, %ecx 3743:
248 xorl %edx, %edx 375 popl %esi
249 xorl %esi, %esi 376 popl %edi
250 xorl %edi, %edi 377 popl %ebx
251 xorl %ebp, %ebp 378 popl %ebp
252 ret 379 ret
380
381 .globl kexec_control_code_size
382.set kexec_control_code_size, . - relocate_kernel
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 05191bbc68b8..dd6f2b71561b 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -52,7 +52,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
52 52
53 cmos_minutes = CMOS_READ(RTC_MINUTES); 53 cmos_minutes = CMOS_READ(RTC_MINUTES);
54 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) 54 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD)
55 BCD_TO_BIN(cmos_minutes); 55 cmos_minutes = bcd2bin(cmos_minutes);
56 56
57 /* 57 /*
58 * since we're only adjusting minutes and seconds, 58 * since we're only adjusting minutes and seconds,
@@ -69,8 +69,8 @@ int mach_set_rtc_mmss(unsigned long nowtime)
69 69
70 if (abs(real_minutes - cmos_minutes) < 30) { 70 if (abs(real_minutes - cmos_minutes) < 30) {
71 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { 71 if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) {
72 BIN_TO_BCD(real_seconds); 72 real_seconds = bin2bcd(real_seconds);
73 BIN_TO_BCD(real_minutes); 73 real_minutes = bin2bcd(real_minutes);
74 } 74 }
75 CMOS_WRITE(real_seconds,RTC_SECONDS); 75 CMOS_WRITE(real_seconds,RTC_SECONDS);
76 CMOS_WRITE(real_minutes,RTC_MINUTES); 76 CMOS_WRITE(real_minutes,RTC_MINUTES);
@@ -124,16 +124,16 @@ unsigned long mach_get_cmos_time(void)
124 WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); 124 WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
125 125
126 if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { 126 if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
127 BCD_TO_BIN(sec); 127 sec = bcd2bin(sec);
128 BCD_TO_BIN(min); 128 min = bcd2bin(min);
129 BCD_TO_BIN(hour); 129 hour = bcd2bin(hour);
130 BCD_TO_BIN(day); 130 day = bcd2bin(day);
131 BCD_TO_BIN(mon); 131 mon = bcd2bin(mon);
132 BCD_TO_BIN(year); 132 year = bcd2bin(year);
133 } 133 }
134 134
135 if (century) { 135 if (century) {
136 BCD_TO_BIN(century); 136 century = bcd2bin(century);
137 year += century * 100; 137 year += century * 100;
138 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); 138 printk(KERN_INFO "Extended CMOS year: %d\n", century * 100);
139 } else 139 } else
@@ -223,11 +223,25 @@ static struct platform_device rtc_device = {
223static __init int add_rtc_cmos(void) 223static __init int add_rtc_cmos(void)
224{ 224{
225#ifdef CONFIG_PNP 225#ifdef CONFIG_PNP
226 if (!pnp_platform_devices) 226 static const char *ids[] __initconst =
227 platform_device_register(&rtc_device); 227 { "PNP0b00", "PNP0b01", "PNP0b02", };
228#else 228 struct pnp_dev *dev;
229 struct pnp_id *id;
230 int i;
231
232 pnp_for_each_dev(dev) {
233 for (id = dev->id; id; id = id->next) {
234 for (i = 0; i < ARRAY_SIZE(ids); i++) {
235 if (compare_pnp_id(id, ids[i]) != 0)
236 return 0;
237 }
238 }
239 }
240#endif
241
229 platform_device_register(&rtc_device); 242 platform_device_register(&rtc_device);
230#endif /* CONFIG_PNP */ 243 dev_info(&rtc_device.dev,
244 "registered platform RTC device (no PNP device found)\n");
231 return 0; 245 return 0;
232} 246}
233device_initcall(add_rtc_cmos); 247device_initcall(add_rtc_cmos);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 531b55b8e81a..0fa6790c1dd3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -57,12 +57,8 @@
57#include <linux/slab.h> 57#include <linux/slab.h>
58#include <linux/user.h> 58#include <linux/user.h>
59#include <linux/delay.h> 59#include <linux/delay.h>
60#include <linux/highmem.h>
61 60
62#include <linux/kallsyms.h> 61#include <linux/kallsyms.h>
63#include <linux/edd.h>
64#include <linux/iscsi_ibft.h>
65#include <linux/kexec.h>
66#include <linux/cpufreq.h> 62#include <linux/cpufreq.h>
67#include <linux/dma-mapping.h> 63#include <linux/dma-mapping.h>
68#include <linux/ctype.h> 64#include <linux/ctype.h>
@@ -96,7 +92,7 @@
96#include <asm/smp.h> 92#include <asm/smp.h>
97#include <asm/desc.h> 93#include <asm/desc.h>
98#include <asm/dma.h> 94#include <asm/dma.h>
99#include <asm/gart.h> 95#include <asm/iommu.h>
100#include <asm/mmu_context.h> 96#include <asm/mmu_context.h>
101#include <asm/proto.h> 97#include <asm/proto.h>
102 98
@@ -104,7 +100,6 @@
104#include <asm/paravirt.h> 100#include <asm/paravirt.h>
105 101
106#include <asm/percpu.h> 102#include <asm/percpu.h>
107#include <asm/sections.h>
108#include <asm/topology.h> 103#include <asm/topology.h>
109#include <asm/apicdef.h> 104#include <asm/apicdef.h>
110#ifdef CONFIG_X86_64 105#ifdef CONFIG_X86_64
@@ -228,6 +223,9 @@ unsigned long saved_video_mode;
228#define RAMDISK_LOAD_FLAG 0x4000 223#define RAMDISK_LOAD_FLAG 0x4000
229 224
230static char __initdata command_line[COMMAND_LINE_SIZE]; 225static char __initdata command_line[COMMAND_LINE_SIZE];
226#ifdef CONFIG_CMDLINE_BOOL
227static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE;
228#endif
231 229
232#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) 230#if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE)
233struct edd edd; 231struct edd edd;
@@ -304,7 +302,7 @@ static void __init relocate_initrd(void)
304 if (clen > MAX_MAP_CHUNK-slop) 302 if (clen > MAX_MAP_CHUNK-slop)
305 clen = MAX_MAP_CHUNK-slop; 303 clen = MAX_MAP_CHUNK-slop;
306 mapaddr = ramdisk_image & PAGE_MASK; 304 mapaddr = ramdisk_image & PAGE_MASK;
307 p = early_ioremap(mapaddr, clen+slop); 305 p = early_memremap(mapaddr, clen+slop);
308 memcpy(q, p+slop, clen); 306 memcpy(q, p+slop, clen);
309 early_iounmap(p, clen+slop); 307 early_iounmap(p, clen+slop);
310 q += clen; 308 q += clen;
@@ -381,7 +379,7 @@ static void __init parse_setup_data(void)
381 return; 379 return;
382 pa_data = boot_params.hdr.setup_data; 380 pa_data = boot_params.hdr.setup_data;
383 while (pa_data) { 381 while (pa_data) {
384 data = early_ioremap(pa_data, PAGE_SIZE); 382 data = early_memremap(pa_data, PAGE_SIZE);
385 switch (data->type) { 383 switch (data->type) {
386 case SETUP_E820_EXT: 384 case SETUP_E820_EXT:
387 parse_e820_ext(data, pa_data); 385 parse_e820_ext(data, pa_data);
@@ -404,7 +402,7 @@ static void __init e820_reserve_setup_data(void)
404 return; 402 return;
405 pa_data = boot_params.hdr.setup_data; 403 pa_data = boot_params.hdr.setup_data;
406 while (pa_data) { 404 while (pa_data) {
407 data = early_ioremap(pa_data, sizeof(*data)); 405 data = early_memremap(pa_data, sizeof(*data));
408 e820_update_range(pa_data, sizeof(*data)+data->len, 406 e820_update_range(pa_data, sizeof(*data)+data->len,
409 E820_RAM, E820_RESERVED_KERN); 407 E820_RAM, E820_RESERVED_KERN);
410 found = 1; 408 found = 1;
@@ -430,7 +428,7 @@ static void __init reserve_early_setup_data(void)
430 return; 428 return;
431 pa_data = boot_params.hdr.setup_data; 429 pa_data = boot_params.hdr.setup_data;
432 while (pa_data) { 430 while (pa_data) {
433 data = early_ioremap(pa_data, sizeof(*data)); 431 data = early_memremap(pa_data, sizeof(*data));
434 sprintf(buf, "setup data %x", data->type); 432 sprintf(buf, "setup data %x", data->type);
435 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); 433 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf);
436 pa_data = data->next; 434 pa_data = data->next;
@@ -450,7 +448,7 @@ static void __init reserve_early_setup_data(void)
450 * @size: Size of the crashkernel memory to reserve. 448 * @size: Size of the crashkernel memory to reserve.
451 * Returns the base address on success, and -1ULL on failure. 449 * Returns the base address on success, and -1ULL on failure.
452 */ 450 */
453unsigned long long find_and_reserve_crashkernel(unsigned long long size) 451unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
454{ 452{
455 const unsigned long long alignment = 16<<20; /* 16M */ 453 const unsigned long long alignment = 16<<20; /* 16M */
456 unsigned long long start = 0LL; 454 unsigned long long start = 0LL;
@@ -563,7 +561,13 @@ static void __init reserve_standard_io_resources(void)
563 561
564} 562}
565 563
566#ifdef CONFIG_PROC_VMCORE 564/*
565 * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
566 * is_kdump_kernel() to determine if we are booting after a panic. Hence
567 * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
568 */
569
570#ifdef CONFIG_CRASH_DUMP
567/* elfcorehdr= specifies the location of elf core header 571/* elfcorehdr= specifies the location of elf core header
568 * stored by the crashed kernel. This option will be passed 572 * stored by the crashed kernel. This option will be passed
569 * by kexec loader to the capture kernel. 573 * by kexec loader to the capture kernel.
@@ -579,6 +583,194 @@ static int __init setup_elfcorehdr(char *arg)
579early_param("elfcorehdr", setup_elfcorehdr); 583early_param("elfcorehdr", setup_elfcorehdr);
580#endif 584#endif
581 585
586static struct x86_quirks default_x86_quirks __initdata;
587
588struct x86_quirks *x86_quirks __initdata = &default_x86_quirks;
589
590/*
591 * Some BIOSes seem to corrupt the low 64k of memory during events
592 * like suspend/resume and unplugging an HDMI cable. Reserve all
593 * remaining free memory in that area and fill it with a distinct
594 * pattern.
595 */
596#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
597#define MAX_SCAN_AREAS 8
598
599static int __read_mostly memory_corruption_check = -1;
600
601static unsigned __read_mostly corruption_check_size = 64*1024;
602static unsigned __read_mostly corruption_check_period = 60; /* seconds */
603
604static struct e820entry scan_areas[MAX_SCAN_AREAS];
605static int num_scan_areas;
606
607
608static int set_corruption_check(char *arg)
609{
610 char *end;
611
612 memory_corruption_check = simple_strtol(arg, &end, 10);
613
614 return (*end == 0) ? 0 : -EINVAL;
615}
616early_param("memory_corruption_check", set_corruption_check);
617
618static int set_corruption_check_period(char *arg)
619{
620 char *end;
621
622 corruption_check_period = simple_strtoul(arg, &end, 10);
623
624 return (*end == 0) ? 0 : -EINVAL;
625}
626early_param("memory_corruption_check_period", set_corruption_check_period);
627
628static int set_corruption_check_size(char *arg)
629{
630 char *end;
631 unsigned size;
632
633 size = memparse(arg, &end);
634
635 if (*end == '\0')
636 corruption_check_size = size;
637
638 return (size == corruption_check_size) ? 0 : -EINVAL;
639}
640early_param("memory_corruption_check_size", set_corruption_check_size);
641
642
643static void __init setup_bios_corruption_check(void)
644{
645 u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */
646
647 if (memory_corruption_check == -1) {
648 memory_corruption_check =
649#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
650 1
651#else
652 0
653#endif
654 ;
655 }
656
657 if (corruption_check_size == 0)
658 memory_corruption_check = 0;
659
660 if (!memory_corruption_check)
661 return;
662
663 corruption_check_size = round_up(corruption_check_size, PAGE_SIZE);
664
665 while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
666 u64 size;
667 addr = find_e820_area_size(addr, &size, PAGE_SIZE);
668
669 if (addr == 0)
670 break;
671
672 if ((addr + size) > corruption_check_size)
673 size = corruption_check_size - addr;
674
675 if (size == 0)
676 break;
677
678 e820_update_range(addr, size, E820_RAM, E820_RESERVED);
679 scan_areas[num_scan_areas].addr = addr;
680 scan_areas[num_scan_areas].size = size;
681 num_scan_areas++;
682
683 /* Assume we've already mapped this early memory */
684 memset(__va(addr), 0, size);
685
686 addr += size;
687 }
688
689 printk(KERN_INFO "Scanning %d areas for low memory corruption\n",
690 num_scan_areas);
691 update_e820();
692}
693
694static struct timer_list periodic_check_timer;
695
696void check_for_bios_corruption(void)
697{
698 int i;
699 int corruption = 0;
700
701 if (!memory_corruption_check)
702 return;
703
704 for(i = 0; i < num_scan_areas; i++) {
705 unsigned long *addr = __va(scan_areas[i].addr);
706 unsigned long size = scan_areas[i].size;
707
708 for(; size; addr++, size -= sizeof(unsigned long)) {
709 if (!*addr)
710 continue;
711 printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n",
712 addr, __pa(addr), *addr);
713 corruption = 1;
714 *addr = 0;
715 }
716 }
717
718 WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n");
719}
720
721static void periodic_check_for_corruption(unsigned long data)
722{
723 check_for_bios_corruption();
724 mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ));
725}
726
727void start_periodic_check_for_corruption(void)
728{
729 if (!memory_corruption_check || corruption_check_period == 0)
730 return;
731
732 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
733 corruption_check_period);
734
735 init_timer(&periodic_check_timer);
736 periodic_check_timer.function = &periodic_check_for_corruption;
737 periodic_check_for_corruption(0);
738}
739#endif
740
741static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
742{
743 printk(KERN_NOTICE
744 "%s detected: BIOS may corrupt low RAM, working it around.\n",
745 d->ident);
746
747 e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
748 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
749
750 return 0;
751}
752
753/* List of systems that have known low memory corruption BIOS problems */
754static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
755#ifdef CONFIG_X86_RESERVE_LOW_64K
756 {
757 .callback = dmi_low_memory_corruption,
758 .ident = "AMI BIOS",
759 .matches = {
760 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
761 },
762 },
763 {
764 .callback = dmi_low_memory_corruption,
765 .ident = "Phoenix BIOS",
766 .matches = {
767 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"),
768 },
769 },
770#endif
771 {}
772};
773
582/* 774/*
583 * Determine if we were loaded by an EFI loader. If so, then we have also been 775 * Determine if we were loaded by an EFI loader. If so, then we have also been
584 * passed the efi memmap, systab, etc., so we should use these data structures 776 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -598,11 +790,11 @@ void __init setup_arch(char **cmdline_p)
598 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 790 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
599 visws_early_detect(); 791 visws_early_detect();
600 pre_setup_arch_hook(); 792 pre_setup_arch_hook();
601 early_cpu_init();
602#else 793#else
603 printk(KERN_INFO "Command line: %s\n", boot_command_line); 794 printk(KERN_INFO "Command line: %s\n", boot_command_line);
604#endif 795#endif
605 796
797 early_cpu_init();
606 early_ioremap_init(); 798 early_ioremap_init();
607 799
608 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); 800 ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
@@ -666,14 +858,36 @@ void __init setup_arch(char **cmdline_p)
666 bss_resource.start = virt_to_phys(&__bss_start); 858 bss_resource.start = virt_to_phys(&__bss_start);
667 bss_resource.end = virt_to_phys(&__bss_stop)-1; 859 bss_resource.end = virt_to_phys(&__bss_stop)-1;
668 860
669#ifdef CONFIG_X86_64 861#ifdef CONFIG_CMDLINE_BOOL
670 early_cpu_init(); 862#ifdef CONFIG_CMDLINE_OVERRIDE
863 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
864#else
865 if (builtin_cmdline[0]) {
866 /* append boot loader cmdline to builtin */
867 strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE);
868 strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE);
869 strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE);
870 }
671#endif 871#endif
872#endif
873
672 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 874 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
673 *cmdline_p = command_line; 875 *cmdline_p = command_line;
674 876
675 parse_early_param(); 877 parse_early_param();
676 878
879#ifdef CONFIG_X86_64
880 check_efer();
881#endif
882
883#if defined(CONFIG_VMI) && defined(CONFIG_X86_32)
884 /*
885 * Must be before kernel pagetables are setup
886 * or fixmap area is touched.
887 */
888 vmi_init();
889#endif
890
677 /* after early param, so could get panic from serial */ 891 /* after early param, so could get panic from serial */
678 reserve_early_setup_data(); 892 reserve_early_setup_data();
679 893
@@ -681,7 +895,7 @@ void __init setup_arch(char **cmdline_p)
681#ifdef CONFIG_X86_LOCAL_APIC 895#ifdef CONFIG_X86_LOCAL_APIC
682 disable_apic = 1; 896 disable_apic = 1;
683#endif 897#endif
684 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); 898 setup_clear_cpu_cap(X86_FEATURE_APIC);
685 } 899 }
686 900
687#ifdef CONFIG_PCI 901#ifdef CONFIG_PCI
@@ -691,6 +905,10 @@ void __init setup_arch(char **cmdline_p)
691 905
692 finish_e820_parsing(); 906 finish_e820_parsing();
693 907
908 dmi_scan_machine();
909
910 dmi_check_system(bad_bios_dmi_table);
911
694#ifdef CONFIG_X86_32 912#ifdef CONFIG_X86_32
695 probe_roms(); 913 probe_roms();
696#endif 914#endif
@@ -734,7 +952,8 @@ void __init setup_arch(char **cmdline_p)
734#else 952#else
735 num_physpages = max_pfn; 953 num_physpages = max_pfn;
736 954
737 check_efer(); 955 if (cpu_has_x2apic)
956 check_x2apic();
738 957
739 /* How many end-of-memory variables you have, grandma! */ 958 /* How many end-of-memory variables you have, grandma! */
740 /* need this before calling reserve_initrd */ 959 /* need this before calling reserve_initrd */
@@ -746,6 +965,10 @@ void __init setup_arch(char **cmdline_p)
746 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 965 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
747#endif 966#endif
748 967
968#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
969 setup_bios_corruption_check();
970#endif
971
749 /* max_pfn_mapped is updated here */ 972 /* max_pfn_mapped is updated here */
750 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT); 973 max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<<PAGE_SHIFT);
751 max_pfn_mapped = max_low_pfn_mapped; 974 max_pfn_mapped = max_low_pfn_mapped;
@@ -774,8 +997,6 @@ void __init setup_arch(char **cmdline_p)
774 vsmp_init(); 997 vsmp_init();
775#endif 998#endif
776 999
777 dmi_scan_machine();
778
779 io_delay_init(); 1000 io_delay_init();
780 1001
781 /* 1002 /*
@@ -783,6 +1004,8 @@ void __init setup_arch(char **cmdline_p)
783 */ 1004 */
784 acpi_boot_table_init(); 1005 acpi_boot_table_init();
785 1006
1007 early_acpi_boot_init();
1008
786#ifdef CONFIG_ACPI_NUMA 1009#ifdef CONFIG_ACPI_NUMA
787 /* 1010 /*
788 * Parse SRAT to discover nodes. 1011 * Parse SRAT to discover nodes.
@@ -792,10 +1015,6 @@ void __init setup_arch(char **cmdline_p)
792 1015
793 initmem_init(0, max_pfn); 1016 initmem_init(0, max_pfn);
794 1017
795#ifdef CONFIG_X86_64
796 dma32_reserve_bootmem();
797#endif
798
799#ifdef CONFIG_ACPI_SLEEP 1018#ifdef CONFIG_ACPI_SLEEP
800 /* 1019 /*
801 * Reserve low memory region for sleep support. 1020 * Reserve low memory region for sleep support.
@@ -810,21 +1029,25 @@ void __init setup_arch(char **cmdline_p)
810#endif 1029#endif
811 reserve_crashkernel(); 1030 reserve_crashkernel();
812 1031
1032#ifdef CONFIG_X86_64
1033 /*
1034 * dma32_reserve_bootmem() allocates bootmem which may conflict
1035 * with the crashkernel command line, so do that after
1036 * reserve_crashkernel()
1037 */
1038 dma32_reserve_bootmem();
1039#endif
1040
813 reserve_ibft_region(); 1041 reserve_ibft_region();
814 1042
815#ifdef CONFIG_KVM_CLOCK 1043#ifdef CONFIG_KVM_CLOCK
816 kvmclock_init(); 1044 kvmclock_init();
817#endif 1045#endif
818 1046
819#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) 1047 paravirt_pagetable_setup_start(swapper_pg_dir);
820 /*
821 * Must be after max_low_pfn is determined, and before kernel
822 * pagetables are setup.
823 */
824 vmi_init();
825#endif
826
827 paging_init(); 1048 paging_init();
1049 paravirt_pagetable_setup_done(swapper_pg_dir);
1050 paravirt_post_allocator_init();
828 1051
829#ifdef CONFIG_X86_64 1052#ifdef CONFIG_X86_64
830 map_vsyscall(); 1053 map_vsyscall();
@@ -850,27 +1073,17 @@ void __init setup_arch(char **cmdline_p)
850#endif 1073#endif
851 1074
852 prefill_possible_map(); 1075 prefill_possible_map();
1076
853#ifdef CONFIG_X86_64 1077#ifdef CONFIG_X86_64
854 init_cpu_to_node(); 1078 init_cpu_to_node();
855#endif 1079#endif
856 1080
857#ifdef CONFIG_X86_NUMAQ
858 /*
859 * need to check online nodes num, call it
860 * here before time_init/tsc_init
861 */
862 numaq_tsc_disable();
863#endif
864
865 init_apic_mappings(); 1081 init_apic_mappings();
866 ioapic_init_mappings(); 1082 ioapic_init_mappings();
867 1083
868#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) 1084 /* need to wait for io_apic is mapped */
869 if (def_to_bigsmp) 1085 nr_irqs = probe_nr_irqs();
870 printk(KERN_WARNING "More than 8 CPUs detected and " 1086
871 "CONFIG_X86_PC cannot handle it.\nUse "
872 "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n");
873#endif
874 kvm_guest_init(); 1087 kvm_guest_init();
875 1088
876 e820_reserve_resources(); 1089 e820_reserve_resources();
@@ -892,3 +1105,5 @@ void __init setup_arch(char **cmdline_p)
892#endif 1105#endif
893#endif 1106#endif
894} 1107}
1108
1109
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index cac68430d31f..ae0c0d3bb770 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -80,24 +80,6 @@ static void __init setup_per_cpu_maps(void)
80#endif 80#endif
81} 81}
82 82
83#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP
84cpumask_t *cpumask_of_cpu_map __read_mostly;
85EXPORT_SYMBOL(cpumask_of_cpu_map);
86
87/* requires nr_cpu_ids to be initialized */
88static void __init setup_cpumask_of_cpu(void)
89{
90 int i;
91
92 /* alloc_bootmem zeroes memory */
93 cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids);
94 for (i = 0; i < nr_cpu_ids; i++)
95 cpu_set(i, cpumask_of_cpu_map[i]);
96}
97#else
98static inline void setup_cpumask_of_cpu(void) { }
99#endif
100
101#ifdef CONFIG_X86_32 83#ifdef CONFIG_X86_32
102/* 84/*
103 * Great future not-so-futuristic plan: make i386 and x86_64 do it 85 * Great future not-so-futuristic plan: make i386 and x86_64 do it
@@ -158,35 +140,47 @@ static void __init setup_cpu_pda_map(void)
158 */ 140 */
159void __init setup_per_cpu_areas(void) 141void __init setup_per_cpu_areas(void)
160{ 142{
161 ssize_t size = PERCPU_ENOUGH_ROOM; 143 ssize_t size, old_size;
162 char *ptr; 144 char *ptr;
163 int cpu; 145 int cpu;
146 unsigned long align = 1;
164 147
165 /* Setup cpu_pda map */ 148 /* Setup cpu_pda map */
166 setup_cpu_pda_map(); 149 setup_cpu_pda_map();
167 150
168 /* Copy section for each CPU (we discard the original) */ 151 /* Copy section for each CPU (we discard the original) */
169 size = PERCPU_ENOUGH_ROOM; 152 old_size = PERCPU_ENOUGH_ROOM;
153 align = max_t(unsigned long, PAGE_SIZE, align);
154 size = roundup(old_size, align);
170 printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", 155 printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
171 size); 156 size);
172 157
173 for_each_possible_cpu(cpu) { 158 for_each_possible_cpu(cpu) {
174#ifndef CONFIG_NEED_MULTIPLE_NODES 159#ifndef CONFIG_NEED_MULTIPLE_NODES
175 ptr = alloc_bootmem_pages(size); 160 ptr = __alloc_bootmem(size, align,
161 __pa(MAX_DMA_ADDRESS));
176#else 162#else
177 int node = early_cpu_to_node(cpu); 163 int node = early_cpu_to_node(cpu);
178 if (!node_online(node) || !NODE_DATA(node)) { 164 if (!node_online(node) || !NODE_DATA(node)) {
179 ptr = alloc_bootmem_pages(size); 165 ptr = __alloc_bootmem(size, align,
166 __pa(MAX_DMA_ADDRESS));
180 printk(KERN_INFO 167 printk(KERN_INFO
181 "cpu %d has no node %d or node-local memory\n", 168 "cpu %d has no node %d or node-local memory\n",
182 cpu, node); 169 cpu, node);
170 if (ptr)
171 printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n",
172 cpu, __pa(ptr));
173 }
174 else {
175 ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
176 __pa(MAX_DMA_ADDRESS));
177 if (ptr)
178 printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
179 cpu, node, __pa(ptr));
183 } 180 }
184 else
185 ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
186#endif 181#endif
187 per_cpu_offset(cpu) = ptr - __per_cpu_start; 182 per_cpu_offset(cpu) = ptr - __per_cpu_start;
188 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); 183 memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start);
189
190 } 184 }
191 185
192 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", 186 printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n",
@@ -197,9 +191,6 @@ void __init setup_per_cpu_areas(void)
197 191
198 /* Setup node to cpumask map */ 192 /* Setup node to cpumask map */
199 setup_node_to_cpumask_map(); 193 setup_node_to_cpumask_map();
200
201 /* Setup cpumask_of_cpu map */
202 setup_cpumask_of_cpu();
203} 194}
204 195
205#endif 196#endif
@@ -227,8 +218,8 @@ static void __init setup_node_to_cpumask_map(void)
227 /* allocate the map */ 218 /* allocate the map */
228 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); 219 map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t));
229 220
230 Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", 221 pr_debug("Node to cpumask map at %p for %d nodes\n",
231 map, nr_node_ids); 222 map, nr_node_ids);
232 223
233 /* node_to_cpumask() will now work */ 224 /* node_to_cpumask() will now work */
234 node_to_cpumask_map = map; 225 node_to_cpumask_map = map;
@@ -248,7 +239,7 @@ void __cpuinit numa_set_node(int cpu, int node)
248 per_cpu(x86_cpu_to_node_map, cpu) = node; 239 per_cpu(x86_cpu_to_node_map, cpu) = node;
249 240
250 else 241 else
251 Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); 242 pr_debug("Setting node for non-present cpu %d\n", cpu);
252} 243}
253 244
254void __cpuinit numa_clear_node(int cpu) 245void __cpuinit numa_clear_node(int cpu)
diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h
index 72bbb519d2dc..cc673aa55ce4 100644
--- a/arch/x86/kernel/sigframe.h
+++ b/arch/x86/kernel/sigframe.h
@@ -3,9 +3,18 @@ struct sigframe {
3 char __user *pretcode; 3 char __user *pretcode;
4 int sig; 4 int sig;
5 struct sigcontext sc; 5 struct sigcontext sc;
6 struct _fpstate fpstate; 6 /*
7 * fpstate is unused. fpstate is moved/allocated after
8 * retcode[] below. This movement allows to have the FP state and the
9 * future state extensions (xsave) stay together.
10 * And at the same time retaining the unused fpstate, prevents changing
11 * the offset of extramask[] in the sigframe and thus prevent any
12 * legacy application accessing/modifying it.
13 */
14 struct _fpstate fpstate_unused;
7 unsigned long extramask[_NSIG_WORDS-1]; 15 unsigned long extramask[_NSIG_WORDS-1];
8 char retcode[8]; 16 char retcode[8];
17 /* fp state follows here */
9}; 18};
10 19
11struct rt_sigframe { 20struct rt_sigframe {
@@ -15,13 +24,19 @@ struct rt_sigframe {
15 void __user *puc; 24 void __user *puc;
16 struct siginfo info; 25 struct siginfo info;
17 struct ucontext uc; 26 struct ucontext uc;
18 struct _fpstate fpstate;
19 char retcode[8]; 27 char retcode[8];
28 /* fp state follows here */
20}; 29};
21#else 30#else
22struct rt_sigframe { 31struct rt_sigframe {
23 char __user *pretcode; 32 char __user *pretcode;
24 struct ucontext uc; 33 struct ucontext uc;
25 struct siginfo info; 34 struct siginfo info;
35 /* fp state follows here */
26}; 36};
37
38int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
39 sigset_t *set, struct pt_regs *regs);
40int ia32_setup_frame(int sig, struct k_sigaction *ka,
41 sigset_t *set, struct pt_regs *regs);
27#endif 42#endif
diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c
index d92373630963..d6dd057d0f22 100644
--- a/arch/x86/kernel/signal_32.c
+++ b/arch/x86/kernel/signal_32.c
@@ -17,6 +17,7 @@
17#include <linux/errno.h> 17#include <linux/errno.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/wait.h> 19#include <linux/wait.h>
20#include <linux/tracehook.h>
20#include <linux/elf.h> 21#include <linux/elf.h>
21#include <linux/smp.h> 22#include <linux/smp.h>
22#include <linux/mm.h> 23#include <linux/mm.h>
@@ -26,6 +27,8 @@
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27#include <asm/i387.h> 28#include <asm/i387.h>
28#include <asm/vdso.h> 29#include <asm/vdso.h>
30#include <asm/syscall.h>
31#include <asm/syscalls.h>
29 32
30#include "sigframe.h" 33#include "sigframe.h"
31 34
@@ -110,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx)
110 return do_sigaltstack(uss, uoss, regs->sp); 113 return do_sigaltstack(uss, uoss, regs->sp);
111} 114}
112 115
116#define COPY(x) { \
117 err |= __get_user(regs->x, &sc->x); \
118}
119
120#define COPY_SEG(seg) { \
121 unsigned short tmp; \
122 err |= __get_user(tmp, &sc->seg); \
123 regs->seg = tmp; \
124}
125
126#define COPY_SEG_STRICT(seg) { \
127 unsigned short tmp; \
128 err |= __get_user(tmp, &sc->seg); \
129 regs->seg = tmp | 3; \
130}
131
132#define GET_SEG(seg) { \
133 unsigned short tmp; \
134 err |= __get_user(tmp, &sc->seg); \
135 loadsegment(seg, tmp); \
136}
113 137
114/* 138/*
115 * Do a signal return; undo the signal stack. 139 * Do a signal return; undo the signal stack.
@@ -118,28 +142,13 @@ static int
118restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 142restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
119 unsigned long *pax) 143 unsigned long *pax)
120{ 144{
145 void __user *buf;
146 unsigned int tmpflags;
121 unsigned int err = 0; 147 unsigned int err = 0;
122 148
123 /* Always make any pending restarted system calls return -EINTR */ 149 /* Always make any pending restarted system calls return -EINTR */
124 current_thread_info()->restart_block.fn = do_no_restart_syscall; 150 current_thread_info()->restart_block.fn = do_no_restart_syscall;
125 151
126#define COPY(x) err |= __get_user(regs->x, &sc->x)
127
128#define COPY_SEG(seg) \
129 { unsigned short tmp; \
130 err |= __get_user(tmp, &sc->seg); \
131 regs->seg = tmp; }
132
133#define COPY_SEG_STRICT(seg) \
134 { unsigned short tmp; \
135 err |= __get_user(tmp, &sc->seg); \
136 regs->seg = tmp|3; }
137
138#define GET_SEG(seg) \
139 { unsigned short tmp; \
140 err |= __get_user(tmp, &sc->seg); \
141 loadsegment(seg, tmp); }
142
143 GET_SEG(gs); 152 GET_SEG(gs);
144 COPY_SEG(fs); 153 COPY_SEG(fs);
145 COPY_SEG(es); 154 COPY_SEG(es);
@@ -149,38 +158,15 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
149 COPY_SEG_STRICT(cs); 158 COPY_SEG_STRICT(cs);
150 COPY_SEG_STRICT(ss); 159 COPY_SEG_STRICT(ss);
151 160
152 { 161 err |= __get_user(tmpflags, &sc->flags);
153 unsigned int tmpflags; 162 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
154 163 regs->orig_ax = -1; /* disable syscall checks */
155 err |= __get_user(tmpflags, &sc->flags);
156 regs->flags = (regs->flags & ~FIX_EFLAGS) |
157 (tmpflags & FIX_EFLAGS);
158 regs->orig_ax = -1; /* disable syscall checks */
159 }
160 164
161 { 165 err |= __get_user(buf, &sc->fpstate);
162 struct _fpstate __user *buf; 166 err |= restore_i387_xstate(buf);
163
164 err |= __get_user(buf, &sc->fpstate);
165 if (buf) {
166 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
167 goto badframe;
168 err |= restore_i387(buf);
169 } else {
170 struct task_struct *me = current;
171
172 if (used_math()) {
173 clear_fpu(me);
174 clear_used_math();
175 }
176 }
177 }
178 167
179 err |= __get_user(*pax, &sc->ax); 168 err |= __get_user(*pax, &sc->ax);
180 return err; 169 return err;
181
182badframe:
183 return 1;
184} 170}
185 171
186asmlinkage unsigned long sys_sigreturn(unsigned long __unused) 172asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
@@ -212,7 +198,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused)
212 198
213badframe: 199badframe:
214 if (show_unhandled_signals && printk_ratelimit()) { 200 if (show_unhandled_signals && printk_ratelimit()) {
215 printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:" 201 printk("%s%s[%d] bad frame in sigreturn frame:"
216 "%p ip:%lx sp:%lx oeax:%lx", 202 "%p ip:%lx sp:%lx oeax:%lx",
217 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, 203 task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG,
218 current->comm, task_pid_nr(current), frame, regs->ip, 204 current->comm, task_pid_nr(current), frame, regs->ip,
@@ -226,9 +212,8 @@ badframe:
226 return 0; 212 return 0;
227} 213}
228 214
229asmlinkage int sys_rt_sigreturn(unsigned long __unused) 215static long do_rt_sigreturn(struct pt_regs *regs)
230{ 216{
231 struct pt_regs *regs = (struct pt_regs *)&__unused;
232 struct rt_sigframe __user *frame; 217 struct rt_sigframe __user *frame;
233 unsigned long ax; 218 unsigned long ax;
234 sigset_t set; 219 sigset_t set;
@@ -254,15 +239,22 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused)
254 return ax; 239 return ax;
255 240
256badframe: 241badframe:
257 force_sig(SIGSEGV, current); 242 signal_fault(regs, frame, "rt_sigreturn");
258 return 0; 243 return 0;
259} 244}
260 245
246asmlinkage int sys_rt_sigreturn(unsigned long __unused)
247{
248 struct pt_regs *regs = (struct pt_regs *)&__unused;
249
250 return do_rt_sigreturn(regs);
251}
252
261/* 253/*
262 * Set up a signal frame. 254 * Set up a signal frame.
263 */ 255 */
264static int 256static int
265setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, 257setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
266 struct pt_regs *regs, unsigned long mask) 258 struct pt_regs *regs, unsigned long mask)
267{ 259{
268 int tmp, err = 0; 260 int tmp, err = 0;
@@ -289,7 +281,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
289 err |= __put_user(regs->sp, &sc->sp_at_signal); 281 err |= __put_user(regs->sp, &sc->sp_at_signal);
290 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); 282 err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss);
291 283
292 tmp = save_i387(fpstate); 284 tmp = save_i387_xstate(fpstate);
293 if (tmp < 0) 285 if (tmp < 0)
294 err = 1; 286 err = 1;
295 else 287 else
@@ -306,7 +298,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate,
306 * Determine which stack to use.. 298 * Determine which stack to use..
307 */ 299 */
308static inline void __user * 300static inline void __user *
309get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) 301get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,
302 void **fpstate)
310{ 303{
311 unsigned long sp; 304 unsigned long sp;
312 305
@@ -332,6 +325,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
332 sp = (unsigned long) ka->sa.sa_restorer; 325 sp = (unsigned long) ka->sa.sa_restorer;
333 } 326 }
334 327
328 if (used_math()) {
329 sp = sp - sig_xstate_size;
330 *fpstate = (struct _fpstate *) sp;
331 }
332
335 sp -= frame_size; 333 sp -= frame_size;
336 /* 334 /*
337 * Align the stack pointer according to the i386 ABI, 335 * Align the stack pointer according to the i386 ABI,
@@ -343,38 +341,29 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size)
343} 341}
344 342
345static int 343static int
346setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, 344__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
347 struct pt_regs *regs) 345 struct pt_regs *regs)
348{ 346{
349 struct sigframe __user *frame; 347 struct sigframe __user *frame;
350 void __user *restorer; 348 void __user *restorer;
351 int err = 0; 349 int err = 0;
352 int usig; 350 void __user *fpstate = NULL;
353 351
354 frame = get_sigframe(ka, regs, sizeof(*frame)); 352 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
355 353
356 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 354 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
357 goto give_sigsegv; 355 return -EFAULT;
358 356
359 usig = current_thread_info()->exec_domain 357 if (__put_user(sig, &frame->sig))
360 && current_thread_info()->exec_domain->signal_invmap 358 return -EFAULT;
361 && sig < 32
362 ? current_thread_info()->exec_domain->signal_invmap[sig]
363 : sig;
364 359
365 err = __put_user(usig, &frame->sig); 360 if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]))
366 if (err) 361 return -EFAULT;
367 goto give_sigsegv;
368
369 err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]);
370 if (err)
371 goto give_sigsegv;
372 362
373 if (_NSIG_WORDS > 1) { 363 if (_NSIG_WORDS > 1) {
374 err = __copy_to_user(&frame->extramask, &set->sig[1], 364 if (__copy_to_user(&frame->extramask, &set->sig[1],
375 sizeof(frame->extramask)); 365 sizeof(frame->extramask)))
376 if (err) 366 return -EFAULT;
377 goto give_sigsegv;
378 } 367 }
379 368
380 if (current->mm->context.vdso) 369 if (current->mm->context.vdso)
@@ -399,7 +388,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
399 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); 388 err |= __put_user(0x80cd, (short __user *)(frame->retcode+6));
400 389
401 if (err) 390 if (err)
402 goto give_sigsegv; 391 return -EFAULT;
403 392
404 /* Set up registers for signal handler */ 393 /* Set up registers for signal handler */
405 regs->sp = (unsigned long)frame; 394 regs->sp = (unsigned long)frame;
@@ -414,50 +403,43 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set,
414 regs->cs = __USER_CS; 403 regs->cs = __USER_CS;
415 404
416 return 0; 405 return 0;
417
418give_sigsegv:
419 force_sigsegv(sig, current);
420 return -EFAULT;
421} 406}
422 407
423static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 408static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
424 sigset_t *set, struct pt_regs *regs) 409 sigset_t *set, struct pt_regs *regs)
425{ 410{
426 struct rt_sigframe __user *frame; 411 struct rt_sigframe __user *frame;
427 void __user *restorer; 412 void __user *restorer;
428 int err = 0; 413 int err = 0;
429 int usig; 414 void __user *fpstate = NULL;
430 415
431 frame = get_sigframe(ka, regs, sizeof(*frame)); 416 frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate);
432 417
433 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 418 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
434 goto give_sigsegv; 419 return -EFAULT;
435
436 usig = current_thread_info()->exec_domain
437 && current_thread_info()->exec_domain->signal_invmap
438 && sig < 32
439 ? current_thread_info()->exec_domain->signal_invmap[sig]
440 : sig;
441 420
442 err |= __put_user(usig, &frame->sig); 421 err |= __put_user(sig, &frame->sig);
443 err |= __put_user(&frame->info, &frame->pinfo); 422 err |= __put_user(&frame->info, &frame->pinfo);
444 err |= __put_user(&frame->uc, &frame->puc); 423 err |= __put_user(&frame->uc, &frame->puc);
445 err |= copy_siginfo_to_user(&frame->info, info); 424 err |= copy_siginfo_to_user(&frame->info, info);
446 if (err) 425 if (err)
447 goto give_sigsegv; 426 return -EFAULT;
448 427
449 /* Create the ucontext. */ 428 /* Create the ucontext. */
450 err |= __put_user(0, &frame->uc.uc_flags); 429 if (cpu_has_xsave)
430 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
431 else
432 err |= __put_user(0, &frame->uc.uc_flags);
451 err |= __put_user(0, &frame->uc.uc_link); 433 err |= __put_user(0, &frame->uc.uc_link);
452 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 434 err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
453 err |= __put_user(sas_ss_flags(regs->sp), 435 err |= __put_user(sas_ss_flags(regs->sp),
454 &frame->uc.uc_stack.ss_flags); 436 &frame->uc.uc_stack.ss_flags);
455 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); 437 err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size);
456 err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, 438 err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
457 regs, set->sig[0]); 439 regs, set->sig[0]);
458 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 440 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
459 if (err) 441 if (err)
460 goto give_sigsegv; 442 return -EFAULT;
461 443
462 /* Set up to return from userspace. */ 444 /* Set up to return from userspace. */
463 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); 445 restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn);
@@ -477,12 +459,12 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
477 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); 459 err |= __put_user(0x80cd, (short __user *)(frame->retcode+5));
478 460
479 if (err) 461 if (err)
480 goto give_sigsegv; 462 return -EFAULT;
481 463
482 /* Set up registers for signal handler */ 464 /* Set up registers for signal handler */
483 regs->sp = (unsigned long)frame; 465 regs->sp = (unsigned long)frame;
484 regs->ip = (unsigned long)ka->sa.sa_handler; 466 regs->ip = (unsigned long)ka->sa.sa_handler;
485 regs->ax = (unsigned long)usig; 467 regs->ax = (unsigned long)sig;
486 regs->dx = (unsigned long)&frame->info; 468 regs->dx = (unsigned long)&frame->info;
487 regs->cx = (unsigned long)&frame->uc; 469 regs->cx = (unsigned long)&frame->uc;
488 470
@@ -492,15 +474,48 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
492 regs->cs = __USER_CS; 474 regs->cs = __USER_CS;
493 475
494 return 0; 476 return 0;
495
496give_sigsegv:
497 force_sigsegv(sig, current);
498 return -EFAULT;
499} 477}
500 478
501/* 479/*
502 * OK, we're invoking a handler: 480 * OK, we're invoking a handler:
503 */ 481 */
482static int signr_convert(int sig)
483{
484 struct thread_info *info = current_thread_info();
485
486 if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32)
487 return info->exec_domain->signal_invmap[sig];
488 return sig;
489}
490
491#define is_ia32 1
492#define ia32_setup_frame __setup_frame
493#define ia32_setup_rt_frame __setup_rt_frame
494
495static int
496setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
497 sigset_t *set, struct pt_regs *regs)
498{
499 int usig = signr_convert(sig);
500 int ret;
501
502 /* Set up the stack frame */
503 if (is_ia32) {
504 if (ka->sa.sa_flags & SA_SIGINFO)
505 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
506 else
507 ret = ia32_setup_frame(usig, ka, set, regs);
508 } else
509 ret = __setup_rt_frame(sig, ka, info, set, regs);
510
511 if (ret) {
512 force_sigsegv(sig, current);
513 return -EFAULT;
514 }
515
516 return ret;
517}
518
504static int 519static int
505handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 520handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
506 sigset_t *oldset, struct pt_regs *regs) 521 sigset_t *oldset, struct pt_regs *regs)
@@ -508,9 +523,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
508 int ret; 523 int ret;
509 524
510 /* Are we from a system call? */ 525 /* Are we from a system call? */
511 if ((long)regs->orig_ax >= 0) { 526 if (syscall_get_nr(current, regs) >= 0) {
512 /* If so, check system call restarting.. */ 527 /* If so, check system call restarting.. */
513 switch (regs->ax) { 528 switch (syscall_get_error(current, regs)) {
514 case -ERESTART_RESTARTBLOCK: 529 case -ERESTART_RESTARTBLOCK:
515 case -ERESTARTNOHAND: 530 case -ERESTARTNOHAND:
516 regs->ax = -EINTR; 531 regs->ax = -EINTR;
@@ -537,15 +552,20 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
537 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 552 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
538 regs->flags &= ~X86_EFLAGS_TF; 553 regs->flags &= ~X86_EFLAGS_TF;
539 554
540 /* Set up the stack frame */ 555 ret = setup_rt_frame(sig, ka, info, oldset, regs);
541 if (ka->sa.sa_flags & SA_SIGINFO)
542 ret = setup_rt_frame(sig, ka, info, oldset, regs);
543 else
544 ret = setup_frame(sig, ka, oldset, regs);
545 556
546 if (ret) 557 if (ret)
547 return ret; 558 return ret;
548 559
560#ifdef CONFIG_X86_64
561 /*
562 * This has nothing to do with segment registers,
563 * despite the name. This magic affects uaccess.h
564 * macros' behavior. Reset it to the normal setting.
565 */
566 set_fs(USER_DS);
567#endif
568
549 /* 569 /*
550 * Clear the direction flag as per the ABI for function entry. 570 * Clear the direction flag as per the ABI for function entry.
551 */ 571 */
@@ -558,8 +578,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
558 * handler too. 578 * handler too.
559 */ 579 */
560 regs->flags &= ~X86_EFLAGS_TF; 580 regs->flags &= ~X86_EFLAGS_TF;
561 if (test_thread_flag(TIF_SINGLESTEP))
562 ptrace_notify(SIGTRAP);
563 581
564 spin_lock_irq(&current->sighand->siglock); 582 spin_lock_irq(&current->sighand->siglock);
565 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask); 583 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
@@ -568,9 +586,13 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
568 recalc_sigpending(); 586 recalc_sigpending();
569 spin_unlock_irq(&current->sighand->siglock); 587 spin_unlock_irq(&current->sighand->siglock);
570 588
589 tracehook_signal_handler(sig, info, ka, regs,
590 test_thread_flag(TIF_SINGLESTEP));
591
571 return 0; 592 return 0;
572} 593}
573 594
595#define NR_restart_syscall __NR_restart_syscall
574/* 596/*
575 * Note that 'init' is a special process: it doesn't get signals it doesn't 597 * Note that 'init' is a special process: it doesn't get signals it doesn't
576 * want to handle. Thus you cannot kill init even with a SIGKILL even by 598 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -623,9 +645,9 @@ static void do_signal(struct pt_regs *regs)
623 } 645 }
624 646
625 /* Did we come from a system call? */ 647 /* Did we come from a system call? */
626 if ((long)regs->orig_ax >= 0) { 648 if (syscall_get_nr(current, regs) >= 0) {
627 /* Restart the system call - no handlers present */ 649 /* Restart the system call - no handlers present */
628 switch (regs->ax) { 650 switch (syscall_get_error(current, regs)) {
629 case -ERESTARTNOHAND: 651 case -ERESTARTNOHAND:
630 case -ERESTARTSYS: 652 case -ERESTARTSYS:
631 case -ERESTARTNOINTR: 653 case -ERESTARTNOINTR:
@@ -634,7 +656,7 @@ static void do_signal(struct pt_regs *regs)
634 break; 656 break;
635 657
636 case -ERESTART_RESTARTBLOCK: 658 case -ERESTART_RESTARTBLOCK:
637 regs->ax = __NR_restart_syscall; 659 regs->ax = NR_restart_syscall;
638 regs->ip -= 2; 660 regs->ip -= 2;
639 break; 661 break;
640 } 662 }
@@ -657,18 +679,38 @@ static void do_signal(struct pt_regs *regs)
657void 679void
658do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) 680do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
659{ 681{
660 /* Pending single-step? */ 682#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
661 if (thread_info_flags & _TIF_SINGLESTEP) { 683 /* notify userspace of pending MCEs */
662 regs->flags |= X86_EFLAGS_TF; 684 if (thread_info_flags & _TIF_MCE_NOTIFY)
663 clear_thread_flag(TIF_SINGLESTEP); 685 mce_notify_user();
664 } 686#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
665 687
666 /* deal with pending signal delivery */ 688 /* deal with pending signal delivery */
667 if (thread_info_flags & _TIF_SIGPENDING) 689 if (thread_info_flags & _TIF_SIGPENDING)
668 do_signal(regs); 690 do_signal(regs);
669 691
670 if (thread_info_flags & _TIF_HRTICK_RESCHED) 692 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
671 hrtick_resched(); 693 clear_thread_flag(TIF_NOTIFY_RESUME);
694 tracehook_notify_resume(regs);
695 }
672 696
697#ifdef CONFIG_X86_32
673 clear_thread_flag(TIF_IRET); 698 clear_thread_flag(TIF_IRET);
699#endif /* CONFIG_X86_32 */
700}
701
702void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
703{
704 struct task_struct *me = current;
705
706 if (show_unhandled_signals && printk_ratelimit()) {
707 printk(KERN_INFO
708 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
709 me->comm, me->pid, where, frame,
710 regs->ip, regs->sp, regs->orig_ax);
711 print_vma_addr(" in ", regs->ip);
712 printk(KERN_CONT "\n");
713 }
714
715 force_sig(SIGSEGV, me);
674} 716}
diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c
index e53b267662e7..a5c9627f4db9 100644
--- a/arch/x86/kernel/signal_64.c
+++ b/arch/x86/kernel/signal_64.c
@@ -15,17 +15,21 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/wait.h> 16#include <linux/wait.h>
17#include <linux/ptrace.h> 17#include <linux/ptrace.h>
18#include <linux/tracehook.h>
18#include <linux/unistd.h> 19#include <linux/unistd.h>
19#include <linux/stddef.h> 20#include <linux/stddef.h>
20#include <linux/personality.h> 21#include <linux/personality.h>
21#include <linux/compiler.h> 22#include <linux/compiler.h>
23#include <linux/uaccess.h>
24
22#include <asm/processor.h> 25#include <asm/processor.h>
23#include <asm/ucontext.h> 26#include <asm/ucontext.h>
24#include <asm/uaccess.h>
25#include <asm/i387.h> 27#include <asm/i387.h>
26#include <asm/proto.h> 28#include <asm/proto.h>
27#include <asm/ia32_unistd.h> 29#include <asm/ia32_unistd.h>
28#include <asm/mce.h> 30#include <asm/mce.h>
31#include <asm/syscall.h>
32#include <asm/syscalls.h>
29#include "sigframe.h" 33#include "sigframe.h"
30 34
31#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) 35#define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
@@ -41,11 +45,6 @@
41# define FIX_EFLAGS __FIX_EFLAGS 45# define FIX_EFLAGS __FIX_EFLAGS
42#endif 46#endif
43 47
44int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
45 sigset_t *set, struct pt_regs * regs);
46int ia32_setup_frame(int sig, struct k_sigaction *ka,
47 sigset_t *set, struct pt_regs * regs);
48
49asmlinkage long 48asmlinkage long
50sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 49sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
51 struct pt_regs *regs) 50 struct pt_regs *regs)
@@ -53,6 +52,15 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
53 return do_sigaltstack(uss, uoss, regs->sp); 52 return do_sigaltstack(uss, uoss, regs->sp);
54} 53}
55 54
55#define COPY(x) { \
56 err |= __get_user(regs->x, &sc->x); \
57}
58
59#define COPY_SEG_STRICT(seg) { \
60 unsigned short tmp; \
61 err |= __get_user(tmp, &sc->seg); \
62 regs->seg = tmp | 3; \
63}
56 64
57/* 65/*
58 * Do a signal return; undo the signal stack. 66 * Do a signal return; undo the signal stack.
@@ -61,13 +69,13 @@ static int
61restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, 69restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
62 unsigned long *pax) 70 unsigned long *pax)
63{ 71{
72 void __user *buf;
73 unsigned int tmpflags;
64 unsigned int err = 0; 74 unsigned int err = 0;
65 75
66 /* Always make any pending restarted system calls return -EINTR */ 76 /* Always make any pending restarted system calls return -EINTR */
67 current_thread_info()->restart_block.fn = do_no_restart_syscall; 77 current_thread_info()->restart_block.fn = do_no_restart_syscall;
68 78
69#define COPY(x) err |= __get_user(regs->x, &sc->x)
70
71 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); 79 COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
72 COPY(dx); COPY(cx); COPY(ip); 80 COPY(dx); COPY(cx); COPY(ip);
73 COPY(r8); 81 COPY(r8);
@@ -82,48 +90,24 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
82 /* Kernel saves and restores only the CS segment register on signals, 90 /* Kernel saves and restores only the CS segment register on signals,
83 * which is the bare minimum needed to allow mixed 32/64-bit code. 91 * which is the bare minimum needed to allow mixed 32/64-bit code.
84 * App's signal handler can save/restore other segments if needed. */ 92 * App's signal handler can save/restore other segments if needed. */
85 { 93 COPY_SEG_STRICT(cs);
86 unsigned cs;
87 err |= __get_user(cs, &sc->cs);
88 regs->cs = cs | 3; /* Force into user mode */
89 }
90 94
91 { 95 err |= __get_user(tmpflags, &sc->flags);
92 unsigned int tmpflags; 96 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
93 err |= __get_user(tmpflags, &sc->flags); 97 regs->orig_ax = -1; /* disable syscall checks */
94 regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
95 regs->orig_ax = -1; /* disable syscall checks */
96 }
97 98
98 { 99 err |= __get_user(buf, &sc->fpstate);
99 struct _fpstate __user * buf; 100 err |= restore_i387_xstate(buf);
100 err |= __get_user(buf, &sc->fpstate);
101
102 if (buf) {
103 if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
104 goto badframe;
105 err |= restore_i387(buf);
106 } else {
107 struct task_struct *me = current;
108 if (used_math()) {
109 clear_fpu(me);
110 clear_used_math();
111 }
112 }
113 }
114 101
115 err |= __get_user(*pax, &sc->ax); 102 err |= __get_user(*pax, &sc->ax);
116 return err; 103 return err;
117
118badframe:
119 return 1;
120} 104}
121 105
122asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) 106static long do_rt_sigreturn(struct pt_regs *regs)
123{ 107{
124 struct rt_sigframe __user *frame; 108 struct rt_sigframe __user *frame;
125 sigset_t set;
126 unsigned long ax; 109 unsigned long ax;
110 sigset_t set;
127 111
128 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); 112 frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
129 if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) 113 if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
@@ -136,7 +120,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
136 current->blocked = set; 120 current->blocked = set;
137 recalc_sigpending(); 121 recalc_sigpending();
138 spin_unlock_irq(&current->sighand->siglock); 122 spin_unlock_irq(&current->sighand->siglock);
139 123
140 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 124 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
141 goto badframe; 125 goto badframe;
142 126
@@ -146,16 +130,22 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
146 return ax; 130 return ax;
147 131
148badframe: 132badframe:
149 signal_fault(regs,frame,"sigreturn"); 133 signal_fault(regs, frame, "rt_sigreturn");
150 return 0; 134 return 0;
151} 135}
136
137asmlinkage long sys_rt_sigreturn(struct pt_regs *regs)
138{
139 return do_rt_sigreturn(regs);
140}
152 141
153/* 142/*
154 * Set up a signal frame. 143 * Set up a signal frame.
155 */ 144 */
156 145
157static inline int 146static inline int
158setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) 147setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs,
148 unsigned long mask, struct task_struct *me)
159{ 149{
160 int err = 0; 150 int err = 0;
161 151
@@ -207,41 +197,40 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size)
207 sp = current->sas_ss_sp + current->sas_ss_size; 197 sp = current->sas_ss_sp + current->sas_ss_size;
208 } 198 }
209 199
210 return (void __user *)round_down(sp - size, 16); 200 return (void __user *)round_down(sp - size, 64);
211} 201}
212 202
213static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 203static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
214 sigset_t *set, struct pt_regs * regs) 204 sigset_t *set, struct pt_regs *regs)
215{ 205{
216 struct rt_sigframe __user *frame; 206 struct rt_sigframe __user *frame;
217 struct _fpstate __user *fp = NULL; 207 void __user *fp = NULL;
218 int err = 0; 208 int err = 0;
219 struct task_struct *me = current; 209 struct task_struct *me = current;
220 210
221 if (used_math()) { 211 if (used_math()) {
222 fp = get_stack(ka, regs, sizeof(struct _fpstate)); 212 fp = get_stack(ka, regs, sig_xstate_size);
223 frame = (void __user *)round_down( 213 frame = (void __user *)round_down(
224 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; 214 (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8;
225 215
226 if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) 216 if (save_i387_xstate(fp) < 0)
227 goto give_sigsegv; 217 return -EFAULT;
228
229 if (save_i387(fp) < 0)
230 err |= -1;
231 } else 218 } else
232 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; 219 frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8;
233 220
234 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) 221 if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame)))
235 goto give_sigsegv; 222 return -EFAULT;
236 223
237 if (ka->sa.sa_flags & SA_SIGINFO) { 224 if (ka->sa.sa_flags & SA_SIGINFO) {
238 err |= copy_siginfo_to_user(&frame->info, info); 225 if (copy_siginfo_to_user(&frame->info, info))
239 if (err) 226 return -EFAULT;
240 goto give_sigsegv;
241 } 227 }
242 228
243 /* Create the ucontext. */ 229 /* Create the ucontext. */
244 err |= __put_user(0, &frame->uc.uc_flags); 230 if (cpu_has_xsave)
231 err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags);
232 else
233 err |= __put_user(0, &frame->uc.uc_flags);
245 err |= __put_user(0, &frame->uc.uc_link); 234 err |= __put_user(0, &frame->uc.uc_link);
246 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); 235 err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp);
247 err |= __put_user(sas_ss_flags(regs->sp), 236 err |= __put_user(sas_ss_flags(regs->sp),
@@ -249,9 +238,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
249 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); 238 err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size);
250 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); 239 err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me);
251 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); 240 err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate);
252 if (sizeof(*set) == 16) { 241 if (sizeof(*set) == 16) {
253 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); 242 __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]);
254 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); 243 __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]);
255 } else 244 } else
256 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); 245 err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
257 246
@@ -262,15 +251,15 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
262 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); 251 err |= __put_user(ka->sa.sa_restorer, &frame->pretcode);
263 } else { 252 } else {
264 /* could use a vstub here */ 253 /* could use a vstub here */
265 goto give_sigsegv; 254 return -EFAULT;
266 } 255 }
267 256
268 if (err) 257 if (err)
269 goto give_sigsegv; 258 return -EFAULT;
270 259
271 /* Set up registers for signal handler */ 260 /* Set up registers for signal handler */
272 regs->di = sig; 261 regs->di = sig;
273 /* In case the signal handler was declared without prototypes */ 262 /* In case the signal handler was declared without prototypes */
274 regs->ax = 0; 263 regs->ax = 0;
275 264
276 /* This also works for non SA_SIGINFO handlers because they expect the 265 /* This also works for non SA_SIGINFO handlers because they expect the
@@ -286,44 +275,45 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
286 regs->cs = __USER_CS; 275 regs->cs = __USER_CS;
287 276
288 return 0; 277 return 0;
289
290give_sigsegv:
291 force_sigsegv(sig, current);
292 return -EFAULT;
293} 278}
294 279
295/* 280/*
296 * Return -1L or the syscall number that @regs is executing. 281 * OK, we're invoking a handler
297 */ 282 */
298static long current_syscall(struct pt_regs *regs) 283static int signr_convert(int sig)
299{ 284{
300 /* 285 return sig;
301 * We always sign-extend a -1 value being set here,
302 * so this is always either -1L or a syscall number.
303 */
304 return regs->orig_ax;
305} 286}
306 287
307/*
308 * Return a value that is -EFOO if the system call in @regs->orig_ax
309 * returned an error. This only works for @regs from @current.
310 */
311static long current_syscall_ret(struct pt_regs *regs)
312{
313#ifdef CONFIG_IA32_EMULATION 288#ifdef CONFIG_IA32_EMULATION
314 if (test_thread_flag(TIF_IA32)) 289#define is_ia32 test_thread_flag(TIF_IA32)
315 /* 290#else
316 * Sign-extend the value so (int)-EFOO becomes (long)-EFOO 291#define is_ia32 0
317 * and will match correctly in comparisons.
318 */
319 return (int) regs->ax;
320#endif 292#endif
321 return regs->ax;
322}
323 293
324/* 294static int
325 * OK, we're invoking a handler 295setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,
326 */ 296 sigset_t *set, struct pt_regs *regs)
297{
298 int usig = signr_convert(sig);
299 int ret;
300
301 /* Set up the stack frame */
302 if (is_ia32) {
303 if (ka->sa.sa_flags & SA_SIGINFO)
304 ret = ia32_setup_rt_frame(usig, ka, info, set, regs);
305 else
306 ret = ia32_setup_frame(usig, ka, set, regs);
307 } else
308 ret = __setup_rt_frame(sig, ka, info, set, regs);
309
310 if (ret) {
311 force_sigsegv(sig, current);
312 return -EFAULT;
313 }
314
315 return ret;
316}
327 317
328static int 318static int
329handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 319handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
@@ -332,9 +322,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
332 int ret; 322 int ret;
333 323
334 /* Are we from a system call? */ 324 /* Are we from a system call? */
335 if (current_syscall(regs) >= 0) { 325 if (syscall_get_nr(current, regs) >= 0) {
336 /* If so, check system call restarting.. */ 326 /* If so, check system call restarting.. */
337 switch (current_syscall_ret(regs)) { 327 switch (syscall_get_error(current, regs)) {
338 case -ERESTART_RESTARTBLOCK: 328 case -ERESTART_RESTARTBLOCK:
339 case -ERESTARTNOHAND: 329 case -ERESTARTNOHAND:
340 regs->ax = -EINTR; 330 regs->ax = -EINTR;
@@ -361,50 +351,48 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
361 likely(test_and_clear_thread_flag(TIF_FORCED_TF))) 351 likely(test_and_clear_thread_flag(TIF_FORCED_TF)))
362 regs->flags &= ~X86_EFLAGS_TF; 352 regs->flags &= ~X86_EFLAGS_TF;
363 353
364#ifdef CONFIG_IA32_EMULATION
365 if (test_thread_flag(TIF_IA32)) {
366 if (ka->sa.sa_flags & SA_SIGINFO)
367 ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs);
368 else
369 ret = ia32_setup_frame(sig, ka, oldset, regs);
370 } else
371#endif
372 ret = setup_rt_frame(sig, ka, info, oldset, regs); 354 ret = setup_rt_frame(sig, ka, info, oldset, regs);
373 355
374 if (ret == 0) { 356 if (ret)
375 /* 357 return ret;
376 * This has nothing to do with segment registers,
377 * despite the name. This magic affects uaccess.h
378 * macros' behavior. Reset it to the normal setting.
379 */
380 set_fs(USER_DS);
381 358
382 /* 359#ifdef CONFIG_X86_64
383 * Clear the direction flag as per the ABI for function entry. 360 /*
384 */ 361 * This has nothing to do with segment registers,
385 regs->flags &= ~X86_EFLAGS_DF; 362 * despite the name. This magic affects uaccess.h
363 * macros' behavior. Reset it to the normal setting.
364 */
365 set_fs(USER_DS);
366#endif
386 367
387 /* 368 /*
388 * Clear TF when entering the signal handler, but 369 * Clear the direction flag as per the ABI for function entry.
389 * notify any tracer that was single-stepping it. 370 */
390 * The tracer may want to single-step inside the 371 regs->flags &= ~X86_EFLAGS_DF;
391 * handler too.
392 */
393 regs->flags &= ~X86_EFLAGS_TF;
394 if (test_thread_flag(TIF_SINGLESTEP))
395 ptrace_notify(SIGTRAP);
396
397 spin_lock_irq(&current->sighand->siglock);
398 sigorsets(&current->blocked,&current->blocked,&ka->sa.sa_mask);
399 if (!(ka->sa.sa_flags & SA_NODEFER))
400 sigaddset(&current->blocked,sig);
401 recalc_sigpending();
402 spin_unlock_irq(&current->sighand->siglock);
403 }
404 372
405 return ret; 373 /*
374 * Clear TF when entering the signal handler, but
375 * notify any tracer that was single-stepping it.
376 * The tracer may want to single-step inside the
377 * handler too.
378 */
379 regs->flags &= ~X86_EFLAGS_TF;
380
381 spin_lock_irq(&current->sighand->siglock);
382 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
383 if (!(ka->sa.sa_flags & SA_NODEFER))
384 sigaddset(&current->blocked, sig);
385 recalc_sigpending();
386 spin_unlock_irq(&current->sighand->siglock);
387
388 tracehook_signal_handler(sig, info, ka, regs,
389 test_thread_flag(TIF_SINGLESTEP));
390
391 return 0;
406} 392}
407 393
394#define NR_restart_syscall \
395 test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall
408/* 396/*
409 * Note that 'init' is a special process: it doesn't get signals it doesn't 397 * Note that 'init' is a special process: it doesn't get signals it doesn't
410 * want to handle. Thus you cannot kill init even with a SIGKILL even by 398 * want to handle. Thus you cannot kill init even with a SIGKILL even by
@@ -434,7 +422,8 @@ static void do_signal(struct pt_regs *regs)
434 422
435 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 423 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
436 if (signr > 0) { 424 if (signr > 0) {
437 /* Re-enable any watchpoints before delivering the 425 /*
426 * Re-enable any watchpoints before delivering the
438 * signal to user space. The processor register will 427 * signal to user space. The processor register will
439 * have been cleared if the watchpoint triggered 428 * have been cleared if the watchpoint triggered
440 * inside the kernel. 429 * inside the kernel.
@@ -442,7 +431,7 @@ static void do_signal(struct pt_regs *regs)
442 if (current->thread.debugreg7) 431 if (current->thread.debugreg7)
443 set_debugreg(current->thread.debugreg7, 7); 432 set_debugreg(current->thread.debugreg7, 7);
444 433
445 /* Whee! Actually deliver the signal. */ 434 /* Whee! Actually deliver the signal. */
446 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 435 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
447 /* 436 /*
448 * A signal was successfully delivered; the saved 437 * A signal was successfully delivered; the saved
@@ -456,19 +445,18 @@ static void do_signal(struct pt_regs *regs)
456 } 445 }
457 446
458 /* Did we come from a system call? */ 447 /* Did we come from a system call? */
459 if (current_syscall(regs) >= 0) { 448 if (syscall_get_nr(current, regs) >= 0) {
460 /* Restart the system call - no handlers present */ 449 /* Restart the system call - no handlers present */
461 switch (current_syscall_ret(regs)) { 450 switch (syscall_get_error(current, regs)) {
462 case -ERESTARTNOHAND: 451 case -ERESTARTNOHAND:
463 case -ERESTARTSYS: 452 case -ERESTARTSYS:
464 case -ERESTARTNOINTR: 453 case -ERESTARTNOINTR:
465 regs->ax = regs->orig_ax; 454 regs->ax = regs->orig_ax;
466 regs->ip -= 2; 455 regs->ip -= 2;
467 break; 456 break;
457
468 case -ERESTART_RESTARTBLOCK: 458 case -ERESTART_RESTARTBLOCK:
469 regs->ax = test_thread_flag(TIF_IA32) ? 459 regs->ax = NR_restart_syscall;
470 __NR_ia32_restart_syscall :
471 __NR_restart_syscall;
472 regs->ip -= 2; 460 regs->ip -= 2;
473 break; 461 break;
474 } 462 }
@@ -484,38 +472,45 @@ static void do_signal(struct pt_regs *regs)
484 } 472 }
485} 473}
486 474
487void do_notify_resume(struct pt_regs *regs, void *unused, 475/*
488 __u32 thread_info_flags) 476 * notification of userspace execution resumption
477 * - triggered by the TIF_WORK_MASK flags
478 */
479void
480do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
489{ 481{
490 /* Pending single-step? */ 482#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE)
491 if (thread_info_flags & _TIF_SINGLESTEP) {
492 regs->flags |= X86_EFLAGS_TF;
493 clear_thread_flag(TIF_SINGLESTEP);
494 }
495
496#ifdef CONFIG_X86_MCE
497 /* notify userspace of pending MCEs */ 483 /* notify userspace of pending MCEs */
498 if (thread_info_flags & _TIF_MCE_NOTIFY) 484 if (thread_info_flags & _TIF_MCE_NOTIFY)
499 mce_notify_user(); 485 mce_notify_user();
500#endif /* CONFIG_X86_MCE */ 486#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
501 487
502 /* deal with pending signal delivery */ 488 /* deal with pending signal delivery */
503 if (thread_info_flags & _TIF_SIGPENDING) 489 if (thread_info_flags & _TIF_SIGPENDING)
504 do_signal(regs); 490 do_signal(regs);
505 491
506 if (thread_info_flags & _TIF_HRTICK_RESCHED) 492 if (thread_info_flags & _TIF_NOTIFY_RESUME) {
507 hrtick_resched(); 493 clear_thread_flag(TIF_NOTIFY_RESUME);
494 tracehook_notify_resume(regs);
495 }
496
497#ifdef CONFIG_X86_32
498 clear_thread_flag(TIF_IRET);
499#endif /* CONFIG_X86_32 */
508} 500}
509 501
510void signal_fault(struct pt_regs *regs, void __user *frame, char *where) 502void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
511{ 503{
512 struct task_struct *me = current; 504 struct task_struct *me = current;
505
513 if (show_unhandled_signals && printk_ratelimit()) { 506 if (show_unhandled_signals && printk_ratelimit()) {
514 printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", 507 printk(KERN_INFO
515 me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax); 508 "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx",
509 me->comm, me->pid, where, frame,
510 regs->ip, regs->sp, regs->orig_ax);
516 print_vma_addr(" in ", regs->ip); 511 print_vma_addr(" in ", regs->ip);
517 printk("\n"); 512 printk(KERN_CONT "\n");
518 } 513 }
519 514
520 force_sig(SIGSEGV, me); 515 force_sig(SIGSEGV, me);
521} 516}
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 361b7a4c640c..18f9b19f5f8f 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
214struct smp_ops smp_ops = { 214struct smp_ops smp_ops = {
215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, 215 .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
216 .smp_prepare_cpus = native_smp_prepare_cpus, 216 .smp_prepare_cpus = native_smp_prepare_cpus,
217 .cpu_up = native_cpu_up,
218 .smp_cpus_done = native_smp_cpus_done, 217 .smp_cpus_done = native_smp_cpus_done,
219 218
220 .smp_send_stop = native_smp_send_stop, 219 .smp_send_stop = native_smp_send_stop,
221 .smp_send_reschedule = native_smp_send_reschedule, 220 .smp_send_reschedule = native_smp_send_reschedule,
222 221
222 .cpu_up = native_cpu_up,
223 .cpu_die = native_cpu_die,
224 .cpu_disable = native_cpu_disable,
225 .play_dead = native_play_dead,
226
223 .send_call_func_ipi = native_send_call_func_ipi, 227 .send_call_func_ipi = native_send_call_func_ipi,
224 .send_call_func_single_ipi = native_send_call_func_single_ipi, 228 .send_call_func_single_ipi = native_send_call_func_single_ipi,
225}; 229};
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 687376ab07e8..7b1093397319 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -52,6 +52,7 @@
52#include <asm/desc.h> 52#include <asm/desc.h>
53#include <asm/nmi.h> 53#include <asm/nmi.h>
54#include <asm/irq.h> 54#include <asm/irq.h>
55#include <asm/idle.h>
55#include <asm/smp.h> 56#include <asm/smp.h>
56#include <asm/trampoline.h> 57#include <asm/trampoline.h>
57#include <asm/cpu.h> 58#include <asm/cpu.h>
@@ -88,7 +89,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
88#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) 89#define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x))
89#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) 90#define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p))
90#else 91#else
91struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; 92static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ;
92#define get_idle_for_cpu(x) (idle_thread_array[(x)]) 93#define get_idle_for_cpu(x) (idle_thread_array[(x)])
93#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) 94#define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p))
94#endif 95#endif
@@ -123,13 +124,12 @@ EXPORT_PER_CPU_SYMBOL(cpu_info);
123 124
124static atomic_t init_deasserted; 125static atomic_t init_deasserted;
125 126
126static int boot_cpu_logical_apicid;
127 127
128/* representing cpus for which sibling maps can be computed */ 128/* representing cpus for which sibling maps can be computed */
129static cpumask_t cpu_sibling_setup_map; 129static cpumask_t cpu_sibling_setup_map;
130 130
131/* Set if we find a B stepping CPU */ 131/* Set if we find a B stepping CPU */
132int __cpuinitdata smp_b_stepping; 132static int __cpuinitdata smp_b_stepping;
133 133
134#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) 134#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
135 135
@@ -165,6 +165,8 @@ static void unmap_cpu_to_node(int cpu)
165#endif 165#endif
166 166
167#ifdef CONFIG_X86_32 167#ifdef CONFIG_X86_32
168static int boot_cpu_logical_apicid;
169
168u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = 170u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
169 { [0 ... NR_CPUS-1] = BAD_APICID }; 171 { [0 ... NR_CPUS-1] = BAD_APICID };
170 172
@@ -210,13 +212,13 @@ static void __cpuinit smp_callin(void)
210 /* 212 /*
211 * (This works even if the APIC is not enabled.) 213 * (This works even if the APIC is not enabled.)
212 */ 214 */
213 phys_id = GET_APIC_ID(read_apic_id()); 215 phys_id = read_apic_id();
214 cpuid = smp_processor_id(); 216 cpuid = smp_processor_id();
215 if (cpu_isset(cpuid, cpu_callin_map)) { 217 if (cpu_isset(cpuid, cpu_callin_map)) {
216 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, 218 panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__,
217 phys_id, cpuid); 219 phys_id, cpuid);
218 } 220 }
219 Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); 221 pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id);
220 222
221 /* 223 /*
222 * STARTUP IPIs are fragile beasts as they might sometimes 224 * STARTUP IPIs are fragile beasts as they might sometimes
@@ -251,12 +253,13 @@ static void __cpuinit smp_callin(void)
251 * boards) 253 * boards)
252 */ 254 */
253 255
254 Dprintk("CALLIN, before setup_local_APIC().\n"); 256 pr_debug("CALLIN, before setup_local_APIC().\n");
255 smp_callin_clear_local_apic(); 257 smp_callin_clear_local_apic();
256 setup_local_APIC(); 258 setup_local_APIC();
257 end_local_APIC_setup(); 259 end_local_APIC_setup();
258 map_cpu_to_logical_apicid(); 260 map_cpu_to_logical_apicid();
259 261
262 notify_cpu_starting(cpuid);
260 /* 263 /*
261 * Get our bogomips. 264 * Get our bogomips.
262 * 265 *
@@ -266,7 +269,7 @@ static void __cpuinit smp_callin(void)
266 local_irq_enable(); 269 local_irq_enable();
267 calibrate_delay(); 270 calibrate_delay();
268 local_irq_disable(); 271 local_irq_disable();
269 Dprintk("Stack at about %p\n", &cpuid); 272 pr_debug("Stack at about %p\n", &cpuid);
270 273
271 /* 274 /*
272 * Save our processor parameters 275 * Save our processor parameters
@@ -279,6 +282,8 @@ static void __cpuinit smp_callin(void)
279 cpu_set(cpuid, cpu_callin_map); 282 cpu_set(cpuid, cpu_callin_map);
280} 283}
281 284
285static int __cpuinitdata unsafe_smp;
286
282/* 287/*
283 * Activate a secondary processor. 288 * Activate a secondary processor.
284 */ 289 */
@@ -326,15 +331,22 @@ static void __cpuinit start_secondary(void *unused)
326 * for which cpus receive the IPI. Holding this 331 * for which cpus receive the IPI. Holding this
327 * lock helps us to not include this cpu in a currently in progress 332 * lock helps us to not include this cpu in a currently in progress
328 * smp_call_function(). 333 * smp_call_function().
334 *
335 * We need to hold vector_lock so there the set of online cpus
336 * does not change while we are assigning vectors to cpus. Holding
337 * this lock ensures we don't half assign or remove an irq from a cpu.
329 */ 338 */
330 ipi_call_lock_irq(); 339 ipi_call_lock();
331#ifdef CONFIG_X86_IO_APIC 340 lock_vector_lock();
332 setup_vector_irq(smp_processor_id()); 341 __setup_vector_irq(smp_processor_id());
333#endif
334 cpu_set(smp_processor_id(), cpu_online_map); 342 cpu_set(smp_processor_id(), cpu_online_map);
335 ipi_call_unlock_irq(); 343 unlock_vector_lock();
344 ipi_call_unlock();
336 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 345 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
337 346
347 /* enable local interrupts */
348 local_irq_enable();
349
338 setup_secondary_clock(); 350 setup_secondary_clock();
339 351
340 wmb(); 352 wmb();
@@ -387,7 +399,7 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c)
387 goto valid_k7; 399 goto valid_k7;
388 400
389 /* If we get here, not a certified SMP capable AMD system. */ 401 /* If we get here, not a certified SMP capable AMD system. */
390 add_taint(TAINT_UNSAFE_SMP); 402 unsafe_smp = 1;
391 } 403 }
392 404
393valid_k7: 405valid_k7:
@@ -404,12 +416,10 @@ static void __cpuinit smp_checks(void)
404 * Don't taint if we are running SMP kernel on a single non-MP 416 * Don't taint if we are running SMP kernel on a single non-MP
405 * approved Athlon 417 * approved Athlon
406 */ 418 */
407 if (tainted & TAINT_UNSAFE_SMP) { 419 if (unsafe_smp && num_online_cpus() > 1) {
408 if (num_online_cpus()) 420 printk(KERN_INFO "WARNING: This combination of AMD"
409 printk(KERN_INFO "WARNING: This combination of AMD" 421 "processors is not suitable for SMP.\n");
410 "processors is not suitable for SMP.\n"); 422 add_taint(TAINT_UNSAFE_SMP);
411 else
412 tainted &= ~TAINT_UNSAFE_SMP;
413 } 423 }
414} 424}
415 425
@@ -438,7 +448,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
438 cpu_set(cpu, cpu_sibling_setup_map); 448 cpu_set(cpu, cpu_sibling_setup_map);
439 449
440 if (smp_num_siblings > 1) { 450 if (smp_num_siblings > 1) {
441 for_each_cpu_mask(i, cpu_sibling_setup_map) { 451 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
442 if (c->phys_proc_id == cpu_data(i).phys_proc_id && 452 if (c->phys_proc_id == cpu_data(i).phys_proc_id &&
443 c->cpu_core_id == cpu_data(i).cpu_core_id) { 453 c->cpu_core_id == cpu_data(i).cpu_core_id) {
444 cpu_set(i, per_cpu(cpu_sibling_map, cpu)); 454 cpu_set(i, per_cpu(cpu_sibling_map, cpu));
@@ -461,7 +471,7 @@ void __cpuinit set_cpu_sibling_map(int cpu)
461 return; 471 return;
462 } 472 }
463 473
464 for_each_cpu_mask(i, cpu_sibling_setup_map) { 474 for_each_cpu_mask_nr(i, cpu_sibling_setup_map) {
465 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 475 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
466 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 476 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
467 cpu_set(i, c->llc_shared_map); 477 cpu_set(i, c->llc_shared_map);
@@ -513,7 +523,7 @@ static void impress_friends(void)
513 /* 523 /*
514 * Allow the user to impress friends. 524 * Allow the user to impress friends.
515 */ 525 */
516 Dprintk("Before bogomips.\n"); 526 pr_debug("Before bogomips.\n");
517 for_each_possible_cpu(cpu) 527 for_each_possible_cpu(cpu)
518 if (cpu_isset(cpu, cpu_callout_map)) 528 if (cpu_isset(cpu, cpu_callout_map))
519 bogosum += cpu_data(cpu).loops_per_jiffy; 529 bogosum += cpu_data(cpu).loops_per_jiffy;
@@ -523,7 +533,7 @@ static void impress_friends(void)
523 bogosum/(500000/HZ), 533 bogosum/(500000/HZ),
524 (bogosum/(5000/HZ))%100); 534 (bogosum/(5000/HZ))%100);
525 535
526 Dprintk("Before bogocount - setting activated=1.\n"); 536 pr_debug("Before bogocount - setting activated=1.\n");
527} 537}
528 538
529static inline void __inquire_remote_apic(int apicid) 539static inline void __inquire_remote_apic(int apicid)
@@ -533,10 +543,10 @@ static inline void __inquire_remote_apic(int apicid)
533 int timeout; 543 int timeout;
534 u32 status; 544 u32 status;
535 545
536 printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); 546 printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid);
537 547
538 for (i = 0; i < ARRAY_SIZE(regs); i++) { 548 for (i = 0; i < ARRAY_SIZE(regs); i++) {
539 printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]); 549 printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]);
540 550
541 /* 551 /*
542 * Wait for idle. 552 * Wait for idle.
@@ -546,8 +556,7 @@ static inline void __inquire_remote_apic(int apicid)
546 printk(KERN_CONT 556 printk(KERN_CONT
547 "a previous APIC delivery may have failed\n"); 557 "a previous APIC delivery may have failed\n");
548 558
549 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); 559 apic_icr_write(APIC_DM_REMRD | regs[i], apicid);
550 apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]);
551 560
552 timeout = 0; 561 timeout = 0;
553 do { 562 do {
@@ -579,29 +588,24 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip)
579 int maxlvt; 588 int maxlvt;
580 589
581 /* Target chip */ 590 /* Target chip */
582 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid));
583
584 /* Boot on the stack */ 591 /* Boot on the stack */
585 /* Kick the second */ 592 /* Kick the second */
586 apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); 593 apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid);
587 594
588 Dprintk("Waiting for send to finish...\n"); 595 pr_debug("Waiting for send to finish...\n");
589 send_status = safe_apic_wait_icr_idle(); 596 send_status = safe_apic_wait_icr_idle();
590 597
591 /* 598 /*
592 * Give the other CPU some time to accept the IPI. 599 * Give the other CPU some time to accept the IPI.
593 */ 600 */
594 udelay(200); 601 udelay(200);
595 /* 602 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
596 * Due to the Pentium erratum 3AP. 603 maxlvt = lapic_get_maxlvt();
597 */ 604 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
598 maxlvt = lapic_get_maxlvt(); 605 apic_write(APIC_ESR, 0);
599 if (maxlvt > 3) { 606 accept_status = (apic_read(APIC_ESR) & 0xEF);
600 apic_read_around(APIC_SPIV);
601 apic_write(APIC_ESR, 0);
602 } 607 }
603 accept_status = (apic_read(APIC_ESR) & 0xEF); 608 pr_debug("NMI sent.\n");
604 Dprintk("NMI sent.\n");
605 609
606 if (send_status) 610 if (send_status)
607 printk(KERN_ERR "APIC never delivered???\n"); 611 printk(KERN_ERR "APIC never delivered???\n");
@@ -625,42 +629,40 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
625 return send_status; 629 return send_status;
626 } 630 }
627 631
632 maxlvt = lapic_get_maxlvt();
633
628 /* 634 /*
629 * Be paranoid about clearing APIC errors. 635 * Be paranoid about clearing APIC errors.
630 */ 636 */
631 if (APIC_INTEGRATED(apic_version[phys_apicid])) { 637 if (APIC_INTEGRATED(apic_version[phys_apicid])) {
632 apic_read_around(APIC_SPIV); 638 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
633 apic_write(APIC_ESR, 0); 639 apic_write(APIC_ESR, 0);
634 apic_read(APIC_ESR); 640 apic_read(APIC_ESR);
635 } 641 }
636 642
637 Dprintk("Asserting INIT.\n"); 643 pr_debug("Asserting INIT.\n");
638 644
639 /* 645 /*
640 * Turn INIT on target chip 646 * Turn INIT on target chip
641 */ 647 */
642 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
643
644 /* 648 /*
645 * Send IPI 649 * Send IPI
646 */ 650 */
647 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT 651 apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT,
648 | APIC_DM_INIT); 652 phys_apicid);
649 653
650 Dprintk("Waiting for send to finish...\n"); 654 pr_debug("Waiting for send to finish...\n");
651 send_status = safe_apic_wait_icr_idle(); 655 send_status = safe_apic_wait_icr_idle();
652 656
653 mdelay(10); 657 mdelay(10);
654 658
655 Dprintk("Deasserting INIT.\n"); 659 pr_debug("Deasserting INIT.\n");
656 660
657 /* Target chip */ 661 /* Target chip */
658 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
659
660 /* Send IPI */ 662 /* Send IPI */
661 apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); 663 apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid);
662 664
663 Dprintk("Waiting for send to finish...\n"); 665 pr_debug("Waiting for send to finish...\n");
664 send_status = safe_apic_wait_icr_idle(); 666 send_status = safe_apic_wait_icr_idle();
665 667
666 mb(); 668 mb();
@@ -687,55 +689,46 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip)
687 /* 689 /*
688 * Run STARTUP IPI loop. 690 * Run STARTUP IPI loop.
689 */ 691 */
690 Dprintk("#startup loops: %d.\n", num_starts); 692 pr_debug("#startup loops: %d.\n", num_starts);
691
692 maxlvt = lapic_get_maxlvt();
693 693
694 for (j = 1; j <= num_starts; j++) { 694 for (j = 1; j <= num_starts; j++) {
695 Dprintk("Sending STARTUP #%d.\n", j); 695 pr_debug("Sending STARTUP #%d.\n", j);
696 apic_read_around(APIC_SPIV); 696 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
697 apic_write(APIC_ESR, 0); 697 apic_write(APIC_ESR, 0);
698 apic_read(APIC_ESR); 698 apic_read(APIC_ESR);
699 Dprintk("After apic_write.\n"); 699 pr_debug("After apic_write.\n");
700 700
701 /* 701 /*
702 * STARTUP IPI 702 * STARTUP IPI
703 */ 703 */
704 704
705 /* Target chip */ 705 /* Target chip */
706 apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid));
707
708 /* Boot on the stack */ 706 /* Boot on the stack */
709 /* Kick the second */ 707 /* Kick the second */
710 apic_write_around(APIC_ICR, APIC_DM_STARTUP 708 apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12),
711 | (start_eip >> 12)); 709 phys_apicid);
712 710
713 /* 711 /*
714 * Give the other CPU some time to accept the IPI. 712 * Give the other CPU some time to accept the IPI.
715 */ 713 */
716 udelay(300); 714 udelay(300);
717 715
718 Dprintk("Startup point 1.\n"); 716 pr_debug("Startup point 1.\n");
719 717
720 Dprintk("Waiting for send to finish...\n"); 718 pr_debug("Waiting for send to finish...\n");
721 send_status = safe_apic_wait_icr_idle(); 719 send_status = safe_apic_wait_icr_idle();
722 720
723 /* 721 /*
724 * Give the other CPU some time to accept the IPI. 722 * Give the other CPU some time to accept the IPI.
725 */ 723 */
726 udelay(200); 724 udelay(200);
727 /* 725 if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */
728 * Due to the Pentium erratum 3AP.
729 */
730 if (maxlvt > 3) {
731 apic_read_around(APIC_SPIV);
732 apic_write(APIC_ESR, 0); 726 apic_write(APIC_ESR, 0);
733 }
734 accept_status = (apic_read(APIC_ESR) & 0xEF); 727 accept_status = (apic_read(APIC_ESR) & 0xEF);
735 if (send_status || accept_status) 728 if (send_status || accept_status)
736 break; 729 break;
737 } 730 }
738 Dprintk("After Startup.\n"); 731 pr_debug("After Startup.\n");
739 732
740 if (send_status) 733 if (send_status)
741 printk(KERN_ERR "APIC never delivered???\n"); 734 printk(KERN_ERR "APIC never delivered???\n");
@@ -763,12 +756,20 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
763} 756}
764 757
765#ifdef CONFIG_X86_64 758#ifdef CONFIG_X86_64
759
760/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */
761static void __ref free_bootmem_pda(struct x8664_pda *oldpda)
762{
763 if (!after_bootmem)
764 free_bootmem((unsigned long)oldpda, sizeof(*oldpda));
765}
766
766/* 767/*
767 * Allocate node local memory for the AP pda. 768 * Allocate node local memory for the AP pda.
768 * 769 *
769 * Must be called after the _cpu_pda pointer table is initialized. 770 * Must be called after the _cpu_pda pointer table is initialized.
770 */ 771 */
771static int __cpuinit get_local_pda(int cpu) 772int __cpuinit get_local_pda(int cpu)
772{ 773{
773 struct x8664_pda *oldpda, *newpda; 774 struct x8664_pda *oldpda, *newpda;
774 unsigned long size = sizeof(struct x8664_pda); 775 unsigned long size = sizeof(struct x8664_pda);
@@ -791,8 +792,7 @@ static int __cpuinit get_local_pda(int cpu)
791 792
792 if (oldpda) { 793 if (oldpda) {
793 memcpy(newpda, oldpda, size); 794 memcpy(newpda, oldpda, size);
794 if (!after_bootmem) 795 free_bootmem_pda(oldpda);
795 free_bootmem((unsigned long)oldpda, size);
796 } 796 }
797 797
798 newpda->in_bootmem = 0; 798 newpda->in_bootmem = 0;
@@ -874,7 +874,7 @@ do_rest:
874 start_ip = setup_trampoline(); 874 start_ip = setup_trampoline();
875 875
876 /* So we see what's up */ 876 /* So we see what's up */
877 printk(KERN_INFO "Booting processor %d/%d ip %lx\n", 877 printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n",
878 cpu, apicid, start_ip); 878 cpu, apicid, start_ip);
879 879
880 /* 880 /*
@@ -886,16 +886,18 @@ do_rest:
886 886
887 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 887 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
888 888
889 Dprintk("Setting warm reset code and vector.\n"); 889 pr_debug("Setting warm reset code and vector.\n");
890 890
891 store_NMI_vector(&nmi_high, &nmi_low); 891 store_NMI_vector(&nmi_high, &nmi_low);
892 892
893 smpboot_setup_warm_reset_vector(start_ip); 893 smpboot_setup_warm_reset_vector(start_ip);
894 /* 894 /*
895 * Be paranoid about clearing APIC errors. 895 * Be paranoid about clearing APIC errors.
896 */ 896 */
897 apic_write(APIC_ESR, 0); 897 if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) {
898 apic_read(APIC_ESR); 898 apic_write(APIC_ESR, 0);
899 apic_read(APIC_ESR);
900 }
899 } 901 }
900 902
901 /* 903 /*
@@ -907,9 +909,9 @@ do_rest:
907 /* 909 /*
908 * allow APs to start initializing. 910 * allow APs to start initializing.
909 */ 911 */
910 Dprintk("Before Callout %d.\n", cpu); 912 pr_debug("Before Callout %d.\n", cpu);
911 cpu_set(cpu, cpu_callout_map); 913 cpu_set(cpu, cpu_callout_map);
912 Dprintk("After Callout %d.\n", cpu); 914 pr_debug("After Callout %d.\n", cpu);
913 915
914 /* 916 /*
915 * Wait 5s total for a response 917 * Wait 5s total for a response
@@ -922,10 +924,10 @@ do_rest:
922 924
923 if (cpu_isset(cpu, cpu_callin_map)) { 925 if (cpu_isset(cpu, cpu_callin_map)) {
924 /* number CPUs logically, starting from 1 (BSP is 0) */ 926 /* number CPUs logically, starting from 1 (BSP is 0) */
925 Dprintk("OK.\n"); 927 pr_debug("OK.\n");
926 printk(KERN_INFO "CPU%d: ", cpu); 928 printk(KERN_INFO "CPU%d: ", cpu);
927 print_cpu_info(&cpu_data(cpu)); 929 print_cpu_info(&cpu_data(cpu));
928 Dprintk("CPU has booted.\n"); 930 pr_debug("CPU has booted.\n");
929 } else { 931 } else {
930 boot_error = 1; 932 boot_error = 1;
931 if (*((volatile unsigned char *)trampoline_base) 933 if (*((volatile unsigned char *)trampoline_base)
@@ -970,7 +972,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
970 972
971 WARN_ON(irqs_disabled()); 973 WARN_ON(irqs_disabled());
972 974
973 Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); 975 pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu);
974 976
975 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || 977 if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid ||
976 !physid_isset(apicid, phys_cpu_present_map)) { 978 !physid_isset(apicid, phys_cpu_present_map)) {
@@ -982,7 +984,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
982 * Already booted CPU? 984 * Already booted CPU?
983 */ 985 */
984 if (cpu_isset(cpu, cpu_callin_map)) { 986 if (cpu_isset(cpu, cpu_callin_map)) {
985 Dprintk("do_boot_cpu %d Already started\n", cpu); 987 pr_debug("do_boot_cpu %d Already started\n", cpu);
986 return -ENOSYS; 988 return -ENOSYS;
987 } 989 }
988 990
@@ -1009,7 +1011,7 @@ int __cpuinit native_cpu_up(unsigned int cpu)
1009 err = do_boot_cpu(apicid, cpu); 1011 err = do_boot_cpu(apicid, cpu);
1010#endif 1012#endif
1011 if (err) { 1013 if (err) {
1012 Dprintk("do_boot_cpu failed %d\n", err); 1014 pr_debug("do_boot_cpu failed %d\n", err);
1013 return -EIO; 1015 return -EIO;
1014 } 1016 }
1015 1017
@@ -1055,6 +1057,34 @@ static __init void disable_smp(void)
1055static int __init smp_sanity_check(unsigned max_cpus) 1057static int __init smp_sanity_check(unsigned max_cpus)
1056{ 1058{
1057 preempt_disable(); 1059 preempt_disable();
1060
1061#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32)
1062 if (def_to_bigsmp && nr_cpu_ids > 8) {
1063 unsigned int cpu;
1064 unsigned nr;
1065
1066 printk(KERN_WARNING
1067 "More than 8 CPUs detected - skipping them.\n"
1068 "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n");
1069
1070 nr = 0;
1071 for_each_present_cpu(cpu) {
1072 if (nr >= 8)
1073 cpu_clear(cpu, cpu_present_map);
1074 nr++;
1075 }
1076
1077 nr = 0;
1078 for_each_possible_cpu(cpu) {
1079 if (nr >= 8)
1080 cpu_clear(cpu, cpu_possible_map);
1081 nr++;
1082 }
1083
1084 nr_cpu_ids = 8;
1085 }
1086#endif
1087
1058 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { 1088 if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) {
1059 printk(KERN_WARNING "weird, boot CPU (#%d) not listed" 1089 printk(KERN_WARNING "weird, boot CPU (#%d) not listed"
1060 "by the BIOS.\n", hard_smp_processor_id()); 1090 "by the BIOS.\n", hard_smp_processor_id());
@@ -1147,10 +1177,17 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1147 * Setup boot CPU information 1177 * Setup boot CPU information
1148 */ 1178 */
1149 smp_store_cpu_info(0); /* Final full version of the data */ 1179 smp_store_cpu_info(0); /* Final full version of the data */
1180#ifdef CONFIG_X86_32
1150 boot_cpu_logical_apicid = logical_smp_processor_id(); 1181 boot_cpu_logical_apicid = logical_smp_processor_id();
1182#endif
1151 current_thread_info()->cpu = 0; /* needed? */ 1183 current_thread_info()->cpu = 0; /* needed? */
1152 set_cpu_sibling_map(0); 1184 set_cpu_sibling_map(0);
1153 1185
1186#ifdef CONFIG_X86_64
1187 enable_IR_x2apic();
1188 setup_apic_routing();
1189#endif
1190
1154 if (smp_sanity_check(max_cpus) < 0) { 1191 if (smp_sanity_check(max_cpus) < 0) {
1155 printk(KERN_INFO "SMP disabled\n"); 1192 printk(KERN_INFO "SMP disabled\n");
1156 disable_smp(); 1193 disable_smp();
@@ -1158,9 +1195,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1158 } 1195 }
1159 1196
1160 preempt_disable(); 1197 preempt_disable();
1161 if (GET_APIC_ID(read_apic_id()) != boot_cpu_physical_apicid) { 1198 if (read_apic_id() != boot_cpu_physical_apicid) {
1162 panic("Boot APIC ID in local APIC unexpected (%d vs %d)", 1199 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
1163 GET_APIC_ID(read_apic_id()), boot_cpu_physical_apicid); 1200 read_apic_id(), boot_cpu_physical_apicid);
1164 /* Or can we switch back to PIC here? */ 1201 /* Or can we switch back to PIC here? */
1165 } 1202 }
1166 preempt_enable(); 1203 preempt_enable();
@@ -1193,6 +1230,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1193 printk(KERN_INFO "CPU%d: ", 0); 1230 printk(KERN_INFO "CPU%d: ", 0);
1194 print_cpu_info(&cpu_data(0)); 1231 print_cpu_info(&cpu_data(0));
1195 setup_boot_clock(); 1232 setup_boot_clock();
1233
1234 if (is_uv_system())
1235 uv_system_init();
1196out: 1236out:
1197 preempt_enable(); 1237 preempt_enable();
1198} 1238}
@@ -1213,7 +1253,7 @@ void __init native_smp_prepare_boot_cpu(void)
1213 1253
1214void __init native_smp_cpus_done(unsigned int max_cpus) 1254void __init native_smp_cpus_done(unsigned int max_cpus)
1215{ 1255{
1216 Dprintk("Boot done.\n"); 1256 pr_debug("Boot done.\n");
1217 1257
1218 impress_friends(); 1258 impress_friends();
1219 smp_checks(); 1259 smp_checks();
@@ -1223,39 +1263,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1223 check_nmi_watchdog(); 1263 check_nmi_watchdog();
1224} 1264}
1225 1265
1226#ifdef CONFIG_HOTPLUG_CPU
1227
1228static void remove_siblinginfo(int cpu)
1229{
1230 int sibling;
1231 struct cpuinfo_x86 *c = &cpu_data(cpu);
1232
1233 for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) {
1234 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1235 /*/
1236 * last thread sibling in this cpu core going down
1237 */
1238 if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
1239 cpu_data(sibling).booted_cores--;
1240 }
1241
1242 for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu))
1243 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1244 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1245 cpus_clear(per_cpu(cpu_core_map, cpu));
1246 c->phys_proc_id = 0;
1247 c->cpu_core_id = 0;
1248 cpu_clear(cpu, cpu_sibling_setup_map);
1249}
1250
1251static int additional_cpus __initdata = -1;
1252
1253static __init int setup_additional_cpus(char *s)
1254{
1255 return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL;
1256}
1257early_param("additional_cpus", setup_additional_cpus);
1258
1259/* 1266/*
1260 * cpu_possible_map should be static, it cannot change as cpu's 1267 * cpu_possible_map should be static, it cannot change as cpu's
1261 * are onlined, or offlined. The reason is per-cpu data-structures 1268 * are onlined, or offlined. The reason is per-cpu data-structures
@@ -1275,24 +1282,13 @@ early_param("additional_cpus", setup_additional_cpus);
1275 */ 1282 */
1276__init void prefill_possible_map(void) 1283__init void prefill_possible_map(void)
1277{ 1284{
1278 int i; 1285 int i, possible;
1279 int possible;
1280 1286
1281 /* no processor from mptable or madt */ 1287 /* no processor from mptable or madt */
1282 if (!num_processors) 1288 if (!num_processors)
1283 num_processors = 1; 1289 num_processors = 1;
1284 1290
1285#ifdef CONFIG_HOTPLUG_CPU 1291 possible = num_processors + disabled_cpus;
1286 if (additional_cpus == -1) {
1287 if (disabled_cpus > 0)
1288 additional_cpus = disabled_cpus;
1289 else
1290 additional_cpus = 0;
1291 }
1292#else
1293 additional_cpus = 0;
1294#endif
1295 possible = num_processors + additional_cpus;
1296 if (possible > NR_CPUS) 1292 if (possible > NR_CPUS)
1297 possible = NR_CPUS; 1293 possible = NR_CPUS;
1298 1294
@@ -1305,17 +1301,64 @@ __init void prefill_possible_map(void)
1305 nr_cpu_ids = possible; 1301 nr_cpu_ids = possible;
1306} 1302}
1307 1303
1304#ifdef CONFIG_HOTPLUG_CPU
1305
1306static void remove_siblinginfo(int cpu)
1307{
1308 int sibling;
1309 struct cpuinfo_x86 *c = &cpu_data(cpu);
1310
1311 for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) {
1312 cpu_clear(cpu, per_cpu(cpu_core_map, sibling));
1313 /*/
1314 * last thread sibling in this cpu core going down
1315 */
1316 if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1)
1317 cpu_data(sibling).booted_cores--;
1318 }
1319
1320 for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu))
1321 cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling));
1322 cpus_clear(per_cpu(cpu_sibling_map, cpu));
1323 cpus_clear(per_cpu(cpu_core_map, cpu));
1324 c->phys_proc_id = 0;
1325 c->cpu_core_id = 0;
1326 cpu_clear(cpu, cpu_sibling_setup_map);
1327}
1328
1308static void __ref remove_cpu_from_maps(int cpu) 1329static void __ref remove_cpu_from_maps(int cpu)
1309{ 1330{
1310 cpu_clear(cpu, cpu_online_map); 1331 cpu_clear(cpu, cpu_online_map);
1311 cpu_clear(cpu, cpu_callout_map); 1332 cpu_clear(cpu, cpu_callout_map);
1312 cpu_clear(cpu, cpu_callin_map); 1333 cpu_clear(cpu, cpu_callin_map);
1313 /* was set by cpu_init() */ 1334 /* was set by cpu_init() */
1314 clear_bit(cpu, (unsigned long *)&cpu_initialized); 1335 cpu_clear(cpu, cpu_initialized);
1315 numa_remove_cpu(cpu); 1336 numa_remove_cpu(cpu);
1316} 1337}
1317 1338
1318int __cpu_disable(void) 1339void cpu_disable_common(void)
1340{
1341 int cpu = smp_processor_id();
1342 /*
1343 * HACK:
1344 * Allow any queued timer interrupts to get serviced
1345 * This is only a temporary solution until we cleanup
1346 * fixup_irqs as we do for IA64.
1347 */
1348 local_irq_enable();
1349 mdelay(1);
1350
1351 local_irq_disable();
1352 remove_siblinginfo(cpu);
1353
1354 /* It's now safe to remove this processor from the online map */
1355 lock_vector_lock();
1356 remove_cpu_from_maps(cpu);
1357 unlock_vector_lock();
1358 fixup_irqs(cpu_online_map);
1359}
1360
1361int native_cpu_disable(void)
1319{ 1362{
1320 int cpu = smp_processor_id(); 1363 int cpu = smp_processor_id();
1321 1364
@@ -1334,25 +1377,11 @@ int __cpu_disable(void)
1334 stop_apic_nmi_watchdog(NULL); 1377 stop_apic_nmi_watchdog(NULL);
1335 clear_local_APIC(); 1378 clear_local_APIC();
1336 1379
1337 /* 1380 cpu_disable_common();
1338 * HACK:
1339 * Allow any queued timer interrupts to get serviced
1340 * This is only a temporary solution until we cleanup
1341 * fixup_irqs as we do for IA64.
1342 */
1343 local_irq_enable();
1344 mdelay(1);
1345
1346 local_irq_disable();
1347 remove_siblinginfo(cpu);
1348
1349 /* It's now safe to remove this processor from the online map */
1350 remove_cpu_from_maps(cpu);
1351 fixup_irqs(cpu_online_map);
1352 return 0; 1381 return 0;
1353} 1382}
1354 1383
1355void __cpu_die(unsigned int cpu) 1384void native_cpu_die(unsigned int cpu)
1356{ 1385{
1357 /* We don't do anything here: idle task is faking death itself. */ 1386 /* We don't do anything here: idle task is faking death itself. */
1358 unsigned int i; 1387 unsigned int i;
@@ -1369,28 +1398,45 @@ void __cpu_die(unsigned int cpu)
1369 } 1398 }
1370 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1399 printk(KERN_ERR "CPU %u didn't die...\n", cpu);
1371} 1400}
1401
1402void play_dead_common(void)
1403{
1404 idle_task_exit();
1405 reset_lazy_tlbstate();
1406 irq_ctx_exit(raw_smp_processor_id());
1407 c1e_remove_cpu(raw_smp_processor_id());
1408
1409 mb();
1410 /* Ack it */
1411 __get_cpu_var(cpu_state) = CPU_DEAD;
1412
1413 /*
1414 * With physical CPU hotplug, we should halt the cpu
1415 */
1416 local_irq_disable();
1417}
1418
1419void native_play_dead(void)
1420{
1421 play_dead_common();
1422 wbinvd_halt();
1423}
1424
1372#else /* ... !CONFIG_HOTPLUG_CPU */ 1425#else /* ... !CONFIG_HOTPLUG_CPU */
1373int __cpu_disable(void) 1426int native_cpu_disable(void)
1374{ 1427{
1375 return -ENOSYS; 1428 return -ENOSYS;
1376} 1429}
1377 1430
1378void __cpu_die(unsigned int cpu) 1431void native_cpu_die(unsigned int cpu)
1379{ 1432{
1380 /* We said "no" in __cpu_disable */ 1433 /* We said "no" in __cpu_disable */
1381 BUG(); 1434 BUG();
1382} 1435}
1383#endif
1384 1436
1385/* 1437void native_play_dead(void)
1386 * If the BIOS enumerates physical processors before logical,
1387 * maxcpus=N at enumeration-time can be used to disable HT.
1388 */
1389static int __init parse_maxcpus(char *arg)
1390{ 1438{
1391 extern unsigned int maxcpus; 1439 BUG();
1392
1393 maxcpus = simple_strtoul(arg, NULL, 0);
1394 return 0;
1395} 1440}
1396early_param("maxcpus", parse_maxcpus); 1441
1442#endif
diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c
index 99941b37eca0..397e309839dd 100644
--- a/arch/x86/kernel/smpcommon.c
+++ b/arch/x86/kernel/smpcommon.c
@@ -8,18 +8,21 @@
8DEFINE_PER_CPU(unsigned long, this_cpu_off); 8DEFINE_PER_CPU(unsigned long, this_cpu_off);
9EXPORT_PER_CPU_SYMBOL(this_cpu_off); 9EXPORT_PER_CPU_SYMBOL(this_cpu_off);
10 10
11/* Initialize the CPU's GDT. This is either the boot CPU doing itself 11/*
12 (still using the master per-cpu area), or a CPU doing it for a 12 * Initialize the CPU's GDT. This is either the boot CPU doing itself
13 secondary which will soon come up. */ 13 * (still using the master per-cpu area), or a CPU doing it for a
14 * secondary which will soon come up.
15 */
14__cpuinit void init_gdt(int cpu) 16__cpuinit void init_gdt(int cpu)
15{ 17{
16 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 18 struct desc_struct gdt;
17 19
18 pack_descriptor(&gdt[GDT_ENTRY_PERCPU], 20 pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF,
19 __per_cpu_offset[cpu], 0xFFFFF,
20 0x2 | DESCTYPE_S, 0x8); 21 0x2 | DESCTYPE_S, 0x8);
22 gdt.s = 1;
21 23
22 gdt[GDT_ENTRY_PERCPU].s = 1; 24 write_gdt_entry(get_cpu_gdt_table(cpu),
25 GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S);
23 26
24 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; 27 per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu];
25 per_cpu(cpu_number, cpu) = cpu; 28 per_cpu(cpu_number, cpu) = cpu;
diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c
deleted file mode 100644
index 8b137891791f..000000000000
--- a/arch/x86/kernel/smpcommon_32.c
+++ /dev/null
@@ -1 +0,0 @@
1
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 92c20fee6781..e8b9863ef8c4 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -105,6 +105,20 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
105static int enable_single_step(struct task_struct *child) 105static int enable_single_step(struct task_struct *child)
106{ 106{
107 struct pt_regs *regs = task_pt_regs(child); 107 struct pt_regs *regs = task_pt_regs(child);
108 unsigned long oflags;
109
110 /*
111 * If we stepped into a sysenter/syscall insn, it trapped in
112 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
113 * If user-mode had set TF itself, then it's still clear from
114 * do_debug() and we need to set it again to restore the user
115 * state so we don't wrongly set TIF_FORCED_TF below.
116 * If enable_single_step() was used last and that is what
117 * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are
118 * already set and our bookkeeping is fine.
119 */
120 if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP)))
121 regs->flags |= X86_EFLAGS_TF;
108 122
109 /* 123 /*
110 * Always set TIF_SINGLESTEP - this guarantees that 124 * Always set TIF_SINGLESTEP - this guarantees that
@@ -113,11 +127,7 @@ static int enable_single_step(struct task_struct *child)
113 */ 127 */
114 set_tsk_thread_flag(child, TIF_SINGLESTEP); 128 set_tsk_thread_flag(child, TIF_SINGLESTEP);
115 129
116 /* 130 oflags = regs->flags;
117 * If TF was already set, don't do anything else
118 */
119 if (regs->flags & X86_EFLAGS_TF)
120 return 0;
121 131
122 /* Set TF on the kernel stack.. */ 132 /* Set TF on the kernel stack.. */
123 regs->flags |= X86_EFLAGS_TF; 133 regs->flags |= X86_EFLAGS_TF;
@@ -126,9 +136,22 @@ static int enable_single_step(struct task_struct *child)
126 * ..but if TF is changed by the instruction we will trace, 136 * ..but if TF is changed by the instruction we will trace,
127 * don't mark it as being "us" that set it, so that we 137 * don't mark it as being "us" that set it, so that we
128 * won't clear it by hand later. 138 * won't clear it by hand later.
139 *
140 * Note that if we don't actually execute the popf because
141 * of a signal arriving right now or suchlike, we will lose
142 * track of the fact that it really was "us" that set it.
129 */ 143 */
130 if (is_setting_trap_flag(child, regs)) 144 if (is_setting_trap_flag(child, regs)) {
145 clear_tsk_thread_flag(child, TIF_FORCED_TF);
131 return 0; 146 return 0;
147 }
148
149 /*
150 * If TF was already set, check whether it was us who set it.
151 * If not, we should never attempt a block step.
152 */
153 if (oflags & X86_EFLAGS_TF)
154 return test_tsk_thread_flag(child, TIF_FORCED_TF);
132 155
133 set_tsk_thread_flag(child, TIF_FORCED_TF); 156 set_tsk_thread_flag(child, TIF_FORCED_TF);
134 157
diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c
index d67ce5f044ba..7b987852e876 100644
--- a/arch/x86/kernel/summit_32.c
+++ b/arch/x86/kernel/summit_32.c
@@ -30,7 +30,7 @@
30#include <linux/init.h> 30#include <linux/init.h>
31#include <asm/io.h> 31#include <asm/io.h>
32#include <asm/bios_ebda.h> 32#include <asm/bios_ebda.h>
33#include <asm/mach-summit/mach_mpparse.h> 33#include <asm/summit/mpparse.h>
34 34
35static struct rio_table_hdr *rio_table_hdr __initdata; 35static struct rio_table_hdr *rio_table_hdr __initdata;
36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; 36static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata;
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 7066cb855a60..1884a8d12bfa 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -22,6 +22,8 @@
22#include <linux/uaccess.h> 22#include <linux/uaccess.h>
23#include <linux/unistd.h> 23#include <linux/unistd.h>
24 24
25#include <asm/syscalls.h>
26
25asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, 27asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
26 unsigned long prot, unsigned long flags, 28 unsigned long prot, unsigned long flags,
27 unsigned long fd, unsigned long pgoff) 29 unsigned long fd, unsigned long pgoff)
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 3b360ef33817..6bc211accf08 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -13,15 +13,17 @@
13#include <linux/utsname.h> 13#include <linux/utsname.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/random.h> 15#include <linux/random.h>
16#include <linux/uaccess.h>
16 17
17#include <asm/uaccess.h>
18#include <asm/ia32.h> 18#include <asm/ia32.h>
19#include <asm/syscalls.h>
19 20
20asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, 21asmlinkage long sys_mmap(unsigned long addr, unsigned long len,
21 unsigned long fd, unsigned long off) 22 unsigned long prot, unsigned long flags,
23 unsigned long fd, unsigned long off)
22{ 24{
23 long error; 25 long error;
24 struct file * file; 26 struct file *file;
25 27
26 error = -EINVAL; 28 error = -EINVAL;
27 if (off & ~PAGE_MASK) 29 if (off & ~PAGE_MASK)
@@ -56,9 +58,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
56 unmapped base down for this case. This can give 58 unmapped base down for this case. This can give
57 conflicts with the heap, but we assume that glibc 59 conflicts with the heap, but we assume that glibc
58 malloc knows how to fall back to mmap. Give it 1GB 60 malloc knows how to fall back to mmap. Give it 1GB
59 of playground for now. -AK */ 61 of playground for now. -AK */
60 *begin = 0x40000000; 62 *begin = 0x40000000;
61 *end = 0x80000000; 63 *end = 0x80000000;
62 if (current->flags & PF_RANDOMIZE) { 64 if (current->flags & PF_RANDOMIZE) {
63 new_begin = randomize_range(*begin, *begin + 0x02000000, 0); 65 new_begin = randomize_range(*begin, *begin + 0x02000000, 0);
64 if (new_begin) 66 if (new_begin)
@@ -66,9 +68,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin,
66 } 68 }
67 } else { 69 } else {
68 *begin = TASK_UNMAPPED_BASE; 70 *begin = TASK_UNMAPPED_BASE;
69 *end = TASK_SIZE; 71 *end = TASK_SIZE;
70 } 72 }
71} 73}
72 74
73unsigned long 75unsigned long
74arch_get_unmapped_area(struct file *filp, unsigned long addr, 76arch_get_unmapped_area(struct file *filp, unsigned long addr,
@@ -78,11 +80,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
78 struct vm_area_struct *vma; 80 struct vm_area_struct *vma;
79 unsigned long start_addr; 81 unsigned long start_addr;
80 unsigned long begin, end; 82 unsigned long begin, end;
81 83
82 if (flags & MAP_FIXED) 84 if (flags & MAP_FIXED)
83 return addr; 85 return addr;
84 86
85 find_start_end(flags, &begin, &end); 87 find_start_end(flags, &begin, &end);
86 88
87 if (len > end) 89 if (len > end)
88 return -ENOMEM; 90 return -ENOMEM;
@@ -96,12 +98,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
96 } 98 }
97 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) 99 if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32))
98 && len <= mm->cached_hole_size) { 100 && len <= mm->cached_hole_size) {
99 mm->cached_hole_size = 0; 101 mm->cached_hole_size = 0;
100 mm->free_area_cache = begin; 102 mm->free_area_cache = begin;
101 } 103 }
102 addr = mm->free_area_cache; 104 addr = mm->free_area_cache;
103 if (addr < begin) 105 if (addr < begin)
104 addr = begin; 106 addr = begin;
105 start_addr = addr; 107 start_addr = addr;
106 108
107full_search: 109full_search:
@@ -127,7 +129,7 @@ full_search:
127 return addr; 129 return addr;
128 } 130 }
129 if (addr + mm->cached_hole_size < vma->vm_start) 131 if (addr + mm->cached_hole_size < vma->vm_start)
130 mm->cached_hole_size = vma->vm_start - addr; 132 mm->cached_hole_size = vma->vm_start - addr;
131 133
132 addr = vma->vm_end; 134 addr = vma->vm_end;
133 } 135 }
@@ -177,7 +179,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
177 vma = find_vma(mm, addr-len); 179 vma = find_vma(mm, addr-len);
178 if (!vma || addr <= vma->vm_start) 180 if (!vma || addr <= vma->vm_start)
179 /* remember the address as a hint for next time */ 181 /* remember the address as a hint for next time */
180 return (mm->free_area_cache = addr-len); 182 return mm->free_area_cache = addr-len;
181 } 183 }
182 184
183 if (mm->mmap_base < len) 185 if (mm->mmap_base < len)
@@ -194,7 +196,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
194 vma = find_vma(mm, addr); 196 vma = find_vma(mm, addr);
195 if (!vma || addr+len <= vma->vm_start) 197 if (!vma || addr+len <= vma->vm_start)
196 /* remember the address as a hint for next time */ 198 /* remember the address as a hint for next time */
197 return (mm->free_area_cache = addr); 199 return mm->free_area_cache = addr;
198 200
199 /* remember the largest hole we saw so far */ 201 /* remember the largest hole we saw so far */
200 if (addr + mm->cached_hole_size < vma->vm_start) 202 if (addr + mm->cached_hole_size < vma->vm_start)
@@ -224,13 +226,13 @@ bottomup:
224} 226}
225 227
226 228
227asmlinkage long sys_uname(struct new_utsname __user * name) 229asmlinkage long sys_uname(struct new_utsname __user *name)
228{ 230{
229 int err; 231 int err;
230 down_read(&uts_sem); 232 down_read(&uts_sem);
231 err = copy_to_user(name, utsname(), sizeof (*name)); 233 err = copy_to_user(name, utsname(), sizeof(*name));
232 up_read(&uts_sem); 234 up_read(&uts_sem);
233 if (personality(current->personality) == PER_LINUX32) 235 if (personality(current->personality) == PER_LINUX32)
234 err |= copy_to_user(&name->machine, "i686", 5); 236 err |= copy_to_user(&name->machine, "i686", 5);
235 return err ? -EFAULT : 0; 237 return err ? -EFAULT : 0;
236} 238}
diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c
index 170d43c17487..de87d6008295 100644
--- a/arch/x86/kernel/syscall_64.c
+++ b/arch/x86/kernel/syscall_64.c
@@ -8,12 +8,12 @@
8#define __NO_STUBS 8#define __NO_STUBS
9 9
10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; 10#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ;
11#undef _ASM_X86_64_UNISTD_H_ 11#undef _ASM_X86_UNISTD_64_H
12#include <asm/unistd_64.h> 12#include <asm/unistd_64.h>
13 13
14#undef __SYSCALL 14#undef __SYSCALL
15#define __SYSCALL(nr, sym) [nr] = sym, 15#define __SYSCALL(nr, sym) [nr] = sym,
16#undef _ASM_X86_64_UNISTD_H_ 16#undef _ASM_X86_UNISTD_64_H
17 17
18typedef void (*sys_call_ptr_t)(void); 18typedef void (*sys_call_ptr_t)(void);
19 19
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index adff5562f5fd..d44395ff34c3 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -326,3 +326,9 @@ ENTRY(sys_call_table)
326 .long sys_fallocate 326 .long sys_fallocate
327 .long sys_timerfd_settime /* 325 */ 327 .long sys_timerfd_settime /* 325 */
328 .long sys_timerfd_gettime 328 .long sys_timerfd_gettime
329 .long sys_signalfd4
330 .long sys_eventfd2
331 .long sys_epoll_create1
332 .long sys_dup3 /* 330 */
333 .long sys_pipe2
334 .long sys_inotify_init1
diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c
index 059ca6ee59b4..77b400f06ea2 100644
--- a/arch/x86/kernel/time_32.c
+++ b/arch/x86/kernel/time_32.c
@@ -36,6 +36,7 @@
36#include <asm/arch_hooks.h> 36#include <asm/arch_hooks.h>
37#include <asm/hpet.h> 37#include <asm/hpet.h>
38#include <asm/time.h> 38#include <asm/time.h>
39#include <asm/timer.h>
39 40
40#include "do_timer.h" 41#include "do_timer.h"
41 42
@@ -46,10 +47,9 @@ unsigned long profile_pc(struct pt_regs *regs)
46 unsigned long pc = instruction_pointer(regs); 47 unsigned long pc = instruction_pointer(regs);
47 48
48#ifdef CONFIG_SMP 49#ifdef CONFIG_SMP
49 if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) && 50 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
50 in_lock_functions(pc)) {
51#ifdef CONFIG_FRAME_POINTER 51#ifdef CONFIG_FRAME_POINTER
52 return *(unsigned long *)(regs->bp + 4); 52 return *(unsigned long *)(regs->bp + sizeof(long));
53#else 53#else
54 unsigned long *sp = (unsigned long *)&regs->sp; 54 unsigned long *sp = (unsigned long *)&regs->sp;
55 55
@@ -94,6 +94,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
94 94
95 do_timer_interrupt_hook(); 95 do_timer_interrupt_hook();
96 96
97#ifdef CONFIG_MCA
97 if (MCA_bus) { 98 if (MCA_bus) {
98 /* The PS/2 uses level-triggered interrupts. You can't 99 /* The PS/2 uses level-triggered interrupts. You can't
99 turn them off, nor would you want to (any attempt to 100 turn them off, nor would you want to (any attempt to
@@ -107,6 +108,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id)
107 u8 irq_v = inb_p( 0x61 ); /* read the current state */ 108 u8 irq_v = inb_p( 0x61 ); /* read the current state */
108 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ 109 outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */
109 } 110 }
111#endif
110 112
111 return IRQ_HANDLED; 113 return IRQ_HANDLED;
112} 114}
@@ -129,6 +131,7 @@ void __init hpet_time_init(void)
129 */ 131 */
130void __init time_init(void) 132void __init time_init(void)
131{ 133{
134 pre_time_init_hook();
132 tsc_init(); 135 tsc_init();
133 late_time_init = choose_time_init(); 136 late_time_init = choose_time_init();
134} 137}
diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c
index e3d49c553af2..cb19d650c216 100644
--- a/arch/x86/kernel/time_64.c
+++ b/arch/x86/kernel/time_64.c
@@ -16,6 +16,7 @@
16#include <linux/interrupt.h> 16#include <linux/interrupt.h>
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/time.h> 18#include <linux/time.h>
19#include <linux/mca.h>
19 20
20#include <asm/i8253.h> 21#include <asm/i8253.h>
21#include <asm/hpet.h> 22#include <asm/hpet.h>
@@ -33,23 +34,34 @@ unsigned long profile_pc(struct pt_regs *regs)
33 /* Assume the lock function has either no stack frame or a copy 34 /* Assume the lock function has either no stack frame or a copy
34 of flags from PUSHF 35 of flags from PUSHF
35 Eflags always has bits 22 and up cleared unlike kernel addresses. */ 36 Eflags always has bits 22 and up cleared unlike kernel addresses. */
36 if (!user_mode(regs) && in_lock_functions(pc)) { 37 if (!user_mode_vm(regs) && in_lock_functions(pc)) {
38#ifdef CONFIG_FRAME_POINTER
39 return *(unsigned long *)(regs->bp + sizeof(long));
40#else
37 unsigned long *sp = (unsigned long *)regs->sp; 41 unsigned long *sp = (unsigned long *)regs->sp;
38 if (sp[0] >> 22) 42 if (sp[0] >> 22)
39 return sp[0]; 43 return sp[0];
40 if (sp[1] >> 22) 44 if (sp[1] >> 22)
41 return sp[1]; 45 return sp[1];
46#endif
42 } 47 }
43 return pc; 48 return pc;
44} 49}
45EXPORT_SYMBOL(profile_pc); 50EXPORT_SYMBOL(profile_pc);
46 51
47static irqreturn_t timer_event_interrupt(int irq, void *dev_id) 52irqreturn_t timer_interrupt(int irq, void *dev_id)
48{ 53{
49 add_pda(irq0_irqs, 1); 54 add_pda(irq0_irqs, 1);
50 55
51 global_clock_event->event_handler(global_clock_event); 56 global_clock_event->event_handler(global_clock_event);
52 57
58#ifdef CONFIG_MCA
59 if (MCA_bus) {
60 u8 irq_v = inb_p(0x61); /* read the current state */
61 outb_p(irq_v|0x80, 0x61); /* reset the IRQ */
62 }
63#endif
64
53 return IRQ_HANDLED; 65 return IRQ_HANDLED;
54} 66}
55 67
@@ -100,7 +112,7 @@ unsigned long __init calibrate_cpu(void)
100} 112}
101 113
102static struct irqaction irq0 = { 114static struct irqaction irq0 = {
103 .handler = timer_event_interrupt, 115 .handler = timer_interrupt,
104 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, 116 .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING,
105 .mask = CPU_MASK_NONE, 117 .mask = CPU_MASK_NONE,
106 .name = "timer" 118 .name = "timer"
@@ -111,16 +123,13 @@ void __init hpet_time_init(void)
111 if (!hpet_enable()) 123 if (!hpet_enable())
112 setup_pit_timer(); 124 setup_pit_timer();
113 125
126 irq0.mask = cpumask_of_cpu(0);
114 setup_irq(0, &irq0); 127 setup_irq(0, &irq0);
115} 128}
116 129
117void __init time_init(void) 130void __init time_init(void)
118{ 131{
119 tsc_init(); 132 tsc_init();
120 if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP))
121 vgetcpu_mode = VGETCPU_RDTSCP;
122 else
123 vgetcpu_mode = VGETCPU_LSL;
124 133
125 late_time_init = choose_time_init(); 134 late_time_init = choose_time_init();
126} 135}
diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c
index fec1ecedc9b7..e00534b33534 100644
--- a/arch/x86/kernel/tlb_32.c
+++ b/arch/x86/kernel/tlb_32.c
@@ -241,3 +241,11 @@ void flush_tlb_all(void)
241 on_each_cpu(do_flush_tlb_all, NULL, 1); 241 on_each_cpu(do_flush_tlb_all, NULL, 1);
242} 242}
243 243
244void reset_lazy_tlbstate(void)
245{
246 int cpu = raw_smp_processor_id();
247
248 per_cpu(cpu_tlbstate, cpu).state = 0;
249 per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm;
250}
251
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index d0fbb7712ab0..04431f34fd16 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -6,7 +6,7 @@
6 * This code is released under the GNU General Public License version 2 or 6 * This code is released under the GNU General Public License version 2 or
7 * later. 7 * later.
8 */ 8 */
9#include <linux/mc146818rtc.h> 9#include <linux/seq_file.h>
10#include <linux/proc_fs.h> 10#include <linux/proc_fs.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12 12
@@ -17,6 +17,7 @@
17#include <asm/genapic.h> 17#include <asm/genapic.h>
18#include <asm/idle.h> 18#include <asm/idle.h>
19#include <asm/tsc.h> 19#include <asm/tsc.h>
20#include <asm/irq_vectors.h>
20 21
21#include <mach_apic.h> 22#include <mach_apic.h>
22 23
@@ -783,7 +784,7 @@ static int __init uv_bau_init(void)
783 uv_init_blade(blade, node, cur_cpu); 784 uv_init_blade(blade, node, cur_cpu);
784 cur_cpu += uv_blade_nr_possible_cpus(blade); 785 cur_cpu += uv_blade_nr_possible_cpus(blade);
785 } 786 }
786 set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); 787 alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
787 uv_enable_timeouts(); 788 uv_enable_timeouts();
788 789
789 return 0; 790 return 0;
diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c
index ab6bf375a307..6bb7b8579e70 100644
--- a/arch/x86/kernel/tls.c
+++ b/arch/x86/kernel/tls.c
@@ -10,6 +10,7 @@
10#include <asm/ldt.h> 10#include <asm/ldt.h>
11#include <asm/processor.h> 11#include <asm/processor.h>
12#include <asm/proto.h> 12#include <asm/proto.h>
13#include <asm/syscalls.h>
13 14
14#include "tls.h" 15#include "tls.h"
15 16
diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps.c
index 8a768973c4f0..04d242ab0161 100644
--- a/arch/x86/kernel/traps_32.c
+++ b/arch/x86/kernel/traps.c
@@ -7,13 +7,11 @@
7 */ 7 */
8 8
9/* 9/*
10 * 'Traps.c' handles hardware traps and faults after we have saved some 10 * Handle hardware traps and faults.
11 * state in 'asm.s'.
12 */ 11 */
13#include <linux/interrupt.h> 12#include <linux/interrupt.h>
14#include <linux/kallsyms.h> 13#include <linux/kallsyms.h>
15#include <linux/spinlock.h> 14#include <linux/spinlock.h>
16#include <linux/highmem.h>
17#include <linux/kprobes.h> 15#include <linux/kprobes.h>
18#include <linux/uaccess.h> 16#include <linux/uaccess.h>
19#include <linux/utsname.h> 17#include <linux/utsname.h>
@@ -32,6 +30,8 @@
32#include <linux/bug.h> 30#include <linux/bug.h>
33#include <linux/nmi.h> 31#include <linux/nmi.h>
34#include <linux/mm.h> 32#include <linux/mm.h>
33#include <linux/smp.h>
34#include <linux/io.h>
35 35
36#ifdef CONFIG_EISA 36#ifdef CONFIG_EISA
37#include <linux/ioport.h> 37#include <linux/ioport.h>
@@ -46,20 +46,31 @@
46#include <linux/edac.h> 46#include <linux/edac.h>
47#endif 47#endif
48 48
49#include <asm/arch_hooks.h>
50#include <asm/stacktrace.h> 49#include <asm/stacktrace.h>
51#include <asm/processor.h> 50#include <asm/processor.h>
52#include <asm/debugreg.h> 51#include <asm/debugreg.h>
53#include <asm/atomic.h> 52#include <asm/atomic.h>
54#include <asm/system.h> 53#include <asm/system.h>
55#include <asm/unwind.h> 54#include <asm/unwind.h>
55#include <asm/traps.h>
56#include <asm/desc.h> 56#include <asm/desc.h>
57#include <asm/i387.h> 57#include <asm/i387.h>
58
59#include <mach_traps.h>
60
61#ifdef CONFIG_X86_64
62#include <asm/pgalloc.h>
63#include <asm/proto.h>
64#include <asm/pda.h>
65#else
66#include <asm/processor-flags.h>
67#include <asm/arch_hooks.h>
58#include <asm/nmi.h> 68#include <asm/nmi.h>
59#include <asm/smp.h> 69#include <asm/smp.h>
60#include <asm/io.h> 70#include <asm/io.h>
71#include <asm/traps.h>
61 72
62#include "mach_traps.h" 73#include "cpu/mcheck/mce.h"
63 74
64DECLARE_BITMAP(used_vectors, NR_VECTORS); 75DECLARE_BITMAP(used_vectors, NR_VECTORS);
65EXPORT_SYMBOL_GPL(used_vectors); 76EXPORT_SYMBOL_GPL(used_vectors);
@@ -76,431 +87,104 @@ char ignore_fpu_irq;
76 */ 87 */
77gate_desc idt_table[256] 88gate_desc idt_table[256]
78 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; 89 __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, };
79
80asmlinkage void divide_error(void);
81asmlinkage void debug(void);
82asmlinkage void nmi(void);
83asmlinkage void int3(void);
84asmlinkage void overflow(void);
85asmlinkage void bounds(void);
86asmlinkage void invalid_op(void);
87asmlinkage void device_not_available(void);
88asmlinkage void coprocessor_segment_overrun(void);
89asmlinkage void invalid_TSS(void);
90asmlinkage void segment_not_present(void);
91asmlinkage void stack_segment(void);
92asmlinkage void general_protection(void);
93asmlinkage void page_fault(void);
94asmlinkage void coprocessor_error(void);
95asmlinkage void simd_coprocessor_error(void);
96asmlinkage void alignment_check(void);
97asmlinkage void spurious_interrupt_bug(void);
98asmlinkage void machine_check(void);
99
100int panic_on_unrecovered_nmi;
101int kstack_depth_to_print = 24;
102static unsigned int code_bytes = 64;
103static int ignore_nmis;
104static int die_counter;
105
106void printk_address(unsigned long address, int reliable)
107{
108#ifdef CONFIG_KALLSYMS
109 unsigned long offset = 0;
110 unsigned long symsize;
111 const char *symname;
112 char *modname;
113 char *delim = ":";
114 char namebuf[KSYM_NAME_LEN];
115 char reliab[4] = "";
116
117 symname = kallsyms_lookup(address, &symsize, &offset,
118 &modname, namebuf);
119 if (!symname) {
120 printk(" [<%08lx>]\n", address);
121 return;
122 }
123 if (!reliable)
124 strcpy(reliab, "? ");
125
126 if (!modname)
127 modname = delim = "";
128 printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n",
129 address, reliab, delim, modname, delim, symname, offset, symsize);
130#else
131 printk(" [<%08lx>]\n", address);
132#endif 90#endif
133}
134
135static inline int valid_stack_ptr(struct thread_info *tinfo,
136 void *p, unsigned int size)
137{
138 void *t = tinfo;
139 return p > t && p <= t + THREAD_SIZE - size;
140}
141
142/* The form of the top of the frame on the stack */
143struct stack_frame {
144 struct stack_frame *next_frame;
145 unsigned long return_address;
146};
147 91
148static inline unsigned long 92static int ignore_nmis;
149print_context_stack(struct thread_info *tinfo,
150 unsigned long *stack, unsigned long bp,
151 const struct stacktrace_ops *ops, void *data)
152{
153 struct stack_frame *frame = (struct stack_frame *)bp;
154
155 while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) {
156 unsigned long addr;
157
158 addr = *stack;
159 if (__kernel_text_address(addr)) {
160 if ((unsigned long) stack == bp + 4) {
161 ops->address(data, addr, 1);
162 frame = frame->next_frame;
163 bp = (unsigned long) frame;
164 } else {
165 ops->address(data, addr, bp == 0);
166 }
167 }
168 stack++;
169 }
170 return bp;
171}
172
173void dump_trace(struct task_struct *task, struct pt_regs *regs,
174 unsigned long *stack, unsigned long bp,
175 const struct stacktrace_ops *ops, void *data)
176{
177 if (!task)
178 task = current;
179
180 if (!stack) {
181 unsigned long dummy;
182 stack = &dummy;
183 if (task != current)
184 stack = (unsigned long *)task->thread.sp;
185 }
186
187#ifdef CONFIG_FRAME_POINTER
188 if (!bp) {
189 if (task == current) {
190 /* Grab bp right from our regs */
191 asm("movl %%ebp, %0" : "=r" (bp) :);
192 } else {
193 /* bp is the last reg pushed by switch_to */
194 bp = *(unsigned long *) task->thread.sp;
195 }
196 }
197#endif
198
199 for (;;) {
200 struct thread_info *context;
201
202 context = (struct thread_info *)
203 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
204 bp = print_context_stack(context, stack, bp, ops, data);
205 /*
206 * Should be after the line below, but somewhere
207 * in early boot context comes out corrupted and we
208 * can't reference it:
209 */
210 if (ops->stack(data, "IRQ") < 0)
211 break;
212 stack = (unsigned long *)context->previous_esp;
213 if (!stack)
214 break;
215 touch_nmi_watchdog();
216 }
217}
218EXPORT_SYMBOL(dump_trace);
219
220static void
221print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
222{
223 printk(data);
224 print_symbol(msg, symbol);
225 printk("\n");
226}
227
228static void print_trace_warning(void *data, char *msg)
229{
230 printk("%s%s\n", (char *)data, msg);
231}
232
233static int print_trace_stack(void *data, char *name)
234{
235 return 0;
236}
237
238/*
239 * Print one address/symbol entries per line.
240 */
241static void print_trace_address(void *data, unsigned long addr, int reliable)
242{
243 printk("%s [<%08lx>] ", (char *)data, addr);
244 if (!reliable)
245 printk("? ");
246 print_symbol("%s\n", addr);
247 touch_nmi_watchdog();
248}
249
250static const struct stacktrace_ops print_trace_ops = {
251 .warning = print_trace_warning,
252 .warning_symbol = print_trace_warning_symbol,
253 .stack = print_trace_stack,
254 .address = print_trace_address,
255};
256 93
257static void 94static inline void conditional_sti(struct pt_regs *regs)
258show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
259 unsigned long *stack, unsigned long bp, char *log_lvl)
260{ 95{
261 dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); 96 if (regs->flags & X86_EFLAGS_IF)
262 printk("%s =======================\n", log_lvl); 97 local_irq_enable();
263} 98}
264 99
265void show_trace(struct task_struct *task, struct pt_regs *regs, 100static inline void preempt_conditional_sti(struct pt_regs *regs)
266 unsigned long *stack, unsigned long bp)
267{ 101{
268 show_trace_log_lvl(task, regs, stack, bp, ""); 102 inc_preempt_count();
103 if (regs->flags & X86_EFLAGS_IF)
104 local_irq_enable();
269} 105}
270 106
271static void 107static inline void preempt_conditional_cli(struct pt_regs *regs)
272show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
273 unsigned long *sp, unsigned long bp, char *log_lvl)
274{ 108{
275 unsigned long *stack; 109 if (regs->flags & X86_EFLAGS_IF)
276 int i; 110 local_irq_disable();
277 111 dec_preempt_count();
278 if (sp == NULL) {
279 if (task)
280 sp = (unsigned long *)task->thread.sp;
281 else
282 sp = (unsigned long *)&sp;
283 }
284
285 stack = sp;
286 for (i = 0; i < kstack_depth_to_print; i++) {
287 if (kstack_end(stack))
288 break;
289 if (i && ((i % 8) == 0))
290 printk("\n%s ", log_lvl);
291 printk("%08lx ", *stack++);
292 }
293 printk("\n%sCall Trace:\n", log_lvl);
294
295 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
296} 112}
297 113
298void show_stack(struct task_struct *task, unsigned long *sp) 114#ifdef CONFIG_X86_32
115static inline void
116die_if_kernel(const char *str, struct pt_regs *regs, long err)
299{ 117{
300 printk(" "); 118 if (!user_mode_vm(regs))
301 show_stack_log_lvl(task, NULL, sp, 0, ""); 119 die(str, regs, err);
302} 120}
303 121
304/* 122/*
305 * The architecture-independent dump_stack generator 123 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
124 * invalid offset set (the LAZY one) and the faulting thread has
125 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS,
126 * we set the offset field correctly and return 1.
306 */ 127 */
307void dump_stack(void) 128static int lazy_iobitmap_copy(void)
308{
309 unsigned long bp = 0;
310 unsigned long stack;
311
312#ifdef CONFIG_FRAME_POINTER
313 if (!bp)
314 asm("movl %%ebp, %0" : "=r" (bp):);
315#endif
316
317 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
318 current->pid, current->comm, print_tainted(),
319 init_utsname()->release,
320 (int)strcspn(init_utsname()->version, " "),
321 init_utsname()->version);
322
323 show_trace(current, NULL, &stack, bp);
324}
325
326EXPORT_SYMBOL(dump_stack);
327
328void show_registers(struct pt_regs *regs)
329{ 129{
330 int i; 130 struct thread_struct *thread;
131 struct tss_struct *tss;
132 int cpu;
331 133
332 print_modules(); 134 cpu = get_cpu();
333 __show_registers(regs, 0); 135 tss = &per_cpu(init_tss, cpu);
136 thread = &current->thread;
334 137
335 printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", 138 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
336 TASK_COMM_LEN, current->comm, task_pid_nr(current), 139 thread->io_bitmap_ptr) {
337 current_thread_info(), current, task_thread_info(current)); 140 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
338 /* 141 thread->io_bitmap_max);
339 * When in-kernel, we also print out the stack and code at the 142 /*
340 * time of the fault.. 143 * If the previously set map was extending to higher ports
341 */ 144 * than the current one, pad extra space with 0xff (no access).
342 if (!user_mode_vm(regs)) { 145 */
343 unsigned int code_prologue = code_bytes * 43 / 64; 146 if (thread->io_bitmap_max < tss->io_bitmap_max) {
344 unsigned int code_len = code_bytes; 147 memset((char *) tss->io_bitmap +
345 unsigned char c; 148 thread->io_bitmap_max, 0xff,
346 u8 *ip; 149 tss->io_bitmap_max - thread->io_bitmap_max);
347
348 printk("\n" KERN_EMERG "Stack: ");
349 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
350
351 printk(KERN_EMERG "Code: ");
352
353 ip = (u8 *)regs->ip - code_prologue;
354 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
355 /* try starting at EIP */
356 ip = (u8 *)regs->ip;
357 code_len = code_len - code_prologue + 1;
358 }
359 for (i = 0; i < code_len; i++, ip++) {
360 if (ip < (u8 *)PAGE_OFFSET ||
361 probe_kernel_address(ip, c)) {
362 printk(" Bad EIP value.");
363 break;
364 }
365 if (ip == (u8 *)regs->ip)
366 printk("<%02x> ", c);
367 else
368 printk("%02x ", c);
369 } 150 }
370 } 151 tss->io_bitmap_max = thread->io_bitmap_max;
371 printk("\n"); 152 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
372} 153 tss->io_bitmap_owner = thread;
373 154 put_cpu();
374int is_valid_bugaddr(unsigned long ip)
375{
376 unsigned short ud2;
377
378 if (ip < PAGE_OFFSET)
379 return 0;
380 if (probe_kernel_address((unsigned short *)ip, ud2))
381 return 0;
382
383 return ud2 == 0x0b0f;
384}
385
386int __kprobes __die(const char *str, struct pt_regs *regs, long err)
387{
388 unsigned short ss;
389 unsigned long sp;
390 155
391 printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter);
392#ifdef CONFIG_PREEMPT
393 printk("PREEMPT ");
394#endif
395#ifdef CONFIG_SMP
396 printk("SMP ");
397#endif
398#ifdef CONFIG_DEBUG_PAGEALLOC
399 printk("DEBUG_PAGEALLOC");
400#endif
401 printk("\n");
402 if (notify_die(DIE_OOPS, str, regs, err,
403 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
404 return 1; 156 return 1;
405
406 show_registers(regs);
407 /* Executive summary in case the oops scrolled away */
408 sp = (unsigned long) (&regs->sp);
409 savesegment(ss, ss);
410 if (user_mode(regs)) {
411 sp = regs->sp;
412 ss = regs->ss & 0xffff;
413 } 157 }
414 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); 158 put_cpu();
415 print_symbol("%s", regs->ip);
416 printk(" SS:ESP %04x:%08lx\n", ss, sp);
417 return 0;
418}
419
420/*
421 * This is gone through when something in the kernel has done something bad
422 * and is about to be terminated:
423 */
424void die(const char *str, struct pt_regs *regs, long err)
425{
426 static struct {
427 raw_spinlock_t lock;
428 u32 lock_owner;
429 int lock_owner_depth;
430 } die = {
431 .lock = __RAW_SPIN_LOCK_UNLOCKED,
432 .lock_owner = -1,
433 .lock_owner_depth = 0
434 };
435 unsigned long flags;
436
437 oops_enter();
438
439 if (die.lock_owner != raw_smp_processor_id()) {
440 console_verbose();
441 raw_local_irq_save(flags);
442 __raw_spin_lock(&die.lock);
443 die.lock_owner = smp_processor_id();
444 die.lock_owner_depth = 0;
445 bust_spinlocks(1);
446 } else {
447 raw_local_irq_save(flags);
448 }
449
450 if (++die.lock_owner_depth < 3) {
451 report_bug(regs->ip, regs);
452
453 if (__die(str, regs, err))
454 regs = NULL;
455 } else {
456 printk(KERN_EMERG "Recursive die() failure, output suppressed\n");
457 }
458
459 bust_spinlocks(0);
460 die.lock_owner = -1;
461 add_taint(TAINT_DIE);
462 __raw_spin_unlock(&die.lock);
463 raw_local_irq_restore(flags);
464
465 if (!regs)
466 return;
467
468 if (kexec_should_crash(current))
469 crash_kexec(regs);
470
471 if (in_interrupt())
472 panic("Fatal exception in interrupt");
473
474 if (panic_on_oops)
475 panic("Fatal exception");
476
477 oops_exit();
478 do_exit(SIGSEGV);
479}
480 159
481static inline void 160 return 0;
482die_if_kernel(const char *str, struct pt_regs *regs, long err)
483{
484 if (!user_mode_vm(regs))
485 die(str, regs, err);
486} 161}
162#endif
487 163
488static void __kprobes 164static void __kprobes
489do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs, 165do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
490 long error_code, siginfo_t *info) 166 long error_code, siginfo_t *info)
491{ 167{
492 struct task_struct *tsk = current; 168 struct task_struct *tsk = current;
493 169
170#ifdef CONFIG_X86_32
494 if (regs->flags & X86_VM_MASK) { 171 if (regs->flags & X86_VM_MASK) {
495 if (vm86) 172 /*
173 * traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
174 * On nmi (interrupt 2), do_trap should not be called.
175 */
176 if (trapnr < 6)
496 goto vm86_trap; 177 goto vm86_trap;
497 goto trap_signal; 178 goto trap_signal;
498 } 179 }
180#endif
499 181
500 if (!user_mode(regs)) 182 if (!user_mode(regs))
501 goto kernel_trap; 183 goto kernel_trap;
502 184
185#ifdef CONFIG_X86_32
503trap_signal: 186trap_signal:
187#endif
504 /* 188 /*
505 * We want error_code and trap_no set for userspace faults and 189 * We want error_code and trap_no set for userspace faults and
506 * kernelspace faults which result in die(), but not 190 * kernelspace faults which result in die(), but not
@@ -513,6 +197,18 @@ trap_signal:
513 tsk->thread.error_code = error_code; 197 tsk->thread.error_code = error_code;
514 tsk->thread.trap_no = trapnr; 198 tsk->thread.trap_no = trapnr;
515 199
200#ifdef CONFIG_X86_64
201 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
202 printk_ratelimit()) {
203 printk(KERN_INFO
204 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
205 tsk->comm, tsk->pid, str,
206 regs->ip, regs->sp, error_code);
207 print_vma_addr(" in ", regs->ip);
208 printk("\n");
209 }
210#endif
211
516 if (info) 212 if (info)
517 force_sig_info(signr, info, tsk); 213 force_sig_info(signr, info, tsk);
518 else 214 else
@@ -527,120 +223,98 @@ kernel_trap:
527 } 223 }
528 return; 224 return;
529 225
226#ifdef CONFIG_X86_32
530vm86_trap: 227vm86_trap:
531 if (handle_vm86_trap((struct kernel_vm86_regs *) regs, 228 if (handle_vm86_trap((struct kernel_vm86_regs *) regs,
532 error_code, trapnr)) 229 error_code, trapnr))
533 goto trap_signal; 230 goto trap_signal;
534 return; 231 return;
232#endif
535} 233}
536 234
537#define DO_ERROR(trapnr, signr, str, name) \ 235#define DO_ERROR(trapnr, signr, str, name) \
538void do_##name(struct pt_regs *regs, long error_code) \ 236dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
539{ \
540 trace_hardirqs_fixup(); \
541 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
542 == NOTIFY_STOP) \
543 return; \
544 do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \
545}
546
547#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \
548void do_##name(struct pt_regs *regs, long error_code) \
549{ \
550 siginfo_t info; \
551 if (irq) \
552 local_irq_enable(); \
553 info.si_signo = signr; \
554 info.si_errno = 0; \
555 info.si_code = sicode; \
556 info.si_addr = (void __user *)siaddr; \
557 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
558 == NOTIFY_STOP) \
559 return; \
560 do_trap(trapnr, signr, str, 0, regs, error_code, &info); \
561}
562
563#define DO_VM86_ERROR(trapnr, signr, str, name) \
564void do_##name(struct pt_regs *regs, long error_code) \
565{ \ 237{ \
566 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 238 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
567 == NOTIFY_STOP) \ 239 == NOTIFY_STOP) \
568 return; \ 240 return; \
569 do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ 241 conditional_sti(regs); \
242 do_trap(trapnr, signr, str, regs, error_code, NULL); \
570} 243}
571 244
572#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ 245#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
573void do_##name(struct pt_regs *regs, long error_code) \ 246dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \
574{ \ 247{ \
575 siginfo_t info; \ 248 siginfo_t info; \
576 info.si_signo = signr; \ 249 info.si_signo = signr; \
577 info.si_errno = 0; \ 250 info.si_errno = 0; \
578 info.si_code = sicode; \ 251 info.si_code = sicode; \
579 info.si_addr = (void __user *)siaddr; \ 252 info.si_addr = (void __user *)siaddr; \
580 trace_hardirqs_fixup(); \
581 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ 253 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
582 == NOTIFY_STOP) \ 254 == NOTIFY_STOP) \
583 return; \ 255 return; \
584 do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ 256 conditional_sti(regs); \
257 do_trap(trapnr, signr, str, regs, error_code, &info); \
585} 258}
586 259
587DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) 260DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
588#ifndef CONFIG_KPROBES 261DO_ERROR(4, SIGSEGV, "overflow", overflow)
589DO_VM86_ERROR(3, SIGTRAP, "int3", int3) 262DO_ERROR(5, SIGSEGV, "bounds", bounds)
590#endif 263DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
591DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow)
592DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds)
593DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0)
594DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) 264DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
595DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) 265DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
596DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) 266DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
267#ifdef CONFIG_X86_32
597DO_ERROR(12, SIGBUS, "stack segment", stack_segment) 268DO_ERROR(12, SIGBUS, "stack segment", stack_segment)
598DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) 269#endif
599DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) 270DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
271
272#ifdef CONFIG_X86_64
273/* Runs on IST stack */
274dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)
275{
276 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
277 12, SIGBUS) == NOTIFY_STOP)
278 return;
279 preempt_conditional_sti(regs);
280 do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
281 preempt_conditional_cli(regs);
282}
600 283
601void __kprobes 284dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
285{
286 static const char str[] = "double fault";
287 struct task_struct *tsk = current;
288
289 /* Return not checked because double check cannot be ignored */
290 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
291
292 tsk->thread.error_code = error_code;
293 tsk->thread.trap_no = 8;
294
295 /* This is always a kernel trap and never fixable (and thus must
296 never return). */
297 for (;;)
298 die(str, regs, error_code);
299}
300#endif
301
302dotraplinkage void __kprobes
602do_general_protection(struct pt_regs *regs, long error_code) 303do_general_protection(struct pt_regs *regs, long error_code)
603{ 304{
604 struct task_struct *tsk; 305 struct task_struct *tsk;
605 struct thread_struct *thread;
606 struct tss_struct *tss;
607 int cpu;
608 306
609 cpu = get_cpu(); 307 conditional_sti(regs);
610 tss = &per_cpu(init_tss, cpu);
611 thread = &current->thread;
612
613 /*
614 * Perform the lazy TSS's I/O bitmap copy. If the TSS has an
615 * invalid offset set (the LAZY one) and the faulting thread has
616 * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS
617 * and we set the offset field correctly. Then we let the CPU to
618 * restart the faulting instruction.
619 */
620 if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY &&
621 thread->io_bitmap_ptr) {
622 memcpy(tss->io_bitmap, thread->io_bitmap_ptr,
623 thread->io_bitmap_max);
624 /*
625 * If the previously set map was extending to higher ports
626 * than the current one, pad extra space with 0xff (no access).
627 */
628 if (thread->io_bitmap_max < tss->io_bitmap_max) {
629 memset((char *) tss->io_bitmap +
630 thread->io_bitmap_max, 0xff,
631 tss->io_bitmap_max - thread->io_bitmap_max);
632 }
633 tss->io_bitmap_max = thread->io_bitmap_max;
634 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET;
635 tss->io_bitmap_owner = thread;
636 put_cpu();
637 308
309#ifdef CONFIG_X86_32
310 if (lazy_iobitmap_copy()) {
311 /* restart the faulting instruction */
638 return; 312 return;
639 } 313 }
640 put_cpu();
641 314
642 if (regs->flags & X86_VM_MASK) 315 if (regs->flags & X86_VM_MASK)
643 goto gp_in_vm86; 316 goto gp_in_vm86;
317#endif
644 318
645 tsk = current; 319 tsk = current;
646 if (!user_mode(regs)) 320 if (!user_mode(regs))
@@ -662,10 +336,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
662 force_sig(SIGSEGV, tsk); 336 force_sig(SIGSEGV, tsk);
663 return; 337 return;
664 338
339#ifdef CONFIG_X86_32
665gp_in_vm86: 340gp_in_vm86:
666 local_irq_enable(); 341 local_irq_enable();
667 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); 342 handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
668 return; 343 return;
344#endif
669 345
670gp_in_kernel: 346gp_in_kernel:
671 if (fixup_exception(regs)) 347 if (fixup_exception(regs))
@@ -702,7 +378,8 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
702 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 378 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
703 379
704 /* Clear and disable the memory parity error line. */ 380 /* Clear and disable the memory parity error line. */
705 clear_mem_error(reason); 381 reason = (reason & 0xf) | 4;
382 outb(reason, 0x61);
706} 383}
707 384
708static notrace __kprobes void 385static notrace __kprobes void
@@ -728,7 +405,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
728static notrace __kprobes void 405static notrace __kprobes void
729unknown_nmi_error(unsigned char reason, struct pt_regs *regs) 406unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
730{ 407{
731 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 408 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) ==
409 NOTIFY_STOP)
732 return; 410 return;
733#ifdef CONFIG_MCA 411#ifdef CONFIG_MCA
734 /* 412 /*
@@ -751,41 +429,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
751 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 429 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
752} 430}
753 431
754static DEFINE_SPINLOCK(nmi_print_lock);
755
756void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic)
757{
758 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
759 return;
760
761 spin_lock(&nmi_print_lock);
762 /*
763 * We are in trouble anyway, lets at least try
764 * to get a message out:
765 */
766 bust_spinlocks(1);
767 printk(KERN_EMERG "%s", str);
768 printk(" on CPU%d, ip %08lx, registers:\n",
769 smp_processor_id(), regs->ip);
770 show_registers(regs);
771 if (do_panic)
772 panic("Non maskable interrupt");
773 console_silent();
774 spin_unlock(&nmi_print_lock);
775 bust_spinlocks(0);
776
777 /*
778 * If we are in kernel we are probably nested up pretty bad
779 * and might aswell get out now while we still can:
780 */
781 if (!user_mode_vm(regs)) {
782 current->thread.trap_no = 2;
783 crash_kexec(regs);
784 }
785
786 do_exit(SIGSEGV);
787}
788
789static notrace __kprobes void default_do_nmi(struct pt_regs *regs) 432static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
790{ 433{
791 unsigned char reason = 0; 434 unsigned char reason = 0;
@@ -824,22 +467,25 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
824 mem_parity_error(reason, regs); 467 mem_parity_error(reason, regs);
825 if (reason & 0x40) 468 if (reason & 0x40)
826 io_check_error(reason, regs); 469 io_check_error(reason, regs);
470#ifdef CONFIG_X86_32
827 /* 471 /*
828 * Reassert NMI in case it became active meanwhile 472 * Reassert NMI in case it became active meanwhile
829 * as it's edge-triggered: 473 * as it's edge-triggered:
830 */ 474 */
831 reassert_nmi(); 475 reassert_nmi();
476#endif
832} 477}
833 478
834notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) 479dotraplinkage notrace __kprobes void
480do_nmi(struct pt_regs *regs, long error_code)
835{ 481{
836 int cpu;
837
838 nmi_enter(); 482 nmi_enter();
839 483
840 cpu = smp_processor_id(); 484#ifdef CONFIG_X86_32
841 485 { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); }
842 ++nmi_count(cpu); 486#else
487 add_pda(__nmi_count, 1);
488#endif
843 489
844 if (!ignore_nmis) 490 if (!ignore_nmis)
845 default_do_nmi(regs); 491 default_do_nmi(regs);
@@ -859,21 +505,44 @@ void restart_nmi(void)
859 acpi_nmi_enable(); 505 acpi_nmi_enable();
860} 506}
861 507
862#ifdef CONFIG_KPROBES 508/* May run on IST stack. */
863void __kprobes do_int3(struct pt_regs *regs, long error_code) 509dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
864{ 510{
865 trace_hardirqs_fixup(); 511#ifdef CONFIG_KPROBES
866
867 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) 512 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
868 == NOTIFY_STOP) 513 == NOTIFY_STOP)
869 return; 514 return;
870 /* 515#else
871 * This is an interrupt gate, because kprobes wants interrupts 516 if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP)
872 * disabled. Normal trap handlers don't. 517 == NOTIFY_STOP)
873 */ 518 return;
874 restore_interrupts(regs); 519#endif
875 520
876 do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); 521 preempt_conditional_sti(regs);
522 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
523 preempt_conditional_cli(regs);
524}
525
526#ifdef CONFIG_X86_64
527/* Help handler running on IST stack to switch back to user stack
528 for scheduling or signal handling. The actual stack switch is done in
529 entry.S */
530asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
531{
532 struct pt_regs *regs = eregs;
533 /* Did already sync */
534 if (eregs == (struct pt_regs *)eregs->sp)
535 ;
536 /* Exception from user space */
537 else if (user_mode(eregs))
538 regs = task_pt_regs(current);
539 /* Exception from kernel and interrupts are enabled. Move to
540 kernel process stack. */
541 else if (eregs->flags & X86_EFLAGS_IF)
542 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
543 if (eregs != regs)
544 *regs = *eregs;
545 return regs;
877} 546}
878#endif 547#endif
879 548
@@ -898,13 +567,14 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code)
898 * about restoring all the debug state, and ptrace doesn't have to 567 * about restoring all the debug state, and ptrace doesn't have to
899 * find every occurrence of the TF bit that could be saved away even 568 * find every occurrence of the TF bit that could be saved away even
900 * by user code) 569 * by user code)
570 *
571 * May run on IST stack.
901 */ 572 */
902void __kprobes do_debug(struct pt_regs *regs, long error_code) 573dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
903{ 574{
904 struct task_struct *tsk = current; 575 struct task_struct *tsk = current;
905 unsigned int condition; 576 unsigned long condition;
906 577 int si_code;
907 trace_hardirqs_fixup();
908 578
909 get_debugreg(condition, 6); 579 get_debugreg(condition, 6);
910 580
@@ -917,9 +587,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
917 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 587 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
918 SIGTRAP) == NOTIFY_STOP) 588 SIGTRAP) == NOTIFY_STOP)
919 return; 589 return;
590
920 /* It's safe to allow irq's after DR6 has been saved */ 591 /* It's safe to allow irq's after DR6 has been saved */
921 if (regs->flags & X86_EFLAGS_IF) 592 preempt_conditional_sti(regs);
922 local_irq_enable();
923 593
924 /* Mask out spurious debug traps due to lazy DR7 setting */ 594 /* Mask out spurious debug traps due to lazy DR7 setting */
925 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 595 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
@@ -927,8 +597,10 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
927 goto clear_dr7; 597 goto clear_dr7;
928 } 598 }
929 599
600#ifdef CONFIG_X86_32
930 if (regs->flags & X86_VM_MASK) 601 if (regs->flags & X86_VM_MASK)
931 goto debug_vm86; 602 goto debug_vm86;
603#endif
932 604
933 /* Save debug status register where ptrace can see it */ 605 /* Save debug status register where ptrace can see it */
934 tsk->thread.debugreg6 = condition; 606 tsk->thread.debugreg6 = condition;
@@ -938,17 +610,13 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
938 * kernel space (but re-enable TF when returning to user mode). 610 * kernel space (but re-enable TF when returning to user mode).
939 */ 611 */
940 if (condition & DR_STEP) { 612 if (condition & DR_STEP) {
941 /*
942 * We already checked v86 mode above, so we can
943 * check for kernel mode by just checking the CPL
944 * of CS.
945 */
946 if (!user_mode(regs)) 613 if (!user_mode(regs))
947 goto clear_TF_reenable; 614 goto clear_TF_reenable;
948 } 615 }
949 616
617 si_code = get_si_code(condition);
950 /* Ok, finally something we can handle */ 618 /* Ok, finally something we can handle */
951 send_sigtrap(tsk, regs, error_code); 619 send_sigtrap(tsk, regs, error_code, si_code);
952 620
953 /* 621 /*
954 * Disable additional traps. They'll be re-enabled when 622 * Disable additional traps. They'll be re-enabled when
@@ -956,18 +624,37 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code)
956 */ 624 */
957clear_dr7: 625clear_dr7:
958 set_debugreg(0, 7); 626 set_debugreg(0, 7);
627 preempt_conditional_cli(regs);
959 return; 628 return;
960 629
630#ifdef CONFIG_X86_32
961debug_vm86: 631debug_vm86:
962 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); 632 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
633 preempt_conditional_cli(regs);
963 return; 634 return;
635#endif
964 636
965clear_TF_reenable: 637clear_TF_reenable:
966 set_tsk_thread_flag(tsk, TIF_SINGLESTEP); 638 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
967 regs->flags &= ~X86_EFLAGS_TF; 639 regs->flags &= ~X86_EFLAGS_TF;
640 preempt_conditional_cli(regs);
968 return; 641 return;
969} 642}
970 643
644#ifdef CONFIG_X86_64
645static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
646{
647 if (fixup_exception(regs))
648 return 1;
649
650 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
651 /* Illegal floating point operation in the kernel */
652 current->thread.trap_no = trapnr;
653 die(str, regs, 0);
654 return 0;
655}
656#endif
657
971/* 658/*
972 * Note that we play around with the 'TS' bit in an attempt to get 659 * Note that we play around with the 'TS' bit in an attempt to get
973 * the correct behaviour even in the presence of the asynchronous 660 * the correct behaviour even in the presence of the asynchronous
@@ -1004,7 +691,9 @@ void math_error(void __user *ip)
1004 swd = get_fpu_swd(task); 691 swd = get_fpu_swd(task);
1005 switch (swd & ~cwd & 0x3f) { 692 switch (swd & ~cwd & 0x3f) {
1006 case 0x000: /* No unmasked exception */ 693 case 0x000: /* No unmasked exception */
694#ifdef CONFIG_X86_32
1007 return; 695 return;
696#endif
1008 default: /* Multiple exceptions */ 697 default: /* Multiple exceptions */
1009 break; 698 break;
1010 case 0x001: /* Invalid Op */ 699 case 0x001: /* Invalid Op */
@@ -1032,9 +721,18 @@ void math_error(void __user *ip)
1032 force_sig_info(SIGFPE, &info, task); 721 force_sig_info(SIGFPE, &info, task);
1033} 722}
1034 723
1035void do_coprocessor_error(struct pt_regs *regs, long error_code) 724dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
1036{ 725{
726 conditional_sti(regs);
727
728#ifdef CONFIG_X86_32
1037 ignore_fpu_irq = 1; 729 ignore_fpu_irq = 1;
730#else
731 if (!user_mode(regs) &&
732 kernel_math_error(regs, "kernel x87 math error", 16))
733 return;
734#endif
735
1038 math_error((void __user *)regs->ip); 736 math_error((void __user *)regs->ip);
1039} 737}
1040 738
@@ -1086,8 +784,12 @@ static void simd_math_error(void __user *ip)
1086 force_sig_info(SIGFPE, &info, task); 784 force_sig_info(SIGFPE, &info, task);
1087} 785}
1088 786
1089void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) 787dotraplinkage void
788do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
1090{ 789{
790 conditional_sti(regs);
791
792#ifdef CONFIG_X86_32
1091 if (cpu_has_xmm) { 793 if (cpu_has_xmm) {
1092 /* Handle SIMD FPU exceptions on PIII+ processors. */ 794 /* Handle SIMD FPU exceptions on PIII+ processors. */
1093 ignore_fpu_irq = 1; 795 ignore_fpu_irq = 1;
@@ -1106,16 +808,25 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
1106 current->thread.error_code = error_code; 808 current->thread.error_code = error_code;
1107 die_if_kernel("cache flush denied", regs, error_code); 809 die_if_kernel("cache flush denied", regs, error_code);
1108 force_sig(SIGSEGV, current); 810 force_sig(SIGSEGV, current);
811#else
812 if (!user_mode(regs) &&
813 kernel_math_error(regs, "kernel simd math error", 19))
814 return;
815 simd_math_error((void __user *)regs->ip);
816#endif
1109} 817}
1110 818
1111void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) 819dotraplinkage void
820do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
1112{ 821{
822 conditional_sti(regs);
1113#if 0 823#if 0
1114 /* No need to warn about this any longer. */ 824 /* No need to warn about this any longer. */
1115 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); 825 printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n");
1116#endif 826#endif
1117} 827}
1118 828
829#ifdef CONFIG_X86_32
1119unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) 830unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
1120{ 831{
1121 struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); 832 struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id());
@@ -1134,6 +845,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp)
1134 845
1135 return new_kesp; 846 return new_kesp;
1136} 847}
848#else
849asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
850{
851}
852
853asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
854{
855}
856#endif
1137 857
1138/* 858/*
1139 * 'math_state_restore()' saves the current math information in the 859 * 'math_state_restore()' saves the current math information in the
@@ -1166,14 +886,24 @@ asmlinkage void math_state_restore(void)
1166 } 886 }
1167 887
1168 clts(); /* Allow maths ops (or we recurse) */ 888 clts(); /* Allow maths ops (or we recurse) */
889#ifdef CONFIG_X86_32
1169 restore_fpu(tsk); 890 restore_fpu(tsk);
891#else
892 /*
893 * Paranoid restore. send a SIGSEGV if we fail to restore the state.
894 */
895 if (unlikely(restore_fpu_checking(tsk))) {
896 stts();
897 force_sig(SIGSEGV, tsk);
898 return;
899 }
900#endif
1170 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ 901 thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */
1171 tsk->fpu_counter++; 902 tsk->fpu_counter++;
1172} 903}
1173EXPORT_SYMBOL_GPL(math_state_restore); 904EXPORT_SYMBOL_GPL(math_state_restore);
1174 905
1175#ifndef CONFIG_MATH_EMULATION 906#ifndef CONFIG_MATH_EMULATION
1176
1177asmlinkage void math_emulate(long arg) 907asmlinkage void math_emulate(long arg)
1178{ 908{
1179 printk(KERN_EMERG 909 printk(KERN_EMERG
@@ -1182,12 +912,46 @@ asmlinkage void math_emulate(long arg)
1182 force_sig(SIGFPE, current); 912 force_sig(SIGFPE, current);
1183 schedule(); 913 schedule();
1184} 914}
1185
1186#endif /* CONFIG_MATH_EMULATION */ 915#endif /* CONFIG_MATH_EMULATION */
1187 916
917dotraplinkage void __kprobes
918do_device_not_available(struct pt_regs *regs, long error)
919{
920#ifdef CONFIG_X86_32
921 if (read_cr0() & X86_CR0_EM) {
922 conditional_sti(regs);
923 math_emulate(0);
924 } else {
925 math_state_restore(); /* interrupts still off */
926 conditional_sti(regs);
927 }
928#else
929 math_state_restore();
930#endif
931}
932
933#ifdef CONFIG_X86_32
934dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
935{
936 siginfo_t info;
937 local_irq_enable();
938
939 info.si_signo = SIGILL;
940 info.si_errno = 0;
941 info.si_code = ILL_BADSTK;
942 info.si_addr = 0;
943 if (notify_die(DIE_TRAP, "iret exception",
944 regs, error_code, 32, SIGILL) == NOTIFY_STOP)
945 return;
946 do_trap(32, SIGILL, "iret exception", regs, error_code, &info);
947}
948#endif
949
1188void __init trap_init(void) 950void __init trap_init(void)
1189{ 951{
952#ifdef CONFIG_X86_32
1190 int i; 953 int i;
954#endif
1191 955
1192#ifdef CONFIG_EISA 956#ifdef CONFIG_EISA
1193 void __iomem *p = early_ioremap(0x0FFFD9, 4); 957 void __iomem *p = early_ioremap(0x0FFFD9, 4);
@@ -1197,29 +961,40 @@ void __init trap_init(void)
1197 early_iounmap(p, 4); 961 early_iounmap(p, 4);
1198#endif 962#endif
1199 963
1200 set_trap_gate(0, &divide_error); 964 set_intr_gate(0, &divide_error);
1201 set_intr_gate(1, &debug); 965 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1202 set_intr_gate(2, &nmi); 966 set_intr_gate_ist(2, &nmi, NMI_STACK);
1203 set_system_intr_gate(3, &int3); /* int3 can be called from all */ 967 /* int3 can be called from all */
1204 set_system_gate(4, &overflow); /* int4 can be called from all */ 968 set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
1205 set_trap_gate(5, &bounds); 969 /* int4 can be called from all */
1206 set_trap_gate(6, &invalid_op); 970 set_system_intr_gate(4, &overflow);
1207 set_trap_gate(7, &device_not_available); 971 set_intr_gate(5, &bounds);
972 set_intr_gate(6, &invalid_op);
973 set_intr_gate(7, &device_not_available);
974#ifdef CONFIG_X86_32
1208 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); 975 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS);
1209 set_trap_gate(9, &coprocessor_segment_overrun); 976#else
1210 set_trap_gate(10, &invalid_TSS); 977 set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
1211 set_trap_gate(11, &segment_not_present); 978#endif
1212 set_trap_gate(12, &stack_segment); 979 set_intr_gate(9, &coprocessor_segment_overrun);
1213 set_trap_gate(13, &general_protection); 980 set_intr_gate(10, &invalid_TSS);
981 set_intr_gate(11, &segment_not_present);
982 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
983 set_intr_gate(13, &general_protection);
1214 set_intr_gate(14, &page_fault); 984 set_intr_gate(14, &page_fault);
1215 set_trap_gate(15, &spurious_interrupt_bug); 985 set_intr_gate(15, &spurious_interrupt_bug);
1216 set_trap_gate(16, &coprocessor_error); 986 set_intr_gate(16, &coprocessor_error);
1217 set_trap_gate(17, &alignment_check); 987 set_intr_gate(17, &alignment_check);
1218#ifdef CONFIG_X86_MCE 988#ifdef CONFIG_X86_MCE
1219 set_trap_gate(18, &machine_check); 989 set_intr_gate_ist(18, &machine_check, MCE_STACK);
1220#endif 990#endif
1221 set_trap_gate(19, &simd_coprocessor_error); 991 set_intr_gate(19, &simd_coprocessor_error);
1222 992
993#ifdef CONFIG_IA32_EMULATION
994 set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
995#endif
996
997#ifdef CONFIG_X86_32
1223 if (cpu_has_fxsr) { 998 if (cpu_has_fxsr) {
1224 printk(KERN_INFO "Enabling fast FPU save and restore... "); 999 printk(KERN_INFO "Enabling fast FPU save and restore... ");
1225 set_in_cr4(X86_CR4_OSFXSR); 1000 set_in_cr4(X86_CR4_OSFXSR);
@@ -1232,37 +1007,20 @@ void __init trap_init(void)
1232 printk("done.\n"); 1007 printk("done.\n");
1233 } 1008 }
1234 1009
1235 set_system_gate(SYSCALL_VECTOR, &system_call); 1010 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
1236 1011
1237 /* Reserve all the builtin and the syscall vector: */ 1012 /* Reserve all the builtin and the syscall vector: */
1238 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) 1013 for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
1239 set_bit(i, used_vectors); 1014 set_bit(i, used_vectors);
1240 1015
1241 set_bit(SYSCALL_VECTOR, used_vectors); 1016 set_bit(SYSCALL_VECTOR, used_vectors);
1242 1017#endif
1243 init_thread_xstate();
1244 /* 1018 /*
1245 * Should be a barrier for any external CPU state: 1019 * Should be a barrier for any external CPU state:
1246 */ 1020 */
1247 cpu_init(); 1021 cpu_init();
1248 1022
1023#ifdef CONFIG_X86_32
1249 trap_init_hook(); 1024 trap_init_hook();
1025#endif
1250} 1026}
1251
1252static int __init kstack_setup(char *s)
1253{
1254 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1255
1256 return 1;
1257}
1258__setup("kstack=", kstack_setup);
1259
1260static int __init code_bytes_setup(char *s)
1261{
1262 code_bytes = simple_strtoul(s, NULL, 0);
1263 if (code_bytes > 8192)
1264 code_bytes = 8192;
1265
1266 return 1;
1267}
1268__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c
deleted file mode 100644
index 2696a6837782..000000000000
--- a/arch/x86/kernel/traps_64.c
+++ /dev/null
@@ -1,1217 +0,0 @@
1/*
2 * Copyright (C) 1991, 1992 Linus Torvalds
3 * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
4 *
5 * Pentium III FXSR, SSE support
6 * Gareth Hughes <gareth@valinux.com>, May 2000
7 */
8
9/*
10 * 'Traps.c' handles hardware traps and faults after we have saved some
11 * state in 'entry.S'.
12 */
13#include <linux/moduleparam.h>
14#include <linux/interrupt.h>
15#include <linux/kallsyms.h>
16#include <linux/spinlock.h>
17#include <linux/kprobes.h>
18#include <linux/uaccess.h>
19#include <linux/utsname.h>
20#include <linux/kdebug.h>
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/ptrace.h>
24#include <linux/string.h>
25#include <linux/unwind.h>
26#include <linux/delay.h>
27#include <linux/errno.h>
28#include <linux/kexec.h>
29#include <linux/sched.h>
30#include <linux/timer.h>
31#include <linux/init.h>
32#include <linux/bug.h>
33#include <linux/nmi.h>
34#include <linux/mm.h>
35
36#if defined(CONFIG_EDAC)
37#include <linux/edac.h>
38#endif
39
40#include <asm/stacktrace.h>
41#include <asm/processor.h>
42#include <asm/debugreg.h>
43#include <asm/atomic.h>
44#include <asm/system.h>
45#include <asm/unwind.h>
46#include <asm/desc.h>
47#include <asm/i387.h>
48#include <asm/nmi.h>
49#include <asm/smp.h>
50#include <asm/io.h>
51#include <asm/pgalloc.h>
52#include <asm/proto.h>
53#include <asm/pda.h>
54
55#include <mach_traps.h>
56
57asmlinkage void divide_error(void);
58asmlinkage void debug(void);
59asmlinkage void nmi(void);
60asmlinkage void int3(void);
61asmlinkage void overflow(void);
62asmlinkage void bounds(void);
63asmlinkage void invalid_op(void);
64asmlinkage void device_not_available(void);
65asmlinkage void double_fault(void);
66asmlinkage void coprocessor_segment_overrun(void);
67asmlinkage void invalid_TSS(void);
68asmlinkage void segment_not_present(void);
69asmlinkage void stack_segment(void);
70asmlinkage void general_protection(void);
71asmlinkage void page_fault(void);
72asmlinkage void coprocessor_error(void);
73asmlinkage void simd_coprocessor_error(void);
74asmlinkage void alignment_check(void);
75asmlinkage void spurious_interrupt_bug(void);
76asmlinkage void machine_check(void);
77
78int panic_on_unrecovered_nmi;
79int kstack_depth_to_print = 12;
80static unsigned int code_bytes = 64;
81static int ignore_nmis;
82static int die_counter;
83
84static inline void conditional_sti(struct pt_regs *regs)
85{
86 if (regs->flags & X86_EFLAGS_IF)
87 local_irq_enable();
88}
89
90static inline void preempt_conditional_sti(struct pt_regs *regs)
91{
92 inc_preempt_count();
93 if (regs->flags & X86_EFLAGS_IF)
94 local_irq_enable();
95}
96
97static inline void preempt_conditional_cli(struct pt_regs *regs)
98{
99 if (regs->flags & X86_EFLAGS_IF)
100 local_irq_disable();
101 /* Make sure to not schedule here because we could be running
102 on an exception stack. */
103 dec_preempt_count();
104}
105
106void printk_address(unsigned long address, int reliable)
107{
108 printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address);
109}
110
111static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
112 unsigned *usedp, char **idp)
113{
114 static char ids[][8] = {
115 [DEBUG_STACK - 1] = "#DB",
116 [NMI_STACK - 1] = "NMI",
117 [DOUBLEFAULT_STACK - 1] = "#DF",
118 [STACKFAULT_STACK - 1] = "#SS",
119 [MCE_STACK - 1] = "#MC",
120#if DEBUG_STKSZ > EXCEPTION_STKSZ
121 [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]"
122#endif
123 };
124 unsigned k;
125
126 /*
127 * Iterate over all exception stacks, and figure out whether
128 * 'stack' is in one of them:
129 */
130 for (k = 0; k < N_EXCEPTION_STACKS; k++) {
131 unsigned long end = per_cpu(orig_ist, cpu).ist[k];
132 /*
133 * Is 'stack' above this exception frame's end?
134 * If yes then skip to the next frame.
135 */
136 if (stack >= end)
137 continue;
138 /*
139 * Is 'stack' above this exception frame's start address?
140 * If yes then we found the right frame.
141 */
142 if (stack >= end - EXCEPTION_STKSZ) {
143 /*
144 * Make sure we only iterate through an exception
145 * stack once. If it comes up for the second time
146 * then there's something wrong going on - just
147 * break out and return NULL:
148 */
149 if (*usedp & (1U << k))
150 break;
151 *usedp |= 1U << k;
152 *idp = ids[k];
153 return (unsigned long *)end;
154 }
155 /*
156 * If this is a debug stack, and if it has a larger size than
157 * the usual exception stacks, then 'stack' might still
158 * be within the lower portion of the debug stack:
159 */
160#if DEBUG_STKSZ > EXCEPTION_STKSZ
161 if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) {
162 unsigned j = N_EXCEPTION_STACKS - 1;
163
164 /*
165 * Black magic. A large debug stack is composed of
166 * multiple exception stack entries, which we
167 * iterate through now. Dont look:
168 */
169 do {
170 ++j;
171 end -= EXCEPTION_STKSZ;
172 ids[j][4] = '1' + (j - N_EXCEPTION_STACKS);
173 } while (stack < end - EXCEPTION_STKSZ);
174 if (*usedp & (1U << j))
175 break;
176 *usedp |= 1U << j;
177 *idp = ids[j];
178 return (unsigned long *)end;
179 }
180#endif
181 }
182 return NULL;
183}
184
185/*
186 * x86-64 can have up to three kernel stacks:
187 * process stack
188 * interrupt stack
189 * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
190 */
191
192static inline int valid_stack_ptr(struct thread_info *tinfo,
193 void *p, unsigned int size, void *end)
194{
195 void *t = tinfo;
196 if (end) {
197 if (p < end && p >= (end-THREAD_SIZE))
198 return 1;
199 else
200 return 0;
201 }
202 return p > t && p < t + THREAD_SIZE - size;
203}
204
205/* The form of the top of the frame on the stack */
206struct stack_frame {
207 struct stack_frame *next_frame;
208 unsigned long return_address;
209};
210
211static inline unsigned long
212print_context_stack(struct thread_info *tinfo,
213 unsigned long *stack, unsigned long bp,
214 const struct stacktrace_ops *ops, void *data,
215 unsigned long *end)
216{
217 struct stack_frame *frame = (struct stack_frame *)bp;
218
219 while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) {
220 unsigned long addr;
221
222 addr = *stack;
223 if (__kernel_text_address(addr)) {
224 if ((unsigned long) stack == bp + 8) {
225 ops->address(data, addr, 1);
226 frame = frame->next_frame;
227 bp = (unsigned long) frame;
228 } else {
229 ops->address(data, addr, bp == 0);
230 }
231 }
232 stack++;
233 }
234 return bp;
235}
236
237void dump_trace(struct task_struct *task, struct pt_regs *regs,
238 unsigned long *stack, unsigned long bp,
239 const struct stacktrace_ops *ops, void *data)
240{
241 const unsigned cpu = get_cpu();
242 unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr;
243 unsigned used = 0;
244 struct thread_info *tinfo;
245
246 if (!task)
247 task = current;
248
249 if (!stack) {
250 unsigned long dummy;
251 stack = &dummy;
252 if (task && task != current)
253 stack = (unsigned long *)task->thread.sp;
254 }
255
256#ifdef CONFIG_FRAME_POINTER
257 if (!bp) {
258 if (task == current) {
259 /* Grab bp right from our regs */
260 asm("movq %%rbp, %0" : "=r" (bp) :);
261 } else {
262 /* bp is the last reg pushed by switch_to */
263 bp = *(unsigned long *) task->thread.sp;
264 }
265 }
266#endif
267
268 /*
269 * Print function call entries in all stacks, starting at the
270 * current stack address. If the stacks consist of nested
271 * exceptions
272 */
273 tinfo = task_thread_info(task);
274 for (;;) {
275 char *id;
276 unsigned long *estack_end;
277 estack_end = in_exception_stack(cpu, (unsigned long)stack,
278 &used, &id);
279
280 if (estack_end) {
281 if (ops->stack(data, id) < 0)
282 break;
283
284 bp = print_context_stack(tinfo, stack, bp, ops,
285 data, estack_end);
286 ops->stack(data, "<EOE>");
287 /*
288 * We link to the next stack via the
289 * second-to-last pointer (index -2 to end) in the
290 * exception stack:
291 */
292 stack = (unsigned long *) estack_end[-2];
293 continue;
294 }
295 if (irqstack_end) {
296 unsigned long *irqstack;
297 irqstack = irqstack_end -
298 (IRQSTACKSIZE - 64) / sizeof(*irqstack);
299
300 if (stack >= irqstack && stack < irqstack_end) {
301 if (ops->stack(data, "IRQ") < 0)
302 break;
303 bp = print_context_stack(tinfo, stack, bp,
304 ops, data, irqstack_end);
305 /*
306 * We link to the next stack (which would be
307 * the process stack normally) the last
308 * pointer (index -1 to end) in the IRQ stack:
309 */
310 stack = (unsigned long *) (irqstack_end[-1]);
311 irqstack_end = NULL;
312 ops->stack(data, "EOI");
313 continue;
314 }
315 }
316 break;
317 }
318
319 /*
320 * This handles the process stack:
321 */
322 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL);
323 put_cpu();
324}
325EXPORT_SYMBOL(dump_trace);
326
327static void
328print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
329{
330 print_symbol(msg, symbol);
331 printk("\n");
332}
333
334static void print_trace_warning(void *data, char *msg)
335{
336 printk("%s\n", msg);
337}
338
339static int print_trace_stack(void *data, char *name)
340{
341 printk(" <%s> ", name);
342 return 0;
343}
344
345static void print_trace_address(void *data, unsigned long addr, int reliable)
346{
347 touch_nmi_watchdog();
348 printk_address(addr, reliable);
349}
350
351static const struct stacktrace_ops print_trace_ops = {
352 .warning = print_trace_warning,
353 .warning_symbol = print_trace_warning_symbol,
354 .stack = print_trace_stack,
355 .address = print_trace_address,
356};
357
358void show_trace(struct task_struct *task, struct pt_regs *regs,
359 unsigned long *stack, unsigned long bp)
360{
361 printk("\nCall Trace:\n");
362 dump_trace(task, regs, stack, bp, &print_trace_ops, NULL);
363 printk("\n");
364}
365
366static void
367_show_stack(struct task_struct *task, struct pt_regs *regs,
368 unsigned long *sp, unsigned long bp)
369{
370 unsigned long *stack;
371 int i;
372 const int cpu = smp_processor_id();
373 unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr);
374 unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE);
375
376 // debugging aid: "show_stack(NULL, NULL);" prints the
377 // back trace for this cpu.
378
379 if (sp == NULL) {
380 if (task)
381 sp = (unsigned long *)task->thread.sp;
382 else
383 sp = (unsigned long *)&sp;
384 }
385
386 stack = sp;
387 for (i = 0; i < kstack_depth_to_print; i++) {
388 if (stack >= irqstack && stack <= irqstack_end) {
389 if (stack == irqstack_end) {
390 stack = (unsigned long *) (irqstack_end[-1]);
391 printk(" <EOI> ");
392 }
393 } else {
394 if (((long) stack & (THREAD_SIZE-1)) == 0)
395 break;
396 }
397 if (i && ((i % 4) == 0))
398 printk("\n");
399 printk(" %016lx", *stack++);
400 touch_nmi_watchdog();
401 }
402 show_trace(task, regs, sp, bp);
403}
404
405void show_stack(struct task_struct *task, unsigned long *sp)
406{
407 _show_stack(task, NULL, sp, 0);
408}
409
410/*
411 * The architecture-independent dump_stack generator
412 */
413void dump_stack(void)
414{
415 unsigned long bp = 0;
416 unsigned long stack;
417
418#ifdef CONFIG_FRAME_POINTER
419 if (!bp)
420 asm("movq %%rbp, %0" : "=r" (bp):);
421#endif
422
423 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
424 current->pid, current->comm, print_tainted(),
425 init_utsname()->release,
426 (int)strcspn(init_utsname()->version, " "),
427 init_utsname()->version);
428 show_trace(NULL, NULL, &stack, bp);
429}
430
431EXPORT_SYMBOL(dump_stack);
432
433void show_registers(struct pt_regs *regs)
434{
435 int i;
436 unsigned long sp;
437 const int cpu = smp_processor_id();
438 struct task_struct *cur = cpu_pda(cpu)->pcurrent;
439
440 sp = regs->sp;
441 printk("CPU %d ", cpu);
442 __show_regs(regs);
443 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
444 cur->comm, cur->pid, task_thread_info(cur), cur);
445
446 /*
447 * When in-kernel, we also print out the stack and code at the
448 * time of the fault..
449 */
450 if (!user_mode(regs)) {
451 unsigned int code_prologue = code_bytes * 43 / 64;
452 unsigned int code_len = code_bytes;
453 unsigned char c;
454 u8 *ip;
455
456 printk("Stack: ");
457 _show_stack(NULL, regs, (unsigned long *)sp, regs->bp);
458 printk("\n");
459
460 printk(KERN_EMERG "Code: ");
461
462 ip = (u8 *)regs->ip - code_prologue;
463 if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) {
464 /* try starting at RIP */
465 ip = (u8 *)regs->ip;
466 code_len = code_len - code_prologue + 1;
467 }
468 for (i = 0; i < code_len; i++, ip++) {
469 if (ip < (u8 *)PAGE_OFFSET ||
470 probe_kernel_address(ip, c)) {
471 printk(" Bad RIP value.");
472 break;
473 }
474 if (ip == (u8 *)regs->ip)
475 printk("<%02x> ", c);
476 else
477 printk("%02x ", c);
478 }
479 }
480 printk("\n");
481}
482
483int is_valid_bugaddr(unsigned long ip)
484{
485 unsigned short ud2;
486
487 if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2)))
488 return 0;
489
490 return ud2 == 0x0b0f;
491}
492
493static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED;
494static int die_owner = -1;
495static unsigned int die_nest_count;
496
497unsigned __kprobes long oops_begin(void)
498{
499 int cpu;
500 unsigned long flags;
501
502 oops_enter();
503
504 /* racy, but better than risking deadlock. */
505 raw_local_irq_save(flags);
506 cpu = smp_processor_id();
507 if (!__raw_spin_trylock(&die_lock)) {
508 if (cpu == die_owner)
509 /* nested oops. should stop eventually */;
510 else
511 __raw_spin_lock(&die_lock);
512 }
513 die_nest_count++;
514 die_owner = cpu;
515 console_verbose();
516 bust_spinlocks(1);
517 return flags;
518}
519
520void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
521{
522 die_owner = -1;
523 bust_spinlocks(0);
524 die_nest_count--;
525 if (!die_nest_count)
526 /* Nest count reaches zero, release the lock. */
527 __raw_spin_unlock(&die_lock);
528 raw_local_irq_restore(flags);
529 if (!regs) {
530 oops_exit();
531 return;
532 }
533 if (panic_on_oops)
534 panic("Fatal exception");
535 oops_exit();
536 do_exit(signr);
537}
538
539int __kprobes __die(const char *str, struct pt_regs *regs, long err)
540{
541 printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter);
542#ifdef CONFIG_PREEMPT
543 printk("PREEMPT ");
544#endif
545#ifdef CONFIG_SMP
546 printk("SMP ");
547#endif
548#ifdef CONFIG_DEBUG_PAGEALLOC
549 printk("DEBUG_PAGEALLOC");
550#endif
551 printk("\n");
552 if (notify_die(DIE_OOPS, str, regs, err,
553 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
554 return 1;
555
556 show_registers(regs);
557 add_taint(TAINT_DIE);
558 /* Executive summary in case the oops scrolled away */
559 printk(KERN_ALERT "RIP ");
560 printk_address(regs->ip, 1);
561 printk(" RSP <%016lx>\n", regs->sp);
562 if (kexec_should_crash(current))
563 crash_kexec(regs);
564 return 0;
565}
566
567void die(const char *str, struct pt_regs *regs, long err)
568{
569 unsigned long flags = oops_begin();
570
571 if (!user_mode(regs))
572 report_bug(regs->ip, regs);
573
574 if (__die(str, regs, err))
575 regs = NULL;
576 oops_end(flags, regs, SIGSEGV);
577}
578
579notrace __kprobes void
580die_nmi(char *str, struct pt_regs *regs, int do_panic)
581{
582 unsigned long flags;
583
584 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
585 return;
586
587 flags = oops_begin();
588 /*
589 * We are in trouble anyway, lets at least try
590 * to get a message out.
591 */
592 printk(KERN_EMERG "%s", str);
593 printk(" on CPU%d, ip %08lx, registers:\n",
594 smp_processor_id(), regs->ip);
595 show_registers(regs);
596 if (kexec_should_crash(current))
597 crash_kexec(regs);
598 if (do_panic || panic_on_oops)
599 panic("Non maskable interrupt");
600 oops_end(flags, NULL, SIGBUS);
601 nmi_exit();
602 local_irq_enable();
603 do_exit(SIGBUS);
604}
605
606static void __kprobes
607do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
608 long error_code, siginfo_t *info)
609{
610 struct task_struct *tsk = current;
611
612 if (!user_mode(regs))
613 goto kernel_trap;
614
615 /*
616 * We want error_code and trap_no set for userspace faults and
617 * kernelspace faults which result in die(), but not
618 * kernelspace faults which are fixed up. die() gives the
619 * process no chance to handle the signal and notice the
620 * kernel fault information, so that won't result in polluting
621 * the information about previously queued, but not yet
622 * delivered, faults. See also do_general_protection below.
623 */
624 tsk->thread.error_code = error_code;
625 tsk->thread.trap_no = trapnr;
626
627 if (show_unhandled_signals && unhandled_signal(tsk, signr) &&
628 printk_ratelimit()) {
629 printk(KERN_INFO
630 "%s[%d] trap %s ip:%lx sp:%lx error:%lx",
631 tsk->comm, tsk->pid, str,
632 regs->ip, regs->sp, error_code);
633 print_vma_addr(" in ", regs->ip);
634 printk("\n");
635 }
636
637 if (info)
638 force_sig_info(signr, info, tsk);
639 else
640 force_sig(signr, tsk);
641 return;
642
643kernel_trap:
644 if (!fixup_exception(regs)) {
645 tsk->thread.error_code = error_code;
646 tsk->thread.trap_no = trapnr;
647 die(str, regs, error_code);
648 }
649 return;
650}
651
652#define DO_ERROR(trapnr, signr, str, name) \
653asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
654{ \
655 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
656 == NOTIFY_STOP) \
657 return; \
658 conditional_sti(regs); \
659 do_trap(trapnr, signr, str, regs, error_code, NULL); \
660}
661
662#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \
663asmlinkage void do_##name(struct pt_regs * regs, long error_code) \
664{ \
665 siginfo_t info; \
666 info.si_signo = signr; \
667 info.si_errno = 0; \
668 info.si_code = sicode; \
669 info.si_addr = (void __user *)siaddr; \
670 trace_hardirqs_fixup(); \
671 if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \
672 == NOTIFY_STOP) \
673 return; \
674 conditional_sti(regs); \
675 do_trap(trapnr, signr, str, regs, error_code, &info); \
676}
677
678DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip)
679DO_ERROR(4, SIGSEGV, "overflow", overflow)
680DO_ERROR(5, SIGSEGV, "bounds", bounds)
681DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip)
682DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun)
683DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS)
684DO_ERROR(11, SIGBUS, "segment not present", segment_not_present)
685DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0)
686
687/* Runs on IST stack */
688asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code)
689{
690 if (notify_die(DIE_TRAP, "stack segment", regs, error_code,
691 12, SIGBUS) == NOTIFY_STOP)
692 return;
693 preempt_conditional_sti(regs);
694 do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL);
695 preempt_conditional_cli(regs);
696}
697
698asmlinkage void do_double_fault(struct pt_regs * regs, long error_code)
699{
700 static const char str[] = "double fault";
701 struct task_struct *tsk = current;
702
703 /* Return not checked because double check cannot be ignored */
704 notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV);
705
706 tsk->thread.error_code = error_code;
707 tsk->thread.trap_no = 8;
708
709 /* This is always a kernel trap and never fixable (and thus must
710 never return). */
711 for (;;)
712 die(str, regs, error_code);
713}
714
715asmlinkage void __kprobes
716do_general_protection(struct pt_regs *regs, long error_code)
717{
718 struct task_struct *tsk;
719
720 conditional_sti(regs);
721
722 tsk = current;
723 if (!user_mode(regs))
724 goto gp_in_kernel;
725
726 tsk->thread.error_code = error_code;
727 tsk->thread.trap_no = 13;
728
729 if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
730 printk_ratelimit()) {
731 printk(KERN_INFO
732 "%s[%d] general protection ip:%lx sp:%lx error:%lx",
733 tsk->comm, tsk->pid,
734 regs->ip, regs->sp, error_code);
735 print_vma_addr(" in ", regs->ip);
736 printk("\n");
737 }
738
739 force_sig(SIGSEGV, tsk);
740 return;
741
742gp_in_kernel:
743 if (fixup_exception(regs))
744 return;
745
746 tsk->thread.error_code = error_code;
747 tsk->thread.trap_no = 13;
748 if (notify_die(DIE_GPF, "general protection fault", regs,
749 error_code, 13, SIGSEGV) == NOTIFY_STOP)
750 return;
751 die("general protection fault", regs, error_code);
752}
753
754static notrace __kprobes void
755mem_parity_error(unsigned char reason, struct pt_regs *regs)
756{
757 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
758 reason);
759 printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n");
760
761#if defined(CONFIG_EDAC)
762 if (edac_handler_set()) {
763 edac_atomic_assert_error();
764 return;
765 }
766#endif
767
768 if (panic_on_unrecovered_nmi)
769 panic("NMI: Not continuing");
770
771 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
772
773 /* Clear and disable the memory parity error line. */
774 reason = (reason & 0xf) | 4;
775 outb(reason, 0x61);
776}
777
778static notrace __kprobes void
779io_check_error(unsigned char reason, struct pt_regs *regs)
780{
781 printk("NMI: IOCK error (debug interrupt?)\n");
782 show_registers(regs);
783
784 /* Re-enable the IOCK line, wait for a few seconds */
785 reason = (reason & 0xf) | 8;
786 outb(reason, 0x61);
787 mdelay(2000);
788 reason &= ~8;
789 outb(reason, 0x61);
790}
791
792static notrace __kprobes void
793unknown_nmi_error(unsigned char reason, struct pt_regs * regs)
794{
795 if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
796 return;
797 printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n",
798 reason);
799 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n");
800
801 if (panic_on_unrecovered_nmi)
802 panic("NMI: Not continuing");
803
804 printk(KERN_EMERG "Dazed and confused, but trying to continue\n");
805}
806
807/* Runs on IST stack. This code must keep interrupts off all the time.
808 Nested NMIs are prevented by the CPU. */
809asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs)
810{
811 unsigned char reason = 0;
812 int cpu;
813
814 cpu = smp_processor_id();
815
816 /* Only the BSP gets external NMIs from the system. */
817 if (!cpu)
818 reason = get_nmi_reason();
819
820 if (!(reason & 0xc0)) {
821 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
822 == NOTIFY_STOP)
823 return;
824 /*
825 * Ok, so this is none of the documented NMI sources,
826 * so it must be the NMI watchdog.
827 */
828 if (nmi_watchdog_tick(regs, reason))
829 return;
830 if (!do_nmi_callback(regs, cpu))
831 unknown_nmi_error(reason, regs);
832
833 return;
834 }
835 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP)
836 return;
837
838 /* AK: following checks seem to be broken on modern chipsets. FIXME */
839 if (reason & 0x80)
840 mem_parity_error(reason, regs);
841 if (reason & 0x40)
842 io_check_error(reason, regs);
843}
844
845asmlinkage notrace __kprobes void
846do_nmi(struct pt_regs *regs, long error_code)
847{
848 nmi_enter();
849
850 add_pda(__nmi_count, 1);
851
852 if (!ignore_nmis)
853 default_do_nmi(regs);
854
855 nmi_exit();
856}
857
858void stop_nmi(void)
859{
860 acpi_nmi_disable();
861 ignore_nmis++;
862}
863
864void restart_nmi(void)
865{
866 ignore_nmis--;
867 acpi_nmi_enable();
868}
869
870/* runs on IST stack. */
871asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
872{
873 trace_hardirqs_fixup();
874
875 if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
876 == NOTIFY_STOP)
877 return;
878
879 preempt_conditional_sti(regs);
880 do_trap(3, SIGTRAP, "int3", regs, error_code, NULL);
881 preempt_conditional_cli(regs);
882}
883
884/* Help handler running on IST stack to switch back to user stack
885 for scheduling or signal handling. The actual stack switch is done in
886 entry.S */
887asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
888{
889 struct pt_regs *regs = eregs;
890 /* Did already sync */
891 if (eregs == (struct pt_regs *)eregs->sp)
892 ;
893 /* Exception from user space */
894 else if (user_mode(eregs))
895 regs = task_pt_regs(current);
896 /* Exception from kernel and interrupts are enabled. Move to
897 kernel process stack. */
898 else if (eregs->flags & X86_EFLAGS_IF)
899 regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs));
900 if (eregs != regs)
901 *regs = *eregs;
902 return regs;
903}
904
905/* runs on IST stack. */
906asmlinkage void __kprobes do_debug(struct pt_regs * regs,
907 unsigned long error_code)
908{
909 struct task_struct *tsk = current;
910 unsigned long condition;
911 siginfo_t info;
912
913 trace_hardirqs_fixup();
914
915 get_debugreg(condition, 6);
916
917 /*
918 * The processor cleared BTF, so don't mark that we need it set.
919 */
920 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
921 tsk->thread.debugctlmsr = 0;
922
923 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code,
924 SIGTRAP) == NOTIFY_STOP)
925 return;
926
927 preempt_conditional_sti(regs);
928
929 /* Mask out spurious debug traps due to lazy DR7 setting */
930 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
931 if (!tsk->thread.debugreg7)
932 goto clear_dr7;
933 }
934
935 tsk->thread.debugreg6 = condition;
936
937 /*
938 * Single-stepping through TF: make sure we ignore any events in
939 * kernel space (but re-enable TF when returning to user mode).
940 */
941 if (condition & DR_STEP) {
942 if (!user_mode(regs))
943 goto clear_TF_reenable;
944 }
945
946 /* Ok, finally something we can handle */
947 tsk->thread.trap_no = 1;
948 tsk->thread.error_code = error_code;
949 info.si_signo = SIGTRAP;
950 info.si_errno = 0;
951 info.si_code = TRAP_BRKPT;
952 info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
953 force_sig_info(SIGTRAP, &info, tsk);
954
955clear_dr7:
956 set_debugreg(0, 7);
957 preempt_conditional_cli(regs);
958 return;
959
960clear_TF_reenable:
961 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
962 regs->flags &= ~X86_EFLAGS_TF;
963 preempt_conditional_cli(regs);
964 return;
965}
966
967static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
968{
969 if (fixup_exception(regs))
970 return 1;
971
972 notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
973 /* Illegal floating point operation in the kernel */
974 current->thread.trap_no = trapnr;
975 die(str, regs, 0);
976 return 0;
977}
978
979/*
980 * Note that we play around with the 'TS' bit in an attempt to get
981 * the correct behaviour even in the presence of the asynchronous
982 * IRQ13 behaviour
983 */
984asmlinkage void do_coprocessor_error(struct pt_regs *regs)
985{
986 void __user *ip = (void __user *)(regs->ip);
987 struct task_struct *task;
988 siginfo_t info;
989 unsigned short cwd, swd;
990
991 conditional_sti(regs);
992 if (!user_mode(regs) &&
993 kernel_math_error(regs, "kernel x87 math error", 16))
994 return;
995
996 /*
997 * Save the info for the exception handler and clear the error.
998 */
999 task = current;
1000 save_init_fpu(task);
1001 task->thread.trap_no = 16;
1002 task->thread.error_code = 0;
1003 info.si_signo = SIGFPE;
1004 info.si_errno = 0;
1005 info.si_code = __SI_FAULT;
1006 info.si_addr = ip;
1007 /*
1008 * (~cwd & swd) will mask out exceptions that are not set to unmasked
1009 * status. 0x3f is the exception bits in these regs, 0x200 is the
1010 * C1 reg you need in case of a stack fault, 0x040 is the stack
1011 * fault bit. We should only be taking one exception at a time,
1012 * so if this combination doesn't produce any single exception,
1013 * then we have a bad program that isn't synchronizing its FPU usage
1014 * and it will suffer the consequences since we won't be able to
1015 * fully reproduce the context of the exception
1016 */
1017 cwd = get_fpu_cwd(task);
1018 swd = get_fpu_swd(task);
1019 switch (swd & ~cwd & 0x3f) {
1020 case 0x000: /* No unmasked exception */
1021 default: /* Multiple exceptions */
1022 break;
1023 case 0x001: /* Invalid Op */
1024 /*
1025 * swd & 0x240 == 0x040: Stack Underflow
1026 * swd & 0x240 == 0x240: Stack Overflow
1027 * User must clear the SF bit (0x40) if set
1028 */
1029 info.si_code = FPE_FLTINV;
1030 break;
1031 case 0x002: /* Denormalize */
1032 case 0x010: /* Underflow */
1033 info.si_code = FPE_FLTUND;
1034 break;
1035 case 0x004: /* Zero Divide */
1036 info.si_code = FPE_FLTDIV;
1037 break;
1038 case 0x008: /* Overflow */
1039 info.si_code = FPE_FLTOVF;
1040 break;
1041 case 0x020: /* Precision */
1042 info.si_code = FPE_FLTRES;
1043 break;
1044 }
1045 force_sig_info(SIGFPE, &info, task);
1046}
1047
1048asmlinkage void bad_intr(void)
1049{
1050 printk("bad interrupt");
1051}
1052
1053asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs)
1054{
1055 void __user *ip = (void __user *)(regs->ip);
1056 struct task_struct *task;
1057 siginfo_t info;
1058 unsigned short mxcsr;
1059
1060 conditional_sti(regs);
1061 if (!user_mode(regs) &&
1062 kernel_math_error(regs, "kernel simd math error", 19))
1063 return;
1064
1065 /*
1066 * Save the info for the exception handler and clear the error.
1067 */
1068 task = current;
1069 save_init_fpu(task);
1070 task->thread.trap_no = 19;
1071 task->thread.error_code = 0;
1072 info.si_signo = SIGFPE;
1073 info.si_errno = 0;
1074 info.si_code = __SI_FAULT;
1075 info.si_addr = ip;
1076 /*
1077 * The SIMD FPU exceptions are handled a little differently, as there
1078 * is only a single status/control register. Thus, to determine which
1079 * unmasked exception was caught we must mask the exception mask bits
1080 * at 0x1f80, and then use these to mask the exception bits at 0x3f.
1081 */
1082 mxcsr = get_fpu_mxcsr(task);
1083 switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
1084 case 0x000:
1085 default:
1086 break;
1087 case 0x001: /* Invalid Op */
1088 info.si_code = FPE_FLTINV;
1089 break;
1090 case 0x002: /* Denormalize */
1091 case 0x010: /* Underflow */
1092 info.si_code = FPE_FLTUND;
1093 break;
1094 case 0x004: /* Zero Divide */
1095 info.si_code = FPE_FLTDIV;
1096 break;
1097 case 0x008: /* Overflow */
1098 info.si_code = FPE_FLTOVF;
1099 break;
1100 case 0x020: /* Precision */
1101 info.si_code = FPE_FLTRES;
1102 break;
1103 }
1104 force_sig_info(SIGFPE, &info, task);
1105}
1106
1107asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs)
1108{
1109}
1110
1111asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void)
1112{
1113}
1114
1115asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void)
1116{
1117}
1118
1119/*
1120 * 'math_state_restore()' saves the current math information in the
1121 * old math state array, and gets the new ones from the current task
1122 *
1123 * Careful.. There are problems with IBM-designed IRQ13 behaviour.
1124 * Don't touch unless you *really* know how it works.
1125 */
1126asmlinkage void math_state_restore(void)
1127{
1128 struct task_struct *me = current;
1129
1130 if (!used_math()) {
1131 local_irq_enable();
1132 /*
1133 * does a slab alloc which can sleep
1134 */
1135 if (init_fpu(me)) {
1136 /*
1137 * ran out of memory!
1138 */
1139 do_group_exit(SIGKILL);
1140 return;
1141 }
1142 local_irq_disable();
1143 }
1144
1145 clts(); /* Allow maths ops (or we recurse) */
1146 restore_fpu_checking(&me->thread.xstate->fxsave);
1147 task_thread_info(me)->status |= TS_USEDFPU;
1148 me->fpu_counter++;
1149}
1150EXPORT_SYMBOL_GPL(math_state_restore);
1151
1152void __init trap_init(void)
1153{
1154 set_intr_gate(0, &divide_error);
1155 set_intr_gate_ist(1, &debug, DEBUG_STACK);
1156 set_intr_gate_ist(2, &nmi, NMI_STACK);
1157 set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */
1158 set_system_gate(4, &overflow); /* int4 can be called from all */
1159 set_intr_gate(5, &bounds);
1160 set_intr_gate(6, &invalid_op);
1161 set_intr_gate(7, &device_not_available);
1162 set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK);
1163 set_intr_gate(9, &coprocessor_segment_overrun);
1164 set_intr_gate(10, &invalid_TSS);
1165 set_intr_gate(11, &segment_not_present);
1166 set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
1167 set_intr_gate(13, &general_protection);
1168 set_intr_gate(14, &page_fault);
1169 set_intr_gate(15, &spurious_interrupt_bug);
1170 set_intr_gate(16, &coprocessor_error);
1171 set_intr_gate(17, &alignment_check);
1172#ifdef CONFIG_X86_MCE
1173 set_intr_gate_ist(18, &machine_check, MCE_STACK);
1174#endif
1175 set_intr_gate(19, &simd_coprocessor_error);
1176
1177#ifdef CONFIG_IA32_EMULATION
1178 set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall);
1179#endif
1180 /*
1181 * initialize the per thread extended state:
1182 */
1183 init_thread_xstate();
1184 /*
1185 * Should be a barrier for any external CPU state:
1186 */
1187 cpu_init();
1188}
1189
1190static int __init oops_setup(char *s)
1191{
1192 if (!s)
1193 return -EINVAL;
1194 if (!strcmp(s, "panic"))
1195 panic_on_oops = 1;
1196 return 0;
1197}
1198early_param("oops", oops_setup);
1199
1200static int __init kstack_setup(char *s)
1201{
1202 if (!s)
1203 return -EINVAL;
1204 kstack_depth_to_print = simple_strtoul(s, NULL, 0);
1205 return 0;
1206}
1207early_param("kstack", kstack_setup);
1208
1209static int __init code_bytes_setup(char *s)
1210{
1211 code_bytes = simple_strtoul(s, NULL, 0);
1212 if (code_bytes > 8192)
1213 code_bytes = 8192;
1214
1215 return 1;
1216}
1217__setup("code_bytes=", code_bytes_setup);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 7603c0553909..161bb850fc47 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,7 +104,7 @@ __setup("notsc", notsc_setup);
104/* 104/*
105 * Read TSC and the reference counters. Take care of SMI disturbance 105 * Read TSC and the reference counters. Take care of SMI disturbance
106 */ 106 */
107static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) 107static u64 tsc_read_refs(u64 *p, int hpet)
108{ 108{
109 u64 t1, t2; 109 u64 t1, t2;
110 int i; 110 int i;
@@ -112,9 +112,9 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
112 for (i = 0; i < MAX_RETRIES; i++) { 112 for (i = 0; i < MAX_RETRIES; i++) {
113 t1 = get_cycles(); 113 t1 = get_cycles();
114 if (hpet) 114 if (hpet)
115 *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; 115 *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF;
116 else 116 else
117 *pm = acpi_pm_read_early(); 117 *p = acpi_pm_read_early();
118 t2 = get_cycles(); 118 t2 = get_cycles();
119 if ((t2 - t1) < SMI_TRESHOLD) 119 if ((t2 - t1) < SMI_TRESHOLD)
120 return t2; 120 return t2;
@@ -122,80 +122,390 @@ static u64 __init tsc_read_refs(u64 *pm, u64 *hpet)
122 return ULLONG_MAX; 122 return ULLONG_MAX;
123} 123}
124 124
125/** 125/*
126 * native_calibrate_tsc - calibrate the tsc on boot 126 * Calculate the TSC frequency from HPET reference
127 */ 127 */
128unsigned long native_calibrate_tsc(void) 128static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2)
129{ 129{
130 unsigned long flags; 130 u64 tmp;
131 u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2;
132 int hpet = is_hpet_enabled();
133 unsigned int tsc_khz_val = 0;
134 131
135 local_irq_save(flags); 132 if (hpet2 < hpet1)
133 hpet2 += 0x100000000ULL;
134 hpet2 -= hpet1;
135 tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
136 do_div(tmp, 1000000);
137 do_div(deltatsc, tmp);
138
139 return (unsigned long) deltatsc;
140}
141
142/*
143 * Calculate the TSC frequency from PMTimer reference
144 */
145static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2)
146{
147 u64 tmp;
136 148
137 tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); 149 if (!pm1 && !pm2)
150 return ULONG_MAX;
151
152 if (pm2 < pm1)
153 pm2 += (u64)ACPI_PM_OVRRUN;
154 pm2 -= pm1;
155 tmp = pm2 * 1000000000LL;
156 do_div(tmp, PMTMR_TICKS_PER_SEC);
157 do_div(deltatsc, tmp);
158
159 return (unsigned long) deltatsc;
160}
161
162#define CAL_MS 10
163#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS))
164#define CAL_PIT_LOOPS 1000
165
166#define CAL2_MS 50
167#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS))
168#define CAL2_PIT_LOOPS 5000
169
170
171/*
172 * Try to calibrate the TSC against the Programmable
173 * Interrupt Timer and return the frequency of the TSC
174 * in kHz.
175 *
176 * Return ULONG_MAX on failure to calibrate.
177 */
178static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin)
179{
180 u64 tsc, t1, t2, delta;
181 unsigned long tscmin, tscmax;
182 int pitcnt;
138 183
184 /* Set the Gate high, disable speaker */
139 outb((inb(0x61) & ~0x02) | 0x01, 0x61); 185 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
140 186
187 /*
188 * Setup CTC channel 2* for mode 0, (interrupt on terminal
189 * count mode), binary count. Set the latch register to 50ms
190 * (LSB then MSB) to begin countdown.
191 */
141 outb(0xb0, 0x43); 192 outb(0xb0, 0x43);
142 outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); 193 outb(latch & 0xff, 0x42);
143 outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); 194 outb(latch >> 8, 0x42);
144 tr1 = get_cycles(); 195
145 while ((inb(0x61) & 0x20) == 0); 196 tsc = t1 = t2 = get_cycles();
146 tr2 = get_cycles(); 197
198 pitcnt = 0;
199 tscmax = 0;
200 tscmin = ULONG_MAX;
201 while ((inb(0x61) & 0x20) == 0) {
202 t2 = get_cycles();
203 delta = t2 - tsc;
204 tsc = t2;
205 if ((unsigned long) delta < tscmin)
206 tscmin = (unsigned int) delta;
207 if ((unsigned long) delta > tscmax)
208 tscmax = (unsigned int) delta;
209 pitcnt++;
210 }
211
212 /*
213 * Sanity checks:
214 *
215 * If we were not able to read the PIT more than loopmin
216 * times, then we have been hit by a massive SMI
217 *
218 * If the maximum is 10 times larger than the minimum,
219 * then we got hit by an SMI as well.
220 */
221 if (pitcnt < loopmin || tscmax > 10 * tscmin)
222 return ULONG_MAX;
223
224 /* Calculate the PIT value */
225 delta = t2 - t1;
226 do_div(delta, ms);
227 return delta;
228}
147 229
148 tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); 230/*
231 * This reads the current MSB of the PIT counter, and
232 * checks if we are running on sufficiently fast and
233 * non-virtualized hardware.
234 *
235 * Our expectations are:
236 *
237 * - the PIT is running at roughly 1.19MHz
238 *
239 * - each IO is going to take about 1us on real hardware,
240 * but we allow it to be much faster (by a factor of 10) or
241 * _slightly_ slower (ie we allow up to a 2us read+counter
242 * update - anything else implies a unacceptably slow CPU
243 * or PIT for the fast calibration to work.
244 *
245 * - with 256 PIT ticks to read the value, we have 214us to
246 * see the same MSB (and overhead like doing a single TSC
247 * read per MSB value etc).
248 *
249 * - We're doing 2 reads per loop (LSB, MSB), and we expect
250 * them each to take about a microsecond on real hardware.
251 * So we expect a count value of around 100. But we'll be
252 * generous, and accept anything over 50.
253 *
254 * - if the PIT is stuck, and we see *many* more reads, we
255 * return early (and the next caller of pit_expect_msb()
256 * then consider it a failure when they don't see the
257 * next expected value).
258 *
259 * These expectations mean that we know that we have seen the
260 * transition from one expected value to another with a fairly
261 * high accuracy, and we didn't miss any events. We can thus
262 * use the TSC value at the transitions to calculate a pretty
263 * good value for the TSC frequencty.
264 */
265static inline int pit_expect_msb(unsigned char val)
266{
267 int count = 0;
149 268
269 for (count = 0; count < 50000; count++) {
270 /* Ignore LSB */
271 inb(0x42);
272 if (inb(0x42) != val)
273 break;
274 }
275 return count > 50;
276}
277
278/*
279 * How many MSB values do we want to see? We aim for a
280 * 15ms calibration, which assuming a 2us counter read
281 * error should give us roughly 150 ppm precision for
282 * the calibration.
283 */
284#define QUICK_PIT_MS 15
285#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256)
286
287static unsigned long quick_pit_calibrate(void)
288{
289 /* Set the Gate high, disable speaker */
290 outb((inb(0x61) & ~0x02) | 0x01, 0x61);
291
292 /*
293 * Counter 2, mode 0 (one-shot), binary count
294 *
295 * NOTE! Mode 2 decrements by two (and then the
296 * output is flipped each time, giving the same
297 * final output frequency as a decrement-by-one),
298 * so mode 0 is much better when looking at the
299 * individual counts.
300 */
301 outb(0xb0, 0x43);
302
303 /* Start at 0xffff */
304 outb(0xff, 0x42);
305 outb(0xff, 0x42);
306
307 if (pit_expect_msb(0xff)) {
308 int i;
309 u64 t1, t2, delta;
310 unsigned char expect = 0xfe;
311
312 t1 = get_cycles();
313 for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) {
314 if (!pit_expect_msb(expect))
315 goto failed;
316 }
317 t2 = get_cycles();
318
319 /*
320 * Make sure we can rely on the second TSC timestamp:
321 */
322 if (!pit_expect_msb(expect))
323 goto failed;
324
325 /*
326 * Ok, if we get here, then we've seen the
327 * MSB of the PIT decrement QUICK_PIT_ITERATIONS
328 * times, and each MSB had many hits, so we never
329 * had any sudden jumps.
330 *
331 * As a result, we can depend on there not being
332 * any odd delays anywhere, and the TSC reads are
333 * reliable.
334 *
335 * kHz = ticks / time-in-seconds / 1000;
336 * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000
337 * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000)
338 */
339 delta = (t2 - t1)*PIT_TICK_RATE;
340 do_div(delta, QUICK_PIT_ITERATIONS*256*1000);
341 printk("Fast TSC calibration using PIT\n");
342 return delta;
343 }
344failed:
345 return 0;
346}
347
348/**
349 * native_calibrate_tsc - calibrate the tsc on boot
350 */
351unsigned long native_calibrate_tsc(void)
352{
353 u64 tsc1, tsc2, delta, ref1, ref2;
354 unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
355 unsigned long flags, latch, ms, fast_calibrate;
356 int hpet = is_hpet_enabled(), i, loopmin;
357
358 local_irq_save(flags);
359 fast_calibrate = quick_pit_calibrate();
150 local_irq_restore(flags); 360 local_irq_restore(flags);
361 if (fast_calibrate)
362 return fast_calibrate;
151 363
152 /* 364 /*
153 * Preset the result with the raw and inaccurate PIT 365 * Run 5 calibration loops to get the lowest frequency value
154 * calibration value 366 * (the best estimate). We use two different calibration modes
367 * here:
368 *
369 * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and
370 * load a timeout of 50ms. We read the time right after we
371 * started the timer and wait until the PIT count down reaches
372 * zero. In each wait loop iteration we read the TSC and check
373 * the delta to the previous read. We keep track of the min
374 * and max values of that delta. The delta is mostly defined
375 * by the IO time of the PIT access, so we can detect when a
376 * SMI/SMM disturbance happend between the two reads. If the
377 * maximum time is significantly larger than the minimum time,
378 * then we discard the result and have another try.
379 *
380 * 2) Reference counter. If available we use the HPET or the
381 * PMTIMER as a reference to check the sanity of that value.
382 * We use separate TSC readouts and check inside of the
383 * reference read for a SMI/SMM disturbance. We dicard
384 * disturbed values here as well. We do that around the PIT
385 * calibration delay loop as we have to wait for a certain
386 * amount of time anyway.
155 */ 387 */
156 delta = (tr2 - tr1); 388
157 do_div(delta, 50); 389 /* Preset PIT loop values */
158 tsc_khz_val = delta; 390 latch = CAL_LATCH;
159 391 ms = CAL_MS;
160 /* hpet or pmtimer available ? */ 392 loopmin = CAL_PIT_LOOPS;
161 if (!hpet && !pm1 && !pm2) { 393
162 printk(KERN_INFO "TSC calibrated against PIT\n"); 394 for (i = 0; i < 3; i++) {
163 goto out; 395 unsigned long tsc_pit_khz;
396
397 /*
398 * Read the start value and the reference count of
399 * hpet/pmtimer when available. Then do the PIT
400 * calibration, which will take at least 50ms, and
401 * read the end value.
402 */
403 local_irq_save(flags);
404 tsc1 = tsc_read_refs(&ref1, hpet);
405 tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin);
406 tsc2 = tsc_read_refs(&ref2, hpet);
407 local_irq_restore(flags);
408
409 /* Pick the lowest PIT TSC calibration so far */
410 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
411
412 /* hpet or pmtimer available ? */
413 if (!hpet && !ref1 && !ref2)
414 continue;
415
416 /* Check, whether the sampling was disturbed by an SMI */
417 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX)
418 continue;
419
420 tsc2 = (tsc2 - tsc1) * 1000000LL;
421 if (hpet)
422 tsc2 = calc_hpet_ref(tsc2, ref1, ref2);
423 else
424 tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2);
425
426 tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2);
427
428 /* Check the reference deviation */
429 delta = ((u64) tsc_pit_min) * 100;
430 do_div(delta, tsc_ref_min);
431
432 /*
433 * If both calibration results are inside a 10% window
434 * then we can be sure, that the calibration
435 * succeeded. We break out of the loop right away. We
436 * use the reference value, as it is more precise.
437 */
438 if (delta >= 90 && delta <= 110) {
439 printk(KERN_INFO
440 "TSC: PIT calibration matches %s. %d loops\n",
441 hpet ? "HPET" : "PMTIMER", i + 1);
442 return tsc_ref_min;
443 }
444
445 /*
446 * Check whether PIT failed more than once. This
447 * happens in virtualized environments. We need to
448 * give the virtual PC a slightly longer timeframe for
449 * the HPET/PMTIMER to make the result precise.
450 */
451 if (i == 1 && tsc_pit_min == ULONG_MAX) {
452 latch = CAL2_LATCH;
453 ms = CAL2_MS;
454 loopmin = CAL2_PIT_LOOPS;
455 }
164 } 456 }
165 457
166 /* Check, whether the sampling was disturbed by an SMI */ 458 /*
167 if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) { 459 * Now check the results.
168 printk(KERN_WARNING "TSC calibration disturbed by SMI, " 460 */
169 "using PIT calibration result\n"); 461 if (tsc_pit_min == ULONG_MAX) {
170 goto out; 462 /* PIT gave no useful value */
463 printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n");
464
465 /* We don't have an alternative source, disable TSC */
466 if (!hpet && !ref1 && !ref2) {
467 printk("TSC: No reference (HPET/PMTIMER) available\n");
468 return 0;
469 }
470
471 /* The alternative source failed as well, disable TSC */
472 if (tsc_ref_min == ULONG_MAX) {
473 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration "
474 "failed.\n");
475 return 0;
476 }
477
478 /* Use the alternative source */
479 printk(KERN_INFO "TSC: using %s reference calibration\n",
480 hpet ? "HPET" : "PMTIMER");
481
482 return tsc_ref_min;
171 } 483 }
172 484
173 tsc2 = (tsc2 - tsc1) * 1000000LL; 485 /* We don't have an alternative source, use the PIT calibration value */
174 486 if (!hpet && !ref1 && !ref2) {
175 if (hpet) { 487 printk(KERN_INFO "TSC: Using PIT calibration value\n");
176 printk(KERN_INFO "TSC calibrated against HPET\n"); 488 return tsc_pit_min;
177 if (hpet2 < hpet1)
178 hpet2 += 0x100000000ULL;
179 hpet2 -= hpet1;
180 tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD));
181 do_div(tsc1, 1000000);
182 } else {
183 printk(KERN_INFO "TSC calibrated against PM_TIMER\n");
184 if (pm2 < pm1)
185 pm2 += (u64)ACPI_PM_OVRRUN;
186 pm2 -= pm1;
187 tsc1 = pm2 * 1000000000LL;
188 do_div(tsc1, PMTMR_TICKS_PER_SEC);
189 } 489 }
190 490
191 do_div(tsc2, tsc1); 491 /* The alternative source failed, use the PIT calibration value */
192 tsc_khz_val = tsc2; 492 if (tsc_ref_min == ULONG_MAX) {
493 printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. "
494 "Using PIT calibration\n");
495 return tsc_pit_min;
496 }
193 497
194out: 498 /*
195 return tsc_khz_val; 499 * The calibration values differ too much. In doubt, we use
500 * the PIT value as we know that there are PMTIMERs around
501 * running at double speed. At least we let the user know:
502 */
503 printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n",
504 hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min);
505 printk(KERN_INFO "TSC: Using PIT calibration value\n");
506 return tsc_pit_min;
196} 507}
197 508
198
199#ifdef CONFIG_X86_32 509#ifdef CONFIG_X86_32
200/* Only called from the Powernow K7 cpu freq driver */ 510/* Only called from the Powernow K7 cpu freq driver */
201int recalibrate_cpu_khz(void) 511int recalibrate_cpu_khz(void)
@@ -314,7 +624,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
314 mark_tsc_unstable("cpufreq changes"); 624 mark_tsc_unstable("cpufreq changes");
315 } 625 }
316 626
317 set_cyc2ns_scale(tsc_khz_ref, freq->cpu); 627 set_cyc2ns_scale(tsc_khz, freq->cpu);
318 628
319 return 0; 629 return 0;
320} 630}
@@ -325,6 +635,10 @@ static struct notifier_block time_cpufreq_notifier_block = {
325 635
326static int __init cpufreq_tsc(void) 636static int __init cpufreq_tsc(void)
327{ 637{
638 if (!cpu_has_tsc)
639 return 0;
640 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
641 return 0;
328 cpufreq_register_notifier(&time_cpufreq_notifier_block, 642 cpufreq_register_notifier(&time_cpufreq_notifier_block,
329 CPUFREQ_TRANSITION_NOTIFIER); 643 CPUFREQ_TRANSITION_NOTIFIER);
330 return 0; 644 return 0;
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index 0577825cf89b..9ffb01c31c40 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -88,11 +88,9 @@ static __cpuinit void check_tsc_warp(void)
88 __raw_spin_unlock(&sync_lock); 88 __raw_spin_unlock(&sync_lock);
89 } 89 }
90 } 90 }
91 if (!(now-start)) { 91 WARN(!(now-start),
92 printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", 92 "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n",
93 now-start, end-start); 93 now-start, end-start);
94 WARN_ON(1);
95 }
96} 94}
97 95
98/* 96/*
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
new file mode 100644
index 000000000000..aeef529917e4
--- /dev/null
+++ b/arch/x86/kernel/uv_irq.c
@@ -0,0 +1,79 @@
1/*
2 * This file is subject to the terms and conditions of the GNU General Public
3 * License. See the file "COPYING" in the main directory of this archive
4 * for more details.
5 *
6 * SGI UV IRQ functions
7 *
8 * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved.
9 */
10
11#include <linux/module.h>
12#include <linux/irq.h>
13
14#include <asm/apic.h>
15#include <asm/uv/uv_irq.h>
16
17static void uv_noop(unsigned int irq)
18{
19}
20
21static unsigned int uv_noop_ret(unsigned int irq)
22{
23 return 0;
24}
25
26static void uv_ack_apic(unsigned int irq)
27{
28 ack_APIC_irq();
29}
30
31struct irq_chip uv_irq_chip = {
32 .name = "UV-CORE",
33 .startup = uv_noop_ret,
34 .shutdown = uv_noop,
35 .enable = uv_noop,
36 .disable = uv_noop,
37 .ack = uv_noop,
38 .mask = uv_noop,
39 .unmask = uv_noop,
40 .eoi = uv_ack_apic,
41 .end = uv_noop,
42};
43
44/*
45 * Set up a mapping of an available irq and vector, and enable the specified
46 * MMR that defines the MSI that is to be sent to the specified CPU when an
47 * interrupt is raised.
48 */
49int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
50 unsigned long mmr_offset)
51{
52 int irq;
53 int ret;
54
55 irq = create_irq();
56 if (irq <= 0)
57 return -EBUSY;
58
59 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset);
60 if (ret != irq)
61 destroy_irq(irq);
62
63 return ret;
64}
65EXPORT_SYMBOL_GPL(uv_setup_irq);
66
67/*
68 * Tear down a mapping of an irq and vector, and disable the specified MMR that
69 * defined the MSI that was to be sent to the specified CPU when an interrupt
70 * was raised.
71 *
72 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
73 */
74void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset)
75{
76 arch_disable_uv_irq(mmr_blade, mmr_offset);
77 destroy_irq(irq);
78}
79EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
new file mode 100644
index 000000000000..67f9b9dbf800
--- /dev/null
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -0,0 +1,72 @@
1/*
2 * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 *
18 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson
20 */
21
22#include <linux/sysdev.h>
23#include <asm/uv/bios.h>
24
25struct kobject *sgi_uv_kobj;
26
27static ssize_t partition_id_show(struct kobject *kobj,
28 struct kobj_attribute *attr, char *buf)
29{
30 return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id);
31}
32
33static ssize_t coherence_id_show(struct kobject *kobj,
34 struct kobj_attribute *attr, char *buf)
35{
36 return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id());
37}
38
39static struct kobj_attribute partition_id_attr =
40 __ATTR(partition_id, S_IRUGO, partition_id_show, NULL);
41
42static struct kobj_attribute coherence_id_attr =
43 __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL);
44
45
46static int __init sgi_uv_sysfs_init(void)
47{
48 unsigned long ret;
49
50 if (!sgi_uv_kobj)
51 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
52 if (!sgi_uv_kobj) {
53 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n");
54 return -EINVAL;
55 }
56
57 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
58 if (ret) {
59 printk(KERN_WARNING "sysfs_create_file partition_id failed \n");
60 return ret;
61 }
62
63 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
64 if (ret) {
65 printk(KERN_WARNING "sysfs_create_file coherence_id failed \n");
66 return ret;
67 }
68
69 return 0;
70}
71
72device_initcall(sgi_uv_sysfs_init);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index e94bdb6add1d..0c9667f0752a 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -25,45 +25,31 @@
25#include <asm/visws/cobalt.h> 25#include <asm/visws/cobalt.h>
26#include <asm/visws/piix4.h> 26#include <asm/visws/piix4.h>
27#include <asm/arch_hooks.h> 27#include <asm/arch_hooks.h>
28#include <asm/io_apic.h>
28#include <asm/fixmap.h> 29#include <asm/fixmap.h>
29#include <asm/reboot.h> 30#include <asm/reboot.h>
30#include <asm/setup.h> 31#include <asm/setup.h>
31#include <asm/e820.h> 32#include <asm/e820.h>
32#include <asm/smp.h>
33#include <asm/io.h> 33#include <asm/io.h>
34 34
35#include <mach_ipi.h> 35#include <mach_ipi.h>
36 36
37#include "mach_apic.h" 37#include "mach_apic.h"
38 38
39#include <linux/init.h>
40#include <linux/smp.h>
41
42#include <linux/kernel_stat.h> 39#include <linux/kernel_stat.h>
43#include <linux/interrupt.h>
44#include <linux/init.h>
45 40
46#include <asm/io.h>
47#include <asm/apic.h>
48#include <asm/i8259.h> 41#include <asm/i8259.h>
49#include <asm/irq_vectors.h> 42#include <asm/irq_vectors.h>
50#include <asm/visws/cobalt.h>
51#include <asm/visws/lithium.h> 43#include <asm/visws/lithium.h>
52#include <asm/visws/piix4.h>
53 44
54#include <linux/sched.h> 45#include <linux/sched.h>
55#include <linux/kernel.h> 46#include <linux/kernel.h>
56#include <linux/init.h>
57#include <linux/pci.h> 47#include <linux/pci.h>
58#include <linux/pci_ids.h> 48#include <linux/pci_ids.h>
59 49
60extern int no_broadcast; 50extern int no_broadcast;
61 51
62#include <asm/io.h>
63#include <asm/apic.h> 52#include <asm/apic.h>
64#include <asm/arch_hooks.h>
65#include <asm/visws/cobalt.h>
66#include <asm/visws/lithium.h>
67 53
68char visws_board_type = -1; 54char visws_board_type = -1;
69char visws_board_rev = -1; 55char visws_board_rev = -1;
@@ -73,7 +59,7 @@ int is_visws_box(void)
73 return visws_board_type >= 0; 59 return visws_board_type >= 0;
74} 60}
75 61
76static int __init visws_time_init_quirk(void) 62static int __init visws_time_init(void)
77{ 63{
78 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 64 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
79 65
@@ -93,7 +79,7 @@ static int __init visws_time_init_quirk(void)
93 return 0; 79 return 0;
94} 80}
95 81
96static int __init visws_pre_intr_init_quirk(void) 82static int __init visws_pre_intr_init(void)
97{ 83{
98 init_VISWS_APIC_irqs(); 84 init_VISWS_APIC_irqs();
99 85
@@ -114,7 +100,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size);
114 100
115long long mem_size __initdata = 0; 101long long mem_size __initdata = 0;
116 102
117static char * __init visws_memory_setup_quirk(void) 103static char * __init visws_memory_setup(void)
118{ 104{
119 long long gfx_mem_size = 8 * MB; 105 long long gfx_mem_size = 8 * MB;
120 106
@@ -176,7 +162,7 @@ static void visws_machine_power_off(void)
176 outl(PIIX_SPECIAL_STOP, 0xCFC); 162 outl(PIIX_SPECIAL_STOP, 0xCFC);
177} 163}
178 164
179static int __init visws_get_smp_config_quirk(unsigned int early) 165static int __init visws_get_smp_config(unsigned int early)
180{ 166{
181 /* 167 /*
182 * Prevent MP-table parsing by the generic code: 168 * Prevent MP-table parsing by the generic code:
@@ -184,15 +170,13 @@ static int __init visws_get_smp_config_quirk(unsigned int early)
184 return 1; 170 return 1;
185} 171}
186 172
187extern unsigned int __cpuinitdata maxcpus;
188
189/* 173/*
190 * The Visual Workstation is Intel MP compliant in the hardware 174 * The Visual Workstation is Intel MP compliant in the hardware
191 * sense, but it doesn't have a BIOS(-configuration table). 175 * sense, but it doesn't have a BIOS(-configuration table).
192 * No problem for Linux. 176 * No problem for Linux.
193 */ 177 */
194 178
195static void __init MP_processor_info (struct mpc_config_processor *m) 179static void __init MP_processor_info(struct mpc_config_processor *m)
196{ 180{
197 int ver, logical_apicid; 181 int ver, logical_apicid;
198 physid_mask_t apic_cpus; 182 physid_mask_t apic_cpus;
@@ -232,7 +216,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m)
232 apic_version[m->mpc_apicid] = ver; 216 apic_version[m->mpc_apicid] = ver;
233} 217}
234 218
235int __init visws_find_smp_config_quirk(unsigned int reserve) 219static int __init visws_find_smp_config(unsigned int reserve)
236{ 220{
237 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); 221 struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS);
238 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 222 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -244,8 +228,8 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
244 ncpus = CO_CPU_MAX; 228 ncpus = CO_CPU_MAX;
245 } 229 }
246 230
247 if (ncpus > maxcpus) 231 if (ncpus > setup_max_cpus)
248 ncpus = maxcpus; 232 ncpus = setup_max_cpus;
249 233
250#ifdef CONFIG_X86_LOCAL_APIC 234#ifdef CONFIG_X86_LOCAL_APIC
251 smp_found_config = 1; 235 smp_found_config = 1;
@@ -258,7 +242,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve)
258 return 1; 242 return 1;
259} 243}
260 244
261extern int visws_trap_init_quirk(void); 245static int visws_trap_init(void);
246
247static struct x86_quirks visws_x86_quirks __initdata = {
248 .arch_time_init = visws_time_init,
249 .arch_pre_intr_init = visws_pre_intr_init,
250 .arch_memory_setup = visws_memory_setup,
251 .arch_intr_init = NULL,
252 .arch_trap_init = visws_trap_init,
253 .mach_get_smp_config = visws_get_smp_config,
254 .mach_find_smp_config = visws_find_smp_config,
255};
262 256
263void __init visws_early_detect(void) 257void __init visws_early_detect(void)
264{ 258{
@@ -272,16 +266,10 @@ void __init visws_early_detect(void)
272 266
273 /* 267 /*
274 * Install special quirks for timer, interrupt and memory setup: 268 * Install special quirks for timer, interrupt and memory setup:
275 */
276 arch_time_init_quirk = visws_time_init_quirk;
277 arch_pre_intr_init_quirk = visws_pre_intr_init_quirk;
278 arch_memory_setup_quirk = visws_memory_setup_quirk;
279
280 /*
281 * Fall back to generic behavior for traps: 269 * Fall back to generic behavior for traps:
270 * Override generic MP-table parsing:
282 */ 271 */
283 arch_intr_init_quirk = NULL; 272 x86_quirks = &visws_x86_quirks;
284 arch_trap_init_quirk = visws_trap_init_quirk;
285 273
286 /* 274 /*
287 * Install reboot quirks: 275 * Install reboot quirks:
@@ -294,12 +282,6 @@ void __init visws_early_detect(void)
294 */ 282 */
295 no_broadcast = 0; 283 no_broadcast = 0;
296 284
297 /*
298 * Override generic MP-table parsing:
299 */
300 mach_get_smp_config_quirk = visws_get_smp_config_quirk;
301 mach_find_smp_config_quirk = visws_find_smp_config_quirk;
302
303#ifdef CONFIG_X86_IO_APIC 285#ifdef CONFIG_X86_IO_APIC
304 /* 286 /*
305 * Turn off IO-APIC detection and initialization: 287 * Turn off IO-APIC detection and initialization:
@@ -426,7 +408,7 @@ static __init void cobalt_init(void)
426 co_apic_read(CO_APIC_ID)); 408 co_apic_read(CO_APIC_ID));
427} 409}
428 410
429int __init visws_trap_init_quirk(void) 411static int __init visws_trap_init(void)
430{ 412{
431 lithium_init(); 413 lithium_init();
432 cobalt_init(); 414 cobalt_init();
@@ -502,10 +484,11 @@ static void disable_cobalt_irq(unsigned int irq)
502static unsigned int startup_cobalt_irq(unsigned int irq) 484static unsigned int startup_cobalt_irq(unsigned int irq)
503{ 485{
504 unsigned long flags; 486 unsigned long flags;
487 struct irq_desc *desc = irq_to_desc(irq);
505 488
506 spin_lock_irqsave(&cobalt_lock, flags); 489 spin_lock_irqsave(&cobalt_lock, flags);
507 if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) 490 if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING)))
508 irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); 491 desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
509 enable_cobalt_irq(irq); 492 enable_cobalt_irq(irq);
510 spin_unlock_irqrestore(&cobalt_lock, flags); 493 spin_unlock_irqrestore(&cobalt_lock, flags);
511 return 0; 494 return 0;
@@ -524,9 +507,10 @@ static void ack_cobalt_irq(unsigned int irq)
524static void end_cobalt_irq(unsigned int irq) 507static void end_cobalt_irq(unsigned int irq)
525{ 508{
526 unsigned long flags; 509 unsigned long flags;
510 struct irq_desc *desc = irq_to_desc(irq);
527 511
528 spin_lock_irqsave(&cobalt_lock, flags); 512 spin_lock_irqsave(&cobalt_lock, flags);
529 if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS))) 513 if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
530 enable_cobalt_irq(irq); 514 enable_cobalt_irq(irq);
531 spin_unlock_irqrestore(&cobalt_lock, flags); 515 spin_unlock_irqrestore(&cobalt_lock, flags);
532} 516}
@@ -644,12 +628,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
644 628
645 spin_unlock_irqrestore(&i8259A_lock, flags); 629 spin_unlock_irqrestore(&i8259A_lock, flags);
646 630
647 desc = irq_desc + realirq; 631 desc = irq_to_desc(realirq);
648 632
649 /* 633 /*
650 * handle this 'virtual interrupt' as a Cobalt one now. 634 * handle this 'virtual interrupt' as a Cobalt one now.
651 */ 635 */
652 kstat_cpu(smp_processor_id()).irqs[realirq]++; 636 kstat_incr_irqs_this_cpu(realirq, desc);
653 637
654 if (likely(desc->action != NULL)) 638 if (likely(desc->action != NULL))
655 handle_IRQ_event(realirq, desc->action); 639 handle_IRQ_event(realirq, desc->action);
@@ -680,27 +664,29 @@ void init_VISWS_APIC_irqs(void)
680 int i; 664 int i;
681 665
682 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { 666 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
683 irq_desc[i].status = IRQ_DISABLED; 667 struct irq_desc *desc = irq_to_desc(i);
684 irq_desc[i].action = 0; 668
685 irq_desc[i].depth = 1; 669 desc->status = IRQ_DISABLED;
670 desc->action = 0;
671 desc->depth = 1;
686 672
687 if (i == 0) { 673 if (i == 0) {
688 irq_desc[i].chip = &cobalt_irq_type; 674 desc->chip = &cobalt_irq_type;
689 } 675 }
690 else if (i == CO_IRQ_IDE0) { 676 else if (i == CO_IRQ_IDE0) {
691 irq_desc[i].chip = &cobalt_irq_type; 677 desc->chip = &cobalt_irq_type;
692 } 678 }
693 else if (i == CO_IRQ_IDE1) { 679 else if (i == CO_IRQ_IDE1) {
694 irq_desc[i].chip = &cobalt_irq_type; 680 desc->chip = &cobalt_irq_type;
695 } 681 }
696 else if (i == CO_IRQ_8259) { 682 else if (i == CO_IRQ_8259) {
697 irq_desc[i].chip = &piix4_master_irq_type; 683 desc->chip = &piix4_master_irq_type;
698 } 684 }
699 else if (i < CO_IRQ_APIC0) { 685 else if (i < CO_IRQ_APIC0) {
700 irq_desc[i].chip = &piix4_virtual_irq_type; 686 desc->chip = &piix4_virtual_irq_type;
701 } 687 }
702 else if (IS_CO_APIC(i)) { 688 else if (IS_CO_APIC(i)) {
703 irq_desc[i].chip = &cobalt_irq_type; 689 desc->chip = &cobalt_irq_type;
704 } 690 }
705 } 691 }
706 692
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 38f566fa27d2..4eeb5cf9720d 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -46,6 +46,7 @@
46#include <asm/io.h> 46#include <asm/io.h>
47#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/irq.h> 48#include <asm/irq.h>
49#include <asm/syscalls.h>
49 50
50/* 51/*
51 * Known problems: 52 * Known problems:
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index b15346092b7b..8b6c393ab9fd 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -37,6 +37,7 @@
37#include <asm/timer.h> 37#include <asm/timer.h>
38#include <asm/vmi_time.h> 38#include <asm/vmi_time.h>
39#include <asm/kmap_types.h> 39#include <asm/kmap_types.h>
40#include <asm/setup.h>
40 41
41/* Convenient for calling VMI functions indirectly in the ROM */ 42/* Convenient for calling VMI functions indirectly in the ROM */
42typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); 43typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
@@ -234,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
234 const void *desc) 235 const void *desc)
235{ 236{
236 u32 *ldt_entry = (u32 *)desc; 237 u32 *ldt_entry = (u32 *)desc;
237 vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); 238 vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
238} 239}
239 240
240static void vmi_load_sp0(struct tss_struct *tss, 241static void vmi_load_sp0(struct tss_struct *tss,
@@ -392,13 +393,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
392} 393}
393#endif 394#endif
394 395
395static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) 396static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
396{ 397{
397 vmi_set_page_type(pfn, VMI_PAGE_L1); 398 vmi_set_page_type(pfn, VMI_PAGE_L1);
398 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 399 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
399} 400}
400 401
401static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) 402static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
402{ 403{
403 /* 404 /*
404 * This call comes in very early, before mem_map is setup. 405 * This call comes in very early, before mem_map is setup.
@@ -409,20 +410,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn)
409 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); 410 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
410} 411}
411 412
412static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) 413static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
413{ 414{
414 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); 415 vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE);
415 vmi_check_page_type(clonepfn, VMI_PAGE_L2); 416 vmi_check_page_type(clonepfn, VMI_PAGE_L2);
416 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); 417 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
417} 418}
418 419
419static void vmi_release_pte(u32 pfn) 420static void vmi_release_pte(unsigned long pfn)
420{ 421{
421 vmi_ops.release_page(pfn, VMI_PAGE_L1); 422 vmi_ops.release_page(pfn, VMI_PAGE_L1);
422 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 423 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
423} 424}
424 425
425static void vmi_release_pmd(u32 pfn) 426static void vmi_release_pmd(unsigned long pfn)
426{ 427{
427 vmi_ops.release_page(pfn, VMI_PAGE_L2); 428 vmi_ops.release_page(pfn, VMI_PAGE_L2);
428 vmi_set_page_type(pfn, VMI_PAGE_NORMAL); 429 vmi_set_page_type(pfn, VMI_PAGE_NORMAL);
@@ -683,7 +684,7 @@ void vmi_bringup(void)
683{ 684{
684 /* We must establish the lowmem mapping for MMU ops to work */ 685 /* We must establish the lowmem mapping for MMU ops to work */
685 if (vmi_ops.set_linear_mapping) 686 if (vmi_ops.set_linear_mapping)
686 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0); 687 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
687} 688}
688 689
689/* 690/*
@@ -904,9 +905,8 @@ static inline int __init activate_vmi(void)
904#endif 905#endif
905 906
906#ifdef CONFIG_X86_LOCAL_APIC 907#ifdef CONFIG_X86_LOCAL_APIC
907 para_fill(pv_apic_ops.apic_read, APICRead); 908 para_fill(apic_ops->read, APICRead);
908 para_fill(pv_apic_ops.apic_write, APICWrite); 909 para_fill(apic_ops->write, APICWrite);
909 para_fill(pv_apic_ops.apic_write_atomic, APICWrite);
910#endif 910#endif
911 911
912 /* 912 /*
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 6953859fe289..254ee07f8635 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -235,11 +235,14 @@ static void __devinit vmi_time_init_clockevent(void)
235 235
236void __init vmi_time_init(void) 236void __init vmi_time_init(void)
237{ 237{
238 unsigned int cpu;
238 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ 239 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
239 outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ 240 outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
240 241
241 vmi_time_init_clockevent(); 242 vmi_time_init_clockevent();
242 setup_irq(0, &vmi_clock_action); 243 setup_irq(0, &vmi_clock_action);
244 for_each_possible_cpu(cpu)
245 per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
243} 246}
244 247
245#ifdef CONFIG_X86_LOCAL_APIC 248#ifdef CONFIG_X86_LOCAL_APIC
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
index cdb2363697d2..a9b8560adbc2 100644
--- a/arch/x86/kernel/vmlinux_32.lds.S
+++ b/arch/x86/kernel/vmlinux_32.lds.S
@@ -140,10 +140,10 @@ SECTIONS
140 *(.con_initcall.init) 140 *(.con_initcall.init)
141 __con_initcall_end = .; 141 __con_initcall_end = .;
142 } 142 }
143 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { 143 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
144 __x86cpuvendor_start = .; 144 __x86_cpu_dev_start = .;
145 *(.x86cpuvendor.init) 145 *(.x86_cpu_dev.init)
146 __x86cpuvendor_end = .; 146 __x86_cpu_dev_end = .;
147 } 147 }
148 SECURITY_INIT 148 SECURITY_INIT
149 . = ALIGN(4); 149 . = ALIGN(4);
@@ -180,6 +180,7 @@ SECTIONS
180 . = ALIGN(PAGE_SIZE); 180 . = ALIGN(PAGE_SIZE);
181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { 181 .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) {
182 __per_cpu_start = .; 182 __per_cpu_start = .;
183 *(.data.percpu.page_aligned)
183 *(.data.percpu) 184 *(.data.percpu)
184 *(.data.percpu.shared_aligned) 185 *(.data.percpu.shared_aligned)
185 __per_cpu_end = .; 186 __per_cpu_end = .;
@@ -209,3 +210,11 @@ SECTIONS
209 210
210 DWARF_DEBUG 211 DWARF_DEBUG
211} 212}
213
214#ifdef CONFIG_KEXEC
215/* Link time checks */
216#include <asm/kexec.h>
217
218ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
219 "kexec control code size is too big")
220#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
index 63e5c1a22e88..46e05447405b 100644
--- a/arch/x86/kernel/vmlinux_64.lds.S
+++ b/arch/x86/kernel/vmlinux_64.lds.S
@@ -168,12 +168,11 @@ SECTIONS
168 *(.con_initcall.init) 168 *(.con_initcall.init)
169 } 169 }
170 __con_initcall_end = .; 170 __con_initcall_end = .;
171 . = ALIGN(16); 171 __x86_cpu_dev_start = .;
172 __x86cpuvendor_start = .; 172 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
173 .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { 173 *(.x86_cpu_dev.init)
174 *(.x86cpuvendor.init)
175 } 174 }
176 __x86cpuvendor_end = .; 175 __x86_cpu_dev_end = .;
177 SECURITY_INIT 176 SECURITY_INIT
178 177
179 . = ALIGN(8); 178 . = ALIGN(8);
diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c
index 0c029e8959c7..7766d36983fc 100644
--- a/arch/x86/kernel/vsmp_64.c
+++ b/arch/x86/kernel/vsmp_64.c
@@ -61,7 +61,7 @@ static void vsmp_irq_enable(void)
61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); 61 native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC));
62} 62}
63 63
64static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, 64static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf,
65 unsigned long addr, unsigned len) 65 unsigned long addr, unsigned len)
66{ 66{
67 switch (type) { 67 switch (type) {
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
new file mode 100644
index 000000000000..b13acb75e822
--- /dev/null
+++ b/arch/x86/kernel/xsave.c
@@ -0,0 +1,345 @@
1/*
2 * xsave/xrstor support.
3 *
4 * Author: Suresh Siddha <suresh.b.siddha@intel.com>
5 */
6#include <linux/bootmem.h>
7#include <linux/compat.h>
8#include <asm/i387.h>
9#ifdef CONFIG_IA32_EMULATION
10#include <asm/sigcontext32.h>
11#endif
12#include <asm/xcr.h>
13
14/*
15 * Supported feature mask by the CPU and the kernel.
16 */
17u64 pcntxt_mask;
18
19struct _fpx_sw_bytes fx_sw_reserved;
20#ifdef CONFIG_IA32_EMULATION
21struct _fpx_sw_bytes fx_sw_reserved_ia32;
22#endif
23
24/*
25 * Check for the presence of extended state information in the
26 * user fpstate pointer in the sigcontext.
27 */
28int check_for_xstate(struct i387_fxsave_struct __user *buf,
29 void __user *fpstate,
30 struct _fpx_sw_bytes *fx_sw_user)
31{
32 int min_xstate_size = sizeof(struct i387_fxsave_struct) +
33 sizeof(struct xsave_hdr_struct);
34 unsigned int magic2;
35 int err;
36
37 err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
38 sizeof(struct _fpx_sw_bytes));
39
40 if (err)
41 return err;
42
43 /*
44 * First Magic check failed.
45 */
46 if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
47 return -1;
48
49 /*
50 * Check for error scenarios.
51 */
52 if (fx_sw_user->xstate_size < min_xstate_size ||
53 fx_sw_user->xstate_size > xstate_size ||
54 fx_sw_user->xstate_size > fx_sw_user->extended_size)
55 return -1;
56
57 err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
58 fx_sw_user->extended_size -
59 FP_XSTATE_MAGIC2_SIZE));
60 /*
61 * Check for the presence of second magic word at the end of memory
62 * layout. This detects the case where the user just copied the legacy
63 * fpstate layout with out copying the extended state information
64 * in the memory layout.
65 */
66 if (err || magic2 != FP_XSTATE_MAGIC2)
67 return -1;
68
69 return 0;
70}
71
72#ifdef CONFIG_X86_64
73/*
74 * Signal frame handlers.
75 */
76
77int save_i387_xstate(void __user *buf)
78{
79 struct task_struct *tsk = current;
80 int err = 0;
81
82 if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size))
83 return -EACCES;
84
85 BUG_ON(sig_xstate_size < xstate_size);
86
87 if ((unsigned long)buf % 64)
88 printk("save_i387_xstate: bad fpstate %p\n", buf);
89
90 if (!used_math())
91 return 0;
92 clear_used_math(); /* trigger finit */
93 if (task_thread_info(tsk)->status & TS_USEDFPU) {
94 /*
95 * Start with clearing the user buffer. This will present a
96 * clean context for the bytes not touched by the fxsave/xsave.
97 */
98 err = __clear_user(buf, sig_xstate_size);
99 if (err)
100 return err;
101
102 if (task_thread_info(tsk)->status & TS_XSAVE)
103 err = xsave_user(buf);
104 else
105 err = fxsave_user(buf);
106
107 if (err)
108 return err;
109 task_thread_info(tsk)->status &= ~TS_USEDFPU;
110 stts();
111 } else {
112 if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
113 xstate_size))
114 return -1;
115 }
116
117 if (task_thread_info(tsk)->status & TS_XSAVE) {
118 struct _fpstate __user *fx = buf;
119 struct _xstate __user *x = buf;
120 u64 xstate_bv;
121
122 err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved,
123 sizeof(struct _fpx_sw_bytes));
124
125 err |= __put_user(FP_XSTATE_MAGIC2,
126 (__u32 __user *) (buf + sig_xstate_size
127 - FP_XSTATE_MAGIC2_SIZE));
128
129 /*
130 * Read the xstate_bv which we copied (directly from the cpu or
131 * from the state in task struct) to the user buffers and
132 * set the FP/SSE bits.
133 */
134 err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv);
135
136 /*
137 * For legacy compatible, we always set FP/SSE bits in the bit
138 * vector while saving the state to the user context. This will
139 * enable us capturing any changes(during sigreturn) to
140 * the FP/SSE bits by the legacy applications which don't touch
141 * xstate_bv in the xsave header.
142 *
143 * xsave aware apps can change the xstate_bv in the xsave
144 * header as well as change any contents in the memory layout.
145 * xrestore as part of sigreturn will capture all the changes.
146 */
147 xstate_bv |= XSTATE_FPSSE;
148
149 err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv);
150
151 if (err)
152 return err;
153 }
154
155 return 1;
156}
157
158/*
159 * Restore the extended state if present. Otherwise, restore the FP/SSE
160 * state.
161 */
162int restore_user_xstate(void __user *buf)
163{
164 struct _fpx_sw_bytes fx_sw_user;
165 u64 mask;
166 int err;
167
168 if (((unsigned long)buf % 64) ||
169 check_for_xstate(buf, buf, &fx_sw_user))
170 goto fx_only;
171
172 mask = fx_sw_user.xstate_bv;
173
174 /*
175 * restore the state passed by the user.
176 */
177 err = xrestore_user(buf, mask);
178 if (err)
179 return err;
180
181 /*
182 * init the state skipped by the user.
183 */
184 mask = pcntxt_mask & ~mask;
185
186 xrstor_state(init_xstate_buf, mask);
187
188 return 0;
189
190fx_only:
191 /*
192 * couldn't find the extended state information in the
193 * memory layout. Restore just the FP/SSE and init all
194 * the other extended state.
195 */
196 xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE);
197 return fxrstor_checking((__force struct i387_fxsave_struct *)buf);
198}
199
200/*
201 * This restores directly out of user space. Exceptions are handled.
202 */
203int restore_i387_xstate(void __user *buf)
204{
205 struct task_struct *tsk = current;
206 int err = 0;
207
208 if (!buf) {
209 if (used_math())
210 goto clear;
211 return 0;
212 } else
213 if (!access_ok(VERIFY_READ, buf, sig_xstate_size))
214 return -EACCES;
215
216 if (!used_math()) {
217 err = init_fpu(tsk);
218 if (err)
219 return err;
220 }
221
222 if (!(task_thread_info(current)->status & TS_USEDFPU)) {
223 clts();
224 task_thread_info(current)->status |= TS_USEDFPU;
225 }
226 if (task_thread_info(tsk)->status & TS_XSAVE)
227 err = restore_user_xstate(buf);
228 else
229 err = fxrstor_checking((__force struct i387_fxsave_struct *)
230 buf);
231 if (unlikely(err)) {
232 /*
233 * Encountered an error while doing the restore from the
234 * user buffer, clear the fpu state.
235 */
236clear:
237 clear_fpu(tsk);
238 clear_used_math();
239 }
240 return err;
241}
242#endif
243
244/*
245 * Prepare the SW reserved portion of the fxsave memory layout, indicating
246 * the presence of the extended state information in the memory layout
247 * pointed by the fpstate pointer in the sigcontext.
248 * This will be saved when ever the FP and extended state context is
249 * saved on the user stack during the signal handler delivery to the user.
250 */
251static void prepare_fx_sw_frame(void)
252{
253 int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) +
254 FP_XSTATE_MAGIC2_SIZE;
255
256 sig_xstate_size = sizeof(struct _fpstate) + size_extended;
257
258#ifdef CONFIG_IA32_EMULATION
259 sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended;
260#endif
261
262 memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved));
263
264 fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1;
265 fx_sw_reserved.extended_size = sig_xstate_size;
266 fx_sw_reserved.xstate_bv = pcntxt_mask;
267 fx_sw_reserved.xstate_size = xstate_size;
268#ifdef CONFIG_IA32_EMULATION
269 memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved,
270 sizeof(struct _fpx_sw_bytes));
271 fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size;
272#endif
273}
274
275/*
276 * Represents init state for the supported extended state.
277 */
278struct xsave_struct *init_xstate_buf;
279
280#ifdef CONFIG_X86_64
281unsigned int sig_xstate_size = sizeof(struct _fpstate);
282#endif
283
284/*
285 * Enable the extended processor state save/restore feature
286 */
287void __cpuinit xsave_init(void)
288{
289 if (!cpu_has_xsave)
290 return;
291
292 set_in_cr4(X86_CR4_OSXSAVE);
293
294 /*
295 * Enable all the features that the HW is capable of
296 * and the Linux kernel is aware of.
297 */
298 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
299}
300
301/*
302 * setup the xstate image representing the init state
303 */
304static void __init setup_xstate_init(void)
305{
306 init_xstate_buf = alloc_bootmem(xstate_size);
307 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
308}
309
310/*
311 * Enable and initialize the xsave feature.
312 */
313void __init xsave_cntxt_init(void)
314{
315 unsigned int eax, ebx, ecx, edx;
316
317 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
318 pcntxt_mask = eax + ((u64)edx << 32);
319
320 if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
321 printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n",
322 pcntxt_mask);
323 BUG();
324 }
325
326 /*
327 * for now OS knows only about FP/SSE
328 */
329 pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
330 xsave_init();
331
332 /*
333 * Recompute the context size for enabled features
334 */
335 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
336 xstate_size = ebx;
337
338 prepare_fx_sw_frame();
339
340 setup_xstate_init();
341
342 printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, "
343 "cntxt size 0x%x\n",
344 pcntxt_mask, xstate_size);
345}
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 8d45fabc5f3b..ce3251ce5504 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -21,6 +21,7 @@ config KVM
21 tristate "Kernel-based Virtual Machine (KVM) support" 21 tristate "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM 22 depends on HAVE_KVM
23 select PREEMPT_NOTIFIERS 23 select PREEMPT_NOTIFIERS
24 select MMU_NOTIFIER
24 select ANON_INODES 25 select ANON_INODES
25 ---help--- 26 ---help---
26 Support hosting fully virtualized guest machines using hardware 27 Support hosting fully virtualized guest machines using hardware
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index c97d35c218db..c02343594b4d 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -2,10 +2,14 @@
2# Makefile for Kernel-based Virtual Machine module 2# Makefile for Kernel-based Virtual Machine module
3# 3#
4 4
5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) 5common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
6 coalesced_mmio.o irq_comm.o)
6ifeq ($(CONFIG_KVM_TRACE),y) 7ifeq ($(CONFIG_KVM_TRACE),y)
7common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) 8common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o)
8endif 9endif
10ifeq ($(CONFIG_DMAR),y)
11common-objs += $(addprefix ../../../virt/kvm/, vtd.o)
12endif
9 13
10EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 14EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm
11 15
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 3829aa7b663f..11c6725fb798 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -91,7 +91,7 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val)
91 c->gate = val; 91 c->gate = val;
92} 92}
93 93
94int pit_get_gate(struct kvm *kvm, int channel) 94static int pit_get_gate(struct kvm *kvm, int channel)
95{ 95{
96 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); 96 WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock));
97 97
@@ -193,23 +193,21 @@ static void pit_latch_status(struct kvm *kvm, int channel)
193 } 193 }
194} 194}
195 195
196int __pit_timer_fn(struct kvm_kpit_state *ps) 196static int __pit_timer_fn(struct kvm_kpit_state *ps)
197{ 197{
198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0]; 198 struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0];
199 struct kvm_kpit_timer *pt = &ps->pit_timer; 199 struct kvm_kpit_timer *pt = &ps->pit_timer;
200 200
201 atomic_inc(&pt->pending); 201 if (!atomic_inc_and_test(&pt->pending))
202 smp_mb__after_atomic_inc();
203 if (vcpu0) {
204 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); 202 set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests);
205 if (waitqueue_active(&vcpu0->wq)) {
206 vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE;
207 wake_up_interruptible(&vcpu0->wq);
208 }
209 }
210 203
211 pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); 204 if (vcpu0 && waitqueue_active(&vcpu0->wq))
212 pt->scheduled = ktime_to_ns(pt->timer.expires); 205 wake_up_interruptible(&vcpu0->wq);
206
207 hrtimer_add_expires_ns(&pt->timer, pt->period);
208 pt->scheduled = hrtimer_get_expires_ns(&pt->timer);
209 if (pt->period)
210 ps->channels[0].count_load_time = hrtimer_get_expires(&pt->timer);
213 211
214 return (pt->period == 0 ? 0 : 1); 212 return (pt->period == 0 ? 0 : 1);
215} 213}
@@ -218,12 +216,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu)
218{ 216{
219 struct kvm_pit *pit = vcpu->kvm->arch.vpit; 217 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
220 218
221 if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending) 219 if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack)
222 return atomic_read(&pit->pit_state.pit_timer.pending); 220 return atomic_read(&pit->pit_state.pit_timer.pending);
223
224 return 0; 221 return 0;
225} 222}
226 223
224static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
225{
226 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
227 irq_ack_notifier);
228 spin_lock(&ps->inject_lock);
229 if (atomic_dec_return(&ps->pit_timer.pending) < 0)
230 atomic_inc(&ps->pit_timer.pending);
231 ps->irq_ack = 1;
232 spin_unlock(&ps->inject_lock);
233}
234
227static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) 235static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
228{ 236{
229 struct kvm_kpit_state *ps; 237 struct kvm_kpit_state *ps;
@@ -249,7 +257,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
249 257
250 timer = &pit->pit_state.pit_timer.timer; 258 timer = &pit->pit_state.pit_timer.timer;
251 if (hrtimer_cancel(timer)) 259 if (hrtimer_cancel(timer))
252 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); 260 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
253} 261}
254 262
255static void destroy_pit_timer(struct kvm_kpit_timer *pt) 263static void destroy_pit_timer(struct kvm_kpit_timer *pt)
@@ -258,8 +266,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt)
258 hrtimer_cancel(&pt->timer); 266 hrtimer_cancel(&pt->timer);
259} 267}
260 268
261static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) 269static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
262{ 270{
271 struct kvm_kpit_timer *pt = &ps->pit_timer;
263 s64 interval; 272 s64 interval;
264 273
265 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 274 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
@@ -271,6 +280,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period)
271 pt->period = (is_period == 0) ? 0 : interval; 280 pt->period = (is_period == 0) ? 0 : interval;
272 pt->timer.function = pit_timer_fn; 281 pt->timer.function = pit_timer_fn;
273 atomic_set(&pt->pending, 0); 282 atomic_set(&pt->pending, 0);
283 ps->irq_ack = 1;
274 284
275 hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), 285 hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval),
276 HRTIMER_MODE_ABS); 286 HRTIMER_MODE_ABS);
@@ -305,10 +315,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
305 case 1: 315 case 1:
306 /* FIXME: enhance mode 4 precision */ 316 /* FIXME: enhance mode 4 precision */
307 case 4: 317 case 4:
308 create_pit_timer(&ps->pit_timer, val, 0); 318 create_pit_timer(ps, val, 0);
309 break; 319 break;
310 case 2: 320 case 2:
311 create_pit_timer(&ps->pit_timer, val, 1); 321 case 3:
322 create_pit_timer(ps, val, 1);
312 break; 323 break;
313 default: 324 default:
314 destroy_pit_timer(&ps->pit_timer); 325 destroy_pit_timer(&ps->pit_timer);
@@ -459,7 +470,8 @@ static void pit_ioport_read(struct kvm_io_device *this,
459 mutex_unlock(&pit_state->lock); 470 mutex_unlock(&pit_state->lock);
460} 471}
461 472
462static int pit_in_range(struct kvm_io_device *this, gpa_t addr) 473static int pit_in_range(struct kvm_io_device *this, gpa_t addr,
474 int len, int is_write)
463{ 475{
464 return ((addr >= KVM_PIT_BASE_ADDRESS) && 476 return ((addr >= KVM_PIT_BASE_ADDRESS) &&
465 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); 477 (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH));
@@ -500,7 +512,8 @@ static void speaker_ioport_read(struct kvm_io_device *this,
500 mutex_unlock(&pit_state->lock); 512 mutex_unlock(&pit_state->lock);
501} 513}
502 514
503static int speaker_in_range(struct kvm_io_device *this, gpa_t addr) 515static int speaker_in_range(struct kvm_io_device *this, gpa_t addr,
516 int len, int is_write)
504{ 517{
505 return (addr == KVM_SPEAKER_BASE_ADDRESS); 518 return (addr == KVM_SPEAKER_BASE_ADDRESS);
506} 519}
@@ -520,7 +533,7 @@ void kvm_pit_reset(struct kvm_pit *pit)
520 mutex_unlock(&pit->pit_state.lock); 533 mutex_unlock(&pit->pit_state.lock);
521 534
522 atomic_set(&pit->pit_state.pit_timer.pending, 0); 535 atomic_set(&pit->pit_state.pit_timer.pending, 0);
523 pit->pit_state.inject_pending = 1; 536 pit->pit_state.irq_ack = 1;
524} 537}
525 538
526struct kvm_pit *kvm_create_pit(struct kvm *kvm) 539struct kvm_pit *kvm_create_pit(struct kvm *kvm)
@@ -534,6 +547,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
534 547
535 mutex_init(&pit->pit_state.lock); 548 mutex_init(&pit->pit_state.lock);
536 mutex_lock(&pit->pit_state.lock); 549 mutex_lock(&pit->pit_state.lock);
550 spin_lock_init(&pit->pit_state.inject_lock);
537 551
538 /* Initialize PIO device */ 552 /* Initialize PIO device */
539 pit->dev.read = pit_ioport_read; 553 pit->dev.read = pit_ioport_read;
@@ -555,6 +569,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm)
555 pit_state->pit = pit; 569 pit_state->pit = pit;
556 hrtimer_init(&pit_state->pit_timer.timer, 570 hrtimer_init(&pit_state->pit_timer.timer,
557 CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 571 CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
572 pit_state->irq_ack_notifier.gsi = 0;
573 pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq;
574 kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
558 mutex_unlock(&pit->pit_state.lock); 575 mutex_unlock(&pit->pit_state.lock);
559 576
560 kvm_pit_reset(pit); 577 kvm_pit_reset(pit);
@@ -575,13 +592,11 @@ void kvm_free_pit(struct kvm *kvm)
575 } 592 }
576} 593}
577 594
578void __inject_pit_timer_intr(struct kvm *kvm) 595static void __inject_pit_timer_intr(struct kvm *kvm)
579{ 596{
580 mutex_lock(&kvm->lock); 597 mutex_lock(&kvm->lock);
581 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); 598 kvm_set_irq(kvm, 0, 1);
582 kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0); 599 kvm_set_irq(kvm, 0, 0);
583 kvm_pic_set_irq(pic_irqchip(kvm), 0, 1);
584 kvm_pic_set_irq(pic_irqchip(kvm), 0, 0);
585 mutex_unlock(&kvm->lock); 600 mutex_unlock(&kvm->lock);
586} 601}
587 602
@@ -592,37 +607,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
592 struct kvm_kpit_state *ps; 607 struct kvm_kpit_state *ps;
593 608
594 if (vcpu && pit) { 609 if (vcpu && pit) {
610 int inject = 0;
595 ps = &pit->pit_state; 611 ps = &pit->pit_state;
596 612
597 /* Try to inject pending interrupts when: 613 /* Try to inject pending interrupts when
598 * 1. Pending exists 614 * last one has been acked.
599 * 2. Last interrupt was accepted or waited for too long time*/ 615 */
600 if (atomic_read(&ps->pit_timer.pending) && 616 spin_lock(&ps->inject_lock);
601 (ps->inject_pending || 617 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
602 (jiffies - ps->last_injected_time 618 ps->irq_ack = 0;
603 >= KVM_MAX_PIT_INTR_INTERVAL))) { 619 inject = 1;
604 ps->inject_pending = 0;
605 __inject_pit_timer_intr(kvm);
606 ps->last_injected_time = jiffies;
607 }
608 }
609}
610
611void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
612{
613 struct kvm_arch *arch = &vcpu->kvm->arch;
614 struct kvm_kpit_state *ps;
615
616 if (vcpu && arch->vpit) {
617 ps = &arch->vpit->pit_state;
618 if (atomic_read(&ps->pit_timer.pending) &&
619 (((arch->vpic->pics[0].imr & 1) == 0 &&
620 arch->vpic->pics[0].irq_base == vec) ||
621 (arch->vioapic->redirtbl[0].fields.vector == vec &&
622 arch->vioapic->redirtbl[0].fields.mask != 1))) {
623 ps->inject_pending = 1;
624 atomic_dec(&ps->pit_timer.pending);
625 ps->channels[0].count_load_time = ktime_get();
626 } 620 }
621 spin_unlock(&ps->inject_lock);
622 if (inject)
623 __inject_pit_timer_intr(kvm);
627 } 624 }
628} 625}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index db25c2a6c8c4..e436d4983aa1 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -8,7 +8,6 @@ struct kvm_kpit_timer {
8 int irq; 8 int irq;
9 s64 period; /* unit: ns */ 9 s64 period; /* unit: ns */
10 s64 scheduled; 10 s64 scheduled;
11 ktime_t last_update;
12 atomic_t pending; 11 atomic_t pending;
13}; 12};
14 13
@@ -34,8 +33,9 @@ struct kvm_kpit_state {
34 u32 speaker_data_on; 33 u32 speaker_data_on;
35 struct mutex lock; 34 struct mutex lock;
36 struct kvm_pit *pit; 35 struct kvm_pit *pit;
37 bool inject_pending; /* if inject pending interrupts */ 36 spinlock_t inject_lock;
38 unsigned long last_injected_time; 37 unsigned long irq_ack;
38 struct kvm_irq_ack_notifier irq_ack_notifier;
39}; 39};
40 40
41struct kvm_pit { 41struct kvm_pit {
@@ -54,7 +54,6 @@ struct kvm_pit {
54#define KVM_PIT_CHANNEL_MASK 0x3 54#define KVM_PIT_CHANNEL_MASK 0x3
55 55
56void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); 56void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
57void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec);
58void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); 57void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val);
59struct kvm_pit *kvm_create_pit(struct kvm *kvm); 58struct kvm_pit *kvm_create_pit(struct kvm *kvm);
60void kvm_free_pit(struct kvm *kvm); 59void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index ab29cf2def47..17e41e165f1a 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -30,6 +30,19 @@
30 30
31#include <linux/kvm_host.h> 31#include <linux/kvm_host.h>
32 32
33static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
34{
35 s->isr &= ~(1 << irq);
36 s->isr_ack |= (1 << irq);
37}
38
39void kvm_pic_clear_isr_ack(struct kvm *kvm)
40{
41 struct kvm_pic *s = pic_irqchip(kvm);
42 s->pics[0].isr_ack = 0xff;
43 s->pics[1].isr_ack = 0xff;
44}
45
33/* 46/*
34 * set irq level. If an edge is detected, then the IRR is set to 1 47 * set irq level. If an edge is detected, then the IRR is set to 1
35 */ 48 */
@@ -130,8 +143,10 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
130{ 143{
131 struct kvm_pic *s = opaque; 144 struct kvm_pic *s = opaque;
132 145
133 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 146 if (irq >= 0 && irq < PIC_NUM_PINS) {
134 pic_update_irq(s); 147 pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
148 pic_update_irq(s);
149 }
135} 150}
136 151
137/* 152/*
@@ -139,11 +154,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level)
139 */ 154 */
140static inline void pic_intack(struct kvm_kpic_state *s, int irq) 155static inline void pic_intack(struct kvm_kpic_state *s, int irq)
141{ 156{
157 s->isr |= 1 << irq;
142 if (s->auto_eoi) { 158 if (s->auto_eoi) {
143 if (s->rotate_on_auto_eoi) 159 if (s->rotate_on_auto_eoi)
144 s->priority_add = (irq + 1) & 7; 160 s->priority_add = (irq + 1) & 7;
145 } else 161 pic_clear_isr(s, irq);
146 s->isr |= (1 << irq); 162 }
147 /* 163 /*
148 * We don't clear a level sensitive interrupt here 164 * We don't clear a level sensitive interrupt here
149 */ 165 */
@@ -151,9 +167,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq)
151 s->irr &= ~(1 << irq); 167 s->irr &= ~(1 << irq);
152} 168}
153 169
154int kvm_pic_read_irq(struct kvm_pic *s) 170int kvm_pic_read_irq(struct kvm *kvm)
155{ 171{
156 int irq, irq2, intno; 172 int irq, irq2, intno;
173 struct kvm_pic *s = pic_irqchip(kvm);
157 174
158 irq = pic_get_irq(&s->pics[0]); 175 irq = pic_get_irq(&s->pics[0]);
159 if (irq >= 0) { 176 if (irq >= 0) {
@@ -179,16 +196,32 @@ int kvm_pic_read_irq(struct kvm_pic *s)
179 intno = s->pics[0].irq_base + irq; 196 intno = s->pics[0].irq_base + irq;
180 } 197 }
181 pic_update_irq(s); 198 pic_update_irq(s);
199 kvm_notify_acked_irq(kvm, irq);
182 200
183 return intno; 201 return intno;
184} 202}
185 203
186void kvm_pic_reset(struct kvm_kpic_state *s) 204void kvm_pic_reset(struct kvm_kpic_state *s)
187{ 205{
206 int irq, irqbase;
207 struct kvm *kvm = s->pics_state->irq_request_opaque;
208 struct kvm_vcpu *vcpu0 = kvm->vcpus[0];
209
210 if (s == &s->pics_state->pics[0])
211 irqbase = 0;
212 else
213 irqbase = 8;
214
215 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
216 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
217 if (s->irr & (1 << irq) || s->isr & (1 << irq))
218 kvm_notify_acked_irq(kvm, irq+irqbase);
219 }
188 s->last_irr = 0; 220 s->last_irr = 0;
189 s->irr = 0; 221 s->irr = 0;
190 s->imr = 0; 222 s->imr = 0;
191 s->isr = 0; 223 s->isr = 0;
224 s->isr_ack = 0xff;
192 s->priority_add = 0; 225 s->priority_add = 0;
193 s->irq_base = 0; 226 s->irq_base = 0;
194 s->read_reg_select = 0; 227 s->read_reg_select = 0;
@@ -241,7 +274,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
241 priority = get_priority(s, s->isr); 274 priority = get_priority(s, s->isr);
242 if (priority != 8) { 275 if (priority != 8) {
243 irq = (priority + s->priority_add) & 7; 276 irq = (priority + s->priority_add) & 7;
244 s->isr &= ~(1 << irq); 277 pic_clear_isr(s, irq);
245 if (cmd == 5) 278 if (cmd == 5)
246 s->priority_add = (irq + 1) & 7; 279 s->priority_add = (irq + 1) & 7;
247 pic_update_irq(s->pics_state); 280 pic_update_irq(s->pics_state);
@@ -249,7 +282,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
249 break; 282 break;
250 case 3: 283 case 3:
251 irq = val & 7; 284 irq = val & 7;
252 s->isr &= ~(1 << irq); 285 pic_clear_isr(s, irq);
253 pic_update_irq(s->pics_state); 286 pic_update_irq(s->pics_state);
254 break; 287 break;
255 case 6: 288 case 6:
@@ -258,8 +291,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
258 break; 291 break;
259 case 7: 292 case 7:
260 irq = val & 7; 293 irq = val & 7;
261 s->isr &= ~(1 << irq);
262 s->priority_add = (irq + 1) & 7; 294 s->priority_add = (irq + 1) & 7;
295 pic_clear_isr(s, irq);
263 pic_update_irq(s->pics_state); 296 pic_update_irq(s->pics_state);
264 break; 297 break;
265 default: 298 default:
@@ -301,7 +334,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1)
301 s->pics_state->pics[0].irr &= ~(1 << 2); 334 s->pics_state->pics[0].irr &= ~(1 << 2);
302 } 335 }
303 s->irr &= ~(1 << ret); 336 s->irr &= ~(1 << ret);
304 s->isr &= ~(1 << ret); 337 pic_clear_isr(s, ret);
305 if (addr1 >> 7 || ret != 2) 338 if (addr1 >> 7 || ret != 2)
306 pic_update_irq(s->pics_state); 339 pic_update_irq(s->pics_state);
307 } else { 340 } else {
@@ -346,7 +379,8 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1)
346 return s->elcr; 379 return s->elcr;
347} 380}
348 381
349static int picdev_in_range(struct kvm_io_device *this, gpa_t addr) 382static int picdev_in_range(struct kvm_io_device *this, gpa_t addr,
383 int len, int is_write)
350{ 384{
351 switch (addr) { 385 switch (addr) {
352 case 0x20: 386 case 0x20:
@@ -419,10 +453,14 @@ static void pic_irq_request(void *opaque, int level)
419{ 453{
420 struct kvm *kvm = opaque; 454 struct kvm *kvm = opaque;
421 struct kvm_vcpu *vcpu = kvm->vcpus[0]; 455 struct kvm_vcpu *vcpu = kvm->vcpus[0];
456 struct kvm_pic *s = pic_irqchip(kvm);
457 int irq = pic_get_irq(&s->pics[0]);
422 458
423 pic_irqchip(kvm)->output = level; 459 s->output = level;
424 if (vcpu) 460 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
461 s->pics[0].isr_ack &= ~(1 << irq);
425 kvm_vcpu_kick(vcpu); 462 kvm_vcpu_kick(vcpu);
463 }
426} 464}
427 465
428struct kvm_pic *kvm_create_pic(struct kvm *kvm) 466struct kvm_pic *kvm_create_pic(struct kvm *kvm)
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 76d736b5f664..c019b8edcdb7 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
72 if (kvm_apic_accept_pic_intr(v)) { 72 if (kvm_apic_accept_pic_intr(v)) {
73 s = pic_irqchip(v->kvm); 73 s = pic_irqchip(v->kvm);
74 s->output = 0; /* PIC */ 74 s->output = 0; /* PIC */
75 vector = kvm_pic_read_irq(s); 75 vector = kvm_pic_read_irq(v->kvm);
76 } 76 }
77 } 77 }
78 return vector; 78 return vector;
@@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs);
90void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) 90void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec)
91{ 91{
92 kvm_apic_timer_intr_post(vcpu, vec); 92 kvm_apic_timer_intr_post(vcpu, vec);
93 kvm_pit_timer_intr_post(vcpu, vec);
94 /* TODO: PIT, RTC etc. */ 93 /* TODO: PIT, RTC etc. */
95} 94}
96EXPORT_SYMBOL_GPL(kvm_timer_intr_post); 95EXPORT_SYMBOL_GPL(kvm_timer_intr_post);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 2a15be2275c0..f17c8f5bbf31 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -30,6 +30,8 @@
30#include "ioapic.h" 30#include "ioapic.h"
31#include "lapic.h" 31#include "lapic.h"
32 32
33#define PIC_NUM_PINS 16
34
33struct kvm; 35struct kvm;
34struct kvm_vcpu; 36struct kvm_vcpu;
35 37
@@ -40,6 +42,7 @@ struct kvm_kpic_state {
40 u8 irr; /* interrupt request register */ 42 u8 irr; /* interrupt request register */
41 u8 imr; /* interrupt mask register */ 43 u8 imr; /* interrupt mask register */
42 u8 isr; /* interrupt service register */ 44 u8 isr; /* interrupt service register */
45 u8 isr_ack; /* interrupt ack detection */
43 u8 priority_add; /* highest irq priority */ 46 u8 priority_add; /* highest irq priority */
44 u8 irq_base; 47 u8 irq_base;
45 u8 read_reg_select; 48 u8 read_reg_select;
@@ -61,12 +64,13 @@ struct kvm_pic {
61 void *irq_request_opaque; 64 void *irq_request_opaque;
62 int output; /* intr from master PIC */ 65 int output; /* intr from master PIC */
63 struct kvm_io_device dev; 66 struct kvm_io_device dev;
67 void (*ack_notifier)(void *opaque, int irq);
64}; 68};
65 69
66struct kvm_pic *kvm_create_pic(struct kvm *kvm); 70struct kvm_pic *kvm_create_pic(struct kvm *kvm);
67void kvm_pic_set_irq(void *opaque, int irq, int level); 71int kvm_pic_read_irq(struct kvm *kvm);
68int kvm_pic_read_irq(struct kvm_pic *s);
69void kvm_pic_update_irq(struct kvm_pic *s); 72void kvm_pic_update_irq(struct kvm_pic *s);
73void kvm_pic_clear_isr_ack(struct kvm *kvm);
70 74
71static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) 75static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
72{ 76{
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
new file mode 100644
index 000000000000..1ff819dce7d3
--- /dev/null
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -0,0 +1,32 @@
1#ifndef ASM_KVM_CACHE_REGS_H
2#define ASM_KVM_CACHE_REGS_H
3
4static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
5 enum kvm_reg reg)
6{
7 if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail))
8 kvm_x86_ops->cache_reg(vcpu, reg);
9
10 return vcpu->arch.regs[reg];
11}
12
13static inline void kvm_register_write(struct kvm_vcpu *vcpu,
14 enum kvm_reg reg,
15 unsigned long val)
16{
17 vcpu->arch.regs[reg] = val;
18 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty);
19 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
20}
21
22static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu)
23{
24 return kvm_register_read(vcpu, VCPU_REGS_RIP);
25}
26
27static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val)
28{
29 kvm_register_write(vcpu, VCPU_REGS_RIP, val);
30}
31
32#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index ebc03f5ae162..0fc3cab48943 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -32,6 +32,7 @@
32#include <asm/current.h> 32#include <asm/current.h>
33#include <asm/apicdef.h> 33#include <asm/apicdef.h>
34#include <asm/atomic.h> 34#include <asm/atomic.h>
35#include "kvm_cache_regs.h"
35#include "irq.h" 36#include "irq.h"
36 37
37#define PRId64 "d" 38#define PRId64 "d"
@@ -338,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
338 } else 339 } else
339 apic_clear_vector(vector, apic->regs + APIC_TMR); 340 apic_clear_vector(vector, apic->regs + APIC_TMR);
340 341
341 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 342 kvm_vcpu_kick(vcpu);
342 kvm_vcpu_kick(vcpu);
343 else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) {
344 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
345 if (waitqueue_active(&vcpu->wq))
346 wake_up_interruptible(&vcpu->wq);
347 }
348 343
349 result = (orig_irr == 0); 344 result = (orig_irr == 0);
350 break; 345 break;
@@ -356,8 +351,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
356 case APIC_DM_SMI: 351 case APIC_DM_SMI:
357 printk(KERN_DEBUG "Ignoring guest SMI\n"); 352 printk(KERN_DEBUG "Ignoring guest SMI\n");
358 break; 353 break;
354
359 case APIC_DM_NMI: 355 case APIC_DM_NMI:
360 printk(KERN_DEBUG "Ignoring guest NMI\n"); 356 kvm_inject_nmi(vcpu);
361 break; 357 break;
362 358
363 case APIC_DM_INIT: 359 case APIC_DM_INIT:
@@ -369,21 +365,18 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
369 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 365 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
370 kvm_vcpu_kick(vcpu); 366 kvm_vcpu_kick(vcpu);
371 } else { 367 } else {
372 printk(KERN_DEBUG 368 apic_debug("Ignoring de-assert INIT to vcpu %d\n",
373 "Ignoring de-assert INIT to vcpu %d\n", 369 vcpu->vcpu_id);
374 vcpu->vcpu_id);
375 } 370 }
376
377 break; 371 break;
378 372
379 case APIC_DM_STARTUP: 373 case APIC_DM_STARTUP:
380 printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", 374 apic_debug("SIPI to vcpu %d vector 0x%02x\n",
381 vcpu->vcpu_id, vector); 375 vcpu->vcpu_id, vector);
382 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { 376 if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
383 vcpu->arch.sipi_vector = vector; 377 vcpu->arch.sipi_vector = vector;
384 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 378 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
385 if (waitqueue_active(&vcpu->wq)) 379 kvm_vcpu_kick(vcpu);
386 wake_up_interruptible(&vcpu->wq);
387 } 380 }
388 break; 381 break;
389 382
@@ -437,7 +430,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector,
437static void apic_set_eoi(struct kvm_lapic *apic) 430static void apic_set_eoi(struct kvm_lapic *apic)
438{ 431{
439 int vector = apic_find_highest_isr(apic); 432 int vector = apic_find_highest_isr(apic);
440 433 int trigger_mode;
441 /* 434 /*
442 * Not every write EOI will has corresponding ISR, 435 * Not every write EOI will has corresponding ISR,
443 * one example is when Kernel check timer on setup_IO_APIC 436 * one example is when Kernel check timer on setup_IO_APIC
@@ -449,7 +442,10 @@ static void apic_set_eoi(struct kvm_lapic *apic)
449 apic_update_ppr(apic); 442 apic_update_ppr(apic);
450 443
451 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) 444 if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR))
452 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); 445 trigger_mode = IOAPIC_LEVEL_TRIG;
446 else
447 trigger_mode = IOAPIC_EDGE_TRIG;
448 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
453} 449}
454 450
455static void apic_send_ipi(struct kvm_lapic *apic) 451static void apic_send_ipi(struct kvm_lapic *apic)
@@ -557,8 +553,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write)
557 struct kvm_run *run = vcpu->run; 553 struct kvm_run *run = vcpu->run;
558 554
559 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); 555 set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests);
560 kvm_x86_ops->cache_regs(vcpu); 556 run->tpr_access.rip = kvm_rip_read(vcpu);
561 run->tpr_access.rip = vcpu->arch.rip;
562 run->tpr_access.is_write = write; 557 run->tpr_access.is_write = write;
563} 558}
564 559
@@ -572,6 +567,8 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
572{ 567{
573 u32 val = 0; 568 u32 val = 0;
574 569
570 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
571
575 if (offset >= LAPIC_MMIO_LENGTH) 572 if (offset >= LAPIC_MMIO_LENGTH)
576 return 0; 573 return 0;
577 574
@@ -680,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this,
680 * Refer SDM 8.4.1 677 * Refer SDM 8.4.1
681 */ 678 */
682 if (len != 4 || alignment) { 679 if (len != 4 || alignment) {
683 if (printk_ratelimit()) 680 /* Don't shout loud, $infamous_os would cause only noise. */
684 printk(KERN_ERR "apic write: bad size=%d %lx\n", 681 apic_debug("apic write: bad size=%d %lx\n",
685 len, (long)address); 682 len, (long)address);
686 return; 683 return;
687 } 684 }
688 685
@@ -695,6 +692,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
695 692
696 offset &= 0xff0; 693 offset &= 0xff0;
697 694
695 KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler);
696
698 switch (offset) { 697 switch (offset) {
699 case APIC_ID: /* Local APIC ID */ 698 case APIC_ID: /* Local APIC ID */
700 apic_set_reg(apic, APIC_ID, val); 699 apic_set_reg(apic, APIC_ID, val);
@@ -780,7 +779,8 @@ static void apic_mmio_write(struct kvm_io_device *this,
780 779
781} 780}
782 781
783static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) 782static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr,
783 int len, int size)
784{ 784{
785 struct kvm_lapic *apic = (struct kvm_lapic *)this->private; 785 struct kvm_lapic *apic = (struct kvm_lapic *)this->private;
786 int ret = 0; 786 int ret = 0;
@@ -939,17 +939,14 @@ static int __apic_timer_fn(struct kvm_lapic *apic)
939 int result = 0; 939 int result = 0;
940 wait_queue_head_t *q = &apic->vcpu->wq; 940 wait_queue_head_t *q = &apic->vcpu->wq;
941 941
942 atomic_inc(&apic->timer.pending); 942 if(!atomic_inc_and_test(&apic->timer.pending))
943 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); 943 set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests);
944 if (waitqueue_active(q)) { 944 if (waitqueue_active(q))
945 apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
946 wake_up_interruptible(q); 945 wake_up_interruptible(q);
947 } 946
948 if (apic_lvtt_period(apic)) { 947 if (apic_lvtt_period(apic)) {
949 result = 1; 948 result = 1;
950 apic->timer.dev.expires = ktime_add_ns( 949 hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period);
951 apic->timer.dev.expires,
952 apic->timer.period);
953 } 950 }
954 return result; 951 return result;
955} 952}
@@ -1118,7 +1115,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
1118 1115
1119 timer = &apic->timer.dev; 1116 timer = &apic->timer.dev;
1120 if (hrtimer_cancel(timer)) 1117 if (hrtimer_cancel(timer))
1121 hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); 1118 hrtimer_start_expires(timer, HRTIMER_MODE_ABS);
1122} 1119}
1123 1120
1124void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) 1121void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 676c396c9cee..81858881287e 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -31,6 +31,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu);
31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); 31u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); 32void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8);
33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); 33void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value);
34u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu);
34 35
35int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); 36int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest);
36int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); 37int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 7e7c3969f7a2..99c239c5c0ac 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -66,9 +66,13 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
66#endif 66#endif
67 67
68#if defined(MMU_DEBUG) || defined(AUDIT) 68#if defined(MMU_DEBUG) || defined(AUDIT)
69static int dbg = 1; 69static int dbg = 0;
70module_param(dbg, bool, 0644);
70#endif 71#endif
71 72
73static int oos_shadow = 1;
74module_param(oos_shadow, bool, 0644);
75
72#ifndef MMU_DEBUG 76#ifndef MMU_DEBUG
73#define ASSERT(x) do { } while (0) 77#define ASSERT(x) do { } while (0)
74#else 78#else
@@ -134,18 +138,24 @@ static int dbg = 1;
134#define ACC_USER_MASK PT_USER_MASK 138#define ACC_USER_MASK PT_USER_MASK
135#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 139#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
136 140
137struct kvm_pv_mmu_op_buffer { 141#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
138 void *ptr;
139 unsigned len;
140 unsigned processed;
141 char buf[512] __aligned(sizeof(long));
142};
143 142
144struct kvm_rmap_desc { 143struct kvm_rmap_desc {
145 u64 *shadow_ptes[RMAP_EXT]; 144 u64 *shadow_ptes[RMAP_EXT];
146 struct kvm_rmap_desc *more; 145 struct kvm_rmap_desc *more;
147}; 146};
148 147
148struct kvm_shadow_walk {
149 int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu,
150 u64 addr, u64 *spte, int level);
151};
152
153struct kvm_unsync_walk {
154 int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk);
155};
156
157typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp);
158
149static struct kmem_cache *pte_chain_cache; 159static struct kmem_cache *pte_chain_cache;
150static struct kmem_cache *rmap_desc_cache; 160static struct kmem_cache *rmap_desc_cache;
151static struct kmem_cache *mmu_page_header_cache; 161static struct kmem_cache *mmu_page_header_cache;
@@ -404,16 +414,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn)
404{ 414{
405 struct vm_area_struct *vma; 415 struct vm_area_struct *vma;
406 unsigned long addr; 416 unsigned long addr;
417 int ret = 0;
407 418
408 addr = gfn_to_hva(kvm, gfn); 419 addr = gfn_to_hva(kvm, gfn);
409 if (kvm_is_error_hva(addr)) 420 if (kvm_is_error_hva(addr))
410 return 0; 421 return ret;
411 422
423 down_read(&current->mm->mmap_sem);
412 vma = find_vma(current->mm, addr); 424 vma = find_vma(current->mm, addr);
413 if (vma && is_vm_hugetlb_page(vma)) 425 if (vma && is_vm_hugetlb_page(vma))
414 return 1; 426 ret = 1;
427 up_read(&current->mm->mmap_sem);
415 428
416 return 0; 429 return ret;
417} 430}
418 431
419static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) 432static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn)
@@ -648,8 +661,88 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn)
648 661
649 if (write_protected) 662 if (write_protected)
650 kvm_flush_remote_tlbs(kvm); 663 kvm_flush_remote_tlbs(kvm);
664}
665
666static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp)
667{
668 u64 *spte;
669 int need_tlb_flush = 0;
670
671 while ((spte = rmap_next(kvm, rmapp, NULL))) {
672 BUG_ON(!(*spte & PT_PRESENT_MASK));
673 rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte);
674 rmap_remove(kvm, spte);
675 set_shadow_pte(spte, shadow_trap_nonpresent_pte);
676 need_tlb_flush = 1;
677 }
678 return need_tlb_flush;
679}
680
681static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
682 int (*handler)(struct kvm *kvm, unsigned long *rmapp))
683{
684 int i;
685 int retval = 0;
651 686
652 account_shadowed(kvm, gfn); 687 /*
688 * If mmap_sem isn't taken, we can look the memslots with only
689 * the mmu_lock by skipping over the slots with userspace_addr == 0.
690 */
691 for (i = 0; i < kvm->nmemslots; i++) {
692 struct kvm_memory_slot *memslot = &kvm->memslots[i];
693 unsigned long start = memslot->userspace_addr;
694 unsigned long end;
695
696 /* mmu_lock protects userspace_addr */
697 if (!start)
698 continue;
699
700 end = start + (memslot->npages << PAGE_SHIFT);
701 if (hva >= start && hva < end) {
702 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
703 retval |= handler(kvm, &memslot->rmap[gfn_offset]);
704 retval |= handler(kvm,
705 &memslot->lpage_info[
706 gfn_offset /
707 KVM_PAGES_PER_HPAGE].rmap_pde);
708 }
709 }
710
711 return retval;
712}
713
714int kvm_unmap_hva(struct kvm *kvm, unsigned long hva)
715{
716 return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp);
717}
718
719static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp)
720{
721 u64 *spte;
722 int young = 0;
723
724 /* always return old for EPT */
725 if (!shadow_accessed_mask)
726 return 0;
727
728 spte = rmap_next(kvm, rmapp, NULL);
729 while (spte) {
730 int _young;
731 u64 _spte = *spte;
732 BUG_ON(!(_spte & PT_PRESENT_MASK));
733 _young = _spte & PT_ACCESSED_MASK;
734 if (_young) {
735 young = 1;
736 clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
737 }
738 spte = rmap_next(kvm, rmapp, spte);
739 }
740 return young;
741}
742
743int kvm_age_hva(struct kvm *kvm, unsigned long hva)
744{
745 return kvm_handle_hva(kvm, hva, kvm_age_rmapp);
653} 746}
654 747
655#ifdef MMU_DEBUG 748#ifdef MMU_DEBUG
@@ -776,6 +869,138 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
776 BUG(); 869 BUG();
777} 870}
778 871
872
873static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
874 mmu_parent_walk_fn fn)
875{
876 struct kvm_pte_chain *pte_chain;
877 struct hlist_node *node;
878 struct kvm_mmu_page *parent_sp;
879 int i;
880
881 if (!sp->multimapped && sp->parent_pte) {
882 parent_sp = page_header(__pa(sp->parent_pte));
883 fn(vcpu, parent_sp);
884 mmu_parent_walk(vcpu, parent_sp, fn);
885 return;
886 }
887 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
888 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
889 if (!pte_chain->parent_ptes[i])
890 break;
891 parent_sp = page_header(__pa(pte_chain->parent_ptes[i]));
892 fn(vcpu, parent_sp);
893 mmu_parent_walk(vcpu, parent_sp, fn);
894 }
895}
896
897static void kvm_mmu_update_unsync_bitmap(u64 *spte)
898{
899 unsigned int index;
900 struct kvm_mmu_page *sp = page_header(__pa(spte));
901
902 index = spte - sp->spt;
903 __set_bit(index, sp->unsync_child_bitmap);
904 sp->unsync_children = 1;
905}
906
907static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp)
908{
909 struct kvm_pte_chain *pte_chain;
910 struct hlist_node *node;
911 int i;
912
913 if (!sp->parent_pte)
914 return;
915
916 if (!sp->multimapped) {
917 kvm_mmu_update_unsync_bitmap(sp->parent_pte);
918 return;
919 }
920
921 hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link)
922 for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) {
923 if (!pte_chain->parent_ptes[i])
924 break;
925 kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]);
926 }
927}
928
929static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
930{
931 sp->unsync_children = 1;
932 kvm_mmu_update_parents_unsync(sp);
933 return 1;
934}
935
936static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu,
937 struct kvm_mmu_page *sp)
938{
939 mmu_parent_walk(vcpu, sp, unsync_walk_fn);
940 kvm_mmu_update_parents_unsync(sp);
941}
942
943static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
944 struct kvm_mmu_page *sp)
945{
946 int i;
947
948 for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
949 sp->spt[i] = shadow_trap_nonpresent_pte;
950}
951
952static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
953 struct kvm_mmu_page *sp)
954{
955 return 1;
956}
957
958static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
959{
960}
961
962#define for_each_unsync_children(bitmap, idx) \
963 for (idx = find_first_bit(bitmap, 512); \
964 idx < 512; \
965 idx = find_next_bit(bitmap, 512, idx+1))
966
967static int mmu_unsync_walk(struct kvm_mmu_page *sp,
968 struct kvm_unsync_walk *walker)
969{
970 int i, ret;
971
972 if (!sp->unsync_children)
973 return 0;
974
975 for_each_unsync_children(sp->unsync_child_bitmap, i) {
976 u64 ent = sp->spt[i];
977
978 if (is_shadow_present_pte(ent)) {
979 struct kvm_mmu_page *child;
980 child = page_header(ent & PT64_BASE_ADDR_MASK);
981
982 if (child->unsync_children) {
983 ret = mmu_unsync_walk(child, walker);
984 if (ret)
985 return ret;
986 __clear_bit(i, sp->unsync_child_bitmap);
987 }
988
989 if (child->unsync) {
990 ret = walker->entry(child, walker);
991 __clear_bit(i, sp->unsync_child_bitmap);
992 if (ret)
993 return ret;
994 }
995 }
996 }
997
998 if (find_first_bit(sp->unsync_child_bitmap, 512) == 512)
999 sp->unsync_children = 0;
1000
1001 return 0;
1002}
1003
779static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) 1004static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
780{ 1005{
781 unsigned index; 1006 unsigned index;
@@ -796,6 +1021,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn)
796 return NULL; 1021 return NULL;
797} 1022}
798 1023
1024static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1025{
1026 WARN_ON(!sp->unsync);
1027 sp->unsync = 0;
1028 --kvm->stat.mmu_unsync;
1029}
1030
1031static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp);
1032
1033static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1034{
1035 if (sp->role.glevels != vcpu->arch.mmu.root_level) {
1036 kvm_mmu_zap_page(vcpu->kvm, sp);
1037 return 1;
1038 }
1039
1040 rmap_write_protect(vcpu->kvm, sp->gfn);
1041 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1042 kvm_mmu_zap_page(vcpu->kvm, sp);
1043 return 1;
1044 }
1045
1046 kvm_mmu_flush_tlb(vcpu);
1047 kvm_unlink_unsync_page(vcpu->kvm, sp);
1048 return 0;
1049}
1050
1051struct sync_walker {
1052 struct kvm_vcpu *vcpu;
1053 struct kvm_unsync_walk walker;
1054};
1055
1056static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1057{
1058 struct sync_walker *sync_walk = container_of(walk, struct sync_walker,
1059 walker);
1060 struct kvm_vcpu *vcpu = sync_walk->vcpu;
1061
1062 kvm_sync_page(vcpu, sp);
1063 return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock));
1064}
1065
1066static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1067{
1068 struct sync_walker walker = {
1069 .walker = { .entry = mmu_sync_fn, },
1070 .vcpu = vcpu,
1071 };
1072
1073 while (mmu_unsync_walk(sp, &walker.walker))
1074 cond_resched_lock(&vcpu->kvm->mmu_lock);
1075}
1076
799static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, 1077static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
800 gfn_t gfn, 1078 gfn_t gfn,
801 gva_t gaddr, 1079 gva_t gaddr,
@@ -809,7 +1087,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
809 unsigned quadrant; 1087 unsigned quadrant;
810 struct hlist_head *bucket; 1088 struct hlist_head *bucket;
811 struct kvm_mmu_page *sp; 1089 struct kvm_mmu_page *sp;
812 struct hlist_node *node; 1090 struct hlist_node *node, *tmp;
813 1091
814 role.word = 0; 1092 role.word = 0;
815 role.glevels = vcpu->arch.mmu.root_level; 1093 role.glevels = vcpu->arch.mmu.root_level;
@@ -825,9 +1103,20 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
825 gfn, role.word); 1103 gfn, role.word);
826 index = kvm_page_table_hashfn(gfn); 1104 index = kvm_page_table_hashfn(gfn);
827 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 1105 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
828 hlist_for_each_entry(sp, node, bucket, hash_link) 1106 hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link)
829 if (sp->gfn == gfn && sp->role.word == role.word) { 1107 if (sp->gfn == gfn) {
1108 if (sp->unsync)
1109 if (kvm_sync_page(vcpu, sp))
1110 continue;
1111
1112 if (sp->role.word != role.word)
1113 continue;
1114
830 mmu_page_add_parent_pte(vcpu, sp, parent_pte); 1115 mmu_page_add_parent_pte(vcpu, sp, parent_pte);
1116 if (sp->unsync_children) {
1117 set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests);
1118 kvm_mmu_mark_parents_unsync(vcpu, sp);
1119 }
831 pgprintk("%s: found\n", __func__); 1120 pgprintk("%s: found\n", __func__);
832 return sp; 1121 return sp;
833 } 1122 }
@@ -839,12 +1128,46 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
839 sp->gfn = gfn; 1128 sp->gfn = gfn;
840 sp->role = role; 1129 sp->role = role;
841 hlist_add_head(&sp->hash_link, bucket); 1130 hlist_add_head(&sp->hash_link, bucket);
842 if (!metaphysical) 1131 if (!metaphysical) {
843 rmap_write_protect(vcpu->kvm, gfn); 1132 rmap_write_protect(vcpu->kvm, gfn);
844 vcpu->arch.mmu.prefetch_page(vcpu, sp); 1133 account_shadowed(vcpu->kvm, gfn);
1134 }
1135 if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte)
1136 vcpu->arch.mmu.prefetch_page(vcpu, sp);
1137 else
1138 nonpaging_prefetch_page(vcpu, sp);
845 return sp; 1139 return sp;
846} 1140}
847 1141
1142static int walk_shadow(struct kvm_shadow_walk *walker,
1143 struct kvm_vcpu *vcpu, u64 addr)
1144{
1145 hpa_t shadow_addr;
1146 int level;
1147 int r;
1148 u64 *sptep;
1149 unsigned index;
1150
1151 shadow_addr = vcpu->arch.mmu.root_hpa;
1152 level = vcpu->arch.mmu.shadow_root_level;
1153 if (level == PT32E_ROOT_LEVEL) {
1154 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
1155 shadow_addr &= PT64_BASE_ADDR_MASK;
1156 --level;
1157 }
1158
1159 while (level >= PT_PAGE_TABLE_LEVEL) {
1160 index = SHADOW_PT_INDEX(addr, level);
1161 sptep = ((u64 *)__va(shadow_addr)) + index;
1162 r = walker->entry(walker, vcpu, addr, sptep, level);
1163 if (r)
1164 return r;
1165 shadow_addr = *sptep & PT64_BASE_ADDR_MASK;
1166 --level;
1167 }
1168 return 0;
1169}
1170
848static void kvm_mmu_page_unlink_children(struct kvm *kvm, 1171static void kvm_mmu_page_unlink_children(struct kvm *kvm,
849 struct kvm_mmu_page *sp) 1172 struct kvm_mmu_page *sp)
850{ 1173{
@@ -860,7 +1183,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
860 rmap_remove(kvm, &pt[i]); 1183 rmap_remove(kvm, &pt[i]);
861 pt[i] = shadow_trap_nonpresent_pte; 1184 pt[i] = shadow_trap_nonpresent_pte;
862 } 1185 }
863 kvm_flush_remote_tlbs(kvm);
864 return; 1186 return;
865 } 1187 }
866 1188
@@ -879,7 +1201,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm,
879 } 1201 }
880 pt[i] = shadow_trap_nonpresent_pte; 1202 pt[i] = shadow_trap_nonpresent_pte;
881 } 1203 }
882 kvm_flush_remote_tlbs(kvm);
883} 1204}
884 1205
885static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) 1206static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte)
@@ -896,11 +1217,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm)
896 kvm->vcpus[i]->arch.last_pte_updated = NULL; 1217 kvm->vcpus[i]->arch.last_pte_updated = NULL;
897} 1218}
898 1219
899static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1220static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp)
900{ 1221{
901 u64 *parent_pte; 1222 u64 *parent_pte;
902 1223
903 ++kvm->stat.mmu_shadow_zapped;
904 while (sp->multimapped || sp->parent_pte) { 1224 while (sp->multimapped || sp->parent_pte) {
905 if (!sp->multimapped) 1225 if (!sp->multimapped)
906 parent_pte = sp->parent_pte; 1226 parent_pte = sp->parent_pte;
@@ -915,18 +1235,59 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
915 kvm_mmu_put_page(sp, parent_pte); 1235 kvm_mmu_put_page(sp, parent_pte);
916 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); 1236 set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte);
917 } 1237 }
1238}
1239
1240struct zap_walker {
1241 struct kvm_unsync_walk walker;
1242 struct kvm *kvm;
1243 int zapped;
1244};
1245
1246static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk)
1247{
1248 struct zap_walker *zap_walk = container_of(walk, struct zap_walker,
1249 walker);
1250 kvm_mmu_zap_page(zap_walk->kvm, sp);
1251 zap_walk->zapped = 1;
1252 return 0;
1253}
1254
1255static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp)
1256{
1257 struct zap_walker walker = {
1258 .walker = { .entry = mmu_zap_fn, },
1259 .kvm = kvm,
1260 .zapped = 0,
1261 };
1262
1263 if (sp->role.level == PT_PAGE_TABLE_LEVEL)
1264 return 0;
1265 mmu_unsync_walk(sp, &walker.walker);
1266 return walker.zapped;
1267}
1268
1269static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp)
1270{
1271 int ret;
1272 ++kvm->stat.mmu_shadow_zapped;
1273 ret = mmu_zap_unsync_children(kvm, sp);
918 kvm_mmu_page_unlink_children(kvm, sp); 1274 kvm_mmu_page_unlink_children(kvm, sp);
1275 kvm_mmu_unlink_parents(kvm, sp);
1276 kvm_flush_remote_tlbs(kvm);
1277 if (!sp->role.invalid && !sp->role.metaphysical)
1278 unaccount_shadowed(kvm, sp->gfn);
1279 if (sp->unsync)
1280 kvm_unlink_unsync_page(kvm, sp);
919 if (!sp->root_count) { 1281 if (!sp->root_count) {
920 if (!sp->role.metaphysical)
921 unaccount_shadowed(kvm, sp->gfn);
922 hlist_del(&sp->hash_link); 1282 hlist_del(&sp->hash_link);
923 kvm_mmu_free_page(kvm, sp); 1283 kvm_mmu_free_page(kvm, sp);
924 } else { 1284 } else {
925 list_move(&sp->link, &kvm->arch.active_mmu_pages);
926 sp->role.invalid = 1; 1285 sp->role.invalid = 1;
1286 list_move(&sp->link, &kvm->arch.active_mmu_pages);
927 kvm_reload_remote_mmus(kvm); 1287 kvm_reload_remote_mmus(kvm);
928 } 1288 }
929 kvm_mmu_reset_last_pte_updated(kvm); 1289 kvm_mmu_reset_last_pte_updated(kvm);
1290 return ret;
930} 1291}
931 1292
932/* 1293/*
@@ -979,8 +1340,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
979 if (sp->gfn == gfn && !sp->role.metaphysical) { 1340 if (sp->gfn == gfn && !sp->role.metaphysical) {
980 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1341 pgprintk("%s: gfn %lx role %x\n", __func__, gfn,
981 sp->role.word); 1342 sp->role.word);
982 kvm_mmu_zap_page(kvm, sp);
983 r = 1; 1343 r = 1;
1344 if (kvm_mmu_zap_page(kvm, sp))
1345 n = bucket->first;
984 } 1346 }
985 return r; 1347 return r;
986} 1348}
@@ -1003,6 +1365,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1003 __set_bit(slot, &sp->slot_bitmap); 1365 __set_bit(slot, &sp->slot_bitmap);
1004} 1366}
1005 1367
1368static void mmu_convert_notrap(struct kvm_mmu_page *sp)
1369{
1370 int i;
1371 u64 *pt = sp->spt;
1372
1373 if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte)
1374 return;
1375
1376 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
1377 if (pt[i] == shadow_notrap_nonpresent_pte)
1378 set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte);
1379 }
1380}
1381
1006struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) 1382struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1007{ 1383{
1008 struct page *page; 1384 struct page *page;
@@ -1012,51 +1388,60 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1012 if (gpa == UNMAPPED_GVA) 1388 if (gpa == UNMAPPED_GVA)
1013 return NULL; 1389 return NULL;
1014 1390
1015 down_read(&current->mm->mmap_sem);
1016 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 1391 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1017 up_read(&current->mm->mmap_sem);
1018 1392
1019 return page; 1393 return page;
1020} 1394}
1021 1395
1022static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, 1396static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
1023 unsigned pt_access, unsigned pte_access,
1024 int user_fault, int write_fault, int dirty,
1025 int *ptwrite, int largepage, gfn_t gfn,
1026 pfn_t pfn, bool speculative)
1027{ 1397{
1028 u64 spte; 1398 unsigned index;
1029 int was_rmapped = 0; 1399 struct hlist_head *bucket;
1030 int was_writeble = is_writeble_pte(*shadow_pte); 1400 struct kvm_mmu_page *s;
1401 struct hlist_node *node, *n;
1031 1402
1032 pgprintk("%s: spte %llx access %x write_fault %d" 1403 index = kvm_page_table_hashfn(sp->gfn);
1033 " user_fault %d gfn %lx\n", 1404 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1034 __func__, *shadow_pte, pt_access, 1405 /* don't unsync if pagetable is shadowed with multiple roles */
1035 write_fault, user_fault, gfn); 1406 hlist_for_each_entry_safe(s, node, n, bucket, hash_link) {
1407 if (s->gfn != sp->gfn || s->role.metaphysical)
1408 continue;
1409 if (s->role.word != sp->role.word)
1410 return 1;
1411 }
1412 kvm_mmu_mark_parents_unsync(vcpu, sp);
1413 ++vcpu->kvm->stat.mmu_unsync;
1414 sp->unsync = 1;
1415 mmu_convert_notrap(sp);
1416 return 0;
1417}
1036 1418
1037 if (is_rmap_pte(*shadow_pte)) { 1419static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
1038 /* 1420 bool can_unsync)
1039 * If we overwrite a PTE page pointer with a 2MB PMD, unlink 1421{
1040 * the parent of the now unreachable PTE. 1422 struct kvm_mmu_page *shadow;
1041 */
1042 if (largepage && !is_large_pte(*shadow_pte)) {
1043 struct kvm_mmu_page *child;
1044 u64 pte = *shadow_pte;
1045 1423
1046 child = page_header(pte & PT64_BASE_ADDR_MASK); 1424 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn);
1047 mmu_page_remove_parent_pte(child, shadow_pte); 1425 if (shadow) {
1048 } else if (pfn != spte_to_pfn(*shadow_pte)) { 1426 if (shadow->role.level != PT_PAGE_TABLE_LEVEL)
1049 pgprintk("hfn old %lx new %lx\n", 1427 return 1;
1050 spte_to_pfn(*shadow_pte), pfn); 1428 if (shadow->unsync)
1051 rmap_remove(vcpu->kvm, shadow_pte); 1429 return 0;
1052 } else { 1430 if (can_unsync && oos_shadow)
1053 if (largepage) 1431 return kvm_unsync_page(vcpu, shadow);
1054 was_rmapped = is_large_pte(*shadow_pte); 1432 return 1;
1055 else
1056 was_rmapped = 1;
1057 }
1058 } 1433 }
1434 return 0;
1435}
1059 1436
1437static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1438 unsigned pte_access, int user_fault,
1439 int write_fault, int dirty, int largepage,
1440 gfn_t gfn, pfn_t pfn, bool speculative,
1441 bool can_unsync)
1442{
1443 u64 spte;
1444 int ret = 0;
1060 /* 1445 /*
1061 * We don't set the accessed bit, since we sometimes want to see 1446 * We don't set the accessed bit, since we sometimes want to see
1062 * whether the guest actually used the pte (in order to detect 1447 * whether the guest actually used the pte (in order to detect
@@ -1064,7 +1449,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1064 */ 1449 */
1065 spte = shadow_base_present_pte | shadow_dirty_mask; 1450 spte = shadow_base_present_pte | shadow_dirty_mask;
1066 if (!speculative) 1451 if (!speculative)
1067 pte_access |= PT_ACCESSED_MASK; 1452 spte |= shadow_accessed_mask;
1068 if (!dirty) 1453 if (!dirty)
1069 pte_access &= ~ACC_WRITE_MASK; 1454 pte_access &= ~ACC_WRITE_MASK;
1070 if (pte_access & ACC_EXEC_MASK) 1455 if (pte_access & ACC_EXEC_MASK)
@@ -1080,35 +1465,82 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1080 1465
1081 if ((pte_access & ACC_WRITE_MASK) 1466 if ((pte_access & ACC_WRITE_MASK)
1082 || (write_fault && !is_write_protection(vcpu) && !user_fault)) { 1467 || (write_fault && !is_write_protection(vcpu) && !user_fault)) {
1083 struct kvm_mmu_page *shadow; 1468
1469 if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) {
1470 ret = 1;
1471 spte = shadow_trap_nonpresent_pte;
1472 goto set_pte;
1473 }
1084 1474
1085 spte |= PT_WRITABLE_MASK; 1475 spte |= PT_WRITABLE_MASK;
1086 1476
1087 shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); 1477 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1088 if (shadow ||
1089 (largepage && has_wrprotected_page(vcpu->kvm, gfn))) {
1090 pgprintk("%s: found shadow page for %lx, marking ro\n", 1478 pgprintk("%s: found shadow page for %lx, marking ro\n",
1091 __func__, gfn); 1479 __func__, gfn);
1480 ret = 1;
1092 pte_access &= ~ACC_WRITE_MASK; 1481 pte_access &= ~ACC_WRITE_MASK;
1093 if (is_writeble_pte(spte)) { 1482 if (is_writeble_pte(spte))
1094 spte &= ~PT_WRITABLE_MASK; 1483 spte &= ~PT_WRITABLE_MASK;
1095 kvm_x86_ops->tlb_flush(vcpu);
1096 }
1097 if (write_fault)
1098 *ptwrite = 1;
1099 } 1484 }
1100 } 1485 }
1101 1486
1102 if (pte_access & ACC_WRITE_MASK) 1487 if (pte_access & ACC_WRITE_MASK)
1103 mark_page_dirty(vcpu->kvm, gfn); 1488 mark_page_dirty(vcpu->kvm, gfn);
1104 1489
1105 pgprintk("%s: setting spte %llx\n", __func__, spte); 1490set_pte:
1106 pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n",
1107 (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB",
1108 (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte);
1109 set_shadow_pte(shadow_pte, spte); 1491 set_shadow_pte(shadow_pte, spte);
1110 if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) 1492 return ret;
1111 && (spte & PT_PRESENT_MASK)) 1493}
1494
1495static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1496 unsigned pt_access, unsigned pte_access,
1497 int user_fault, int write_fault, int dirty,
1498 int *ptwrite, int largepage, gfn_t gfn,
1499 pfn_t pfn, bool speculative)
1500{
1501 int was_rmapped = 0;
1502 int was_writeble = is_writeble_pte(*shadow_pte);
1503
1504 pgprintk("%s: spte %llx access %x write_fault %d"
1505 " user_fault %d gfn %lx\n",
1506 __func__, *shadow_pte, pt_access,
1507 write_fault, user_fault, gfn);
1508
1509 if (is_rmap_pte(*shadow_pte)) {
1510 /*
1511 * If we overwrite a PTE page pointer with a 2MB PMD, unlink
1512 * the parent of the now unreachable PTE.
1513 */
1514 if (largepage && !is_large_pte(*shadow_pte)) {
1515 struct kvm_mmu_page *child;
1516 u64 pte = *shadow_pte;
1517
1518 child = page_header(pte & PT64_BASE_ADDR_MASK);
1519 mmu_page_remove_parent_pte(child, shadow_pte);
1520 } else if (pfn != spte_to_pfn(*shadow_pte)) {
1521 pgprintk("hfn old %lx new %lx\n",
1522 spte_to_pfn(*shadow_pte), pfn);
1523 rmap_remove(vcpu->kvm, shadow_pte);
1524 } else {
1525 if (largepage)
1526 was_rmapped = is_large_pte(*shadow_pte);
1527 else
1528 was_rmapped = 1;
1529 }
1530 }
1531 if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault,
1532 dirty, largepage, gfn, pfn, speculative, true)) {
1533 if (write_fault)
1534 *ptwrite = 1;
1535 kvm_x86_ops->tlb_flush(vcpu);
1536 }
1537
1538 pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte);
1539 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n",
1540 is_large_pte(*shadow_pte)? "2MB" : "4kB",
1541 is_present_pte(*shadow_pte)?"RW":"R", gfn,
1542 *shadow_pte, shadow_pte);
1543 if (!was_rmapped && is_large_pte(*shadow_pte))
1112 ++vcpu->kvm->stat.lpages; 1544 ++vcpu->kvm->stat.lpages;
1113 1545
1114 page_header_update_slot(vcpu->kvm, shadow_pte, gfn); 1546 page_header_update_slot(vcpu->kvm, shadow_pte, gfn);
@@ -1122,61 +1554,77 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte,
1122 else 1554 else
1123 kvm_release_pfn_clean(pfn); 1555 kvm_release_pfn_clean(pfn);
1124 } 1556 }
1125 if (!ptwrite || !*ptwrite) 1557 if (speculative) {
1126 vcpu->arch.last_pte_updated = shadow_pte; 1558 vcpu->arch.last_pte_updated = shadow_pte;
1559 vcpu->arch.last_pte_gfn = gfn;
1560 }
1127} 1561}
1128 1562
1129static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) 1563static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
1130{ 1564{
1131} 1565}
1132 1566
1133static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 1567struct direct_shadow_walk {
1134 int largepage, gfn_t gfn, pfn_t pfn, 1568 struct kvm_shadow_walk walker;
1135 int level) 1569 pfn_t pfn;
1136{ 1570 int write;
1137 hpa_t table_addr = vcpu->arch.mmu.root_hpa; 1571 int largepage;
1138 int pt_write = 0; 1572 int pt_write;
1139 1573};
1140 for (; ; level--) {
1141 u32 index = PT64_INDEX(v, level);
1142 u64 *table;
1143
1144 ASSERT(VALID_PAGE(table_addr));
1145 table = __va(table_addr);
1146 1574
1147 if (level == 1) { 1575static int direct_map_entry(struct kvm_shadow_walk *_walk,
1148 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1576 struct kvm_vcpu *vcpu,
1149 0, write, 1, &pt_write, 0, gfn, pfn, false); 1577 u64 addr, u64 *sptep, int level)
1150 return pt_write; 1578{
1151 } 1579 struct direct_shadow_walk *walk =
1580 container_of(_walk, struct direct_shadow_walk, walker);
1581 struct kvm_mmu_page *sp;
1582 gfn_t pseudo_gfn;
1583 gfn_t gfn = addr >> PAGE_SHIFT;
1584
1585 if (level == PT_PAGE_TABLE_LEVEL
1586 || (walk->largepage && level == PT_DIRECTORY_LEVEL)) {
1587 mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL,
1588 0, walk->write, 1, &walk->pt_write,
1589 walk->largepage, gfn, walk->pfn, false);
1590 ++vcpu->stat.pf_fixed;
1591 return 1;
1592 }
1152 1593
1153 if (largepage && level == 2) { 1594 if (*sptep == shadow_trap_nonpresent_pte) {
1154 mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, 1595 pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT;
1155 0, write, 1, &pt_write, 1, gfn, pfn, false); 1596 sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1,
1156 return pt_write; 1597 1, ACC_ALL, sptep);
1598 if (!sp) {
1599 pgprintk("nonpaging_map: ENOMEM\n");
1600 kvm_release_pfn_clean(walk->pfn);
1601 return -ENOMEM;
1157 } 1602 }
1158 1603
1159 if (table[index] == shadow_trap_nonpresent_pte) { 1604 set_shadow_pte(sptep,
1160 struct kvm_mmu_page *new_table; 1605 __pa(sp->spt)
1161 gfn_t pseudo_gfn; 1606 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1162 1607 | shadow_user_mask | shadow_x_mask);
1163 pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK)
1164 >> PAGE_SHIFT;
1165 new_table = kvm_mmu_get_page(vcpu, pseudo_gfn,
1166 v, level - 1,
1167 1, ACC_ALL, &table[index]);
1168 if (!new_table) {
1169 pgprintk("nonpaging_map: ENOMEM\n");
1170 kvm_release_pfn_clean(pfn);
1171 return -ENOMEM;
1172 }
1173
1174 table[index] = __pa(new_table->spt)
1175 | PT_PRESENT_MASK | PT_WRITABLE_MASK
1176 | shadow_user_mask | shadow_x_mask;
1177 }
1178 table_addr = table[index] & PT64_BASE_ADDR_MASK;
1179 } 1608 }
1609 return 0;
1610}
1611
1612static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
1613 int largepage, gfn_t gfn, pfn_t pfn)
1614{
1615 int r;
1616 struct direct_shadow_walk walker = {
1617 .walker = { .entry = direct_map_entry, },
1618 .pfn = pfn,
1619 .largepage = largepage,
1620 .write = write,
1621 .pt_write = 0,
1622 };
1623
1624 r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT);
1625 if (r < 0)
1626 return r;
1627 return walker.pt_write;
1180} 1628}
1181 1629
1182static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 1630static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
@@ -1184,15 +1632,16 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1184 int r; 1632 int r;
1185 int largepage = 0; 1633 int largepage = 0;
1186 pfn_t pfn; 1634 pfn_t pfn;
1635 unsigned long mmu_seq;
1187 1636
1188 down_read(&current->mm->mmap_sem);
1189 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1637 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1190 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1638 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1191 largepage = 1; 1639 largepage = 1;
1192 } 1640 }
1193 1641
1642 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1643 smp_rmb();
1194 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1644 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1195 up_read(&current->mm->mmap_sem);
1196 1645
1197 /* mmio */ 1646 /* mmio */
1198 if (is_error_pfn(pfn)) { 1647 if (is_error_pfn(pfn)) {
@@ -1201,25 +1650,22 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
1201 } 1650 }
1202 1651
1203 spin_lock(&vcpu->kvm->mmu_lock); 1652 spin_lock(&vcpu->kvm->mmu_lock);
1653 if (mmu_notifier_retry(vcpu, mmu_seq))
1654 goto out_unlock;
1204 kvm_mmu_free_some_pages(vcpu); 1655 kvm_mmu_free_some_pages(vcpu);
1205 r = __direct_map(vcpu, v, write, largepage, gfn, pfn, 1656 r = __direct_map(vcpu, v, write, largepage, gfn, pfn);
1206 PT32E_ROOT_LEVEL);
1207 spin_unlock(&vcpu->kvm->mmu_lock); 1657 spin_unlock(&vcpu->kvm->mmu_lock);
1208 1658
1209 1659
1210 return r; 1660 return r;
1211}
1212
1213
1214static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1215 struct kvm_mmu_page *sp)
1216{
1217 int i;
1218 1661
1219 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 1662out_unlock:
1220 sp->spt[i] = shadow_trap_nonpresent_pte; 1663 spin_unlock(&vcpu->kvm->mmu_lock);
1664 kvm_release_pfn_clean(pfn);
1665 return 0;
1221} 1666}
1222 1667
1668
1223static void mmu_free_roots(struct kvm_vcpu *vcpu) 1669static void mmu_free_roots(struct kvm_vcpu *vcpu)
1224{ 1670{
1225 int i; 1671 int i;
@@ -1303,6 +1749,37 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu)
1303 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 1749 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
1304} 1750}
1305 1751
1752static void mmu_sync_roots(struct kvm_vcpu *vcpu)
1753{
1754 int i;
1755 struct kvm_mmu_page *sp;
1756
1757 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
1758 return;
1759 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
1760 hpa_t root = vcpu->arch.mmu.root_hpa;
1761 sp = page_header(root);
1762 mmu_sync_children(vcpu, sp);
1763 return;
1764 }
1765 for (i = 0; i < 4; ++i) {
1766 hpa_t root = vcpu->arch.mmu.pae_root[i];
1767
1768 if (root) {
1769 root &= PT64_BASE_ADDR_MASK;
1770 sp = page_header(root);
1771 mmu_sync_children(vcpu, sp);
1772 }
1773 }
1774}
1775
1776void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
1777{
1778 spin_lock(&vcpu->kvm->mmu_lock);
1779 mmu_sync_roots(vcpu);
1780 spin_unlock(&vcpu->kvm->mmu_lock);
1781}
1782
1306static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 1783static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr)
1307{ 1784{
1308 return vaddr; 1785 return vaddr;
@@ -1335,6 +1812,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1335 int r; 1812 int r;
1336 int largepage = 0; 1813 int largepage = 0;
1337 gfn_t gfn = gpa >> PAGE_SHIFT; 1814 gfn_t gfn = gpa >> PAGE_SHIFT;
1815 unsigned long mmu_seq;
1338 1816
1339 ASSERT(vcpu); 1817 ASSERT(vcpu);
1340 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 1818 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -1343,24 +1821,31 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
1343 if (r) 1821 if (r)
1344 return r; 1822 return r;
1345 1823
1346 down_read(&current->mm->mmap_sem);
1347 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { 1824 if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) {
1348 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 1825 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1349 largepage = 1; 1826 largepage = 1;
1350 } 1827 }
1828 mmu_seq = vcpu->kvm->mmu_notifier_seq;
1829 smp_rmb();
1351 pfn = gfn_to_pfn(vcpu->kvm, gfn); 1830 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1352 up_read(&current->mm->mmap_sem);
1353 if (is_error_pfn(pfn)) { 1831 if (is_error_pfn(pfn)) {
1354 kvm_release_pfn_clean(pfn); 1832 kvm_release_pfn_clean(pfn);
1355 return 1; 1833 return 1;
1356 } 1834 }
1357 spin_lock(&vcpu->kvm->mmu_lock); 1835 spin_lock(&vcpu->kvm->mmu_lock);
1836 if (mmu_notifier_retry(vcpu, mmu_seq))
1837 goto out_unlock;
1358 kvm_mmu_free_some_pages(vcpu); 1838 kvm_mmu_free_some_pages(vcpu);
1359 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 1839 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK,
1360 largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); 1840 largepage, gfn, pfn);
1361 spin_unlock(&vcpu->kvm->mmu_lock); 1841 spin_unlock(&vcpu->kvm->mmu_lock);
1362 1842
1363 return r; 1843 return r;
1844
1845out_unlock:
1846 spin_unlock(&vcpu->kvm->mmu_lock);
1847 kvm_release_pfn_clean(pfn);
1848 return 0;
1364} 1849}
1365 1850
1366static void nonpaging_free(struct kvm_vcpu *vcpu) 1851static void nonpaging_free(struct kvm_vcpu *vcpu)
@@ -1377,6 +1862,8 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
1377 context->gva_to_gpa = nonpaging_gva_to_gpa; 1862 context->gva_to_gpa = nonpaging_gva_to_gpa;
1378 context->free = nonpaging_free; 1863 context->free = nonpaging_free;
1379 context->prefetch_page = nonpaging_prefetch_page; 1864 context->prefetch_page = nonpaging_prefetch_page;
1865 context->sync_page = nonpaging_sync_page;
1866 context->invlpg = nonpaging_invlpg;
1380 context->root_level = 0; 1867 context->root_level = 0;
1381 context->shadow_root_level = PT32E_ROOT_LEVEL; 1868 context->shadow_root_level = PT32E_ROOT_LEVEL;
1382 context->root_hpa = INVALID_PAGE; 1869 context->root_hpa = INVALID_PAGE;
@@ -1424,6 +1911,8 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
1424 context->page_fault = paging64_page_fault; 1911 context->page_fault = paging64_page_fault;
1425 context->gva_to_gpa = paging64_gva_to_gpa; 1912 context->gva_to_gpa = paging64_gva_to_gpa;
1426 context->prefetch_page = paging64_prefetch_page; 1913 context->prefetch_page = paging64_prefetch_page;
1914 context->sync_page = paging64_sync_page;
1915 context->invlpg = paging64_invlpg;
1427 context->free = paging_free; 1916 context->free = paging_free;
1428 context->root_level = level; 1917 context->root_level = level;
1429 context->shadow_root_level = level; 1918 context->shadow_root_level = level;
@@ -1445,6 +1934,8 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
1445 context->gva_to_gpa = paging32_gva_to_gpa; 1934 context->gva_to_gpa = paging32_gva_to_gpa;
1446 context->free = paging_free; 1935 context->free = paging_free;
1447 context->prefetch_page = paging32_prefetch_page; 1936 context->prefetch_page = paging32_prefetch_page;
1937 context->sync_page = paging32_sync_page;
1938 context->invlpg = paging32_invlpg;
1448 context->root_level = PT32_ROOT_LEVEL; 1939 context->root_level = PT32_ROOT_LEVEL;
1449 context->shadow_root_level = PT32E_ROOT_LEVEL; 1940 context->shadow_root_level = PT32E_ROOT_LEVEL;
1450 context->root_hpa = INVALID_PAGE; 1941 context->root_hpa = INVALID_PAGE;
@@ -1464,6 +1955,8 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
1464 context->page_fault = tdp_page_fault; 1955 context->page_fault = tdp_page_fault;
1465 context->free = nonpaging_free; 1956 context->free = nonpaging_free;
1466 context->prefetch_page = nonpaging_prefetch_page; 1957 context->prefetch_page = nonpaging_prefetch_page;
1958 context->sync_page = nonpaging_sync_page;
1959 context->invlpg = nonpaging_invlpg;
1467 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 1960 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
1468 context->root_hpa = INVALID_PAGE; 1961 context->root_hpa = INVALID_PAGE;
1469 1962
@@ -1535,6 +2028,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
1535 spin_lock(&vcpu->kvm->mmu_lock); 2028 spin_lock(&vcpu->kvm->mmu_lock);
1536 kvm_mmu_free_some_pages(vcpu); 2029 kvm_mmu_free_some_pages(vcpu);
1537 mmu_alloc_roots(vcpu); 2030 mmu_alloc_roots(vcpu);
2031 mmu_sync_roots(vcpu);
1538 spin_unlock(&vcpu->kvm->mmu_lock); 2032 spin_unlock(&vcpu->kvm->mmu_lock);
1539 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 2033 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
1540 kvm_mmu_flush_tlb(vcpu); 2034 kvm_mmu_flush_tlb(vcpu);
@@ -1655,13 +2149,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1655 return; 2149 return;
1656 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; 2150 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
1657 2151
1658 down_read(&current->mm->mmap_sem);
1659 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { 2152 if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) {
1660 gfn &= ~(KVM_PAGES_PER_HPAGE-1); 2153 gfn &= ~(KVM_PAGES_PER_HPAGE-1);
1661 vcpu->arch.update_pte.largepage = 1; 2154 vcpu->arch.update_pte.largepage = 1;
1662 } 2155 }
2156 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
2157 smp_rmb();
1663 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2158 pfn = gfn_to_pfn(vcpu->kvm, gfn);
1664 up_read(&current->mm->mmap_sem);
1665 2159
1666 if (is_error_pfn(pfn)) { 2160 if (is_error_pfn(pfn)) {
1667 kvm_release_pfn_clean(pfn); 2161 kvm_release_pfn_clean(pfn);
@@ -1671,6 +2165,18 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1671 vcpu->arch.update_pte.pfn = pfn; 2165 vcpu->arch.update_pte.pfn = pfn;
1672} 2166}
1673 2167
2168static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2169{
2170 u64 *spte = vcpu->arch.last_pte_updated;
2171
2172 if (spte
2173 && vcpu->arch.last_pte_gfn == gfn
2174 && shadow_accessed_mask
2175 && !(*spte & shadow_accessed_mask)
2176 && is_shadow_present_pte(*spte))
2177 set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte);
2178}
2179
1674void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, 2180void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1675 const u8 *new, int bytes) 2181 const u8 *new, int bytes)
1676{ 2182{
@@ -1694,6 +2200,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1694 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 2200 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
1695 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); 2201 mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes);
1696 spin_lock(&vcpu->kvm->mmu_lock); 2202 spin_lock(&vcpu->kvm->mmu_lock);
2203 kvm_mmu_access_page(vcpu, gfn);
1697 kvm_mmu_free_some_pages(vcpu); 2204 kvm_mmu_free_some_pages(vcpu);
1698 ++vcpu->kvm->stat.mmu_pte_write; 2205 ++vcpu->kvm->stat.mmu_pte_write;
1699 kvm_mmu_audit(vcpu, "pre pte write"); 2206 kvm_mmu_audit(vcpu, "pre pte write");
@@ -1710,7 +2217,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1710 index = kvm_page_table_hashfn(gfn); 2217 index = kvm_page_table_hashfn(gfn);
1711 bucket = &vcpu->kvm->arch.mmu_page_hash[index]; 2218 bucket = &vcpu->kvm->arch.mmu_page_hash[index];
1712 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { 2219 hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) {
1713 if (sp->gfn != gfn || sp->role.metaphysical) 2220 if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid)
1714 continue; 2221 continue;
1715 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; 2222 pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8;
1716 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); 2223 misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
@@ -1728,7 +2235,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
1728 */ 2235 */
1729 pgprintk("misaligned: gpa %llx bytes %d role %x\n", 2236 pgprintk("misaligned: gpa %llx bytes %d role %x\n",
1730 gpa, bytes, sp->role.word); 2237 gpa, bytes, sp->role.word);
1731 kvm_mmu_zap_page(vcpu->kvm, sp); 2238 if (kvm_mmu_zap_page(vcpu->kvm, sp))
2239 n = bucket->first;
1732 ++vcpu->kvm->stat.mmu_flooded; 2240 ++vcpu->kvm->stat.mmu_flooded;
1733 continue; 2241 continue;
1734 } 2242 }
@@ -1791,6 +2299,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
1791 spin_unlock(&vcpu->kvm->mmu_lock); 2299 spin_unlock(&vcpu->kvm->mmu_lock);
1792 return r; 2300 return r;
1793} 2301}
2302EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
1794 2303
1795void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 2304void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
1796{ 2305{
@@ -1841,12 +2350,28 @@ out:
1841} 2350}
1842EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); 2351EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
1843 2352
2353void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
2354{
2355 spin_lock(&vcpu->kvm->mmu_lock);
2356 vcpu->arch.mmu.invlpg(vcpu, gva);
2357 spin_unlock(&vcpu->kvm->mmu_lock);
2358 kvm_mmu_flush_tlb(vcpu);
2359 ++vcpu->stat.invlpg;
2360}
2361EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
2362
1844void kvm_enable_tdp(void) 2363void kvm_enable_tdp(void)
1845{ 2364{
1846 tdp_enabled = true; 2365 tdp_enabled = true;
1847} 2366}
1848EXPORT_SYMBOL_GPL(kvm_enable_tdp); 2367EXPORT_SYMBOL_GPL(kvm_enable_tdp);
1849 2368
2369void kvm_disable_tdp(void)
2370{
2371 tdp_enabled = false;
2372}
2373EXPORT_SYMBOL_GPL(kvm_disable_tdp);
2374
1850static void free_mmu_pages(struct kvm_vcpu *vcpu) 2375static void free_mmu_pages(struct kvm_vcpu *vcpu)
1851{ 2376{
1852 struct kvm_mmu_page *sp; 2377 struct kvm_mmu_page *sp;
@@ -1921,6 +2446,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1921{ 2446{
1922 struct kvm_mmu_page *sp; 2447 struct kvm_mmu_page *sp;
1923 2448
2449 spin_lock(&kvm->mmu_lock);
1924 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { 2450 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) {
1925 int i; 2451 int i;
1926 u64 *pt; 2452 u64 *pt;
@@ -1934,6 +2460,8 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
1934 if (pt[i] & PT_WRITABLE_MASK) 2460 if (pt[i] & PT_WRITABLE_MASK)
1935 pt[i] &= ~PT_WRITABLE_MASK; 2461 pt[i] &= ~PT_WRITABLE_MASK;
1936 } 2462 }
2463 kvm_flush_remote_tlbs(kvm);
2464 spin_unlock(&kvm->mmu_lock);
1937} 2465}
1938 2466
1939void kvm_mmu_zap_all(struct kvm *kvm) 2467void kvm_mmu_zap_all(struct kvm *kvm)
@@ -1942,13 +2470,15 @@ void kvm_mmu_zap_all(struct kvm *kvm)
1942 2470
1943 spin_lock(&kvm->mmu_lock); 2471 spin_lock(&kvm->mmu_lock);
1944 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) 2472 list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link)
1945 kvm_mmu_zap_page(kvm, sp); 2473 if (kvm_mmu_zap_page(kvm, sp))
2474 node = container_of(kvm->arch.active_mmu_pages.next,
2475 struct kvm_mmu_page, link);
1946 spin_unlock(&kvm->mmu_lock); 2476 spin_unlock(&kvm->mmu_lock);
1947 2477
1948 kvm_flush_remote_tlbs(kvm); 2478 kvm_flush_remote_tlbs(kvm);
1949} 2479}
1950 2480
1951void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) 2481static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm)
1952{ 2482{
1953 struct kvm_mmu_page *page; 2483 struct kvm_mmu_page *page;
1954 2484
@@ -1968,6 +2498,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
1968 list_for_each_entry(kvm, &vm_list, vm_list) { 2498 list_for_each_entry(kvm, &vm_list, vm_list) {
1969 int npages; 2499 int npages;
1970 2500
2501 if (!down_read_trylock(&kvm->slots_lock))
2502 continue;
1971 spin_lock(&kvm->mmu_lock); 2503 spin_lock(&kvm->mmu_lock);
1972 npages = kvm->arch.n_alloc_mmu_pages - 2504 npages = kvm->arch.n_alloc_mmu_pages -
1973 kvm->arch.n_free_mmu_pages; 2505 kvm->arch.n_free_mmu_pages;
@@ -1980,6 +2512,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
1980 nr_to_scan--; 2512 nr_to_scan--;
1981 2513
1982 spin_unlock(&kvm->mmu_lock); 2514 spin_unlock(&kvm->mmu_lock);
2515 up_read(&kvm->slots_lock);
1983 } 2516 }
1984 if (kvm_freed) 2517 if (kvm_freed)
1985 list_move_tail(&kvm_freed->vm_list, &vm_list); 2518 list_move_tail(&kvm_freed->vm_list, &vm_list);
@@ -2154,18 +2687,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2154 gpa_t addr, unsigned long *ret) 2687 gpa_t addr, unsigned long *ret)
2155{ 2688{
2156 int r; 2689 int r;
2157 struct kvm_pv_mmu_op_buffer buffer; 2690 struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer;
2158 2691
2159 buffer.ptr = buffer.buf; 2692 buffer->ptr = buffer->buf;
2160 buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); 2693 buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf);
2161 buffer.processed = 0; 2694 buffer->processed = 0;
2162 2695
2163 r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); 2696 r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len);
2164 if (r) 2697 if (r)
2165 goto out; 2698 goto out;
2166 2699
2167 while (buffer.len) { 2700 while (buffer->len) {
2168 r = kvm_pv_mmu_op_one(vcpu, &buffer); 2701 r = kvm_pv_mmu_op_one(vcpu, buffer);
2169 if (r < 0) 2702 if (r < 0)
2170 goto out; 2703 goto out;
2171 if (r == 0) 2704 if (r == 0)
@@ -2174,7 +2707,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes,
2174 2707
2175 r = 1; 2708 r = 1;
2176out: 2709out:
2177 *ret = buffer.processed; 2710 *ret = buffer->processed;
2178 return r; 2711 return r;
2179} 2712}
2180 2713
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 1730757bbc7a..258e5d56298e 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -15,7 +15,8 @@
15#define PT_USER_MASK (1ULL << 2) 15#define PT_USER_MASK (1ULL << 2)
16#define PT_PWT_MASK (1ULL << 3) 16#define PT_PWT_MASK (1ULL << 3)
17#define PT_PCD_MASK (1ULL << 4) 17#define PT_PCD_MASK (1ULL << 4)
18#define PT_ACCESSED_MASK (1ULL << 5) 18#define PT_ACCESSED_SHIFT 5
19#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT)
19#define PT_DIRTY_MASK (1ULL << 6) 20#define PT_DIRTY_MASK (1ULL << 6)
20#define PT_PAGE_SIZE_MASK (1ULL << 7) 21#define PT_PAGE_SIZE_MASK (1ULL << 7)
21#define PT_PAT_MASK (1ULL << 7) 22#define PT_PAT_MASK (1ULL << 7)
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 934c7b619396..613ec9aa674a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -25,11 +25,11 @@
25#if PTTYPE == 64 25#if PTTYPE == 64
26 #define pt_element_t u64 26 #define pt_element_t u64
27 #define guest_walker guest_walker64 27 #define guest_walker guest_walker64
28 #define shadow_walker shadow_walker64
28 #define FNAME(name) paging##64_##name 29 #define FNAME(name) paging##64_##name
29 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK 30 #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
30 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK 31 #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK
31 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 32 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
32 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) 33 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
34 #define PT_LEVEL_BITS PT64_LEVEL_BITS 34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
35 #ifdef CONFIG_X86_64 35 #ifdef CONFIG_X86_64
@@ -42,11 +42,11 @@
42#elif PTTYPE == 32 42#elif PTTYPE == 32
43 #define pt_element_t u32 43 #define pt_element_t u32
44 #define guest_walker guest_walker32 44 #define guest_walker guest_walker32
45 #define shadow_walker shadow_walker32
45 #define FNAME(name) paging##32_##name 46 #define FNAME(name) paging##32_##name
46 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK 47 #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
47 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK 48 #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK
48 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
49 #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) 50 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
51 #define PT_LEVEL_BITS PT32_LEVEL_BITS 51 #define PT_LEVEL_BITS PT32_LEVEL_BITS
52 #define PT_MAX_FULL_LEVELS 2 52 #define PT_MAX_FULL_LEVELS 2
@@ -73,6 +73,17 @@ struct guest_walker {
73 u32 error_code; 73 u32 error_code;
74}; 74};
75 75
76struct shadow_walker {
77 struct kvm_shadow_walk walker;
78 struct guest_walker *guest_walker;
79 int user_fault;
80 int write_fault;
81 int largepage;
82 int *ptwrite;
83 pfn_t pfn;
84 u64 *sptep;
85};
86
76static gfn_t gpte_to_gfn(pt_element_t gpte) 87static gfn_t gpte_to_gfn(pt_element_t gpte)
77{ 88{
78 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; 89 return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -91,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm,
91 pt_element_t *table; 102 pt_element_t *table;
92 struct page *page; 103 struct page *page;
93 104
94 down_read(&current->mm->mmap_sem);
95 page = gfn_to_page(kvm, table_gfn); 105 page = gfn_to_page(kvm, table_gfn);
96 up_read(&current->mm->mmap_sem);
97 106
98 table = kmap_atomic(page, KM_USER0); 107 table = kmap_atomic(page, KM_USER0);
99
100 ret = CMPXCHG(&table[index], orig_pte, new_pte); 108 ret = CMPXCHG(&table[index], orig_pte, new_pte);
101
102 kunmap_atomic(table, KM_USER0); 109 kunmap_atomic(table, KM_USER0);
103 110
104 kvm_release_page_dirty(page); 111 kvm_release_page_dirty(page);
@@ -263,6 +270,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
263 pfn = vcpu->arch.update_pte.pfn; 270 pfn = vcpu->arch.update_pte.pfn;
264 if (is_error_pfn(pfn)) 271 if (is_error_pfn(pfn))
265 return; 272 return;
273 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
274 return;
266 kvm_get_pfn(pfn); 275 kvm_get_pfn(pfn);
267 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, 276 mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0,
268 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), 277 gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte),
@@ -272,86 +281,89 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page,
272/* 281/*
273 * Fetch a shadow pte for a specific level in the paging hierarchy. 282 * Fetch a shadow pte for a specific level in the paging hierarchy.
274 */ 283 */
275static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 284static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw,
276 struct guest_walker *walker, 285 struct kvm_vcpu *vcpu, u64 addr,
277 int user_fault, int write_fault, int largepage, 286 u64 *sptep, int level)
278 int *ptwrite, pfn_t pfn)
279{ 287{
280 hpa_t shadow_addr; 288 struct shadow_walker *sw =
281 int level; 289 container_of(_sw, struct shadow_walker, walker);
282 u64 *shadow_ent; 290 struct guest_walker *gw = sw->guest_walker;
283 unsigned access = walker->pt_access; 291 unsigned access = gw->pt_access;
284 292 struct kvm_mmu_page *shadow_page;
285 if (!is_present_pte(walker->ptes[walker->level - 1])) 293 u64 spte;
286 return NULL; 294 int metaphysical;
287 295 gfn_t table_gfn;
288 shadow_addr = vcpu->arch.mmu.root_hpa; 296 int r;
289 level = vcpu->arch.mmu.shadow_root_level; 297 pt_element_t curr_pte;
290 if (level == PT32E_ROOT_LEVEL) { 298
291 shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 299 if (level == PT_PAGE_TABLE_LEVEL
292 shadow_addr &= PT64_BASE_ADDR_MASK; 300 || (sw->largepage && level == PT_DIRECTORY_LEVEL)) {
293 --level; 301 mmu_set_spte(vcpu, sptep, access, gw->pte_access & access,
302 sw->user_fault, sw->write_fault,
303 gw->ptes[gw->level-1] & PT_DIRTY_MASK,
304 sw->ptwrite, sw->largepage, gw->gfn, sw->pfn,
305 false);
306 sw->sptep = sptep;
307 return 1;
294 } 308 }
295 309
296 for (; ; level--) { 310 if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep))
297 u32 index = SHADOW_PT_INDEX(addr, level); 311 return 0;
298 struct kvm_mmu_page *shadow_page;
299 u64 shadow_pte;
300 int metaphysical;
301 gfn_t table_gfn;
302
303 shadow_ent = ((u64 *)__va(shadow_addr)) + index;
304 if (level == PT_PAGE_TABLE_LEVEL)
305 break;
306
307 if (largepage && level == PT_DIRECTORY_LEVEL)
308 break;
309 312
310 if (is_shadow_present_pte(*shadow_ent) 313 if (is_large_pte(*sptep)) {
311 && !is_large_pte(*shadow_ent)) { 314 set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
312 shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; 315 kvm_flush_remote_tlbs(vcpu->kvm);
313 continue; 316 rmap_remove(vcpu->kvm, sptep);
314 } 317 }
315 318
316 if (is_large_pte(*shadow_ent)) 319 if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) {
317 rmap_remove(vcpu->kvm, shadow_ent); 320 metaphysical = 1;
318 321 if (!is_dirty_pte(gw->ptes[level - 1]))
319 if (level - 1 == PT_PAGE_TABLE_LEVEL 322 access &= ~ACC_WRITE_MASK;
320 && walker->level == PT_DIRECTORY_LEVEL) { 323 table_gfn = gpte_to_gfn(gw->ptes[level - 1]);
321 metaphysical = 1; 324 } else {
322 if (!is_dirty_pte(walker->ptes[level - 1])) 325 metaphysical = 0;
323 access &= ~ACC_WRITE_MASK; 326 table_gfn = gw->table_gfn[level - 2];
324 table_gfn = gpte_to_gfn(walker->ptes[level - 1]); 327 }
325 } else { 328 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1,
326 metaphysical = 0; 329 metaphysical, access, sptep);
327 table_gfn = walker->table_gfn[level - 2]; 330 if (!metaphysical) {
328 } 331 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2],
329 shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, 332 &curr_pte, sizeof(curr_pte));
330 metaphysical, access, 333 if (r || curr_pte != gw->ptes[level - 2]) {
331 shadow_ent); 334 kvm_release_pfn_clean(sw->pfn);
332 if (!metaphysical) { 335 sw->sptep = NULL;
333 int r; 336 return 1;
334 pt_element_t curr_pte;
335 r = kvm_read_guest_atomic(vcpu->kvm,
336 walker->pte_gpa[level - 2],
337 &curr_pte, sizeof(curr_pte));
338 if (r || curr_pte != walker->ptes[level - 2]) {
339 kvm_release_pfn_clean(pfn);
340 return NULL;
341 }
342 } 337 }
343 shadow_addr = __pa(shadow_page->spt);
344 shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK
345 | PT_WRITABLE_MASK | PT_USER_MASK;
346 *shadow_ent = shadow_pte;
347 } 338 }
348 339
349 mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, 340 spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK
350 user_fault, write_fault, 341 | PT_WRITABLE_MASK | PT_USER_MASK;
351 walker->ptes[walker->level-1] & PT_DIRTY_MASK, 342 *sptep = spte;
352 ptwrite, largepage, walker->gfn, pfn, false); 343 return 0;
344}
345
346static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
347 struct guest_walker *guest_walker,
348 int user_fault, int write_fault, int largepage,
349 int *ptwrite, pfn_t pfn)
350{
351 struct shadow_walker walker = {
352 .walker = { .entry = FNAME(shadow_walk_entry), },
353 .guest_walker = guest_walker,
354 .user_fault = user_fault,
355 .write_fault = write_fault,
356 .largepage = largepage,
357 .ptwrite = ptwrite,
358 .pfn = pfn,
359 };
360
361 if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1]))
362 return NULL;
353 363
354 return shadow_ent; 364 walk_shadow(&walker.walker, vcpu, addr);
365
366 return walker.sptep;
355} 367}
356 368
357/* 369/*
@@ -380,6 +392,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
380 int r; 392 int r;
381 pfn_t pfn; 393 pfn_t pfn;
382 int largepage = 0; 394 int largepage = 0;
395 unsigned long mmu_seq;
383 396
384 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 397 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
385 kvm_mmu_audit(vcpu, "pre page fault"); 398 kvm_mmu_audit(vcpu, "pre page fault");
@@ -404,7 +417,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
404 return 0; 417 return 0;
405 } 418 }
406 419
407 down_read(&current->mm->mmap_sem);
408 if (walker.level == PT_DIRECTORY_LEVEL) { 420 if (walker.level == PT_DIRECTORY_LEVEL) {
409 gfn_t large_gfn; 421 gfn_t large_gfn;
410 large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); 422 large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1);
@@ -413,8 +425,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
413 largepage = 1; 425 largepage = 1;
414 } 426 }
415 } 427 }
428 mmu_seq = vcpu->kvm->mmu_notifier_seq;
429 smp_rmb();
416 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 430 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn);
417 up_read(&current->mm->mmap_sem);
418 431
419 /* mmio */ 432 /* mmio */
420 if (is_error_pfn(pfn)) { 433 if (is_error_pfn(pfn)) {
@@ -424,6 +437,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
424 } 437 }
425 438
426 spin_lock(&vcpu->kvm->mmu_lock); 439 spin_lock(&vcpu->kvm->mmu_lock);
440 if (mmu_notifier_retry(vcpu, mmu_seq))
441 goto out_unlock;
427 kvm_mmu_free_some_pages(vcpu); 442 kvm_mmu_free_some_pages(vcpu);
428 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 443 shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
429 largepage, &write_pt, pfn); 444 largepage, &write_pt, pfn);
@@ -439,6 +454,36 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
439 spin_unlock(&vcpu->kvm->mmu_lock); 454 spin_unlock(&vcpu->kvm->mmu_lock);
440 455
441 return write_pt; 456 return write_pt;
457
458out_unlock:
459 spin_unlock(&vcpu->kvm->mmu_lock);
460 kvm_release_pfn_clean(pfn);
461 return 0;
462}
463
464static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw,
465 struct kvm_vcpu *vcpu, u64 addr,
466 u64 *sptep, int level)
467{
468
469 if (level == PT_PAGE_TABLE_LEVEL) {
470 if (is_shadow_present_pte(*sptep))
471 rmap_remove(vcpu->kvm, sptep);
472 set_shadow_pte(sptep, shadow_trap_nonpresent_pte);
473 return 1;
474 }
475 if (!is_shadow_present_pte(*sptep))
476 return 1;
477 return 0;
478}
479
480static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
481{
482 struct shadow_walker walker = {
483 .walker = { .entry = FNAME(shadow_invlpg_entry), },
484 };
485
486 walk_shadow(&walker.walker, vcpu, gva);
442} 487}
443 488
444static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 489static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
@@ -460,8 +505,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr)
460static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, 505static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
461 struct kvm_mmu_page *sp) 506 struct kvm_mmu_page *sp)
462{ 507{
463 int i, offset = 0, r = 0; 508 int i, j, offset, r;
464 pt_element_t pt; 509 pt_element_t pt[256 / sizeof(pt_element_t)];
510 gpa_t pte_gpa;
465 511
466 if (sp->role.metaphysical 512 if (sp->role.metaphysical
467 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { 513 || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) {
@@ -469,28 +515,83 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
469 return; 515 return;
470 } 516 }
471 517
518 pte_gpa = gfn_to_gpa(sp->gfn);
519 if (PTTYPE == 32) {
520 offset = sp->role.quadrant << PT64_LEVEL_BITS;
521 pte_gpa += offset * sizeof(pt_element_t);
522 }
523
524 for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) {
525 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt);
526 pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t);
527 for (j = 0; j < ARRAY_SIZE(pt); ++j)
528 if (r || is_present_pte(pt[j]))
529 sp->spt[i+j] = shadow_trap_nonpresent_pte;
530 else
531 sp->spt[i+j] = shadow_notrap_nonpresent_pte;
532 }
533}
534
535/*
536 * Using the cached information from sp->gfns is safe because:
537 * - The spte has a reference to the struct page, so the pfn for a given gfn
538 * can't change unless all sptes pointing to it are nuked first.
539 * - Alias changes zap the entire shadow cache.
540 */
541static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
542{
543 int i, offset, nr_present;
544
545 offset = nr_present = 0;
546
472 if (PTTYPE == 32) 547 if (PTTYPE == 32)
473 offset = sp->role.quadrant << PT64_LEVEL_BITS; 548 offset = sp->role.quadrant << PT64_LEVEL_BITS;
474 549
475 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { 550 for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
476 gpa_t pte_gpa = gfn_to_gpa(sp->gfn); 551 unsigned pte_access;
552 pt_element_t gpte;
553 gpa_t pte_gpa;
554 gfn_t gfn = sp->gfns[i];
555
556 if (!is_shadow_present_pte(sp->spt[i]))
557 continue;
558
559 pte_gpa = gfn_to_gpa(sp->gfn);
477 pte_gpa += (i+offset) * sizeof(pt_element_t); 560 pte_gpa += (i+offset) * sizeof(pt_element_t);
478 561
479 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt, 562 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
480 sizeof(pt_element_t)); 563 sizeof(pt_element_t)))
481 if (r || is_present_pte(pt)) 564 return -EINVAL;
482 sp->spt[i] = shadow_trap_nonpresent_pte; 565
483 else 566 if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) ||
484 sp->spt[i] = shadow_notrap_nonpresent_pte; 567 !(gpte & PT_ACCESSED_MASK)) {
568 u64 nonpresent;
569
570 rmap_remove(vcpu->kvm, &sp->spt[i]);
571 if (is_present_pte(gpte))
572 nonpresent = shadow_trap_nonpresent_pte;
573 else
574 nonpresent = shadow_notrap_nonpresent_pte;
575 set_shadow_pte(&sp->spt[i], nonpresent);
576 continue;
577 }
578
579 nr_present++;
580 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
581 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
582 is_dirty_pte(gpte), 0, gfn,
583 spte_to_pfn(sp->spt[i]), true, false);
485 } 584 }
585
586 return !nr_present;
486} 587}
487 588
488#undef pt_element_t 589#undef pt_element_t
489#undef guest_walker 590#undef guest_walker
591#undef shadow_walker
490#undef FNAME 592#undef FNAME
491#undef PT_BASE_ADDR_MASK 593#undef PT_BASE_ADDR_MASK
492#undef PT_INDEX 594#undef PT_INDEX
493#undef SHADOW_PT_INDEX
494#undef PT_LEVEL_MASK 595#undef PT_LEVEL_MASK
495#undef PT_DIR_BASE_ADDR_MASK 596#undef PT_DIR_BASE_ADDR_MASK
496#undef PT_LEVEL_BITS 597#undef PT_LEVEL_BITS
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6b0d5fa5bab3..9c4ce657d963 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -18,6 +18,7 @@
18#include "kvm_svm.h" 18#include "kvm_svm.h"
19#include "irq.h" 19#include "irq.h"
20#include "mmu.h" 20#include "mmu.h"
21#include "kvm_cache_regs.h"
21 22
22#include <linux/module.h> 23#include <linux/module.h>
23#include <linux/kernel.h> 24#include <linux/kernel.h>
@@ -27,16 +28,14 @@
27 28
28#include <asm/desc.h> 29#include <asm/desc.h>
29 30
31#define __ex(x) __kvm_handle_fault_on_reboot(x)
32
30MODULE_AUTHOR("Qumranet"); 33MODULE_AUTHOR("Qumranet");
31MODULE_LICENSE("GPL"); 34MODULE_LICENSE("GPL");
32 35
33#define IOPM_ALLOC_ORDER 2 36#define IOPM_ALLOC_ORDER 2
34#define MSRPM_ALLOC_ORDER 1 37#define MSRPM_ALLOC_ORDER 1
35 38
36#define DB_VECTOR 1
37#define UD_VECTOR 6
38#define GP_VECTOR 13
39
40#define DR7_GD_MASK (1 << 13) 39#define DR7_GD_MASK (1 << 13)
41#define DR6_BD_MASK (1 << 13) 40#define DR6_BD_MASK (1 << 13)
42 41
@@ -45,7 +44,7 @@ MODULE_LICENSE("GPL");
45 44
46#define SVM_FEATURE_NPT (1 << 0) 45#define SVM_FEATURE_NPT (1 << 0)
47#define SVM_FEATURE_LBRV (1 << 1) 46#define SVM_FEATURE_LBRV (1 << 1)
48#define SVM_DEATURE_SVML (1 << 2) 47#define SVM_FEATURE_SVML (1 << 2)
49 48
50#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 49#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
51 50
@@ -60,6 +59,7 @@ static int npt = 1;
60module_param(npt, int, S_IRUGO); 59module_param(npt, int, S_IRUGO);
61 60
62static void kvm_reput_irq(struct vcpu_svm *svm); 61static void kvm_reput_irq(struct vcpu_svm *svm);
62static void svm_flush_tlb(struct kvm_vcpu *vcpu);
63 63
64static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 64static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
65{ 65{
@@ -129,17 +129,17 @@ static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq)
129 129
130static inline void clgi(void) 130static inline void clgi(void)
131{ 131{
132 asm volatile (SVM_CLGI); 132 asm volatile (__ex(SVM_CLGI));
133} 133}
134 134
135static inline void stgi(void) 135static inline void stgi(void)
136{ 136{
137 asm volatile (SVM_STGI); 137 asm volatile (__ex(SVM_STGI));
138} 138}
139 139
140static inline void invlpga(unsigned long addr, u32 asid) 140static inline void invlpga(unsigned long addr, u32 asid)
141{ 141{
142 asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); 142 asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid));
143} 143}
144 144
145static inline unsigned long kvm_read_cr2(void) 145static inline unsigned long kvm_read_cr2(void)
@@ -233,13 +233,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
233 printk(KERN_DEBUG "%s: NOP\n", __func__); 233 printk(KERN_DEBUG "%s: NOP\n", __func__);
234 return; 234 return;
235 } 235 }
236 if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) 236 if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
237 printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", 237 printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
238 __func__, 238 __func__, kvm_rip_read(vcpu), svm->next_rip);
239 svm->vmcb->save.rip,
240 svm->next_rip);
241 239
242 vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; 240 kvm_rip_write(vcpu, svm->next_rip);
243 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; 241 svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK;
244 242
245 vcpu->arch.interrupt_window_open = 1; 243 vcpu->arch.interrupt_window_open = 1;
@@ -270,19 +268,11 @@ static int has_svm(void)
270 268
271static void svm_hardware_disable(void *garbage) 269static void svm_hardware_disable(void *garbage)
272{ 270{
273 struct svm_cpu_data *svm_data 271 uint64_t efer;
274 = per_cpu(svm_data, raw_smp_processor_id());
275
276 if (svm_data) {
277 uint64_t efer;
278 272
279 wrmsrl(MSR_VM_HSAVE_PA, 0); 273 wrmsrl(MSR_VM_HSAVE_PA, 0);
280 rdmsrl(MSR_EFER, efer); 274 rdmsrl(MSR_EFER, efer);
281 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); 275 wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK);
282 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
283 __free_page(svm_data->save_area);
284 kfree(svm_data);
285 }
286} 276}
287 277
288static void svm_hardware_enable(void *garbage) 278static void svm_hardware_enable(void *garbage)
@@ -321,6 +311,19 @@ static void svm_hardware_enable(void *garbage)
321 page_to_pfn(svm_data->save_area) << PAGE_SHIFT); 311 page_to_pfn(svm_data->save_area) << PAGE_SHIFT);
322} 312}
323 313
314static void svm_cpu_uninit(int cpu)
315{
316 struct svm_cpu_data *svm_data
317 = per_cpu(svm_data, raw_smp_processor_id());
318
319 if (!svm_data)
320 return;
321
322 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
323 __free_page(svm_data->save_area);
324 kfree(svm_data);
325}
326
324static int svm_cpu_init(int cpu) 327static int svm_cpu_init(int cpu)
325{ 328{
326 struct svm_cpu_data *svm_data; 329 struct svm_cpu_data *svm_data;
@@ -446,7 +449,8 @@ static __init int svm_hardware_setup(void)
446 if (npt_enabled) { 449 if (npt_enabled) {
447 printk(KERN_INFO "kvm: Nested Paging enabled\n"); 450 printk(KERN_INFO "kvm: Nested Paging enabled\n");
448 kvm_enable_tdp(); 451 kvm_enable_tdp();
449 } 452 } else
453 kvm_disable_tdp();
450 454
451 return 0; 455 return 0;
452 456
@@ -458,6 +462,11 @@ err:
458 462
459static __exit void svm_hardware_unsetup(void) 463static __exit void svm_hardware_unsetup(void)
460{ 464{
465 int cpu;
466
467 for_each_online_cpu(cpu)
468 svm_cpu_uninit(cpu);
469
461 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 470 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
462 iopm_base = 0; 471 iopm_base = 0;
463} 472}
@@ -516,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm)
516 (1ULL << INTERCEPT_CPUID) | 525 (1ULL << INTERCEPT_CPUID) |
517 (1ULL << INTERCEPT_INVD) | 526 (1ULL << INTERCEPT_INVD) |
518 (1ULL << INTERCEPT_HLT) | 527 (1ULL << INTERCEPT_HLT) |
528 (1ULL << INTERCEPT_INVLPG) |
519 (1ULL << INTERCEPT_INVLPGA) | 529 (1ULL << INTERCEPT_INVLPGA) |
520 (1ULL << INTERCEPT_IOIO_PROT) | 530 (1ULL << INTERCEPT_IOIO_PROT) |
521 (1ULL << INTERCEPT_MSR_PROT) | 531 (1ULL << INTERCEPT_MSR_PROT) |
@@ -567,6 +577,7 @@ static void init_vmcb(struct vcpu_svm *svm)
567 save->dr7 = 0x400; 577 save->dr7 = 0x400;
568 save->rflags = 2; 578 save->rflags = 2;
569 save->rip = 0x0000fff0; 579 save->rip = 0x0000fff0;
580 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
570 581
571 /* 582 /*
572 * cr0 val on cpu init should be 0x60000010, we enable cpu 583 * cr0 val on cpu init should be 0x60000010, we enable cpu
@@ -579,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm)
579 if (npt_enabled) { 590 if (npt_enabled) {
580 /* Setup VMCB for Nested Paging */ 591 /* Setup VMCB for Nested Paging */
581 control->nested_ctl = 1; 592 control->nested_ctl = 1;
582 control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH); 593 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
594 (1ULL << INTERCEPT_INVLPG));
583 control->intercept_exceptions &= ~(1 << PF_VECTOR); 595 control->intercept_exceptions &= ~(1 << PF_VECTOR);
584 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| 596 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK|
585 INTERCEPT_CR3_MASK); 597 INTERCEPT_CR3_MASK);
@@ -601,10 +613,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu)
601 init_vmcb(svm); 613 init_vmcb(svm);
602 614
603 if (vcpu->vcpu_id != 0) { 615 if (vcpu->vcpu_id != 0) {
604 svm->vmcb->save.rip = 0; 616 kvm_rip_write(vcpu, 0);
605 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; 617 svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12;
606 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; 618 svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8;
607 } 619 }
620 vcpu->arch.regs_avail = ~0;
621 vcpu->arch.regs_dirty = ~0;
608 622
609 return 0; 623 return 0;
610} 624}
@@ -707,27 +721,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
707 rdtscll(vcpu->arch.host_tsc); 721 rdtscll(vcpu->arch.host_tsc);
708} 722}
709 723
710static void svm_vcpu_decache(struct kvm_vcpu *vcpu)
711{
712}
713
714static void svm_cache_regs(struct kvm_vcpu *vcpu)
715{
716 struct vcpu_svm *svm = to_svm(vcpu);
717
718 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
719 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
720 vcpu->arch.rip = svm->vmcb->save.rip;
721}
722
723static void svm_decache_regs(struct kvm_vcpu *vcpu)
724{
725 struct vcpu_svm *svm = to_svm(vcpu);
726 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
727 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
728 svm->vmcb->save.rip = vcpu->arch.rip;
729}
730
731static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 724static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
732{ 725{
733 return to_svm(vcpu)->vmcb->save.rflags; 726 return to_svm(vcpu)->vmcb->save.rflags;
@@ -869,6 +862,10 @@ set:
869static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 862static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
870{ 863{
871 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; 864 unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE;
865 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
866
867 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
868 force_new_asid(vcpu);
872 869
873 vcpu->arch.cr4 = cr4; 870 vcpu->arch.cr4 = cr4;
874 if (!npt_enabled) 871 if (!npt_enabled)
@@ -949,7 +946,9 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data)
949 946
950static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) 947static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr)
951{ 948{
952 return to_svm(vcpu)->db_regs[dr]; 949 unsigned long val = to_svm(vcpu)->db_regs[dr];
950 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
951 return val;
953} 952}
954 953
955static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, 954static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value,
@@ -997,13 +996,35 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
997 struct kvm *kvm = svm->vcpu.kvm; 996 struct kvm *kvm = svm->vcpu.kvm;
998 u64 fault_address; 997 u64 fault_address;
999 u32 error_code; 998 u32 error_code;
999 bool event_injection = false;
1000 1000
1001 if (!irqchip_in_kernel(kvm) && 1001 if (!irqchip_in_kernel(kvm) &&
1002 is_external_interrupt(exit_int_info)) 1002 is_external_interrupt(exit_int_info)) {
1003 event_injection = true;
1003 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); 1004 push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK);
1005 }
1004 1006
1005 fault_address = svm->vmcb->control.exit_info_2; 1007 fault_address = svm->vmcb->control.exit_info_2;
1006 error_code = svm->vmcb->control.exit_info_1; 1008 error_code = svm->vmcb->control.exit_info_1;
1009
1010 if (!npt_enabled)
1011 KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code,
1012 (u32)fault_address, (u32)(fault_address >> 32),
1013 handler);
1014 else
1015 KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code,
1016 (u32)fault_address, (u32)(fault_address >> 32),
1017 handler);
1018 /*
1019 * FIXME: Tis shouldn't be necessary here, but there is a flush
1020 * missing in the MMU code. Until we find this bug, flush the
1021 * complete TLB here on an NPF
1022 */
1023 if (npt_enabled)
1024 svm_flush_tlb(&svm->vcpu);
1025
1026 if (!npt_enabled && event_injection)
1027 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1007 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1028 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1008} 1029}
1009 1030
@@ -1081,6 +1102,19 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1081 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1102 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port);
1082} 1103}
1083 1104
1105static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1106{
1107 KVMTRACE_0D(NMI, &svm->vcpu, handler);
1108 return 1;
1109}
1110
1111static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1112{
1113 ++svm->vcpu.stat.irq_exits;
1114 KVMTRACE_0D(INTR, &svm->vcpu, handler);
1115 return 1;
1116}
1117
1084static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1118static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1085{ 1119{
1086 return 1; 1120 return 1;
@@ -1088,14 +1122,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1088 1122
1089static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1123static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1090{ 1124{
1091 svm->next_rip = svm->vmcb->save.rip + 1; 1125 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1092 skip_emulated_instruction(&svm->vcpu); 1126 skip_emulated_instruction(&svm->vcpu);
1093 return kvm_emulate_halt(&svm->vcpu); 1127 return kvm_emulate_halt(&svm->vcpu);
1094} 1128}
1095 1129
1096static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1130static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1097{ 1131{
1098 svm->next_rip = svm->vmcb->save.rip + 3; 1132 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1099 skip_emulated_instruction(&svm->vcpu); 1133 skip_emulated_instruction(&svm->vcpu);
1100 kvm_emulate_hypercall(&svm->vcpu); 1134 kvm_emulate_hypercall(&svm->vcpu);
1101 return 1; 1135 return 1;
@@ -1127,11 +1161,18 @@ static int task_switch_interception(struct vcpu_svm *svm,
1127 1161
1128static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1162static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1129{ 1163{
1130 svm->next_rip = svm->vmcb->save.rip + 2; 1164 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1131 kvm_emulate_cpuid(&svm->vcpu); 1165 kvm_emulate_cpuid(&svm->vcpu);
1132 return 1; 1166 return 1;
1133} 1167}
1134 1168
1169static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1170{
1171 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE)
1172 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
1173 return 1;
1174}
1175
1135static int emulate_on_interception(struct vcpu_svm *svm, 1176static int emulate_on_interception(struct vcpu_svm *svm,
1136 struct kvm_run *kvm_run) 1177 struct kvm_run *kvm_run)
1137{ 1178{
@@ -1219,9 +1260,12 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1219 if (svm_get_msr(&svm->vcpu, ecx, &data)) 1260 if (svm_get_msr(&svm->vcpu, ecx, &data))
1220 kvm_inject_gp(&svm->vcpu, 0); 1261 kvm_inject_gp(&svm->vcpu, 0);
1221 else { 1262 else {
1222 svm->vmcb->save.rax = data & 0xffffffff; 1263 KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data,
1264 (u32)(data >> 32), handler);
1265
1266 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
1223 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; 1267 svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32;
1224 svm->next_rip = svm->vmcb->save.rip + 2; 1268 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1225 skip_emulated_instruction(&svm->vcpu); 1269 skip_emulated_instruction(&svm->vcpu);
1226 } 1270 }
1227 return 1; 1271 return 1;
@@ -1284,16 +1328,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1284 case MSR_K7_EVNTSEL1: 1328 case MSR_K7_EVNTSEL1:
1285 case MSR_K7_EVNTSEL2: 1329 case MSR_K7_EVNTSEL2:
1286 case MSR_K7_EVNTSEL3: 1330 case MSR_K7_EVNTSEL3:
1331 case MSR_K7_PERFCTR0:
1332 case MSR_K7_PERFCTR1:
1333 case MSR_K7_PERFCTR2:
1334 case MSR_K7_PERFCTR3:
1287 /* 1335 /*
1288 * only support writing 0 to the performance counters for now 1336 * Just discard all writes to the performance counters; this
1289 * to make Windows happy. Should be replaced by a real 1337 * should keep both older linux and windows 64-bit guests
1290 * performance counter emulation later. 1338 * happy
1291 */ 1339 */
1292 if (data != 0) 1340 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data);
1293 goto unhandled; 1341
1294 break; 1342 break;
1295 default: 1343 default:
1296 unhandled:
1297 return kvm_set_msr_common(vcpu, ecx, data); 1344 return kvm_set_msr_common(vcpu, ecx, data);
1298 } 1345 }
1299 return 0; 1346 return 0;
@@ -1302,9 +1349,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
1302static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1349static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1303{ 1350{
1304 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 1351 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
1305 u64 data = (svm->vmcb->save.rax & -1u) 1352 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
1306 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 1353 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
1307 svm->next_rip = svm->vmcb->save.rip + 2; 1354
1355 KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32),
1356 handler);
1357
1358 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
1308 if (svm_set_msr(&svm->vcpu, ecx, data)) 1359 if (svm_set_msr(&svm->vcpu, ecx, data))
1309 kvm_inject_gp(&svm->vcpu, 0); 1360 kvm_inject_gp(&svm->vcpu, 0);
1310 else 1361 else
@@ -1323,6 +1374,8 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1323static int interrupt_window_interception(struct vcpu_svm *svm, 1374static int interrupt_window_interception(struct vcpu_svm *svm,
1324 struct kvm_run *kvm_run) 1375 struct kvm_run *kvm_run)
1325{ 1376{
1377 KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler);
1378
1326 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); 1379 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR);
1327 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 1380 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
1328 /* 1381 /*
@@ -1364,8 +1417,8 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1364 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, 1417 [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception,
1365 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, 1418 [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception,
1366 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, 1419 [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception,
1367 [SVM_EXIT_INTR] = nop_on_interception, 1420 [SVM_EXIT_INTR] = intr_interception,
1368 [SVM_EXIT_NMI] = nop_on_interception, 1421 [SVM_EXIT_NMI] = nmi_interception,
1369 [SVM_EXIT_SMI] = nop_on_interception, 1422 [SVM_EXIT_SMI] = nop_on_interception,
1370 [SVM_EXIT_INIT] = nop_on_interception, 1423 [SVM_EXIT_INIT] = nop_on_interception,
1371 [SVM_EXIT_VINTR] = interrupt_window_interception, 1424 [SVM_EXIT_VINTR] = interrupt_window_interception,
@@ -1373,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
1373 [SVM_EXIT_CPUID] = cpuid_interception, 1426 [SVM_EXIT_CPUID] = cpuid_interception,
1374 [SVM_EXIT_INVD] = emulate_on_interception, 1427 [SVM_EXIT_INVD] = emulate_on_interception,
1375 [SVM_EXIT_HLT] = halt_interception, 1428 [SVM_EXIT_HLT] = halt_interception,
1376 [SVM_EXIT_INVLPG] = emulate_on_interception, 1429 [SVM_EXIT_INVLPG] = invlpg_interception,
1377 [SVM_EXIT_INVLPGA] = invalid_op_interception, 1430 [SVM_EXIT_INVLPGA] = invalid_op_interception,
1378 [SVM_EXIT_IOIO] = io_interception, 1431 [SVM_EXIT_IOIO] = io_interception,
1379 [SVM_EXIT_MSR] = msr_interception, 1432 [SVM_EXIT_MSR] = msr_interception,
@@ -1397,6 +1450,9 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
1397 struct vcpu_svm *svm = to_svm(vcpu); 1450 struct vcpu_svm *svm = to_svm(vcpu);
1398 u32 exit_code = svm->vmcb->control.exit_code; 1451 u32 exit_code = svm->vmcb->control.exit_code;
1399 1452
1453 KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip,
1454 (u32)((u64)svm->vmcb->save.rip >> 32), entryexit);
1455
1400 if (npt_enabled) { 1456 if (npt_enabled) {
1401 int mmu_reload = 0; 1457 int mmu_reload = 0;
1402 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { 1458 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
@@ -1470,6 +1526,9 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
1470{ 1526{
1471 struct vmcb_control_area *control; 1527 struct vmcb_control_area *control;
1472 1528
1529 KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler);
1530
1531 ++svm->vcpu.stat.irq_injections;
1473 control = &svm->vmcb->control; 1532 control = &svm->vmcb->control;
1474 control->int_vector = irq; 1533 control->int_vector = irq;
1475 control->int_ctl &= ~V_INTR_PRIO_MASK; 1534 control->int_ctl &= ~V_INTR_PRIO_MASK;
@@ -1648,6 +1707,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
1648 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; 1707 svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK;
1649} 1708}
1650 1709
1710#ifdef CONFIG_X86_64
1711#define R "r"
1712#else
1713#define R "e"
1714#endif
1715
1651static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 1716static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1652{ 1717{
1653 struct vcpu_svm *svm = to_svm(vcpu); 1718 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1655,14 +1720,18 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1655 u16 gs_selector; 1720 u16 gs_selector;
1656 u16 ldt_selector; 1721 u16 ldt_selector;
1657 1722
1723 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
1724 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
1725 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
1726
1658 pre_svm_run(svm); 1727 pre_svm_run(svm);
1659 1728
1660 sync_lapic_to_cr8(vcpu); 1729 sync_lapic_to_cr8(vcpu);
1661 1730
1662 save_host_msrs(vcpu); 1731 save_host_msrs(vcpu);
1663 fs_selector = read_fs(); 1732 fs_selector = kvm_read_fs();
1664 gs_selector = read_gs(); 1733 gs_selector = kvm_read_gs();
1665 ldt_selector = read_ldt(); 1734 ldt_selector = kvm_read_ldt();
1666 svm->host_cr2 = kvm_read_cr2(); 1735 svm->host_cr2 = kvm_read_cr2();
1667 svm->host_dr6 = read_dr6(); 1736 svm->host_dr6 = read_dr6();
1668 svm->host_dr7 = read_dr7(); 1737 svm->host_dr7 = read_dr7();
@@ -1682,19 +1751,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1682 local_irq_enable(); 1751 local_irq_enable();
1683 1752
1684 asm volatile ( 1753 asm volatile (
1754 "push %%"R"bp; \n\t"
1755 "mov %c[rbx](%[svm]), %%"R"bx \n\t"
1756 "mov %c[rcx](%[svm]), %%"R"cx \n\t"
1757 "mov %c[rdx](%[svm]), %%"R"dx \n\t"
1758 "mov %c[rsi](%[svm]), %%"R"si \n\t"
1759 "mov %c[rdi](%[svm]), %%"R"di \n\t"
1760 "mov %c[rbp](%[svm]), %%"R"bp \n\t"
1685#ifdef CONFIG_X86_64 1761#ifdef CONFIG_X86_64
1686 "push %%rbp; \n\t"
1687#else
1688 "push %%ebp; \n\t"
1689#endif
1690
1691#ifdef CONFIG_X86_64
1692 "mov %c[rbx](%[svm]), %%rbx \n\t"
1693 "mov %c[rcx](%[svm]), %%rcx \n\t"
1694 "mov %c[rdx](%[svm]), %%rdx \n\t"
1695 "mov %c[rsi](%[svm]), %%rsi \n\t"
1696 "mov %c[rdi](%[svm]), %%rdi \n\t"
1697 "mov %c[rbp](%[svm]), %%rbp \n\t"
1698 "mov %c[r8](%[svm]), %%r8 \n\t" 1762 "mov %c[r8](%[svm]), %%r8 \n\t"
1699 "mov %c[r9](%[svm]), %%r9 \n\t" 1763 "mov %c[r9](%[svm]), %%r9 \n\t"
1700 "mov %c[r10](%[svm]), %%r10 \n\t" 1764 "mov %c[r10](%[svm]), %%r10 \n\t"
@@ -1703,41 +1767,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1703 "mov %c[r13](%[svm]), %%r13 \n\t" 1767 "mov %c[r13](%[svm]), %%r13 \n\t"
1704 "mov %c[r14](%[svm]), %%r14 \n\t" 1768 "mov %c[r14](%[svm]), %%r14 \n\t"
1705 "mov %c[r15](%[svm]), %%r15 \n\t" 1769 "mov %c[r15](%[svm]), %%r15 \n\t"
1706#else
1707 "mov %c[rbx](%[svm]), %%ebx \n\t"
1708 "mov %c[rcx](%[svm]), %%ecx \n\t"
1709 "mov %c[rdx](%[svm]), %%edx \n\t"
1710 "mov %c[rsi](%[svm]), %%esi \n\t"
1711 "mov %c[rdi](%[svm]), %%edi \n\t"
1712 "mov %c[rbp](%[svm]), %%ebp \n\t"
1713#endif 1770#endif
1714 1771
1715#ifdef CONFIG_X86_64
1716 /* Enter guest mode */
1717 "push %%rax \n\t"
1718 "mov %c[vmcb](%[svm]), %%rax \n\t"
1719 SVM_VMLOAD "\n\t"
1720 SVM_VMRUN "\n\t"
1721 SVM_VMSAVE "\n\t"
1722 "pop %%rax \n\t"
1723#else
1724 /* Enter guest mode */ 1772 /* Enter guest mode */
1725 "push %%eax \n\t" 1773 "push %%"R"ax \n\t"
1726 "mov %c[vmcb](%[svm]), %%eax \n\t" 1774 "mov %c[vmcb](%[svm]), %%"R"ax \n\t"
1727 SVM_VMLOAD "\n\t" 1775 __ex(SVM_VMLOAD) "\n\t"
1728 SVM_VMRUN "\n\t" 1776 __ex(SVM_VMRUN) "\n\t"
1729 SVM_VMSAVE "\n\t" 1777 __ex(SVM_VMSAVE) "\n\t"
1730 "pop %%eax \n\t" 1778 "pop %%"R"ax \n\t"
1731#endif
1732 1779
1733 /* Save guest registers, load host registers */ 1780 /* Save guest registers, load host registers */
1781 "mov %%"R"bx, %c[rbx](%[svm]) \n\t"
1782 "mov %%"R"cx, %c[rcx](%[svm]) \n\t"
1783 "mov %%"R"dx, %c[rdx](%[svm]) \n\t"
1784 "mov %%"R"si, %c[rsi](%[svm]) \n\t"
1785 "mov %%"R"di, %c[rdi](%[svm]) \n\t"
1786 "mov %%"R"bp, %c[rbp](%[svm]) \n\t"
1734#ifdef CONFIG_X86_64 1787#ifdef CONFIG_X86_64
1735 "mov %%rbx, %c[rbx](%[svm]) \n\t"
1736 "mov %%rcx, %c[rcx](%[svm]) \n\t"
1737 "mov %%rdx, %c[rdx](%[svm]) \n\t"
1738 "mov %%rsi, %c[rsi](%[svm]) \n\t"
1739 "mov %%rdi, %c[rdi](%[svm]) \n\t"
1740 "mov %%rbp, %c[rbp](%[svm]) \n\t"
1741 "mov %%r8, %c[r8](%[svm]) \n\t" 1788 "mov %%r8, %c[r8](%[svm]) \n\t"
1742 "mov %%r9, %c[r9](%[svm]) \n\t" 1789 "mov %%r9, %c[r9](%[svm]) \n\t"
1743 "mov %%r10, %c[r10](%[svm]) \n\t" 1790 "mov %%r10, %c[r10](%[svm]) \n\t"
@@ -1746,18 +1793,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1746 "mov %%r13, %c[r13](%[svm]) \n\t" 1793 "mov %%r13, %c[r13](%[svm]) \n\t"
1747 "mov %%r14, %c[r14](%[svm]) \n\t" 1794 "mov %%r14, %c[r14](%[svm]) \n\t"
1748 "mov %%r15, %c[r15](%[svm]) \n\t" 1795 "mov %%r15, %c[r15](%[svm]) \n\t"
1749
1750 "pop %%rbp; \n\t"
1751#else
1752 "mov %%ebx, %c[rbx](%[svm]) \n\t"
1753 "mov %%ecx, %c[rcx](%[svm]) \n\t"
1754 "mov %%edx, %c[rdx](%[svm]) \n\t"
1755 "mov %%esi, %c[rsi](%[svm]) \n\t"
1756 "mov %%edi, %c[rdi](%[svm]) \n\t"
1757 "mov %%ebp, %c[rbp](%[svm]) \n\t"
1758
1759 "pop %%ebp; \n\t"
1760#endif 1796#endif
1797 "pop %%"R"bp"
1761 : 1798 :
1762 : [svm]"a"(svm), 1799 : [svm]"a"(svm),
1763 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), 1800 [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)),
@@ -1778,11 +1815,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1778 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) 1815 [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15]))
1779#endif 1816#endif
1780 : "cc", "memory" 1817 : "cc", "memory"
1818 , R"bx", R"cx", R"dx", R"si", R"di"
1781#ifdef CONFIG_X86_64 1819#ifdef CONFIG_X86_64
1782 , "rbx", "rcx", "rdx", "rsi", "rdi"
1783 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" 1820 , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15"
1784#else
1785 , "ebx", "ecx", "edx" , "esi", "edi"
1786#endif 1821#endif
1787 ); 1822 );
1788 1823
@@ -1790,14 +1825,17 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1790 load_db_regs(svm->host_db_regs); 1825 load_db_regs(svm->host_db_regs);
1791 1826
1792 vcpu->arch.cr2 = svm->vmcb->save.cr2; 1827 vcpu->arch.cr2 = svm->vmcb->save.cr2;
1828 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
1829 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
1830 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
1793 1831
1794 write_dr6(svm->host_dr6); 1832 write_dr6(svm->host_dr6);
1795 write_dr7(svm->host_dr7); 1833 write_dr7(svm->host_dr7);
1796 kvm_write_cr2(svm->host_cr2); 1834 kvm_write_cr2(svm->host_cr2);
1797 1835
1798 load_fs(fs_selector); 1836 kvm_load_fs(fs_selector);
1799 load_gs(gs_selector); 1837 kvm_load_gs(gs_selector);
1800 load_ldt(ldt_selector); 1838 kvm_load_ldt(ldt_selector);
1801 load_host_msrs(vcpu); 1839 load_host_msrs(vcpu);
1802 1840
1803 reload_tss(vcpu); 1841 reload_tss(vcpu);
@@ -1811,6 +1849,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
1811 svm->next_rip = 0; 1849 svm->next_rip = 0;
1812} 1850}
1813 1851
1852#undef R
1853
1814static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) 1854static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
1815{ 1855{
1816 struct vcpu_svm *svm = to_svm(vcpu); 1856 struct vcpu_svm *svm = to_svm(vcpu);
@@ -1889,7 +1929,6 @@ static struct kvm_x86_ops svm_x86_ops = {
1889 .prepare_guest_switch = svm_prepare_guest_switch, 1929 .prepare_guest_switch = svm_prepare_guest_switch,
1890 .vcpu_load = svm_vcpu_load, 1930 .vcpu_load = svm_vcpu_load,
1891 .vcpu_put = svm_vcpu_put, 1931 .vcpu_put = svm_vcpu_put,
1892 .vcpu_decache = svm_vcpu_decache,
1893 1932
1894 .set_guest_debug = svm_guest_debug, 1933 .set_guest_debug = svm_guest_debug,
1895 .get_msr = svm_get_msr, 1934 .get_msr = svm_get_msr,
@@ -1910,8 +1949,6 @@ static struct kvm_x86_ops svm_x86_ops = {
1910 .set_gdt = svm_set_gdt, 1949 .set_gdt = svm_set_gdt,
1911 .get_dr = svm_get_dr, 1950 .get_dr = svm_get_dr,
1912 .set_dr = svm_set_dr, 1951 .set_dr = svm_set_dr,
1913 .cache_regs = svm_cache_regs,
1914 .decache_regs = svm_decache_regs,
1915 .get_rflags = svm_get_rflags, 1952 .get_rflags = svm_get_rflags,
1916 .set_rflags = svm_set_rflags, 1953 .set_rflags = svm_set_rflags,
1917 1954
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 10ce6ee4c491..2643b430d83a 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -26,10 +26,14 @@
26#include <linux/highmem.h> 26#include <linux/highmem.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/moduleparam.h> 28#include <linux/moduleparam.h>
29#include "kvm_cache_regs.h"
30#include "x86.h"
29 31
30#include <asm/io.h> 32#include <asm/io.h>
31#include <asm/desc.h> 33#include <asm/desc.h>
32 34
35#define __ex(x) __kvm_handle_fault_on_reboot(x)
36
33MODULE_AUTHOR("Qumranet"); 37MODULE_AUTHOR("Qumranet");
34MODULE_LICENSE("GPL"); 38MODULE_LICENSE("GPL");
35 39
@@ -45,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0);
45static int enable_ept = 1; 49static int enable_ept = 1;
46module_param(enable_ept, bool, 0); 50module_param(enable_ept, bool, 0);
47 51
52static int emulate_invalid_guest_state = 0;
53module_param(emulate_invalid_guest_state, bool, 0);
54
48struct vmcs { 55struct vmcs {
49 u32 revision_id; 56 u32 revision_id;
50 u32 abort; 57 u32 abort;
@@ -53,6 +60,8 @@ struct vmcs {
53 60
54struct vcpu_vmx { 61struct vcpu_vmx {
55 struct kvm_vcpu vcpu; 62 struct kvm_vcpu vcpu;
63 struct list_head local_vcpus_link;
64 unsigned long host_rsp;
56 int launched; 65 int launched;
57 u8 fail; 66 u8 fail;
58 u32 idt_vectoring_info; 67 u32 idt_vectoring_info;
@@ -80,6 +89,7 @@ struct vcpu_vmx {
80 } irq; 89 } irq;
81 } rmode; 90 } rmode;
82 int vpid; 91 int vpid;
92 bool emulation_required;
83}; 93};
84 94
85static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 95static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -88,9 +98,11 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
88} 98}
89 99
90static int init_rmode(struct kvm *kvm); 100static int init_rmode(struct kvm *kvm);
101static u64 construct_eptp(unsigned long root_hpa);
91 102
92static DEFINE_PER_CPU(struct vmcs *, vmxarea); 103static DEFINE_PER_CPU(struct vmcs *, vmxarea);
93static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 104static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
105static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu);
94 106
95static struct page *vmx_io_bitmap_a; 107static struct page *vmx_io_bitmap_a;
96static struct page *vmx_io_bitmap_b; 108static struct page *vmx_io_bitmap_b;
@@ -260,6 +272,11 @@ static inline int cpu_has_vmx_vpid(void)
260 SECONDARY_EXEC_ENABLE_VPID); 272 SECONDARY_EXEC_ENABLE_VPID);
261} 273}
262 274
275static inline int cpu_has_virtual_nmis(void)
276{
277 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
278}
279
263static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) 280static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
264{ 281{
265 int i; 282 int i;
@@ -278,7 +295,7 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva)
278 u64 gva; 295 u64 gva;
279 } operand = { vpid, 0, gva }; 296 } operand = { vpid, 0, gva };
280 297
281 asm volatile (ASM_VMX_INVVPID 298 asm volatile (__ex(ASM_VMX_INVVPID)
282 /* CF==1 or ZF==1 --> rc = -1 */ 299 /* CF==1 or ZF==1 --> rc = -1 */
283 "; ja 1f ; ud2 ; 1:" 300 "; ja 1f ; ud2 ; 1:"
284 : : "a"(&operand), "c"(ext) : "cc", "memory"); 301 : : "a"(&operand), "c"(ext) : "cc", "memory");
@@ -290,7 +307,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
290 u64 eptp, gpa; 307 u64 eptp, gpa;
291 } operand = {eptp, gpa}; 308 } operand = {eptp, gpa};
292 309
293 asm volatile (ASM_VMX_INVEPT 310 asm volatile (__ex(ASM_VMX_INVEPT)
294 /* CF==1 or ZF==1 --> rc = -1 */ 311 /* CF==1 or ZF==1 --> rc = -1 */
295 "; ja 1f ; ud2 ; 1:\n" 312 "; ja 1f ; ud2 ; 1:\n"
296 : : "a" (&operand), "c" (ext) : "cc", "memory"); 313 : : "a" (&operand), "c" (ext) : "cc", "memory");
@@ -311,7 +328,7 @@ static void vmcs_clear(struct vmcs *vmcs)
311 u64 phys_addr = __pa(vmcs); 328 u64 phys_addr = __pa(vmcs);
312 u8 error; 329 u8 error;
313 330
314 asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" 331 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
315 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 332 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
316 : "cc", "memory"); 333 : "cc", "memory");
317 if (error) 334 if (error)
@@ -329,6 +346,9 @@ static void __vcpu_clear(void *arg)
329 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 346 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
330 per_cpu(current_vmcs, cpu) = NULL; 347 per_cpu(current_vmcs, cpu) = NULL;
331 rdtscll(vmx->vcpu.arch.host_tsc); 348 rdtscll(vmx->vcpu.arch.host_tsc);
349 list_del(&vmx->local_vcpus_link);
350 vmx->vcpu.cpu = -1;
351 vmx->launched = 0;
332} 352}
333 353
334static void vcpu_clear(struct vcpu_vmx *vmx) 354static void vcpu_clear(struct vcpu_vmx *vmx)
@@ -336,7 +356,6 @@ static void vcpu_clear(struct vcpu_vmx *vmx)
336 if (vmx->vcpu.cpu == -1) 356 if (vmx->vcpu.cpu == -1)
337 return; 357 return;
338 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); 358 smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1);
339 vmx->launched = 0;
340} 359}
341 360
342static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) 361static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx)
@@ -378,7 +397,7 @@ static unsigned long vmcs_readl(unsigned long field)
378{ 397{
379 unsigned long value; 398 unsigned long value;
380 399
381 asm volatile (ASM_VMX_VMREAD_RDX_RAX 400 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
382 : "=a"(value) : "d"(field) : "cc"); 401 : "=a"(value) : "d"(field) : "cc");
383 return value; 402 return value;
384} 403}
@@ -413,7 +432,7 @@ static void vmcs_writel(unsigned long field, unsigned long value)
413{ 432{
414 u8 error; 433 u8 error;
415 434
416 asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" 435 asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0"
417 : "=q"(error) : "a"(value), "d"(field) : "cc"); 436 : "=q"(error) : "a"(value), "d"(field) : "cc");
418 if (unlikely(error)) 437 if (unlikely(error))
419 vmwrite_error(field, value); 438 vmwrite_error(field, value);
@@ -431,10 +450,8 @@ static void vmcs_write32(unsigned long field, u32 value)
431 450
432static void vmcs_write64(unsigned long field, u64 value) 451static void vmcs_write64(unsigned long field, u64 value)
433{ 452{
434#ifdef CONFIG_X86_64
435 vmcs_writel(field, value);
436#else
437 vmcs_writel(field, value); 453 vmcs_writel(field, value);
454#ifndef CONFIG_X86_64
438 asm volatile (""); 455 asm volatile ("");
439 vmcs_writel(field+1, value >> 32); 456 vmcs_writel(field+1, value >> 32);
440#endif 457#endif
@@ -458,7 +475,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
458 if (!vcpu->fpu_active) 475 if (!vcpu->fpu_active)
459 eb |= 1u << NM_VECTOR; 476 eb |= 1u << NM_VECTOR;
460 if (vcpu->guest_debug.enabled) 477 if (vcpu->guest_debug.enabled)
461 eb |= 1u << 1; 478 eb |= 1u << DB_VECTOR;
462 if (vcpu->arch.rmode.active) 479 if (vcpu->arch.rmode.active)
463 eb = ~0; 480 eb = ~0;
464 if (vm_need_ept()) 481 if (vm_need_ept())
@@ -474,7 +491,7 @@ static void reload_tss(void)
474 struct descriptor_table gdt; 491 struct descriptor_table gdt;
475 struct desc_struct *descs; 492 struct desc_struct *descs;
476 493
477 get_gdt(&gdt); 494 kvm_get_gdt(&gdt);
478 descs = (void *)gdt.base; 495 descs = (void *)gdt.base;
479 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 496 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
480 load_TR_desc(); 497 load_TR_desc();
@@ -530,9 +547,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
530 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not 547 * Set host fs and gs selectors. Unfortunately, 22.2.3 does not
531 * allow segment selectors with cpl > 0 or ti == 1. 548 * allow segment selectors with cpl > 0 or ti == 1.
532 */ 549 */
533 vmx->host_state.ldt_sel = read_ldt(); 550 vmx->host_state.ldt_sel = kvm_read_ldt();
534 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; 551 vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel;
535 vmx->host_state.fs_sel = read_fs(); 552 vmx->host_state.fs_sel = kvm_read_fs();
536 if (!(vmx->host_state.fs_sel & 7)) { 553 if (!(vmx->host_state.fs_sel & 7)) {
537 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); 554 vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel);
538 vmx->host_state.fs_reload_needed = 0; 555 vmx->host_state.fs_reload_needed = 0;
@@ -540,7 +557,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
540 vmcs_write16(HOST_FS_SELECTOR, 0); 557 vmcs_write16(HOST_FS_SELECTOR, 0);
541 vmx->host_state.fs_reload_needed = 1; 558 vmx->host_state.fs_reload_needed = 1;
542 } 559 }
543 vmx->host_state.gs_sel = read_gs(); 560 vmx->host_state.gs_sel = kvm_read_gs();
544 if (!(vmx->host_state.gs_sel & 7)) 561 if (!(vmx->host_state.gs_sel & 7))
545 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); 562 vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel);
546 else { 563 else {
@@ -576,15 +593,15 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
576 ++vmx->vcpu.stat.host_state_reload; 593 ++vmx->vcpu.stat.host_state_reload;
577 vmx->host_state.loaded = 0; 594 vmx->host_state.loaded = 0;
578 if (vmx->host_state.fs_reload_needed) 595 if (vmx->host_state.fs_reload_needed)
579 load_fs(vmx->host_state.fs_sel); 596 kvm_load_fs(vmx->host_state.fs_sel);
580 if (vmx->host_state.gs_ldt_reload_needed) { 597 if (vmx->host_state.gs_ldt_reload_needed) {
581 load_ldt(vmx->host_state.ldt_sel); 598 kvm_load_ldt(vmx->host_state.ldt_sel);
582 /* 599 /*
583 * If we have to reload gs, we must take care to 600 * If we have to reload gs, we must take care to
584 * preserve our gs base. 601 * preserve our gs base.
585 */ 602 */
586 local_irq_save(flags); 603 local_irq_save(flags);
587 load_gs(vmx->host_state.gs_sel); 604 kvm_load_gs(vmx->host_state.gs_sel);
588#ifdef CONFIG_X86_64 605#ifdef CONFIG_X86_64
589 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); 606 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE));
590#endif 607#endif
@@ -617,13 +634,17 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
617 vcpu_clear(vmx); 634 vcpu_clear(vmx);
618 kvm_migrate_timers(vcpu); 635 kvm_migrate_timers(vcpu);
619 vpid_sync_vcpu_all(vmx); 636 vpid_sync_vcpu_all(vmx);
637 local_irq_disable();
638 list_add(&vmx->local_vcpus_link,
639 &per_cpu(vcpus_on_cpu, cpu));
640 local_irq_enable();
620 } 641 }
621 642
622 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { 643 if (per_cpu(current_vmcs, cpu) != vmx->vmcs) {
623 u8 error; 644 u8 error;
624 645
625 per_cpu(current_vmcs, cpu) = vmx->vmcs; 646 per_cpu(current_vmcs, cpu) = vmx->vmcs;
626 asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" 647 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
627 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 648 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr)
628 : "cc"); 649 : "cc");
629 if (error) 650 if (error)
@@ -640,8 +661,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
640 * Linux uses per-cpu TSS and GDT, so set these when switching 661 * Linux uses per-cpu TSS and GDT, so set these when switching
641 * processors. 662 * processors.
642 */ 663 */
643 vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ 664 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
644 get_gdt(&dt); 665 kvm_get_gdt(&dt);
645 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ 666 vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */
646 667
647 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 668 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
@@ -684,11 +705,6 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
684 update_exception_bitmap(vcpu); 705 update_exception_bitmap(vcpu);
685} 706}
686 707
687static void vmx_vcpu_decache(struct kvm_vcpu *vcpu)
688{
689 vcpu_clear(to_vmx(vcpu));
690}
691
692static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 708static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
693{ 709{
694 return vmcs_readl(GUEST_RFLAGS); 710 return vmcs_readl(GUEST_RFLAGS);
@@ -706,9 +722,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
706 unsigned long rip; 722 unsigned long rip;
707 u32 interruptibility; 723 u32 interruptibility;
708 724
709 rip = vmcs_readl(GUEST_RIP); 725 rip = kvm_rip_read(vcpu);
710 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 726 rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
711 vmcs_writel(GUEST_RIP, rip); 727 kvm_rip_write(vcpu, rip);
712 728
713 /* 729 /*
714 * We emulated an instruction, so temporary interrupt blocking 730 * We emulated an instruction, so temporary interrupt blocking
@@ -724,19 +740,35 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
724static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 740static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
725 bool has_error_code, u32 error_code) 741 bool has_error_code, u32 error_code)
726{ 742{
743 struct vcpu_vmx *vmx = to_vmx(vcpu);
744
745 if (has_error_code)
746 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
747
748 if (vcpu->arch.rmode.active) {
749 vmx->rmode.irq.pending = true;
750 vmx->rmode.irq.vector = nr;
751 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
752 if (nr == BP_VECTOR)
753 vmx->rmode.irq.rip++;
754 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
755 nr | INTR_TYPE_SOFT_INTR
756 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
757 | INTR_INFO_VALID_MASK);
758 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
759 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
760 return;
761 }
762
727 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 763 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
728 nr | INTR_TYPE_EXCEPTION 764 nr | INTR_TYPE_EXCEPTION
729 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) 765 | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0)
730 | INTR_INFO_VALID_MASK); 766 | INTR_INFO_VALID_MASK);
731 if (has_error_code)
732 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
733} 767}
734 768
735static bool vmx_exception_injected(struct kvm_vcpu *vcpu) 769static bool vmx_exception_injected(struct kvm_vcpu *vcpu)
736{ 770{
737 struct vcpu_vmx *vmx = to_vmx(vcpu); 771 return false;
738
739 return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
740} 772}
741 773
742/* 774/*
@@ -913,6 +945,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
913 case MSR_IA32_TIME_STAMP_COUNTER: 945 case MSR_IA32_TIME_STAMP_COUNTER:
914 guest_write_tsc(data); 946 guest_write_tsc(data);
915 break; 947 break;
948 case MSR_P6_PERFCTR0:
949 case MSR_P6_PERFCTR1:
950 case MSR_P6_EVNTSEL0:
951 case MSR_P6_EVNTSEL1:
952 /*
953 * Just discard all writes to the performance counters; this
954 * should keep both older linux and windows 64-bit guests
955 * happy
956 */
957 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data);
958
959 break;
916 default: 960 default:
917 vmx_load_host_state(vmx); 961 vmx_load_host_state(vmx);
918 msr = find_msr_entry(vmx, msr_index); 962 msr = find_msr_entry(vmx, msr_index);
@@ -926,24 +970,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
926 return ret; 970 return ret;
927} 971}
928 972
929/* 973static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
930 * Sync the rsp and rip registers into the vcpu structure. This allows
931 * registers to be accessed by indexing vcpu->arch.regs.
932 */
933static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu)
934{
935 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
936 vcpu->arch.rip = vmcs_readl(GUEST_RIP);
937}
938
939/*
940 * Syncs rsp and rip back into the vmcs. Should be called after possible
941 * modification.
942 */
943static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu)
944{ 974{
945 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 975 __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
946 vmcs_writel(GUEST_RIP, vcpu->arch.rip); 976 switch (reg) {
977 case VCPU_REGS_RSP:
978 vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
979 break;
980 case VCPU_REGS_RIP:
981 vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP);
982 break;
983 default:
984 break;
985 }
947} 986}
948 987
949static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) 988static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
@@ -986,17 +1025,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg)
986 1025
987static int vmx_get_irq(struct kvm_vcpu *vcpu) 1026static int vmx_get_irq(struct kvm_vcpu *vcpu)
988{ 1027{
989 struct vcpu_vmx *vmx = to_vmx(vcpu); 1028 if (!vcpu->arch.interrupt.pending)
990 u32 idtv_info_field; 1029 return -1;
991 1030 return vcpu->arch.interrupt.nr;
992 idtv_info_field = vmx->idt_vectoring_info;
993 if (idtv_info_field & INTR_INFO_VALID_MASK) {
994 if (is_external_interrupt(idtv_info_field))
995 return idtv_info_field & VECTORING_INFO_VECTOR_MASK;
996 else
997 printk(KERN_DEBUG "pending exception: not handled yet\n");
998 }
999 return -1;
1000} 1031}
1001 1032
1002static __init int cpu_has_kvm_support(void) 1033static __init int cpu_has_kvm_support(void)
@@ -1010,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void)
1010 u64 msr; 1041 u64 msr;
1011 1042
1012 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 1043 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1013 return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | 1044 return (msr & (FEATURE_CONTROL_LOCKED |
1014 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1045 FEATURE_CONTROL_VMXON_ENABLED))
1015 == MSR_IA32_FEATURE_CONTROL_LOCKED; 1046 == FEATURE_CONTROL_LOCKED;
1016 /* locked but not enabled */ 1047 /* locked but not enabled */
1017} 1048}
1018 1049
@@ -1022,23 +1053,36 @@ static void hardware_enable(void *garbage)
1022 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1053 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1023 u64 old; 1054 u64 old;
1024 1055
1056 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1025 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1057 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1026 if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | 1058 if ((old & (FEATURE_CONTROL_LOCKED |
1027 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1059 FEATURE_CONTROL_VMXON_ENABLED))
1028 != (MSR_IA32_FEATURE_CONTROL_LOCKED | 1060 != (FEATURE_CONTROL_LOCKED |
1029 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) 1061 FEATURE_CONTROL_VMXON_ENABLED))
1030 /* enable and lock */ 1062 /* enable and lock */
1031 wrmsrl(MSR_IA32_FEATURE_CONTROL, old | 1063 wrmsrl(MSR_IA32_FEATURE_CONTROL, old |
1032 MSR_IA32_FEATURE_CONTROL_LOCKED | 1064 FEATURE_CONTROL_LOCKED |
1033 MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); 1065 FEATURE_CONTROL_VMXON_ENABLED);
1034 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ 1066 write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */
1035 asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) 1067 asm volatile (ASM_VMX_VMXON_RAX
1068 : : "a"(&phys_addr), "m"(phys_addr)
1036 : "memory", "cc"); 1069 : "memory", "cc");
1037} 1070}
1038 1071
1072static void vmclear_local_vcpus(void)
1073{
1074 int cpu = raw_smp_processor_id();
1075 struct vcpu_vmx *vmx, *n;
1076
1077 list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu),
1078 local_vcpus_link)
1079 __vcpu_clear(vmx);
1080}
1081
1039static void hardware_disable(void *garbage) 1082static void hardware_disable(void *garbage)
1040{ 1083{
1041 asm volatile (ASM_VMX_VMXOFF : : : "cc"); 1084 vmclear_local_vcpus();
1085 asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc");
1042 write_cr4(read_cr4() & ~X86_CR4_VMXE); 1086 write_cr4(read_cr4() & ~X86_CR4_VMXE);
1043} 1087}
1044 1088
@@ -1072,7 +1116,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1072 u32 _vmentry_control = 0; 1116 u32 _vmentry_control = 0;
1073 1117
1074 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; 1118 min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
1075 opt = 0; 1119 opt = PIN_BASED_VIRTUAL_NMIS;
1076 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, 1120 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
1077 &_pin_based_exec_control) < 0) 1121 &_pin_based_exec_control) < 0)
1078 return -EIO; 1122 return -EIO;
@@ -1086,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1086 CPU_BASED_CR3_STORE_EXITING | 1130 CPU_BASED_CR3_STORE_EXITING |
1087 CPU_BASED_USE_IO_BITMAPS | 1131 CPU_BASED_USE_IO_BITMAPS |
1088 CPU_BASED_MOV_DR_EXITING | 1132 CPU_BASED_MOV_DR_EXITING |
1089 CPU_BASED_USE_TSC_OFFSETING; 1133 CPU_BASED_USE_TSC_OFFSETING |
1134 CPU_BASED_INVLPG_EXITING;
1090 opt = CPU_BASED_TPR_SHADOW | 1135 opt = CPU_BASED_TPR_SHADOW |
1091 CPU_BASED_USE_MSR_BITMAPS | 1136 CPU_BASED_USE_MSR_BITMAPS |
1092 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1137 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1115,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1115 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; 1160 _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
1116#endif 1161#endif
1117 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { 1162 if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) {
1118 /* CR3 accesses don't need to cause VM Exits when EPT enabled */ 1163 /* CR3 accesses and invlpg don't need to cause VM Exits when EPT
1164 enabled */
1119 min &= ~(CPU_BASED_CR3_LOAD_EXITING | 1165 min &= ~(CPU_BASED_CR3_LOAD_EXITING |
1120 CPU_BASED_CR3_STORE_EXITING); 1166 CPU_BASED_CR3_STORE_EXITING |
1167 CPU_BASED_INVLPG_EXITING);
1121 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, 1168 if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS,
1122 &_cpu_based_exec_control) < 0) 1169 &_cpu_based_exec_control) < 0)
1123 return -EIO; 1170 return -EIO;
@@ -1254,7 +1301,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save)
1254static void enter_pmode(struct kvm_vcpu *vcpu) 1301static void enter_pmode(struct kvm_vcpu *vcpu)
1255{ 1302{
1256 unsigned long flags; 1303 unsigned long flags;
1304 struct vcpu_vmx *vmx = to_vmx(vcpu);
1257 1305
1306 vmx->emulation_required = 1;
1258 vcpu->arch.rmode.active = 0; 1307 vcpu->arch.rmode.active = 0;
1259 1308
1260 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); 1309 vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base);
@@ -1271,6 +1320,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1271 1320
1272 update_exception_bitmap(vcpu); 1321 update_exception_bitmap(vcpu);
1273 1322
1323 if (emulate_invalid_guest_state)
1324 return;
1325
1274 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); 1326 fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es);
1275 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); 1327 fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds);
1276 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); 1328 fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
@@ -1311,7 +1363,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1311static void enter_rmode(struct kvm_vcpu *vcpu) 1363static void enter_rmode(struct kvm_vcpu *vcpu)
1312{ 1364{
1313 unsigned long flags; 1365 unsigned long flags;
1366 struct vcpu_vmx *vmx = to_vmx(vcpu);
1314 1367
1368 vmx->emulation_required = 1;
1315 vcpu->arch.rmode.active = 1; 1369 vcpu->arch.rmode.active = 1;
1316 1370
1317 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1371 vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
@@ -1333,6 +1387,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1333 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); 1387 vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME);
1334 update_exception_bitmap(vcpu); 1388 update_exception_bitmap(vcpu);
1335 1389
1390 if (emulate_invalid_guest_state)
1391 goto continue_rmode;
1392
1336 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); 1393 vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4);
1337 vmcs_write32(GUEST_SS_LIMIT, 0xffff); 1394 vmcs_write32(GUEST_SS_LIMIT, 0xffff);
1338 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); 1395 vmcs_write32(GUEST_SS_AR_BYTES, 0xf3);
@@ -1348,6 +1405,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1348 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); 1405 fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs);
1349 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); 1406 fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs);
1350 1407
1408continue_rmode:
1351 kvm_mmu_reset_context(vcpu); 1409 kvm_mmu_reset_context(vcpu);
1352 init_rmode(vcpu->kvm); 1410 init_rmode(vcpu->kvm);
1353} 1411}
@@ -1389,6 +1447,8 @@ static void exit_lmode(struct kvm_vcpu *vcpu)
1389static void vmx_flush_tlb(struct kvm_vcpu *vcpu) 1447static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1390{ 1448{
1391 vpid_sync_vcpu_all(to_vmx(vcpu)); 1449 vpid_sync_vcpu_all(to_vmx(vcpu));
1450 if (vm_need_ept())
1451 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1392} 1452}
1393 1453
1394static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1454static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
@@ -1420,7 +1480,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1420 if (!(cr0 & X86_CR0_PG)) { 1480 if (!(cr0 & X86_CR0_PG)) {
1421 /* From paging/starting to nonpaging */ 1481 /* From paging/starting to nonpaging */
1422 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1482 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1423 vmcs_config.cpu_based_exec_ctrl | 1483 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) |
1424 (CPU_BASED_CR3_LOAD_EXITING | 1484 (CPU_BASED_CR3_LOAD_EXITING |
1425 CPU_BASED_CR3_STORE_EXITING)); 1485 CPU_BASED_CR3_STORE_EXITING));
1426 vcpu->arch.cr0 = cr0; 1486 vcpu->arch.cr0 = cr0;
@@ -1430,7 +1490,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1430 } else if (!is_paging(vcpu)) { 1490 } else if (!is_paging(vcpu)) {
1431 /* From nonpaging to paging */ 1491 /* From nonpaging to paging */
1432 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1492 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
1433 vmcs_config.cpu_based_exec_ctrl & 1493 vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) &
1434 ~(CPU_BASED_CR3_LOAD_EXITING | 1494 ~(CPU_BASED_CR3_LOAD_EXITING |
1435 CPU_BASED_CR3_STORE_EXITING)); 1495 CPU_BASED_CR3_STORE_EXITING));
1436 vcpu->arch.cr0 = cr0; 1496 vcpu->arch.cr0 = cr0;
@@ -1679,6 +1739,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
1679 vmcs_writel(GUEST_GDTR_BASE, dt->base); 1739 vmcs_writel(GUEST_GDTR_BASE, dt->base);
1680} 1740}
1681 1741
1742static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg)
1743{
1744 struct kvm_segment var;
1745 u32 ar;
1746
1747 vmx_get_segment(vcpu, &var, seg);
1748 ar = vmx_segment_access_rights(&var);
1749
1750 if (var.base != (var.selector << 4))
1751 return false;
1752 if (var.limit != 0xffff)
1753 return false;
1754 if (ar != 0xf3)
1755 return false;
1756
1757 return true;
1758}
1759
1760static bool code_segment_valid(struct kvm_vcpu *vcpu)
1761{
1762 struct kvm_segment cs;
1763 unsigned int cs_rpl;
1764
1765 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1766 cs_rpl = cs.selector & SELECTOR_RPL_MASK;
1767
1768 if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK))
1769 return false;
1770 if (!cs.s)
1771 return false;
1772 if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) {
1773 if (cs.dpl > cs_rpl)
1774 return false;
1775 } else if (cs.type & AR_TYPE_CODE_MASK) {
1776 if (cs.dpl != cs_rpl)
1777 return false;
1778 }
1779 if (!cs.present)
1780 return false;
1781
1782 /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */
1783 return true;
1784}
1785
1786static bool stack_segment_valid(struct kvm_vcpu *vcpu)
1787{
1788 struct kvm_segment ss;
1789 unsigned int ss_rpl;
1790
1791 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1792 ss_rpl = ss.selector & SELECTOR_RPL_MASK;
1793
1794 if ((ss.type != 3) || (ss.type != 7))
1795 return false;
1796 if (!ss.s)
1797 return false;
1798 if (ss.dpl != ss_rpl) /* DPL != RPL */
1799 return false;
1800 if (!ss.present)
1801 return false;
1802
1803 return true;
1804}
1805
1806static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg)
1807{
1808 struct kvm_segment var;
1809 unsigned int rpl;
1810
1811 vmx_get_segment(vcpu, &var, seg);
1812 rpl = var.selector & SELECTOR_RPL_MASK;
1813
1814 if (!var.s)
1815 return false;
1816 if (!var.present)
1817 return false;
1818 if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) {
1819 if (var.dpl < rpl) /* DPL < RPL */
1820 return false;
1821 }
1822
1823 /* TODO: Add other members to kvm_segment_field to allow checking for other access
1824 * rights flags
1825 */
1826 return true;
1827}
1828
1829static bool tr_valid(struct kvm_vcpu *vcpu)
1830{
1831 struct kvm_segment tr;
1832
1833 vmx_get_segment(vcpu, &tr, VCPU_SREG_TR);
1834
1835 if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */
1836 return false;
1837 if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */
1838 return false;
1839 if (!tr.present)
1840 return false;
1841
1842 return true;
1843}
1844
1845static bool ldtr_valid(struct kvm_vcpu *vcpu)
1846{
1847 struct kvm_segment ldtr;
1848
1849 vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR);
1850
1851 if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */
1852 return false;
1853 if (ldtr.type != 2)
1854 return false;
1855 if (!ldtr.present)
1856 return false;
1857
1858 return true;
1859}
1860
1861static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
1862{
1863 struct kvm_segment cs, ss;
1864
1865 vmx_get_segment(vcpu, &cs, VCPU_SREG_CS);
1866 vmx_get_segment(vcpu, &ss, VCPU_SREG_SS);
1867
1868 return ((cs.selector & SELECTOR_RPL_MASK) ==
1869 (ss.selector & SELECTOR_RPL_MASK));
1870}
1871
1872/*
1873 * Check if guest state is valid. Returns true if valid, false if
1874 * not.
1875 * We assume that registers are always usable
1876 */
1877static bool guest_state_valid(struct kvm_vcpu *vcpu)
1878{
1879 /* real mode guest state checks */
1880 if (!(vcpu->arch.cr0 & X86_CR0_PE)) {
1881 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
1882 return false;
1883 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
1884 return false;
1885 if (!rmode_segment_valid(vcpu, VCPU_SREG_DS))
1886 return false;
1887 if (!rmode_segment_valid(vcpu, VCPU_SREG_ES))
1888 return false;
1889 if (!rmode_segment_valid(vcpu, VCPU_SREG_FS))
1890 return false;
1891 if (!rmode_segment_valid(vcpu, VCPU_SREG_GS))
1892 return false;
1893 } else {
1894 /* protected mode guest state checks */
1895 if (!cs_ss_rpl_check(vcpu))
1896 return false;
1897 if (!code_segment_valid(vcpu))
1898 return false;
1899 if (!stack_segment_valid(vcpu))
1900 return false;
1901 if (!data_segment_valid(vcpu, VCPU_SREG_DS))
1902 return false;
1903 if (!data_segment_valid(vcpu, VCPU_SREG_ES))
1904 return false;
1905 if (!data_segment_valid(vcpu, VCPU_SREG_FS))
1906 return false;
1907 if (!data_segment_valid(vcpu, VCPU_SREG_GS))
1908 return false;
1909 if (!tr_valid(vcpu))
1910 return false;
1911 if (!ldtr_valid(vcpu))
1912 return false;
1913 }
1914 /* TODO:
1915 * - Add checks on RIP
1916 * - Add checks on RFLAGS
1917 */
1918
1919 return true;
1920}
1921
1682static int init_rmode_tss(struct kvm *kvm) 1922static int init_rmode_tss(struct kvm *kvm)
1683{ 1923{
1684 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 1924 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
@@ -1690,7 +1930,8 @@ static int init_rmode_tss(struct kvm *kvm)
1690 if (r < 0) 1930 if (r < 0)
1691 goto out; 1931 goto out;
1692 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; 1932 data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
1693 r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); 1933 r = kvm_write_guest_page(kvm, fn++, &data,
1934 TSS_IOPB_BASE_OFFSET, sizeof(u16));
1694 if (r < 0) 1935 if (r < 0)
1695 goto out; 1936 goto out;
1696 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); 1937 r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE);
@@ -1753,7 +1994,7 @@ static void seg_setup(int seg)
1753 vmcs_write16(sf->selector, 0); 1994 vmcs_write16(sf->selector, 0);
1754 vmcs_writel(sf->base, 0); 1995 vmcs_writel(sf->base, 0);
1755 vmcs_write32(sf->limit, 0xffff); 1996 vmcs_write32(sf->limit, 0xffff);
1756 vmcs_write32(sf->ar_bytes, 0x93); 1997 vmcs_write32(sf->ar_bytes, 0xf3);
1757} 1998}
1758 1999
1759static int alloc_apic_access_page(struct kvm *kvm) 2000static int alloc_apic_access_page(struct kvm *kvm)
@@ -1772,9 +2013,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
1772 if (r) 2013 if (r)
1773 goto out; 2014 goto out;
1774 2015
1775 down_read(&current->mm->mmap_sem);
1776 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); 2016 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
1777 up_read(&current->mm->mmap_sem);
1778out: 2017out:
1779 up_write(&kvm->slots_lock); 2018 up_write(&kvm->slots_lock);
1780 return r; 2019 return r;
@@ -1796,10 +2035,8 @@ static int alloc_identity_pagetable(struct kvm *kvm)
1796 if (r) 2035 if (r)
1797 goto out; 2036 goto out;
1798 2037
1799 down_read(&current->mm->mmap_sem);
1800 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, 2038 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
1801 VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); 2039 VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT);
1802 up_read(&current->mm->mmap_sem);
1803out: 2040out:
1804 up_write(&kvm->slots_lock); 2041 up_write(&kvm->slots_lock);
1805 return r; 2042 return r;
@@ -1821,7 +2058,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx)
1821 spin_unlock(&vmx_vpid_lock); 2058 spin_unlock(&vmx_vpid_lock);
1822} 2059}
1823 2060
1824void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) 2061static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr)
1825{ 2062{
1826 void *va; 2063 void *va;
1827 2064
@@ -1881,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1881 } 2118 }
1882 if (!vm_need_ept()) 2119 if (!vm_need_ept())
1883 exec_control |= CPU_BASED_CR3_STORE_EXITING | 2120 exec_control |= CPU_BASED_CR3_STORE_EXITING |
1884 CPU_BASED_CR3_LOAD_EXITING; 2121 CPU_BASED_CR3_LOAD_EXITING |
2122 CPU_BASED_INVLPG_EXITING;
1885 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); 2123 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);
1886 2124
1887 if (cpu_has_secondary_exec_ctrls()) { 2125 if (cpu_has_secondary_exec_ctrls()) {
@@ -1907,8 +2145,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1907 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ 2145 vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */
1908 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 2146 vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1909 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 2147 vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1910 vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ 2148 vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */
1911 vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ 2149 vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */
1912 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ 2150 vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */
1913#ifdef CONFIG_X86_64 2151#ifdef CONFIG_X86_64
1914 rdmsrl(MSR_FS_BASE, a); 2152 rdmsrl(MSR_FS_BASE, a);
@@ -1922,7 +2160,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
1922 2160
1923 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ 2161 vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */
1924 2162
1925 get_idt(&dt); 2163 kvm_get_idt(&dt);
1926 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ 2164 vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */
1927 2165
1928 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); 2166 asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return));
@@ -1983,6 +2221,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
1983 u64 msr; 2221 u64 msr;
1984 int ret; 2222 int ret;
1985 2223
2224 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
1986 down_read(&vcpu->kvm->slots_lock); 2225 down_read(&vcpu->kvm->slots_lock);
1987 if (!init_rmode(vmx->vcpu.kvm)) { 2226 if (!init_rmode(vmx->vcpu.kvm)) {
1988 ret = -ENOMEM; 2227 ret = -ENOMEM;
@@ -2000,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2000 2239
2001 fx_init(&vmx->vcpu); 2240 fx_init(&vmx->vcpu);
2002 2241
2242 seg_setup(VCPU_SREG_CS);
2003 /* 2243 /*
2004 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode 2244 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
2005 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. 2245 * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh.
@@ -2011,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2011 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); 2251 vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8);
2012 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); 2252 vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12);
2013 } 2253 }
2014 vmcs_write32(GUEST_CS_LIMIT, 0xffff);
2015 vmcs_write32(GUEST_CS_AR_BYTES, 0x9b);
2016 2254
2017 seg_setup(VCPU_SREG_DS); 2255 seg_setup(VCPU_SREG_DS);
2018 seg_setup(VCPU_SREG_ES); 2256 seg_setup(VCPU_SREG_ES);
@@ -2036,10 +2274,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2036 2274
2037 vmcs_writel(GUEST_RFLAGS, 0x02); 2275 vmcs_writel(GUEST_RFLAGS, 0x02);
2038 if (vmx->vcpu.vcpu_id == 0) 2276 if (vmx->vcpu.vcpu_id == 0)
2039 vmcs_writel(GUEST_RIP, 0xfff0); 2277 kvm_rip_write(vcpu, 0xfff0);
2040 else 2278 else
2041 vmcs_writel(GUEST_RIP, 0); 2279 kvm_rip_write(vcpu, 0);
2042 vmcs_writel(GUEST_RSP, 0); 2280 kvm_register_write(vcpu, VCPU_REGS_RSP, 0);
2043 2281
2044 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ 2282 /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */
2045 vmcs_writel(GUEST_DR7, 0x400); 2283 vmcs_writel(GUEST_DR7, 0x400);
@@ -2089,6 +2327,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2089 2327
2090 ret = 0; 2328 ret = 0;
2091 2329
2330 /* HACK: Don't enable emulation on guest boot/reset */
2331 vmx->emulation_required = 0;
2332
2092out: 2333out:
2093 up_read(&vcpu->kvm->slots_lock); 2334 up_read(&vcpu->kvm->slots_lock);
2094 return ret; 2335 return ret;
@@ -2100,20 +2341,27 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq)
2100 2341
2101 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); 2342 KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler);
2102 2343
2344 ++vcpu->stat.irq_injections;
2103 if (vcpu->arch.rmode.active) { 2345 if (vcpu->arch.rmode.active) {
2104 vmx->rmode.irq.pending = true; 2346 vmx->rmode.irq.pending = true;
2105 vmx->rmode.irq.vector = irq; 2347 vmx->rmode.irq.vector = irq;
2106 vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); 2348 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2107 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2349 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2108 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); 2350 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2109 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); 2351 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2110 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); 2352 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2111 return; 2353 return;
2112 } 2354 }
2113 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 2355 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2114 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); 2356 irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK);
2115} 2357}
2116 2358
2359static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2360{
2361 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2362 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
2363}
2364
2117static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) 2365static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2118{ 2366{
2119 int word_index = __ffs(vcpu->arch.irq_summary); 2367 int word_index = __ffs(vcpu->arch.irq_summary);
@@ -2123,7 +2371,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu)
2123 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); 2371 clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]);
2124 if (!vcpu->arch.irq_pending[word_index]) 2372 if (!vcpu->arch.irq_pending[word_index])
2125 clear_bit(word_index, &vcpu->arch.irq_summary); 2373 clear_bit(word_index, &vcpu->arch.irq_summary);
2126 vmx_inject_irq(vcpu, irq); 2374 kvm_queue_interrupt(vcpu, irq);
2127} 2375}
2128 2376
2129 2377
@@ -2137,13 +2385,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu,
2137 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); 2385 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2138 2386
2139 if (vcpu->arch.interrupt_window_open && 2387 if (vcpu->arch.interrupt_window_open &&
2140 vcpu->arch.irq_summary && 2388 vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending)
2141 !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK))
2142 /*
2143 * If interrupts enabled, and not blocked by sti or mov ss. Good.
2144 */
2145 kvm_do_inject_irq(vcpu); 2389 kvm_do_inject_irq(vcpu);
2146 2390
2391 if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending)
2392 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
2393
2147 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 2394 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2148 if (!vcpu->arch.interrupt_window_open && 2395 if (!vcpu->arch.interrupt_window_open &&
2149 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) 2396 (vcpu->arch.irq_summary || kvm_run->request_interrupt_window))
@@ -2194,9 +2441,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu)
2194static int handle_rmode_exception(struct kvm_vcpu *vcpu, 2441static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2195 int vec, u32 err_code) 2442 int vec, u32 err_code)
2196{ 2443{
2197 if (!vcpu->arch.rmode.active)
2198 return 0;
2199
2200 /* 2444 /*
2201 * Instruction with address size override prefix opcode 0x67 2445 * Instruction with address size override prefix opcode 0x67
2202 * Cause the #SS fault with 0 error code in VM86 mode. 2446 * Cause the #SS fault with 0 error code in VM86 mode.
@@ -2204,6 +2448,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2204 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2448 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2205 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) 2449 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE)
2206 return 1; 2450 return 1;
2451 /*
2452 * Forward all other exceptions that are valid in real mode.
2453 * FIXME: Breaks guest debugging in real mode, needs to be fixed with
2454 * the required debugging infrastructure rework.
2455 */
2456 switch (vec) {
2457 case DE_VECTOR:
2458 case DB_VECTOR:
2459 case BP_VECTOR:
2460 case OF_VECTOR:
2461 case BR_VECTOR:
2462 case UD_VECTOR:
2463 case DF_VECTOR:
2464 case SS_VECTOR:
2465 case GP_VECTOR:
2466 case MF_VECTOR:
2467 kvm_queue_exception(vcpu, vec);
2468 return 1;
2469 }
2207 return 0; 2470 return 0;
2208} 2471}
2209 2472
@@ -2245,7 +2508,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2245 } 2508 }
2246 2509
2247 error_code = 0; 2510 error_code = 0;
2248 rip = vmcs_readl(GUEST_RIP); 2511 rip = kvm_rip_read(vcpu);
2249 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 2512 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
2250 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 2513 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
2251 if (is_page_fault(intr_info)) { 2514 if (is_page_fault(intr_info)) {
@@ -2255,6 +2518,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2255 cr2 = vmcs_readl(EXIT_QUALIFICATION); 2518 cr2 = vmcs_readl(EXIT_QUALIFICATION);
2256 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, 2519 KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2,
2257 (u32)((u64)cr2 >> 32), handler); 2520 (u32)((u64)cr2 >> 32), handler);
2521 if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending)
2522 kvm_mmu_unprotect_page_virt(vcpu, cr2);
2258 return kvm_mmu_page_fault(vcpu, cr2, error_code); 2523 return kvm_mmu_page_fault(vcpu, cr2, error_code);
2259 } 2524 }
2260 2525
@@ -2341,27 +2606,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2341 reg = (exit_qualification >> 8) & 15; 2606 reg = (exit_qualification >> 8) & 15;
2342 switch ((exit_qualification >> 4) & 3) { 2607 switch ((exit_qualification >> 4) & 3) {
2343 case 0: /* mov to cr */ 2608 case 0: /* mov to cr */
2344 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg], 2609 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr,
2345 (u32)((u64)vcpu->arch.regs[reg] >> 32), handler); 2610 (u32)kvm_register_read(vcpu, reg),
2611 (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2612 handler);
2346 switch (cr) { 2613 switch (cr) {
2347 case 0: 2614 case 0:
2348 vcpu_load_rsp_rip(vcpu); 2615 kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg));
2349 kvm_set_cr0(vcpu, vcpu->arch.regs[reg]);
2350 skip_emulated_instruction(vcpu); 2616 skip_emulated_instruction(vcpu);
2351 return 1; 2617 return 1;
2352 case 3: 2618 case 3:
2353 vcpu_load_rsp_rip(vcpu); 2619 kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg));
2354 kvm_set_cr3(vcpu, vcpu->arch.regs[reg]);
2355 skip_emulated_instruction(vcpu); 2620 skip_emulated_instruction(vcpu);
2356 return 1; 2621 return 1;
2357 case 4: 2622 case 4:
2358 vcpu_load_rsp_rip(vcpu); 2623 kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg));
2359 kvm_set_cr4(vcpu, vcpu->arch.regs[reg]);
2360 skip_emulated_instruction(vcpu); 2624 skip_emulated_instruction(vcpu);
2361 return 1; 2625 return 1;
2362 case 8: 2626 case 8:
2363 vcpu_load_rsp_rip(vcpu); 2627 kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg));
2364 kvm_set_cr8(vcpu, vcpu->arch.regs[reg]);
2365 skip_emulated_instruction(vcpu); 2628 skip_emulated_instruction(vcpu);
2366 if (irqchip_in_kernel(vcpu->kvm)) 2629 if (irqchip_in_kernel(vcpu->kvm))
2367 return 1; 2630 return 1;
@@ -2370,7 +2633,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2370 }; 2633 };
2371 break; 2634 break;
2372 case 2: /* clts */ 2635 case 2: /* clts */
2373 vcpu_load_rsp_rip(vcpu);
2374 vmx_fpu_deactivate(vcpu); 2636 vmx_fpu_deactivate(vcpu);
2375 vcpu->arch.cr0 &= ~X86_CR0_TS; 2637 vcpu->arch.cr0 &= ~X86_CR0_TS;
2376 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); 2638 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
@@ -2381,21 +2643,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2381 case 1: /*mov from cr*/ 2643 case 1: /*mov from cr*/
2382 switch (cr) { 2644 switch (cr) {
2383 case 3: 2645 case 3:
2384 vcpu_load_rsp_rip(vcpu); 2646 kvm_register_write(vcpu, reg, vcpu->arch.cr3);
2385 vcpu->arch.regs[reg] = vcpu->arch.cr3;
2386 vcpu_put_rsp_rip(vcpu);
2387 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, 2647 KVMTRACE_3D(CR_READ, vcpu, (u32)cr,
2388 (u32)vcpu->arch.regs[reg], 2648 (u32)kvm_register_read(vcpu, reg),
2389 (u32)((u64)vcpu->arch.regs[reg] >> 32), 2649 (u32)((u64)kvm_register_read(vcpu, reg) >> 32),
2390 handler); 2650 handler);
2391 skip_emulated_instruction(vcpu); 2651 skip_emulated_instruction(vcpu);
2392 return 1; 2652 return 1;
2393 case 8: 2653 case 8:
2394 vcpu_load_rsp_rip(vcpu); 2654 kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu));
2395 vcpu->arch.regs[reg] = kvm_get_cr8(vcpu);
2396 vcpu_put_rsp_rip(vcpu);
2397 KVMTRACE_2D(CR_READ, vcpu, (u32)cr, 2655 KVMTRACE_2D(CR_READ, vcpu, (u32)cr,
2398 (u32)vcpu->arch.regs[reg], handler); 2656 (u32)kvm_register_read(vcpu, reg), handler);
2399 skip_emulated_instruction(vcpu); 2657 skip_emulated_instruction(vcpu);
2400 return 1; 2658 return 1;
2401 } 2659 }
@@ -2427,7 +2685,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2427 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 2685 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
2428 dr = exit_qualification & 7; 2686 dr = exit_qualification & 7;
2429 reg = (exit_qualification >> 8) & 15; 2687 reg = (exit_qualification >> 8) & 15;
2430 vcpu_load_rsp_rip(vcpu);
2431 if (exit_qualification & 16) { 2688 if (exit_qualification & 16) {
2432 /* mov from dr */ 2689 /* mov from dr */
2433 switch (dr) { 2690 switch (dr) {
@@ -2440,12 +2697,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2440 default: 2697 default:
2441 val = 0; 2698 val = 0;
2442 } 2699 }
2443 vcpu->arch.regs[reg] = val; 2700 kvm_register_write(vcpu, reg, val);
2444 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); 2701 KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler);
2445 } else { 2702 } else {
2446 /* mov to dr */ 2703 /* mov to dr */
2447 } 2704 }
2448 vcpu_put_rsp_rip(vcpu);
2449 skip_emulated_instruction(vcpu); 2705 skip_emulated_instruction(vcpu);
2450 return 1; 2706 return 1;
2451} 2707}
@@ -2538,6 +2794,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2538 return 1; 2794 return 1;
2539} 2795}
2540 2796
2797static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2798{
2799 u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2800
2801 kvm_mmu_invlpg(vcpu, exit_qualification);
2802 skip_emulated_instruction(vcpu);
2803 return 1;
2804}
2805
2541static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2806static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2542{ 2807{
2543 skip_emulated_instruction(vcpu); 2808 skip_emulated_instruction(vcpu);
@@ -2554,8 +2819,6 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2554 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); 2819 exit_qualification = vmcs_read64(EXIT_QUALIFICATION);
2555 offset = exit_qualification & 0xffful; 2820 offset = exit_qualification & 0xffful;
2556 2821
2557 KVMTRACE_1D(APIC_ACCESS, vcpu, (u32)offset, handler);
2558
2559 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 2822 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2560 2823
2561 if (er != EMULATE_DONE) { 2824 if (er != EMULATE_DONE) {
@@ -2639,6 +2902,56 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2639 return 1; 2902 return 1;
2640} 2903}
2641 2904
2905static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2906{
2907 u32 cpu_based_vm_exec_control;
2908
2909 /* clear pending NMI */
2910 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2911 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
2912 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2913 ++vcpu->stat.nmi_window_exits;
2914
2915 return 1;
2916}
2917
2918static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
2919 struct kvm_run *kvm_run)
2920{
2921 struct vcpu_vmx *vmx = to_vmx(vcpu);
2922 int err;
2923
2924 preempt_enable();
2925 local_irq_enable();
2926
2927 while (!guest_state_valid(vcpu)) {
2928 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0);
2929
2930 switch (err) {
2931 case EMULATE_DONE:
2932 break;
2933 case EMULATE_DO_MMIO:
2934 kvm_report_emulation_failure(vcpu, "mmio");
2935 /* TODO: Handle MMIO */
2936 return;
2937 default:
2938 kvm_report_emulation_failure(vcpu, "emulation failure");
2939 return;
2940 }
2941
2942 if (signal_pending(current))
2943 break;
2944 if (need_resched())
2945 schedule();
2946 }
2947
2948 local_irq_disable();
2949 preempt_disable();
2950
2951 /* Guest state should be valid now, no more emulation should be needed */
2952 vmx->emulation_required = 0;
2953}
2954
2642/* 2955/*
2643 * The exit handlers return 1 if the exit was handled fully and guest execution 2956 * The exit handlers return 1 if the exit was handled fully and guest execution
2644 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 2957 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -2649,6 +2962,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2649 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 2962 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
2650 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 2963 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
2651 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 2964 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
2965 [EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
2652 [EXIT_REASON_IO_INSTRUCTION] = handle_io, 2966 [EXIT_REASON_IO_INSTRUCTION] = handle_io,
2653 [EXIT_REASON_CR_ACCESS] = handle_cr, 2967 [EXIT_REASON_CR_ACCESS] = handle_cr,
2654 [EXIT_REASON_DR_ACCESS] = handle_dr, 2968 [EXIT_REASON_DR_ACCESS] = handle_dr,
@@ -2657,6 +2971,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
2657 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 2971 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
2658 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 2972 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
2659 [EXIT_REASON_HLT] = handle_halt, 2973 [EXIT_REASON_HLT] = handle_halt,
2974 [EXIT_REASON_INVLPG] = handle_invlpg,
2660 [EXIT_REASON_VMCALL] = handle_vmcall, 2975 [EXIT_REASON_VMCALL] = handle_vmcall,
2661 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, 2976 [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
2662 [EXIT_REASON_APIC_ACCESS] = handle_apic_access, 2977 [EXIT_REASON_APIC_ACCESS] = handle_apic_access,
@@ -2678,8 +2993,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2678 struct vcpu_vmx *vmx = to_vmx(vcpu); 2993 struct vcpu_vmx *vmx = to_vmx(vcpu);
2679 u32 vectoring_info = vmx->idt_vectoring_info; 2994 u32 vectoring_info = vmx->idt_vectoring_info;
2680 2995
2681 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), 2996 KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu),
2682 (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); 2997 (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit);
2683 2998
2684 /* Access CR3 don't cause VMExit in paging mode, so we need 2999 /* Access CR3 don't cause VMExit in paging mode, so we need
2685 * to sync with guest real CR3. */ 3000 * to sync with guest real CR3. */
@@ -2736,64 +3051,128 @@ static void enable_irq_window(struct kvm_vcpu *vcpu)
2736 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 3051 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2737} 3052}
2738 3053
2739static void vmx_intr_assist(struct kvm_vcpu *vcpu) 3054static void enable_nmi_window(struct kvm_vcpu *vcpu)
2740{ 3055{
2741 struct vcpu_vmx *vmx = to_vmx(vcpu); 3056 u32 cpu_based_vm_exec_control;
2742 u32 idtv_info_field, intr_info_field;
2743 int has_ext_irq, interrupt_window_open;
2744 int vector;
2745
2746 update_tpr_threshold(vcpu);
2747 3057
2748 has_ext_irq = kvm_cpu_has_interrupt(vcpu); 3058 if (!cpu_has_virtual_nmis())
2749 intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD);
2750 idtv_info_field = vmx->idt_vectoring_info;
2751 if (intr_info_field & INTR_INFO_VALID_MASK) {
2752 if (idtv_info_field & INTR_INFO_VALID_MASK) {
2753 /* TODO: fault when IDT_Vectoring */
2754 if (printk_ratelimit())
2755 printk(KERN_ERR "Fault when IDT_Vectoring\n");
2756 }
2757 if (has_ext_irq)
2758 enable_irq_window(vcpu);
2759 return; 3059 return;
2760 }
2761 if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) {
2762 if ((idtv_info_field & VECTORING_INFO_TYPE_MASK)
2763 == INTR_TYPE_EXT_INTR
2764 && vcpu->arch.rmode.active) {
2765 u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK;
2766 3060
2767 vmx_inject_irq(vcpu, vect); 3061 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2768 if (unlikely(has_ext_irq)) 3062 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2769 enable_irq_window(vcpu); 3063 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
2770 return; 3064}
2771 }
2772 3065
2773 KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); 3066static int vmx_nmi_enabled(struct kvm_vcpu *vcpu)
3067{
3068 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3069 return !(guest_intr & (GUEST_INTR_STATE_NMI |
3070 GUEST_INTR_STATE_MOV_SS |
3071 GUEST_INTR_STATE_STI));
3072}
2774 3073
2775 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); 3074static int vmx_irq_enabled(struct kvm_vcpu *vcpu)
2776 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 3075{
2777 vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); 3076 u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
3077 return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS |
3078 GUEST_INTR_STATE_STI)) &&
3079 (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF));
3080}
2778 3081
2779 if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) 3082static void enable_intr_window(struct kvm_vcpu *vcpu)
2780 vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, 3083{
2781 vmcs_read32(IDT_VECTORING_ERROR_CODE)); 3084 if (vcpu->arch.nmi_pending)
2782 if (unlikely(has_ext_irq)) 3085 enable_nmi_window(vcpu);
3086 else if (kvm_cpu_has_interrupt(vcpu))
3087 enable_irq_window(vcpu);
3088}
3089
3090static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3091{
3092 u32 exit_intr_info;
3093 u32 idt_vectoring_info;
3094 bool unblock_nmi;
3095 u8 vector;
3096 int type;
3097 bool idtv_info_valid;
3098 u32 error;
3099
3100 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3101 if (cpu_has_virtual_nmis()) {
3102 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
3103 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
3104 /*
3105 * SDM 3: 25.7.1.2
3106 * Re-set bit "block by NMI" before VM entry if vmexit caused by
3107 * a guest IRET fault.
3108 */
3109 if (unblock_nmi && vector != DF_VECTOR)
3110 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3111 GUEST_INTR_STATE_NMI);
3112 }
3113
3114 idt_vectoring_info = vmx->idt_vectoring_info;
3115 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3116 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3117 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3118 if (vmx->vcpu.arch.nmi_injected) {
3119 /*
3120 * SDM 3: 25.7.1.2
3121 * Clear bit "block by NMI" before VM entry if a NMI delivery
3122 * faulted.
3123 */
3124 if (idtv_info_valid && type == INTR_TYPE_NMI_INTR)
3125 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
3126 GUEST_INTR_STATE_NMI);
3127 else
3128 vmx->vcpu.arch.nmi_injected = false;
3129 }
3130 kvm_clear_exception_queue(&vmx->vcpu);
3131 if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) {
3132 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
3133 error = vmcs_read32(IDT_VECTORING_ERROR_CODE);
3134 kvm_queue_exception_e(&vmx->vcpu, vector, error);
3135 } else
3136 kvm_queue_exception(&vmx->vcpu, vector);
3137 vmx->idt_vectoring_info = 0;
3138 }
3139 kvm_clear_interrupt_queue(&vmx->vcpu);
3140 if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) {
3141 kvm_queue_interrupt(&vmx->vcpu, vector);
3142 vmx->idt_vectoring_info = 0;
3143 }
3144}
3145
3146static void vmx_intr_assist(struct kvm_vcpu *vcpu)
3147{
3148 update_tpr_threshold(vcpu);
3149
3150 if (cpu_has_virtual_nmis()) {
3151 if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) {
3152 if (vmx_nmi_enabled(vcpu)) {
3153 vcpu->arch.nmi_pending = false;
3154 vcpu->arch.nmi_injected = true;
3155 } else {
3156 enable_intr_window(vcpu);
3157 return;
3158 }
3159 }
3160 if (vcpu->arch.nmi_injected) {
3161 vmx_inject_nmi(vcpu);
3162 enable_intr_window(vcpu);
3163 return;
3164 }
3165 }
3166 if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) {
3167 if (vmx_irq_enabled(vcpu))
3168 kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu));
3169 else
2783 enable_irq_window(vcpu); 3170 enable_irq_window(vcpu);
2784 return;
2785 } 3171 }
2786 if (!has_ext_irq) 3172 if (vcpu->arch.interrupt.pending) {
2787 return; 3173 vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr);
2788 interrupt_window_open = 3174 kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr);
2789 ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && 3175 }
2790 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0);
2791 if (interrupt_window_open) {
2792 vector = kvm_cpu_get_interrupt(vcpu);
2793 vmx_inject_irq(vcpu, vector);
2794 kvm_timer_intr_post(vcpu, vector);
2795 } else
2796 enable_irq_window(vcpu);
2797} 3176}
2798 3177
2799/* 3178/*
@@ -2805,9 +3184,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu)
2805static void fixup_rmode_irq(struct vcpu_vmx *vmx) 3184static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2806{ 3185{
2807 vmx->rmode.irq.pending = 0; 3186 vmx->rmode.irq.pending = 0;
2808 if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) 3187 if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip)
2809 return; 3188 return;
2810 vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); 3189 kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip);
2811 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { 3190 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
2812 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; 3191 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK;
2813 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; 3192 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR;
@@ -2819,11 +3198,30 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
2819 | vmx->rmode.irq.vector; 3198 | vmx->rmode.irq.vector;
2820} 3199}
2821 3200
3201#ifdef CONFIG_X86_64
3202#define R "r"
3203#define Q "q"
3204#else
3205#define R "e"
3206#define Q "l"
3207#endif
3208
2822static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3209static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2823{ 3210{
2824 struct vcpu_vmx *vmx = to_vmx(vcpu); 3211 struct vcpu_vmx *vmx = to_vmx(vcpu);
2825 u32 intr_info; 3212 u32 intr_info;
2826 3213
3214 /* Handle invalid guest state instead of entering VMX */
3215 if (vmx->emulation_required && emulate_invalid_guest_state) {
3216 handle_invalid_guest_state(vcpu, kvm_run);
3217 return;
3218 }
3219
3220 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3221 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
3222 if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
3223 vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
3224
2827 /* 3225 /*
2828 * Loading guest fpu may have cleared host cr0.ts 3226 * Loading guest fpu may have cleared host cr0.ts
2829 */ 3227 */
@@ -2831,26 +3229,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2831 3229
2832 asm( 3230 asm(
2833 /* Store host registers */ 3231 /* Store host registers */
2834#ifdef CONFIG_X86_64 3232 "push %%"R"dx; push %%"R"bp;"
2835 "push %%rdx; push %%rbp;" 3233 "push %%"R"cx \n\t"
2836 "push %%rcx \n\t" 3234 "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
2837#else 3235 "je 1f \n\t"
2838 "push %%edx; push %%ebp;" 3236 "mov %%"R"sp, %c[host_rsp](%0) \n\t"
2839 "push %%ecx \n\t" 3237 __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t"
2840#endif 3238 "1: \n\t"
2841 ASM_VMX_VMWRITE_RSP_RDX "\n\t"
2842 /* Check if vmlaunch of vmresume is needed */ 3239 /* Check if vmlaunch of vmresume is needed */
2843 "cmpl $0, %c[launched](%0) \n\t" 3240 "cmpl $0, %c[launched](%0) \n\t"
2844 /* Load guest registers. Don't clobber flags. */ 3241 /* Load guest registers. Don't clobber flags. */
3242 "mov %c[cr2](%0), %%"R"ax \n\t"
3243 "mov %%"R"ax, %%cr2 \n\t"
3244 "mov %c[rax](%0), %%"R"ax \n\t"
3245 "mov %c[rbx](%0), %%"R"bx \n\t"
3246 "mov %c[rdx](%0), %%"R"dx \n\t"
3247 "mov %c[rsi](%0), %%"R"si \n\t"
3248 "mov %c[rdi](%0), %%"R"di \n\t"
3249 "mov %c[rbp](%0), %%"R"bp \n\t"
2845#ifdef CONFIG_X86_64 3250#ifdef CONFIG_X86_64
2846 "mov %c[cr2](%0), %%rax \n\t"
2847 "mov %%rax, %%cr2 \n\t"
2848 "mov %c[rax](%0), %%rax \n\t"
2849 "mov %c[rbx](%0), %%rbx \n\t"
2850 "mov %c[rdx](%0), %%rdx \n\t"
2851 "mov %c[rsi](%0), %%rsi \n\t"
2852 "mov %c[rdi](%0), %%rdi \n\t"
2853 "mov %c[rbp](%0), %%rbp \n\t"
2854 "mov %c[r8](%0), %%r8 \n\t" 3251 "mov %c[r8](%0), %%r8 \n\t"
2855 "mov %c[r9](%0), %%r9 \n\t" 3252 "mov %c[r9](%0), %%r9 \n\t"
2856 "mov %c[r10](%0), %%r10 \n\t" 3253 "mov %c[r10](%0), %%r10 \n\t"
@@ -2859,34 +3256,25 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2859 "mov %c[r13](%0), %%r13 \n\t" 3256 "mov %c[r13](%0), %%r13 \n\t"
2860 "mov %c[r14](%0), %%r14 \n\t" 3257 "mov %c[r14](%0), %%r14 \n\t"
2861 "mov %c[r15](%0), %%r15 \n\t" 3258 "mov %c[r15](%0), %%r15 \n\t"
2862 "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */
2863#else
2864 "mov %c[cr2](%0), %%eax \n\t"
2865 "mov %%eax, %%cr2 \n\t"
2866 "mov %c[rax](%0), %%eax \n\t"
2867 "mov %c[rbx](%0), %%ebx \n\t"
2868 "mov %c[rdx](%0), %%edx \n\t"
2869 "mov %c[rsi](%0), %%esi \n\t"
2870 "mov %c[rdi](%0), %%edi \n\t"
2871 "mov %c[rbp](%0), %%ebp \n\t"
2872 "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */
2873#endif 3259#endif
3260 "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */
3261
2874 /* Enter guest mode */ 3262 /* Enter guest mode */
2875 "jne .Llaunched \n\t" 3263 "jne .Llaunched \n\t"
2876 ASM_VMX_VMLAUNCH "\n\t" 3264 __ex(ASM_VMX_VMLAUNCH) "\n\t"
2877 "jmp .Lkvm_vmx_return \n\t" 3265 "jmp .Lkvm_vmx_return \n\t"
2878 ".Llaunched: " ASM_VMX_VMRESUME "\n\t" 3266 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
2879 ".Lkvm_vmx_return: " 3267 ".Lkvm_vmx_return: "
2880 /* Save guest registers, load host registers, keep flags */ 3268 /* Save guest registers, load host registers, keep flags */
3269 "xchg %0, (%%"R"sp) \n\t"
3270 "mov %%"R"ax, %c[rax](%0) \n\t"
3271 "mov %%"R"bx, %c[rbx](%0) \n\t"
3272 "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t"
3273 "mov %%"R"dx, %c[rdx](%0) \n\t"
3274 "mov %%"R"si, %c[rsi](%0) \n\t"
3275 "mov %%"R"di, %c[rdi](%0) \n\t"
3276 "mov %%"R"bp, %c[rbp](%0) \n\t"
2881#ifdef CONFIG_X86_64 3277#ifdef CONFIG_X86_64
2882 "xchg %0, (%%rsp) \n\t"
2883 "mov %%rax, %c[rax](%0) \n\t"
2884 "mov %%rbx, %c[rbx](%0) \n\t"
2885 "pushq (%%rsp); popq %c[rcx](%0) \n\t"
2886 "mov %%rdx, %c[rdx](%0) \n\t"
2887 "mov %%rsi, %c[rsi](%0) \n\t"
2888 "mov %%rdi, %c[rdi](%0) \n\t"
2889 "mov %%rbp, %c[rbp](%0) \n\t"
2890 "mov %%r8, %c[r8](%0) \n\t" 3278 "mov %%r8, %c[r8](%0) \n\t"
2891 "mov %%r9, %c[r9](%0) \n\t" 3279 "mov %%r9, %c[r9](%0) \n\t"
2892 "mov %%r10, %c[r10](%0) \n\t" 3280 "mov %%r10, %c[r10](%0) \n\t"
@@ -2895,28 +3283,16 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2895 "mov %%r13, %c[r13](%0) \n\t" 3283 "mov %%r13, %c[r13](%0) \n\t"
2896 "mov %%r14, %c[r14](%0) \n\t" 3284 "mov %%r14, %c[r14](%0) \n\t"
2897 "mov %%r15, %c[r15](%0) \n\t" 3285 "mov %%r15, %c[r15](%0) \n\t"
2898 "mov %%cr2, %%rax \n\t"
2899 "mov %%rax, %c[cr2](%0) \n\t"
2900
2901 "pop %%rbp; pop %%rbp; pop %%rdx \n\t"
2902#else
2903 "xchg %0, (%%esp) \n\t"
2904 "mov %%eax, %c[rax](%0) \n\t"
2905 "mov %%ebx, %c[rbx](%0) \n\t"
2906 "pushl (%%esp); popl %c[rcx](%0) \n\t"
2907 "mov %%edx, %c[rdx](%0) \n\t"
2908 "mov %%esi, %c[rsi](%0) \n\t"
2909 "mov %%edi, %c[rdi](%0) \n\t"
2910 "mov %%ebp, %c[rbp](%0) \n\t"
2911 "mov %%cr2, %%eax \n\t"
2912 "mov %%eax, %c[cr2](%0) \n\t"
2913
2914 "pop %%ebp; pop %%ebp; pop %%edx \n\t"
2915#endif 3286#endif
3287 "mov %%cr2, %%"R"ax \n\t"
3288 "mov %%"R"ax, %c[cr2](%0) \n\t"
3289
3290 "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t"
2916 "setbe %c[fail](%0) \n\t" 3291 "setbe %c[fail](%0) \n\t"
2917 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 3292 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
2918 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 3293 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
2919 [fail]"i"(offsetof(struct vcpu_vmx, fail)), 3294 [fail]"i"(offsetof(struct vcpu_vmx, fail)),
3295 [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)),
2920 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), 3296 [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])),
2921 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), 3297 [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])),
2922 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), 3298 [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])),
@@ -2936,20 +3312,22 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2936#endif 3312#endif
2937 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) 3313 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2))
2938 : "cc", "memory" 3314 : "cc", "memory"
3315 , R"bx", R"di", R"si"
2939#ifdef CONFIG_X86_64 3316#ifdef CONFIG_X86_64
2940 , "rbx", "rdi", "rsi"
2941 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 3317 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
2942#else
2943 , "ebx", "edi", "rsi"
2944#endif 3318#endif
2945 ); 3319 );
2946 3320
3321 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
3322 vcpu->arch.regs_dirty = 0;
3323
2947 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 3324 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
2948 if (vmx->rmode.irq.pending) 3325 if (vmx->rmode.irq.pending)
2949 fixup_rmode_irq(vmx); 3326 fixup_rmode_irq(vmx);
2950 3327
2951 vcpu->arch.interrupt_window_open = 3328 vcpu->arch.interrupt_window_open =
2952 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; 3329 (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
3330 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0;
2953 3331
2954 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 3332 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
2955 vmx->launched = 1; 3333 vmx->launched = 1;
@@ -2957,18 +3335,24 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2957 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3335 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2958 3336
2959 /* We need to handle NMIs before interrupts are enabled */ 3337 /* We need to handle NMIs before interrupts are enabled */
2960 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ 3338 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 &&
3339 (intr_info & INTR_INFO_VALID_MASK)) {
2961 KVMTRACE_0D(NMI, vcpu, handler); 3340 KVMTRACE_0D(NMI, vcpu, handler);
2962 asm("int $2"); 3341 asm("int $2");
2963 } 3342 }
3343
3344 vmx_complete_interrupts(vmx);
2964} 3345}
2965 3346
3347#undef R
3348#undef Q
3349
2966static void vmx_free_vmcs(struct kvm_vcpu *vcpu) 3350static void vmx_free_vmcs(struct kvm_vcpu *vcpu)
2967{ 3351{
2968 struct vcpu_vmx *vmx = to_vmx(vcpu); 3352 struct vcpu_vmx *vmx = to_vmx(vcpu);
2969 3353
2970 if (vmx->vmcs) { 3354 if (vmx->vmcs) {
2971 on_each_cpu(__vcpu_clear, vmx, 1); 3355 vcpu_clear(vmx);
2972 free_vmcs(vmx->vmcs); 3356 free_vmcs(vmx->vmcs);
2973 vmx->vmcs = NULL; 3357 vmx->vmcs = NULL;
2974 } 3358 }
@@ -2999,15 +3383,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
2999 return ERR_PTR(-ENOMEM); 3383 return ERR_PTR(-ENOMEM);
3000 3384
3001 allocate_vpid(vmx); 3385 allocate_vpid(vmx);
3002 if (id == 0 && vm_need_ept()) {
3003 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3004 VMX_EPT_WRITABLE_MASK |
3005 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3006 kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK,
3007 VMX_EPT_FAKE_DIRTY_MASK, 0ull,
3008 VMX_EPT_EXECUTABLE_MASK);
3009 kvm_enable_tdp();
3010 }
3011 3386
3012 err = kvm_vcpu_init(&vmx->vcpu, kvm, id); 3387 err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
3013 if (err) 3388 if (err)
@@ -3095,7 +3470,6 @@ static struct kvm_x86_ops vmx_x86_ops = {
3095 .prepare_guest_switch = vmx_save_host_state, 3470 .prepare_guest_switch = vmx_save_host_state,
3096 .vcpu_load = vmx_vcpu_load, 3471 .vcpu_load = vmx_vcpu_load,
3097 .vcpu_put = vmx_vcpu_put, 3472 .vcpu_put = vmx_vcpu_put,
3098 .vcpu_decache = vmx_vcpu_decache,
3099 3473
3100 .set_guest_debug = set_guest_debug, 3474 .set_guest_debug = set_guest_debug,
3101 .guest_debug_pre = kvm_guest_debug_pre, 3475 .guest_debug_pre = kvm_guest_debug_pre,
@@ -3115,8 +3489,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3115 .set_idt = vmx_set_idt, 3489 .set_idt = vmx_set_idt,
3116 .get_gdt = vmx_get_gdt, 3490 .get_gdt = vmx_get_gdt,
3117 .set_gdt = vmx_set_gdt, 3491 .set_gdt = vmx_set_gdt,
3118 .cache_regs = vcpu_load_rsp_rip, 3492 .cache_reg = vmx_cache_reg,
3119 .decache_regs = vcpu_put_rsp_rip,
3120 .get_rflags = vmx_get_rflags, 3493 .get_rflags = vmx_get_rflags,
3121 .set_rflags = vmx_set_rflags, 3494 .set_rflags = vmx_set_rflags,
3122 3495
@@ -3187,8 +3560,16 @@ static int __init vmx_init(void)
3187 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); 3560 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP);
3188 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); 3561 vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP);
3189 3562
3190 if (cpu_has_vmx_ept()) 3563 if (vm_need_ept()) {
3191 bypass_guest_pf = 0; 3564 bypass_guest_pf = 0;
3565 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
3566 VMX_EPT_WRITABLE_MASK |
3567 VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT);
3568 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
3569 VMX_EPT_EXECUTABLE_MASK);
3570 kvm_enable_tdp();
3571 } else
3572 kvm_disable_tdp();
3192 3573
3193 if (bypass_guest_pf) 3574 if (bypass_guest_pf)
3194 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); 3575 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h
index 79d94c610dfe..3e010d21fdd7 100644
--- a/arch/x86/kvm/vmx.h
+++ b/arch/x86/kvm/vmx.h
@@ -40,6 +40,7 @@
40#define CPU_BASED_CR8_LOAD_EXITING 0x00080000 40#define CPU_BASED_CR8_LOAD_EXITING 0x00080000
41#define CPU_BASED_CR8_STORE_EXITING 0x00100000 41#define CPU_BASED_CR8_STORE_EXITING 0x00100000
42#define CPU_BASED_TPR_SHADOW 0x00200000 42#define CPU_BASED_TPR_SHADOW 0x00200000
43#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000
43#define CPU_BASED_MOV_DR_EXITING 0x00800000 44#define CPU_BASED_MOV_DR_EXITING 0x00800000
44#define CPU_BASED_UNCOND_IO_EXITING 0x01000000 45#define CPU_BASED_UNCOND_IO_EXITING 0x01000000
45#define CPU_BASED_USE_IO_BITMAPS 0x02000000 46#define CPU_BASED_USE_IO_BITMAPS 0x02000000
@@ -216,7 +217,7 @@ enum vmcs_field {
216#define EXIT_REASON_TRIPLE_FAULT 2 217#define EXIT_REASON_TRIPLE_FAULT 2
217 218
218#define EXIT_REASON_PENDING_INTERRUPT 7 219#define EXIT_REASON_PENDING_INTERRUPT 7
219 220#define EXIT_REASON_NMI_WINDOW 8
220#define EXIT_REASON_TASK_SWITCH 9 221#define EXIT_REASON_TASK_SWITCH 9
221#define EXIT_REASON_CPUID 10 222#define EXIT_REASON_CPUID 10
222#define EXIT_REASON_HLT 12 223#define EXIT_REASON_HLT 12
@@ -251,7 +252,9 @@ enum vmcs_field {
251#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ 252#define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */
252#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ 253#define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */
253#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ 254#define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */
255#define INTR_INFO_UNBLOCK_NMI 0x1000 /* 12 */
254#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ 256#define INTR_INFO_VALID_MASK 0x80000000 /* 31 */
257#define INTR_INFO_RESVD_BITS_MASK 0x7ffff000
255 258
256#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK 259#define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK
257#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK 260#define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK
@@ -259,9 +262,16 @@ enum vmcs_field {
259#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK 262#define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK
260 263
261#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ 264#define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */
265#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */
262#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ 266#define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */
263#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ 267#define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */
264 268
269/* GUEST_INTERRUPTIBILITY_INFO flags. */
270#define GUEST_INTR_STATE_STI 0x00000001
271#define GUEST_INTR_STATE_MOV_SS 0x00000002
272#define GUEST_INTR_STATE_SMI 0x00000004
273#define GUEST_INTR_STATE_NMI 0x00000008
274
265/* 275/*
266 * Exit Qualifications for MOV for Control Register Access 276 * Exit Qualifications for MOV for Control Register Access
267 */ 277 */
@@ -321,24 +331,6 @@ enum vmcs_field {
321 331
322#define AR_RESERVD_MASK 0xfffe0f00 332#define AR_RESERVD_MASK 0xfffe0f00
323 333
324#define MSR_IA32_VMX_BASIC 0x480
325#define MSR_IA32_VMX_PINBASED_CTLS 0x481
326#define MSR_IA32_VMX_PROCBASED_CTLS 0x482
327#define MSR_IA32_VMX_EXIT_CTLS 0x483
328#define MSR_IA32_VMX_ENTRY_CTLS 0x484
329#define MSR_IA32_VMX_MISC 0x485
330#define MSR_IA32_VMX_CR0_FIXED0 0x486
331#define MSR_IA32_VMX_CR0_FIXED1 0x487
332#define MSR_IA32_VMX_CR4_FIXED0 0x488
333#define MSR_IA32_VMX_CR4_FIXED1 0x489
334#define MSR_IA32_VMX_VMCS_ENUM 0x48a
335#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b
336#define MSR_IA32_VMX_EPT_VPID_CAP 0x48c
337
338#define MSR_IA32_FEATURE_CONTROL 0x3a
339#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1
340#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4
341
342#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 334#define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9
343#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 335#define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10
344 336
@@ -360,8 +352,6 @@ enum vmcs_field {
360#define VMX_EPT_READABLE_MASK 0x1ull 352#define VMX_EPT_READABLE_MASK 0x1ull
361#define VMX_EPT_WRITABLE_MASK 0x2ull 353#define VMX_EPT_WRITABLE_MASK 0x2ull
362#define VMX_EPT_EXECUTABLE_MASK 0x4ull 354#define VMX_EPT_EXECUTABLE_MASK 0x4ull
363#define VMX_EPT_FAKE_ACCESSED_MASK (1ull << 62)
364#define VMX_EPT_FAKE_DIRTY_MASK (1ull << 63)
365 355
366#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 356#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
367 357
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0faa2546b1cd..4f0677d1eae8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4,10 +4,14 @@
4 * derived from drivers/kvm/kvm_main.c 4 * derived from drivers/kvm/kvm_main.c
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008
7 * 9 *
8 * Authors: 10 * Authors:
9 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
10 * Yaniv Kamay <yaniv@qumranet.com> 12 * Yaniv Kamay <yaniv@qumranet.com>
13 * Amit Shah <amit.shah@qumranet.com>
14 * Ben-Ami Yassour <benami@il.ibm.com>
11 * 15 *
12 * This work is licensed under the terms of the GNU GPL, version 2. See 16 * This work is licensed under the terms of the GNU GPL, version 2. See
13 * the COPYING file in the top-level directory. 17 * the COPYING file in the top-level directory.
@@ -19,14 +23,18 @@
19#include "mmu.h" 23#include "mmu.h"
20#include "i8254.h" 24#include "i8254.h"
21#include "tss.h" 25#include "tss.h"
26#include "kvm_cache_regs.h"
27#include "x86.h"
22 28
23#include <linux/clocksource.h> 29#include <linux/clocksource.h>
30#include <linux/interrupt.h>
24#include <linux/kvm.h> 31#include <linux/kvm.h>
25#include <linux/fs.h> 32#include <linux/fs.h>
26#include <linux/vmalloc.h> 33#include <linux/vmalloc.h>
27#include <linux/module.h> 34#include <linux/module.h>
28#include <linux/mman.h> 35#include <linux/mman.h>
29#include <linux/highmem.h> 36#include <linux/highmem.h>
37#include <linux/intel-iommu.h>
30 38
31#include <asm/uaccess.h> 39#include <asm/uaccess.h>
32#include <asm/msr.h> 40#include <asm/msr.h>
@@ -61,6 +69,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
61 struct kvm_cpuid_entry2 __user *entries); 69 struct kvm_cpuid_entry2 __user *entries);
62 70
63struct kvm_x86_ops *kvm_x86_ops; 71struct kvm_x86_ops *kvm_x86_ops;
72EXPORT_SYMBOL_GPL(kvm_x86_ops);
64 73
65struct kvm_stats_debugfs_item debugfs_entries[] = { 74struct kvm_stats_debugfs_item debugfs_entries[] = {
66 { "pf_fixed", VCPU_STAT(pf_fixed) }, 75 { "pf_fixed", VCPU_STAT(pf_fixed) },
@@ -72,6 +81,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
72 { "mmio_exits", VCPU_STAT(mmio_exits) }, 81 { "mmio_exits", VCPU_STAT(mmio_exits) },
73 { "signal_exits", VCPU_STAT(signal_exits) }, 82 { "signal_exits", VCPU_STAT(signal_exits) },
74 { "irq_window", VCPU_STAT(irq_window_exits) }, 83 { "irq_window", VCPU_STAT(irq_window_exits) },
84 { "nmi_window", VCPU_STAT(nmi_window_exits) },
75 { "halt_exits", VCPU_STAT(halt_exits) }, 85 { "halt_exits", VCPU_STAT(halt_exits) },
76 { "halt_wakeup", VCPU_STAT(halt_wakeup) }, 86 { "halt_wakeup", VCPU_STAT(halt_wakeup) },
77 { "hypercalls", VCPU_STAT(hypercalls) }, 87 { "hypercalls", VCPU_STAT(hypercalls) },
@@ -82,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
82 { "fpu_reload", VCPU_STAT(fpu_reload) }, 92 { "fpu_reload", VCPU_STAT(fpu_reload) },
83 { "insn_emulation", VCPU_STAT(insn_emulation) }, 93 { "insn_emulation", VCPU_STAT(insn_emulation) },
84 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, 94 { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
95 { "irq_injections", VCPU_STAT(irq_injections) },
85 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, 96 { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
86 { "mmu_pte_write", VM_STAT(mmu_pte_write) }, 97 { "mmu_pte_write", VM_STAT(mmu_pte_write) },
87 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, 98 { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
@@ -89,12 +100,12 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
89 { "mmu_flooded", VM_STAT(mmu_flooded) }, 100 { "mmu_flooded", VM_STAT(mmu_flooded) },
90 { "mmu_recycled", VM_STAT(mmu_recycled) }, 101 { "mmu_recycled", VM_STAT(mmu_recycled) },
91 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, 102 { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
103 { "mmu_unsync", VM_STAT(mmu_unsync) },
92 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, 104 { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
93 { "largepages", VM_STAT(lpages) }, 105 { "largepages", VM_STAT(lpages) },
94 { NULL } 106 { NULL }
95}; 107};
96 108
97
98unsigned long segment_base(u16 selector) 109unsigned long segment_base(u16 selector)
99{ 110{
100 struct descriptor_table gdt; 111 struct descriptor_table gdt;
@@ -173,6 +184,12 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
173 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 184 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
174} 185}
175 186
187void kvm_inject_nmi(struct kvm_vcpu *vcpu)
188{
189 vcpu->arch.nmi_pending = 1;
190}
191EXPORT_SYMBOL_GPL(kvm_inject_nmi);
192
176void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 193void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
177{ 194{
178 WARN_ON(vcpu->arch.exception.pending); 195 WARN_ON(vcpu->arch.exception.pending);
@@ -345,6 +362,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
345void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 362void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
346{ 363{
347 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 364 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) {
365 kvm_mmu_sync_roots(vcpu);
348 kvm_mmu_flush_tlb(vcpu); 366 kvm_mmu_flush_tlb(vcpu);
349 return; 367 return;
350 } 368 }
@@ -557,7 +575,7 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *
557 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 575 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32);
558 576
559 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 577 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n",
560 __FUNCTION__, tsc_khz, hv_clock->tsc_shift, 578 __func__, tsc_khz, hv_clock->tsc_shift,
561 hv_clock->tsc_to_system_mul); 579 hv_clock->tsc_to_system_mul);
562} 580}
563 581
@@ -604,6 +622,38 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
604 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 622 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
605} 623}
606 624
625static bool msr_mtrr_valid(unsigned msr)
626{
627 switch (msr) {
628 case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1:
629 case MSR_MTRRfix64K_00000:
630 case MSR_MTRRfix16K_80000:
631 case MSR_MTRRfix16K_A0000:
632 case MSR_MTRRfix4K_C0000:
633 case MSR_MTRRfix4K_C8000:
634 case MSR_MTRRfix4K_D0000:
635 case MSR_MTRRfix4K_D8000:
636 case MSR_MTRRfix4K_E0000:
637 case MSR_MTRRfix4K_E8000:
638 case MSR_MTRRfix4K_F0000:
639 case MSR_MTRRfix4K_F8000:
640 case MSR_MTRRdefType:
641 case MSR_IA32_CR_PAT:
642 return true;
643 case 0x2f8:
644 return true;
645 }
646 return false;
647}
648
649static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
650{
651 if (!msr_mtrr_valid(msr))
652 return 1;
653
654 vcpu->arch.mtrr[msr - 0x200] = data;
655 return 0;
656}
607 657
608int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 658int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
609{ 659{
@@ -623,10 +673,23 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
623 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", 673 pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n",
624 __func__, data); 674 __func__, data);
625 break; 675 break;
676 case MSR_IA32_DEBUGCTLMSR:
677 if (!data) {
678 /* We support the non-activated case already */
679 break;
680 } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) {
681 /* Values other than LBR and BTF are vendor-specific,
682 thus reserved and should throw a #GP */
683 return 1;
684 }
685 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
686 __func__, data);
687 break;
626 case MSR_IA32_UCODE_REV: 688 case MSR_IA32_UCODE_REV:
627 case MSR_IA32_UCODE_WRITE: 689 case MSR_IA32_UCODE_WRITE:
628 case 0x200 ... 0x2ff: /* MTRRs */
629 break; 690 break;
691 case 0x200 ... 0x2ff:
692 return set_msr_mtrr(vcpu, msr, data);
630 case MSR_IA32_APICBASE: 693 case MSR_IA32_APICBASE:
631 kvm_set_apic_base(vcpu, data); 694 kvm_set_apic_base(vcpu, data);
632 break; 695 break;
@@ -652,10 +715,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
652 /* ...but clean it before doing the actual write */ 715 /* ...but clean it before doing the actual write */
653 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); 716 vcpu->arch.time_offset = data & ~(PAGE_MASK | 1);
654 717
655 down_read(&current->mm->mmap_sem);
656 vcpu->arch.time_page = 718 vcpu->arch.time_page =
657 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); 719 gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT);
658 up_read(&current->mm->mmap_sem);
659 720
660 if (is_error_page(vcpu->arch.time_page)) { 721 if (is_error_page(vcpu->arch.time_page)) {
661 kvm_release_page_clean(vcpu->arch.time_page); 722 kvm_release_page_clean(vcpu->arch.time_page);
@@ -684,6 +745,15 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
684 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); 745 return kvm_x86_ops->get_msr(vcpu, msr_index, pdata);
685} 746}
686 747
748static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
749{
750 if (!msr_mtrr_valid(msr))
751 return 1;
752
753 *pdata = vcpu->arch.mtrr[msr - 0x200];
754 return 0;
755}
756
687int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 757int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
688{ 758{
689 u64 data; 759 u64 data;
@@ -703,13 +773,21 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
703 case MSR_IA32_MC0_MISC+8: 773 case MSR_IA32_MC0_MISC+8:
704 case MSR_IA32_MC0_MISC+12: 774 case MSR_IA32_MC0_MISC+12:
705 case MSR_IA32_MC0_MISC+16: 775 case MSR_IA32_MC0_MISC+16:
776 case MSR_IA32_MC0_MISC+20:
706 case MSR_IA32_UCODE_REV: 777 case MSR_IA32_UCODE_REV:
707 case MSR_IA32_EBL_CR_POWERON: 778 case MSR_IA32_EBL_CR_POWERON:
708 /* MTRR registers */ 779 case MSR_IA32_DEBUGCTLMSR:
709 case 0xfe: 780 case MSR_IA32_LASTBRANCHFROMIP:
710 case 0x200 ... 0x2ff: 781 case MSR_IA32_LASTBRANCHTOIP:
782 case MSR_IA32_LASTINTFROMIP:
783 case MSR_IA32_LASTINTTOIP:
711 data = 0; 784 data = 0;
712 break; 785 break;
786 case MSR_MTRRcap:
787 data = 0x500 | KVM_NR_VAR_MTRR;
788 break;
789 case 0x200 ... 0x2ff:
790 return get_msr_mtrr(vcpu, msr, pdata);
713 case 0xcd: /* fsb frequency */ 791 case 0xcd: /* fsb frequency */
714 data = 3; 792 data = 3;
715 break; 793 break;
@@ -817,41 +895,6 @@ out:
817 return r; 895 return r;
818} 896}
819 897
820/*
821 * Make sure that a cpu that is being hot-unplugged does not have any vcpus
822 * cached on it.
823 */
824void decache_vcpus_on_cpu(int cpu)
825{
826 struct kvm *vm;
827 struct kvm_vcpu *vcpu;
828 int i;
829
830 spin_lock(&kvm_lock);
831 list_for_each_entry(vm, &vm_list, vm_list)
832 for (i = 0; i < KVM_MAX_VCPUS; ++i) {
833 vcpu = vm->vcpus[i];
834 if (!vcpu)
835 continue;
836 /*
837 * If the vcpu is locked, then it is running on some
838 * other cpu and therefore it is not cached on the
839 * cpu in question.
840 *
841 * If it's not locked, check the last cpu it executed
842 * on.
843 */
844 if (mutex_trylock(&vcpu->mutex)) {
845 if (vcpu->cpu == cpu) {
846 kvm_x86_ops->vcpu_decache(vcpu);
847 vcpu->cpu = -1;
848 }
849 mutex_unlock(&vcpu->mutex);
850 }
851 }
852 spin_unlock(&kvm_lock);
853}
854
855int kvm_dev_ioctl_check_extension(long ext) 898int kvm_dev_ioctl_check_extension(long ext)
856{ 899{
857 int r; 900 int r;
@@ -867,8 +910,12 @@ int kvm_dev_ioctl_check_extension(long ext)
867 case KVM_CAP_PIT: 910 case KVM_CAP_PIT:
868 case KVM_CAP_NOP_IO_DELAY: 911 case KVM_CAP_NOP_IO_DELAY:
869 case KVM_CAP_MP_STATE: 912 case KVM_CAP_MP_STATE:
913 case KVM_CAP_SYNC_MMU:
870 r = 1; 914 r = 1;
871 break; 915 break;
916 case KVM_CAP_COALESCED_MMIO:
917 r = KVM_COALESCED_MMIO_PAGE_OFFSET;
918 break;
872 case KVM_CAP_VAPIC: 919 case KVM_CAP_VAPIC:
873 r = !kvm_x86_ops->cpu_has_accelerated_tpr(); 920 r = !kvm_x86_ops->cpu_has_accelerated_tpr();
874 break; 921 break;
@@ -881,6 +928,9 @@ int kvm_dev_ioctl_check_extension(long ext)
881 case KVM_CAP_PV_MMU: 928 case KVM_CAP_PV_MMU:
882 r = !tdp_enabled; 929 r = !tdp_enabled;
883 break; 930 break;
931 case KVM_CAP_IOMMU:
932 r = intel_iommu_found();
933 break;
884 default: 934 default:
885 r = 0; 935 r = 0;
886 break; 936 break;
@@ -1283,28 +1333,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1283 struct kvm_vcpu *vcpu = filp->private_data; 1333 struct kvm_vcpu *vcpu = filp->private_data;
1284 void __user *argp = (void __user *)arg; 1334 void __user *argp = (void __user *)arg;
1285 int r; 1335 int r;
1336 struct kvm_lapic_state *lapic = NULL;
1286 1337
1287 switch (ioctl) { 1338 switch (ioctl) {
1288 case KVM_GET_LAPIC: { 1339 case KVM_GET_LAPIC: {
1289 struct kvm_lapic_state lapic; 1340 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1290 1341
1291 memset(&lapic, 0, sizeof lapic); 1342 r = -ENOMEM;
1292 r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); 1343 if (!lapic)
1344 goto out;
1345 r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic);
1293 if (r) 1346 if (r)
1294 goto out; 1347 goto out;
1295 r = -EFAULT; 1348 r = -EFAULT;
1296 if (copy_to_user(argp, &lapic, sizeof lapic)) 1349 if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state)))
1297 goto out; 1350 goto out;
1298 r = 0; 1351 r = 0;
1299 break; 1352 break;
1300 } 1353 }
1301 case KVM_SET_LAPIC: { 1354 case KVM_SET_LAPIC: {
1302 struct kvm_lapic_state lapic; 1355 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1303 1356 r = -ENOMEM;
1357 if (!lapic)
1358 goto out;
1304 r = -EFAULT; 1359 r = -EFAULT;
1305 if (copy_from_user(&lapic, argp, sizeof lapic)) 1360 if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state)))
1306 goto out; 1361 goto out;
1307 r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; 1362 r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic);
1308 if (r) 1363 if (r)
1309 goto out; 1364 goto out;
1310 r = 0; 1365 r = 0;
@@ -1402,6 +1457,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1402 r = -EINVAL; 1457 r = -EINVAL;
1403 } 1458 }
1404out: 1459out:
1460 if (lapic)
1461 kfree(lapic);
1405 return r; 1462 return r;
1406} 1463}
1407 1464
@@ -1476,6 +1533,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1476 goto out; 1533 goto out;
1477 1534
1478 down_write(&kvm->slots_lock); 1535 down_write(&kvm->slots_lock);
1536 spin_lock(&kvm->mmu_lock);
1479 1537
1480 p = &kvm->arch.aliases[alias->slot]; 1538 p = &kvm->arch.aliases[alias->slot];
1481 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 1539 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
@@ -1487,6 +1545,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1487 break; 1545 break;
1488 kvm->arch.naliases = n; 1546 kvm->arch.naliases = n;
1489 1547
1548 spin_unlock(&kvm->mmu_lock);
1490 kvm_mmu_zap_all(kvm); 1549 kvm_mmu_zap_all(kvm);
1491 1550
1492 up_write(&kvm->slots_lock); 1551 up_write(&kvm->slots_lock);
@@ -1608,6 +1667,15 @@ long kvm_arch_vm_ioctl(struct file *filp,
1608 struct kvm *kvm = filp->private_data; 1667 struct kvm *kvm = filp->private_data;
1609 void __user *argp = (void __user *)arg; 1668 void __user *argp = (void __user *)arg;
1610 int r = -EINVAL; 1669 int r = -EINVAL;
1670 /*
1671 * This union makes it completely explicit to gcc-3.x
1672 * that these two variables' stack usage should be
1673 * combined, not added together.
1674 */
1675 union {
1676 struct kvm_pit_state ps;
1677 struct kvm_memory_alias alias;
1678 } u;
1611 1679
1612 switch (ioctl) { 1680 switch (ioctl) {
1613 case KVM_SET_TSS_ADDR: 1681 case KVM_SET_TSS_ADDR:
@@ -1639,17 +1707,14 @@ long kvm_arch_vm_ioctl(struct file *filp,
1639 case KVM_GET_NR_MMU_PAGES: 1707 case KVM_GET_NR_MMU_PAGES:
1640 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); 1708 r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
1641 break; 1709 break;
1642 case KVM_SET_MEMORY_ALIAS: { 1710 case KVM_SET_MEMORY_ALIAS:
1643 struct kvm_memory_alias alias;
1644
1645 r = -EFAULT; 1711 r = -EFAULT;
1646 if (copy_from_user(&alias, argp, sizeof alias)) 1712 if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias)))
1647 goto out; 1713 goto out;
1648 r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); 1714 r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias);
1649 if (r) 1715 if (r)
1650 goto out; 1716 goto out;
1651 break; 1717 break;
1652 }
1653 case KVM_CREATE_IRQCHIP: 1718 case KVM_CREATE_IRQCHIP:
1654 r = -ENOMEM; 1719 r = -ENOMEM;
1655 kvm->arch.vpic = kvm_create_pic(kvm); 1720 kvm->arch.vpic = kvm_create_pic(kvm);
@@ -1677,13 +1742,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
1677 goto out; 1742 goto out;
1678 if (irqchip_in_kernel(kvm)) { 1743 if (irqchip_in_kernel(kvm)) {
1679 mutex_lock(&kvm->lock); 1744 mutex_lock(&kvm->lock);
1680 if (irq_event.irq < 16) 1745 kvm_set_irq(kvm, irq_event.irq, irq_event.level);
1681 kvm_pic_set_irq(pic_irqchip(kvm),
1682 irq_event.irq,
1683 irq_event.level);
1684 kvm_ioapic_set_irq(kvm->arch.vioapic,
1685 irq_event.irq,
1686 irq_event.level);
1687 mutex_unlock(&kvm->lock); 1746 mutex_unlock(&kvm->lock);
1688 r = 0; 1747 r = 0;
1689 } 1748 }
@@ -1691,65 +1750,77 @@ long kvm_arch_vm_ioctl(struct file *filp,
1691 } 1750 }
1692 case KVM_GET_IRQCHIP: { 1751 case KVM_GET_IRQCHIP: {
1693 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1752 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1694 struct kvm_irqchip chip; 1753 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1695 1754
1696 r = -EFAULT; 1755 r = -ENOMEM;
1697 if (copy_from_user(&chip, argp, sizeof chip)) 1756 if (!chip)
1698 goto out; 1757 goto out;
1758 r = -EFAULT;
1759 if (copy_from_user(chip, argp, sizeof *chip))
1760 goto get_irqchip_out;
1699 r = -ENXIO; 1761 r = -ENXIO;
1700 if (!irqchip_in_kernel(kvm)) 1762 if (!irqchip_in_kernel(kvm))
1701 goto out; 1763 goto get_irqchip_out;
1702 r = kvm_vm_ioctl_get_irqchip(kvm, &chip); 1764 r = kvm_vm_ioctl_get_irqchip(kvm, chip);
1703 if (r) 1765 if (r)
1704 goto out; 1766 goto get_irqchip_out;
1705 r = -EFAULT; 1767 r = -EFAULT;
1706 if (copy_to_user(argp, &chip, sizeof chip)) 1768 if (copy_to_user(argp, chip, sizeof *chip))
1707 goto out; 1769 goto get_irqchip_out;
1708 r = 0; 1770 r = 0;
1771 get_irqchip_out:
1772 kfree(chip);
1773 if (r)
1774 goto out;
1709 break; 1775 break;
1710 } 1776 }
1711 case KVM_SET_IRQCHIP: { 1777 case KVM_SET_IRQCHIP: {
1712 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ 1778 /* 0: PIC master, 1: PIC slave, 2: IOAPIC */
1713 struct kvm_irqchip chip; 1779 struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL);
1714 1780
1715 r = -EFAULT; 1781 r = -ENOMEM;
1716 if (copy_from_user(&chip, argp, sizeof chip)) 1782 if (!chip)
1717 goto out; 1783 goto out;
1784 r = -EFAULT;
1785 if (copy_from_user(chip, argp, sizeof *chip))
1786 goto set_irqchip_out;
1718 r = -ENXIO; 1787 r = -ENXIO;
1719 if (!irqchip_in_kernel(kvm)) 1788 if (!irqchip_in_kernel(kvm))
1720 goto out; 1789 goto set_irqchip_out;
1721 r = kvm_vm_ioctl_set_irqchip(kvm, &chip); 1790 r = kvm_vm_ioctl_set_irqchip(kvm, chip);
1722 if (r) 1791 if (r)
1723 goto out; 1792 goto set_irqchip_out;
1724 r = 0; 1793 r = 0;
1794 set_irqchip_out:
1795 kfree(chip);
1796 if (r)
1797 goto out;
1725 break; 1798 break;
1726 } 1799 }
1727 case KVM_GET_PIT: { 1800 case KVM_GET_PIT: {
1728 struct kvm_pit_state ps;
1729 r = -EFAULT; 1801 r = -EFAULT;
1730 if (copy_from_user(&ps, argp, sizeof ps)) 1802 if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
1731 goto out; 1803 goto out;
1732 r = -ENXIO; 1804 r = -ENXIO;
1733 if (!kvm->arch.vpit) 1805 if (!kvm->arch.vpit)
1734 goto out; 1806 goto out;
1735 r = kvm_vm_ioctl_get_pit(kvm, &ps); 1807 r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
1736 if (r) 1808 if (r)
1737 goto out; 1809 goto out;
1738 r = -EFAULT; 1810 r = -EFAULT;
1739 if (copy_to_user(argp, &ps, sizeof ps)) 1811 if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
1740 goto out; 1812 goto out;
1741 r = 0; 1813 r = 0;
1742 break; 1814 break;
1743 } 1815 }
1744 case KVM_SET_PIT: { 1816 case KVM_SET_PIT: {
1745 struct kvm_pit_state ps;
1746 r = -EFAULT; 1817 r = -EFAULT;
1747 if (copy_from_user(&ps, argp, sizeof ps)) 1818 if (copy_from_user(&u.ps, argp, sizeof u.ps))
1748 goto out; 1819 goto out;
1749 r = -ENXIO; 1820 r = -ENXIO;
1750 if (!kvm->arch.vpit) 1821 if (!kvm->arch.vpit)
1751 goto out; 1822 goto out;
1752 r = kvm_vm_ioctl_set_pit(kvm, &ps); 1823 r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
1753 if (r) 1824 if (r)
1754 goto out; 1825 goto out;
1755 r = 0; 1826 r = 0;
@@ -1781,13 +1852,14 @@ static void kvm_init_msr_list(void)
1781 * Only apic need an MMIO device hook, so shortcut now.. 1852 * Only apic need an MMIO device hook, so shortcut now..
1782 */ 1853 */
1783static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, 1854static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1784 gpa_t addr) 1855 gpa_t addr, int len,
1856 int is_write)
1785{ 1857{
1786 struct kvm_io_device *dev; 1858 struct kvm_io_device *dev;
1787 1859
1788 if (vcpu->arch.apic) { 1860 if (vcpu->arch.apic) {
1789 dev = &vcpu->arch.apic->dev; 1861 dev = &vcpu->arch.apic->dev;
1790 if (dev->in_range(dev, addr)) 1862 if (dev->in_range(dev, addr, len, is_write))
1791 return dev; 1863 return dev;
1792 } 1864 }
1793 return NULL; 1865 return NULL;
@@ -1795,13 +1867,15 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu,
1795 1867
1796 1868
1797static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, 1869static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu,
1798 gpa_t addr) 1870 gpa_t addr, int len,
1871 int is_write)
1799{ 1872{
1800 struct kvm_io_device *dev; 1873 struct kvm_io_device *dev;
1801 1874
1802 dev = vcpu_find_pervcpu_dev(vcpu, addr); 1875 dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write);
1803 if (dev == NULL) 1876 if (dev == NULL)
1804 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); 1877 dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len,
1878 is_write);
1805 return dev; 1879 return dev;
1806} 1880}
1807 1881
@@ -1869,7 +1943,7 @@ mmio:
1869 * Is this MMIO handled locally? 1943 * Is this MMIO handled locally?
1870 */ 1944 */
1871 mutex_lock(&vcpu->kvm->lock); 1945 mutex_lock(&vcpu->kvm->lock);
1872 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 1946 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0);
1873 if (mmio_dev) { 1947 if (mmio_dev) {
1874 kvm_iodevice_read(mmio_dev, gpa, bytes, val); 1948 kvm_iodevice_read(mmio_dev, gpa, bytes, val);
1875 mutex_unlock(&vcpu->kvm->lock); 1949 mutex_unlock(&vcpu->kvm->lock);
@@ -1924,7 +1998,7 @@ mmio:
1924 * Is this MMIO handled locally? 1998 * Is this MMIO handled locally?
1925 */ 1999 */
1926 mutex_lock(&vcpu->kvm->lock); 2000 mutex_lock(&vcpu->kvm->lock);
1927 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); 2001 mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1);
1928 if (mmio_dev) { 2002 if (mmio_dev) {
1929 kvm_iodevice_write(mmio_dev, gpa, bytes, val); 2003 kvm_iodevice_write(mmio_dev, gpa, bytes, val);
1930 mutex_unlock(&vcpu->kvm->lock); 2004 mutex_unlock(&vcpu->kvm->lock);
@@ -1993,9 +2067,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
1993 2067
1994 val = *(u64 *)new; 2068 val = *(u64 *)new;
1995 2069
1996 down_read(&current->mm->mmap_sem);
1997 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2070 page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT);
1998 up_read(&current->mm->mmap_sem);
1999 2071
2000 kaddr = kmap_atomic(page, KM_USER0); 2072 kaddr = kmap_atomic(page, KM_USER0);
2001 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); 2073 set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val);
@@ -2015,11 +2087,13 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
2015 2087
2016int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 2088int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2017{ 2089{
2090 kvm_mmu_invlpg(vcpu, address);
2018 return X86EMUL_CONTINUE; 2091 return X86EMUL_CONTINUE;
2019} 2092}
2020 2093
2021int emulate_clts(struct kvm_vcpu *vcpu) 2094int emulate_clts(struct kvm_vcpu *vcpu)
2022{ 2095{
2096 KVMTRACE_0D(CLTS, vcpu, handler);
2023 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 2097 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS);
2024 return X86EMUL_CONTINUE; 2098 return X86EMUL_CONTINUE;
2025} 2099}
@@ -2053,21 +2127,19 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2053 2127
2054void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 2128void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2055{ 2129{
2056 static int reported;
2057 u8 opcodes[4]; 2130 u8 opcodes[4];
2058 unsigned long rip = vcpu->arch.rip; 2131 unsigned long rip = kvm_rip_read(vcpu);
2059 unsigned long rip_linear; 2132 unsigned long rip_linear;
2060 2133
2061 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 2134 if (!printk_ratelimit())
2062
2063 if (reported)
2064 return; 2135 return;
2065 2136
2137 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2138
2066 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); 2139 emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu);
2067 2140
2068 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 2141 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2069 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 2142 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
2070 reported = 1;
2071} 2143}
2072EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 2144EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2073 2145
@@ -2078,6 +2150,14 @@ static struct x86_emulate_ops emulate_ops = {
2078 .cmpxchg_emulated = emulator_cmpxchg_emulated, 2150 .cmpxchg_emulated = emulator_cmpxchg_emulated,
2079}; 2151};
2080 2152
2153static void cache_all_regs(struct kvm_vcpu *vcpu)
2154{
2155 kvm_register_read(vcpu, VCPU_REGS_RAX);
2156 kvm_register_read(vcpu, VCPU_REGS_RSP);
2157 kvm_register_read(vcpu, VCPU_REGS_RIP);
2158 vcpu->arch.regs_dirty = ~0;
2159}
2160
2081int emulate_instruction(struct kvm_vcpu *vcpu, 2161int emulate_instruction(struct kvm_vcpu *vcpu,
2082 struct kvm_run *run, 2162 struct kvm_run *run,
2083 unsigned long cr2, 2163 unsigned long cr2,
@@ -2087,8 +2167,15 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2087 int r; 2167 int r;
2088 struct decode_cache *c; 2168 struct decode_cache *c;
2089 2169
2170 kvm_clear_exception_queue(vcpu);
2090 vcpu->arch.mmio_fault_cr2 = cr2; 2171 vcpu->arch.mmio_fault_cr2 = cr2;
2091 kvm_x86_ops->cache_regs(vcpu); 2172 /*
2173 * TODO: fix x86_emulate.c to use guest_read/write_register
2174 * instead of direct ->regs accesses, can save hundred cycles
2175 * on Intel for instructions that don't read/change RSP, for
2176 * for example.
2177 */
2178 cache_all_regs(vcpu);
2092 2179
2093 vcpu->mmio_is_write = 0; 2180 vcpu->mmio_is_write = 0;
2094 vcpu->arch.pio.string = 0; 2181 vcpu->arch.pio.string = 0;
@@ -2105,27 +2192,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2105 ? X86EMUL_MODE_PROT64 : cs_db 2192 ? X86EMUL_MODE_PROT64 : cs_db
2106 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 2193 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2107 2194
2108 if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) {
2109 vcpu->arch.emulate_ctxt.cs_base = 0;
2110 vcpu->arch.emulate_ctxt.ds_base = 0;
2111 vcpu->arch.emulate_ctxt.es_base = 0;
2112 vcpu->arch.emulate_ctxt.ss_base = 0;
2113 } else {
2114 vcpu->arch.emulate_ctxt.cs_base =
2115 get_segment_base(vcpu, VCPU_SREG_CS);
2116 vcpu->arch.emulate_ctxt.ds_base =
2117 get_segment_base(vcpu, VCPU_SREG_DS);
2118 vcpu->arch.emulate_ctxt.es_base =
2119 get_segment_base(vcpu, VCPU_SREG_ES);
2120 vcpu->arch.emulate_ctxt.ss_base =
2121 get_segment_base(vcpu, VCPU_SREG_SS);
2122 }
2123
2124 vcpu->arch.emulate_ctxt.gs_base =
2125 get_segment_base(vcpu, VCPU_SREG_GS);
2126 vcpu->arch.emulate_ctxt.fs_base =
2127 get_segment_base(vcpu, VCPU_SREG_FS);
2128
2129 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 2195 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
2130 2196
2131 /* Reject the instructions other than VMCALL/VMMCALL when 2197 /* Reject the instructions other than VMCALL/VMMCALL when
@@ -2169,7 +2235,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2169 return EMULATE_DO_MMIO; 2235 return EMULATE_DO_MMIO;
2170 } 2236 }
2171 2237
2172 kvm_x86_ops->decache_regs(vcpu);
2173 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 2238 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2174 2239
2175 if (vcpu->mmio_is_write) { 2240 if (vcpu->mmio_is_write) {
@@ -2222,20 +2287,19 @@ int complete_pio(struct kvm_vcpu *vcpu)
2222 struct kvm_pio_request *io = &vcpu->arch.pio; 2287 struct kvm_pio_request *io = &vcpu->arch.pio;
2223 long delta; 2288 long delta;
2224 int r; 2289 int r;
2225 2290 unsigned long val;
2226 kvm_x86_ops->cache_regs(vcpu);
2227 2291
2228 if (!io->string) { 2292 if (!io->string) {
2229 if (io->in) 2293 if (io->in) {
2230 memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, 2294 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2231 io->size); 2295 memcpy(&val, vcpu->arch.pio_data, io->size);
2296 kvm_register_write(vcpu, VCPU_REGS_RAX, val);
2297 }
2232 } else { 2298 } else {
2233 if (io->in) { 2299 if (io->in) {
2234 r = pio_copy_data(vcpu); 2300 r = pio_copy_data(vcpu);
2235 if (r) { 2301 if (r)
2236 kvm_x86_ops->cache_regs(vcpu);
2237 return r; 2302 return r;
2238 }
2239 } 2303 }
2240 2304
2241 delta = 1; 2305 delta = 1;
@@ -2245,19 +2309,24 @@ int complete_pio(struct kvm_vcpu *vcpu)
2245 * The size of the register should really depend on 2309 * The size of the register should really depend on
2246 * current address size. 2310 * current address size.
2247 */ 2311 */
2248 vcpu->arch.regs[VCPU_REGS_RCX] -= delta; 2312 val = kvm_register_read(vcpu, VCPU_REGS_RCX);
2313 val -= delta;
2314 kvm_register_write(vcpu, VCPU_REGS_RCX, val);
2249 } 2315 }
2250 if (io->down) 2316 if (io->down)
2251 delta = -delta; 2317 delta = -delta;
2252 delta *= io->size; 2318 delta *= io->size;
2253 if (io->in) 2319 if (io->in) {
2254 vcpu->arch.regs[VCPU_REGS_RDI] += delta; 2320 val = kvm_register_read(vcpu, VCPU_REGS_RDI);
2255 else 2321 val += delta;
2256 vcpu->arch.regs[VCPU_REGS_RSI] += delta; 2322 kvm_register_write(vcpu, VCPU_REGS_RDI, val);
2323 } else {
2324 val = kvm_register_read(vcpu, VCPU_REGS_RSI);
2325 val += delta;
2326 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2327 }
2257 } 2328 }
2258 2329
2259 kvm_x86_ops->decache_regs(vcpu);
2260
2261 io->count -= io->cur_count; 2330 io->count -= io->cur_count;
2262 io->cur_count = 0; 2331 io->cur_count = 0;
2263 2332
@@ -2300,15 +2369,17 @@ static void pio_string_write(struct kvm_io_device *pio_dev,
2300} 2369}
2301 2370
2302static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, 2371static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu,
2303 gpa_t addr) 2372 gpa_t addr, int len,
2373 int is_write)
2304{ 2374{
2305 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); 2375 return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write);
2306} 2376}
2307 2377
2308int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 2378int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2309 int size, unsigned port) 2379 int size, unsigned port)
2310{ 2380{
2311 struct kvm_io_device *pio_dev; 2381 struct kvm_io_device *pio_dev;
2382 unsigned long val;
2312 2383
2313 vcpu->run->exit_reason = KVM_EXIT_IO; 2384 vcpu->run->exit_reason = KVM_EXIT_IO;
2314 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 2385 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
@@ -2329,13 +2400,12 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2329 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, 2400 KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size,
2330 handler); 2401 handler);
2331 2402
2332 kvm_x86_ops->cache_regs(vcpu); 2403 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2333 memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); 2404 memcpy(vcpu->arch.pio_data, &val, 4);
2334 kvm_x86_ops->decache_regs(vcpu);
2335 2405
2336 kvm_x86_ops->skip_emulated_instruction(vcpu); 2406 kvm_x86_ops->skip_emulated_instruction(vcpu);
2337 2407
2338 pio_dev = vcpu_find_pio_dev(vcpu, port); 2408 pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in);
2339 if (pio_dev) { 2409 if (pio_dev) {
2340 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); 2410 kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data);
2341 complete_pio(vcpu); 2411 complete_pio(vcpu);
@@ -2417,7 +2487,9 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2417 } 2487 }
2418 } 2488 }
2419 2489
2420 pio_dev = vcpu_find_pio_dev(vcpu, port); 2490 pio_dev = vcpu_find_pio_dev(vcpu, port,
2491 vcpu->arch.pio.cur_count,
2492 !vcpu->arch.pio.in);
2421 if (!vcpu->arch.pio.in) { 2493 if (!vcpu->arch.pio.in) {
2422 /* string PIO write */ 2494 /* string PIO write */
2423 ret = pio_copy_data(vcpu); 2495 ret = pio_copy_data(vcpu);
@@ -2487,11 +2559,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
2487 KVMTRACE_0D(HLT, vcpu, handler); 2559 KVMTRACE_0D(HLT, vcpu, handler);
2488 if (irqchip_in_kernel(vcpu->kvm)) { 2560 if (irqchip_in_kernel(vcpu->kvm)) {
2489 vcpu->arch.mp_state = KVM_MP_STATE_HALTED; 2561 vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
2490 up_read(&vcpu->kvm->slots_lock);
2491 kvm_vcpu_block(vcpu);
2492 down_read(&vcpu->kvm->slots_lock);
2493 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
2494 return -EINTR;
2495 return 1; 2562 return 1;
2496 } else { 2563 } else {
2497 vcpu->run->exit_reason = KVM_EXIT_HLT; 2564 vcpu->run->exit_reason = KVM_EXIT_HLT;
@@ -2514,13 +2581,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2514 unsigned long nr, a0, a1, a2, a3, ret; 2581 unsigned long nr, a0, a1, a2, a3, ret;
2515 int r = 1; 2582 int r = 1;
2516 2583
2517 kvm_x86_ops->cache_regs(vcpu); 2584 nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
2518 2585 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
2519 nr = vcpu->arch.regs[VCPU_REGS_RAX]; 2586 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
2520 a0 = vcpu->arch.regs[VCPU_REGS_RBX]; 2587 a2 = kvm_register_read(vcpu, VCPU_REGS_RDX);
2521 a1 = vcpu->arch.regs[VCPU_REGS_RCX]; 2588 a3 = kvm_register_read(vcpu, VCPU_REGS_RSI);
2522 a2 = vcpu->arch.regs[VCPU_REGS_RDX];
2523 a3 = vcpu->arch.regs[VCPU_REGS_RSI];
2524 2589
2525 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); 2590 KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler);
2526 2591
@@ -2543,8 +2608,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
2543 ret = -KVM_ENOSYS; 2608 ret = -KVM_ENOSYS;
2544 break; 2609 break;
2545 } 2610 }
2546 vcpu->arch.regs[VCPU_REGS_RAX] = ret; 2611 kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
2547 kvm_x86_ops->decache_regs(vcpu);
2548 ++vcpu->stat.hypercalls; 2612 ++vcpu->stat.hypercalls;
2549 return r; 2613 return r;
2550} 2614}
@@ -2554,6 +2618,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2554{ 2618{
2555 char instruction[3]; 2619 char instruction[3];
2556 int ret = 0; 2620 int ret = 0;
2621 unsigned long rip = kvm_rip_read(vcpu);
2557 2622
2558 2623
2559 /* 2624 /*
@@ -2563,9 +2628,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
2563 */ 2628 */
2564 kvm_mmu_zap_all(vcpu->kvm); 2629 kvm_mmu_zap_all(vcpu->kvm);
2565 2630
2566 kvm_x86_ops->cache_regs(vcpu);
2567 kvm_x86_ops->patch_hypercall(vcpu, instruction); 2631 kvm_x86_ops->patch_hypercall(vcpu, instruction);
2568 if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) 2632 if (emulator_write_emulated(rip, instruction, 3, vcpu)
2569 != X86EMUL_CONTINUE) 2633 != X86EMUL_CONTINUE)
2570 ret = -EFAULT; 2634 ret = -EFAULT;
2571 2635
@@ -2600,27 +2664,41 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
2600 2664
2601unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 2665unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
2602{ 2666{
2667 unsigned long value;
2668
2603 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 2669 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
2604 switch (cr) { 2670 switch (cr) {
2605 case 0: 2671 case 0:
2606 return vcpu->arch.cr0; 2672 value = vcpu->arch.cr0;
2673 break;
2607 case 2: 2674 case 2:
2608 return vcpu->arch.cr2; 2675 value = vcpu->arch.cr2;
2676 break;
2609 case 3: 2677 case 3:
2610 return vcpu->arch.cr3; 2678 value = vcpu->arch.cr3;
2679 break;
2611 case 4: 2680 case 4:
2612 return vcpu->arch.cr4; 2681 value = vcpu->arch.cr4;
2682 break;
2613 case 8: 2683 case 8:
2614 return kvm_get_cr8(vcpu); 2684 value = kvm_get_cr8(vcpu);
2685 break;
2615 default: 2686 default:
2616 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 2687 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
2617 return 0; 2688 return 0;
2618 } 2689 }
2690 KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value,
2691 (u32)((u64)value >> 32), handler);
2692
2693 return value;
2619} 2694}
2620 2695
2621void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, 2696void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
2622 unsigned long *rflags) 2697 unsigned long *rflags)
2623{ 2698{
2699 KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val,
2700 (u32)((u64)val >> 32), handler);
2701
2624 switch (cr) { 2702 switch (cr) {
2625 case 0: 2703 case 0:
2626 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 2704 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val));
@@ -2681,13 +2759,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2681 u32 function, index; 2759 u32 function, index;
2682 struct kvm_cpuid_entry2 *e, *best; 2760 struct kvm_cpuid_entry2 *e, *best;
2683 2761
2684 kvm_x86_ops->cache_regs(vcpu); 2762 function = kvm_register_read(vcpu, VCPU_REGS_RAX);
2685 function = vcpu->arch.regs[VCPU_REGS_RAX]; 2763 index = kvm_register_read(vcpu, VCPU_REGS_RCX);
2686 index = vcpu->arch.regs[VCPU_REGS_RCX]; 2764 kvm_register_write(vcpu, VCPU_REGS_RAX, 0);
2687 vcpu->arch.regs[VCPU_REGS_RAX] = 0; 2765 kvm_register_write(vcpu, VCPU_REGS_RBX, 0);
2688 vcpu->arch.regs[VCPU_REGS_RBX] = 0; 2766 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
2689 vcpu->arch.regs[VCPU_REGS_RCX] = 0; 2767 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
2690 vcpu->arch.regs[VCPU_REGS_RDX] = 0;
2691 best = NULL; 2768 best = NULL;
2692 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { 2769 for (i = 0; i < vcpu->arch.cpuid_nent; ++i) {
2693 e = &vcpu->arch.cpuid_entries[i]; 2770 e = &vcpu->arch.cpuid_entries[i];
@@ -2705,18 +2782,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
2705 best = e; 2782 best = e;
2706 } 2783 }
2707 if (best) { 2784 if (best) {
2708 vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; 2785 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
2709 vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; 2786 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
2710 vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; 2787 kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx);
2711 vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; 2788 kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx);
2712 } 2789 }
2713 kvm_x86_ops->decache_regs(vcpu);
2714 kvm_x86_ops->skip_emulated_instruction(vcpu); 2790 kvm_x86_ops->skip_emulated_instruction(vcpu);
2715 KVMTRACE_5D(CPUID, vcpu, function, 2791 KVMTRACE_5D(CPUID, vcpu, function,
2716 (u32)vcpu->arch.regs[VCPU_REGS_RAX], 2792 (u32)kvm_register_read(vcpu, VCPU_REGS_RAX),
2717 (u32)vcpu->arch.regs[VCPU_REGS_RBX], 2793 (u32)kvm_register_read(vcpu, VCPU_REGS_RBX),
2718 (u32)vcpu->arch.regs[VCPU_REGS_RCX], 2794 (u32)kvm_register_read(vcpu, VCPU_REGS_RCX),
2719 (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); 2795 (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler);
2720} 2796}
2721EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); 2797EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
2722 2798
@@ -2757,9 +2833,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
2757 if (!apic || !apic->vapic_addr) 2833 if (!apic || !apic->vapic_addr)
2758 return; 2834 return;
2759 2835
2760 down_read(&current->mm->mmap_sem);
2761 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2836 page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2762 up_read(&current->mm->mmap_sem);
2763 2837
2764 vcpu->arch.apic->vapic_page = page; 2838 vcpu->arch.apic->vapic_page = page;
2765} 2839}
@@ -2771,32 +2845,16 @@ static void vapic_exit(struct kvm_vcpu *vcpu)
2771 if (!apic || !apic->vapic_addr) 2845 if (!apic || !apic->vapic_addr)
2772 return; 2846 return;
2773 2847
2848 down_read(&vcpu->kvm->slots_lock);
2774 kvm_release_page_dirty(apic->vapic_page); 2849 kvm_release_page_dirty(apic->vapic_page);
2775 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 2850 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
2851 up_read(&vcpu->kvm->slots_lock);
2776} 2852}
2777 2853
2778static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2854static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2779{ 2855{
2780 int r; 2856 int r;
2781 2857
2782 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
2783 pr_debug("vcpu %d received sipi with vector # %x\n",
2784 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2785 kvm_lapic_reset(vcpu);
2786 r = kvm_x86_ops->vcpu_reset(vcpu);
2787 if (r)
2788 return r;
2789 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2790 }
2791
2792 down_read(&vcpu->kvm->slots_lock);
2793 vapic_enter(vcpu);
2794
2795preempted:
2796 if (vcpu->guest_debug.enabled)
2797 kvm_x86_ops->guest_debug_pre(vcpu);
2798
2799again:
2800 if (vcpu->requests) 2858 if (vcpu->requests)
2801 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 2859 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
2802 kvm_mmu_unload(vcpu); 2860 kvm_mmu_unload(vcpu);
@@ -2808,6 +2866,8 @@ again:
2808 if (vcpu->requests) { 2866 if (vcpu->requests) {
2809 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) 2867 if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests))
2810 __kvm_migrate_timers(vcpu); 2868 __kvm_migrate_timers(vcpu);
2869 if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests))
2870 kvm_mmu_sync_roots(vcpu);
2811 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) 2871 if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests))
2812 kvm_x86_ops->tlb_flush(vcpu); 2872 kvm_x86_ops->tlb_flush(vcpu);
2813 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 2873 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
@@ -2833,21 +2893,15 @@ again:
2833 2893
2834 local_irq_disable(); 2894 local_irq_disable();
2835 2895
2836 if (vcpu->requests || need_resched()) { 2896 if (vcpu->requests || need_resched() || signal_pending(current)) {
2837 local_irq_enable(); 2897 local_irq_enable();
2838 preempt_enable(); 2898 preempt_enable();
2839 r = 1; 2899 r = 1;
2840 goto out; 2900 goto out;
2841 } 2901 }
2842 2902
2843 if (signal_pending(current)) { 2903 if (vcpu->guest_debug.enabled)
2844 local_irq_enable(); 2904 kvm_x86_ops->guest_debug_pre(vcpu);
2845 preempt_enable();
2846 r = -EINTR;
2847 kvm_run->exit_reason = KVM_EXIT_INTR;
2848 ++vcpu->stat.signal_exits;
2849 goto out;
2850 }
2851 2905
2852 vcpu->guest_mode = 1; 2906 vcpu->guest_mode = 1;
2853 /* 2907 /*
@@ -2896,8 +2950,8 @@ again:
2896 * Profile KVM exit RIPs: 2950 * Profile KVM exit RIPs:
2897 */ 2951 */
2898 if (unlikely(prof_on == KVM_PROFILING)) { 2952 if (unlikely(prof_on == KVM_PROFILING)) {
2899 kvm_x86_ops->cache_regs(vcpu); 2953 unsigned long rip = kvm_rip_read(vcpu);
2900 profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); 2954 profile_hit(KVM_PROFILING, (void *)rip);
2901 } 2955 }
2902 2956
2903 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) 2957 if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu))
@@ -2906,31 +2960,66 @@ again:
2906 kvm_lapic_sync_from_vapic(vcpu); 2960 kvm_lapic_sync_from_vapic(vcpu);
2907 2961
2908 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 2962 r = kvm_x86_ops->handle_exit(kvm_run, vcpu);
2963out:
2964 return r;
2965}
2909 2966
2910 if (r > 0) { 2967static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2911 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 2968{
2912 r = -EINTR; 2969 int r;
2913 kvm_run->exit_reason = KVM_EXIT_INTR; 2970
2914 ++vcpu->stat.request_irq_exits; 2971 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
2915 goto out; 2972 pr_debug("vcpu %d received sipi with vector # %x\n",
2916 } 2973 vcpu->vcpu_id, vcpu->arch.sipi_vector);
2917 if (!need_resched()) 2974 kvm_lapic_reset(vcpu);
2918 goto again; 2975 r = kvm_x86_ops->vcpu_reset(vcpu);
2976 if (r)
2977 return r;
2978 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
2919 } 2979 }
2920 2980
2921out: 2981 down_read(&vcpu->kvm->slots_lock);
2922 up_read(&vcpu->kvm->slots_lock); 2982 vapic_enter(vcpu);
2923 if (r > 0) { 2983
2924 kvm_resched(vcpu); 2984 r = 1;
2925 down_read(&vcpu->kvm->slots_lock); 2985 while (r > 0) {
2926 goto preempted; 2986 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
2987 r = vcpu_enter_guest(vcpu, kvm_run);
2988 else {
2989 up_read(&vcpu->kvm->slots_lock);
2990 kvm_vcpu_block(vcpu);
2991 down_read(&vcpu->kvm->slots_lock);
2992 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
2993 if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED)
2994 vcpu->arch.mp_state =
2995 KVM_MP_STATE_RUNNABLE;
2996 if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE)
2997 r = -EINTR;
2998 }
2999
3000 if (r > 0) {
3001 if (dm_request_for_irq_injection(vcpu, kvm_run)) {
3002 r = -EINTR;
3003 kvm_run->exit_reason = KVM_EXIT_INTR;
3004 ++vcpu->stat.request_irq_exits;
3005 }
3006 if (signal_pending(current)) {
3007 r = -EINTR;
3008 kvm_run->exit_reason = KVM_EXIT_INTR;
3009 ++vcpu->stat.signal_exits;
3010 }
3011 if (need_resched()) {
3012 up_read(&vcpu->kvm->slots_lock);
3013 kvm_resched(vcpu);
3014 down_read(&vcpu->kvm->slots_lock);
3015 }
3016 }
2927 } 3017 }
2928 3018
3019 up_read(&vcpu->kvm->slots_lock);
2929 post_kvm_run_save(vcpu, kvm_run); 3020 post_kvm_run_save(vcpu, kvm_run);
2930 3021
2931 down_read(&vcpu->kvm->slots_lock);
2932 vapic_exit(vcpu); 3022 vapic_exit(vcpu);
2933 up_read(&vcpu->kvm->slots_lock);
2934 3023
2935 return r; 3024 return r;
2936} 3025}
@@ -2942,15 +3031,16 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2942 3031
2943 vcpu_load(vcpu); 3032 vcpu_load(vcpu);
2944 3033
3034 if (vcpu->sigset_active)
3035 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
3036
2945 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { 3037 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
2946 kvm_vcpu_block(vcpu); 3038 kvm_vcpu_block(vcpu);
2947 vcpu_put(vcpu); 3039 clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
2948 return -EAGAIN; 3040 r = -EAGAIN;
3041 goto out;
2949 } 3042 }
2950 3043
2951 if (vcpu->sigset_active)
2952 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
2953
2954 /* re-sync apic's tpr */ 3044 /* re-sync apic's tpr */
2955 if (!irqchip_in_kernel(vcpu->kvm)) 3045 if (!irqchip_in_kernel(vcpu->kvm))
2956 kvm_set_cr8(vcpu, kvm_run->cr8); 3046 kvm_set_cr8(vcpu, kvm_run->cr8);
@@ -2980,11 +3070,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2980 } 3070 }
2981 } 3071 }
2982#endif 3072#endif
2983 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { 3073 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
2984 kvm_x86_ops->cache_regs(vcpu); 3074 kvm_register_write(vcpu, VCPU_REGS_RAX,
2985 vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; 3075 kvm_run->hypercall.ret);
2986 kvm_x86_ops->decache_regs(vcpu);
2987 }
2988 3076
2989 r = __vcpu_run(vcpu, kvm_run); 3077 r = __vcpu_run(vcpu, kvm_run);
2990 3078
@@ -3000,28 +3088,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3000{ 3088{
3001 vcpu_load(vcpu); 3089 vcpu_load(vcpu);
3002 3090
3003 kvm_x86_ops->cache_regs(vcpu); 3091 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3004 3092 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3005 regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; 3093 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3006 regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; 3094 regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3007 regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; 3095 regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3008 regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; 3096 regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3009 regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; 3097 regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3010 regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; 3098 regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3011 regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP];
3012 regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP];
3013#ifdef CONFIG_X86_64 3099#ifdef CONFIG_X86_64
3014 regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; 3100 regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8);
3015 regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; 3101 regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9);
3016 regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; 3102 regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10);
3017 regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; 3103 regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11);
3018 regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; 3104 regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12);
3019 regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; 3105 regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13);
3020 regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; 3106 regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14);
3021 regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; 3107 regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15);
3022#endif 3108#endif
3023 3109
3024 regs->rip = vcpu->arch.rip; 3110 regs->rip = kvm_rip_read(vcpu);
3025 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 3111 regs->rflags = kvm_x86_ops->get_rflags(vcpu);
3026 3112
3027 /* 3113 /*
@@ -3039,29 +3125,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3039{ 3125{
3040 vcpu_load(vcpu); 3126 vcpu_load(vcpu);
3041 3127
3042 vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; 3128 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
3043 vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; 3129 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
3044 vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; 3130 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
3045 vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; 3131 kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx);
3046 vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; 3132 kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi);
3047 vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; 3133 kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi);
3048 vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; 3134 kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp);
3049 vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; 3135 kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp);
3050#ifdef CONFIG_X86_64 3136#ifdef CONFIG_X86_64
3051 vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; 3137 kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8);
3052 vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; 3138 kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9);
3053 vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; 3139 kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10);
3054 vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; 3140 kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11);
3055 vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; 3141 kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12);
3056 vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; 3142 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3057 vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; 3143 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3058 vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; 3144 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3145
3059#endif 3146#endif
3060 3147
3061 vcpu->arch.rip = regs->rip; 3148 kvm_rip_write(vcpu, regs->rip);
3062 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 3149 kvm_x86_ops->set_rflags(vcpu, regs->rflags);
3063 3150
3064 kvm_x86_ops->decache_regs(vcpu);
3065 3151
3066 vcpu->arch.exception.pending = false; 3152 vcpu->arch.exception.pending = false;
3067 3153
@@ -3070,8 +3156,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3070 return 0; 3156 return 0;
3071} 3157}
3072 3158
3073static void get_segment(struct kvm_vcpu *vcpu, 3159void kvm_get_segment(struct kvm_vcpu *vcpu,
3074 struct kvm_segment *var, int seg) 3160 struct kvm_segment *var, int seg)
3075{ 3161{
3076 kvm_x86_ops->get_segment(vcpu, var, seg); 3162 kvm_x86_ops->get_segment(vcpu, var, seg);
3077} 3163}
@@ -3080,7 +3166,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
3080{ 3166{
3081 struct kvm_segment cs; 3167 struct kvm_segment cs;
3082 3168
3083 get_segment(vcpu, &cs, VCPU_SREG_CS); 3169 kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
3084 *db = cs.db; 3170 *db = cs.db;
3085 *l = cs.l; 3171 *l = cs.l;
3086} 3172}
@@ -3094,15 +3180,15 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3094 3180
3095 vcpu_load(vcpu); 3181 vcpu_load(vcpu);
3096 3182
3097 get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3183 kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3098 get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3184 kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3099 get_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3185 kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3100 get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3186 kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3101 get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3187 kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3102 get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3188 kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3103 3189
3104 get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3190 kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3105 get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3191 kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3106 3192
3107 kvm_x86_ops->get_idt(vcpu, &dt); 3193 kvm_x86_ops->get_idt(vcpu, &dt);
3108 sregs->idt.limit = dt.limit; 3194 sregs->idt.limit = dt.limit;
@@ -3154,7 +3240,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
3154 return 0; 3240 return 0;
3155} 3241}
3156 3242
3157static void set_segment(struct kvm_vcpu *vcpu, 3243static void kvm_set_segment(struct kvm_vcpu *vcpu,
3158 struct kvm_segment *var, int seg) 3244 struct kvm_segment *var, int seg)
3159{ 3245{
3160 kvm_x86_ops->set_segment(vcpu, var, seg); 3246 kvm_x86_ops->set_segment(vcpu, var, seg);
@@ -3168,6 +3254,10 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector,
3168 kvm_desct->base |= seg_desc->base2 << 24; 3254 kvm_desct->base |= seg_desc->base2 << 24;
3169 kvm_desct->limit = seg_desc->limit0; 3255 kvm_desct->limit = seg_desc->limit0;
3170 kvm_desct->limit |= seg_desc->limit << 16; 3256 kvm_desct->limit |= seg_desc->limit << 16;
3257 if (seg_desc->g) {
3258 kvm_desct->limit <<= 12;
3259 kvm_desct->limit |= 0xfff;
3260 }
3171 kvm_desct->selector = selector; 3261 kvm_desct->selector = selector;
3172 kvm_desct->type = seg_desc->type; 3262 kvm_desct->type = seg_desc->type;
3173 kvm_desct->present = seg_desc->p; 3263 kvm_desct->present = seg_desc->p;
@@ -3191,7 +3281,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3191 if (selector & 1 << 2) { 3281 if (selector & 1 << 2) {
3192 struct kvm_segment kvm_seg; 3282 struct kvm_segment kvm_seg;
3193 3283
3194 get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); 3284 kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR);
3195 3285
3196 if (kvm_seg.unusable) 3286 if (kvm_seg.unusable)
3197 dtable->limit = 0; 3287 dtable->limit = 0;
@@ -3207,6 +3297,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu,
3207static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3297static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3208 struct desc_struct *seg_desc) 3298 struct desc_struct *seg_desc)
3209{ 3299{
3300 gpa_t gpa;
3210 struct descriptor_table dtable; 3301 struct descriptor_table dtable;
3211 u16 index = selector >> 3; 3302 u16 index = selector >> 3;
3212 3303
@@ -3216,13 +3307,16 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3216 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 3307 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
3217 return 1; 3308 return 1;
3218 } 3309 }
3219 return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); 3310 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3311 gpa += index * 8;
3312 return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8);
3220} 3313}
3221 3314
3222/* allowed just for 8 bytes segments */ 3315/* allowed just for 8 bytes segments */
3223static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3316static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3224 struct desc_struct *seg_desc) 3317 struct desc_struct *seg_desc)
3225{ 3318{
3319 gpa_t gpa;
3226 struct descriptor_table dtable; 3320 struct descriptor_table dtable;
3227 u16 index = selector >> 3; 3321 u16 index = selector >> 3;
3228 3322
@@ -3230,7 +3324,9 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3230 3324
3231 if (dtable.limit < index * 8 + 7) 3325 if (dtable.limit < index * 8 + 7)
3232 return 1; 3326 return 1;
3233 return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); 3327 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base);
3328 gpa += index * 8;
3329 return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8);
3234} 3330}
3235 3331
3236static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, 3332static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
@@ -3242,62 +3338,14 @@ static u32 get_tss_base_addr(struct kvm_vcpu *vcpu,
3242 base_addr |= (seg_desc->base1 << 16); 3338 base_addr |= (seg_desc->base1 << 16);
3243 base_addr |= (seg_desc->base2 << 24); 3339 base_addr |= (seg_desc->base2 << 24);
3244 3340
3245 return base_addr; 3341 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr);
3246}
3247
3248static int load_tss_segment32(struct kvm_vcpu *vcpu,
3249 struct desc_struct *seg_desc,
3250 struct tss_segment_32 *tss)
3251{
3252 u32 base_addr;
3253
3254 base_addr = get_tss_base_addr(vcpu, seg_desc);
3255
3256 return kvm_read_guest(vcpu->kvm, base_addr, tss,
3257 sizeof(struct tss_segment_32));
3258}
3259
3260static int save_tss_segment32(struct kvm_vcpu *vcpu,
3261 struct desc_struct *seg_desc,
3262 struct tss_segment_32 *tss)
3263{
3264 u32 base_addr;
3265
3266 base_addr = get_tss_base_addr(vcpu, seg_desc);
3267
3268 return kvm_write_guest(vcpu->kvm, base_addr, tss,
3269 sizeof(struct tss_segment_32));
3270}
3271
3272static int load_tss_segment16(struct kvm_vcpu *vcpu,
3273 struct desc_struct *seg_desc,
3274 struct tss_segment_16 *tss)
3275{
3276 u32 base_addr;
3277
3278 base_addr = get_tss_base_addr(vcpu, seg_desc);
3279
3280 return kvm_read_guest(vcpu->kvm, base_addr, tss,
3281 sizeof(struct tss_segment_16));
3282}
3283
3284static int save_tss_segment16(struct kvm_vcpu *vcpu,
3285 struct desc_struct *seg_desc,
3286 struct tss_segment_16 *tss)
3287{
3288 u32 base_addr;
3289
3290 base_addr = get_tss_base_addr(vcpu, seg_desc);
3291
3292 return kvm_write_guest(vcpu->kvm, base_addr, tss,
3293 sizeof(struct tss_segment_16));
3294} 3342}
3295 3343
3296static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 3344static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
3297{ 3345{
3298 struct kvm_segment kvm_seg; 3346 struct kvm_segment kvm_seg;
3299 3347
3300 get_segment(vcpu, &kvm_seg, seg); 3348 kvm_get_segment(vcpu, &kvm_seg, seg);
3301 return kvm_seg.selector; 3349 return kvm_seg.selector;
3302} 3350}
3303 3351
@@ -3313,11 +3361,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
3313 return 0; 3361 return 0;
3314} 3362}
3315 3363
3316static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 3364static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
3317 int type_bits, int seg) 3365{
3366 struct kvm_segment segvar = {
3367 .base = selector << 4,
3368 .limit = 0xffff,
3369 .selector = selector,
3370 .type = 3,
3371 .present = 1,
3372 .dpl = 3,
3373 .db = 0,
3374 .s = 1,
3375 .l = 0,
3376 .g = 0,
3377 .avl = 0,
3378 .unusable = 0,
3379 };
3380 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
3381 return 0;
3382}
3383
3384int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3385 int type_bits, int seg)
3318{ 3386{
3319 struct kvm_segment kvm_seg; 3387 struct kvm_segment kvm_seg;
3320 3388
3389 if (!(vcpu->arch.cr0 & X86_CR0_PE))
3390 return kvm_load_realmode_segment(vcpu, selector, seg);
3321 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) 3391 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
3322 return 1; 3392 return 1;
3323 kvm_seg.type |= type_bits; 3393 kvm_seg.type |= type_bits;
@@ -3327,7 +3397,7 @@ static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
3327 if (!kvm_seg.s) 3397 if (!kvm_seg.s)
3328 kvm_seg.unusable = 1; 3398 kvm_seg.unusable = 1;
3329 3399
3330 set_segment(vcpu, &kvm_seg, seg); 3400 kvm_set_segment(vcpu, &kvm_seg, seg);
3331 return 0; 3401 return 0;
3332} 3402}
3333 3403
@@ -3335,17 +3405,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
3335 struct tss_segment_32 *tss) 3405 struct tss_segment_32 *tss)
3336{ 3406{
3337 tss->cr3 = vcpu->arch.cr3; 3407 tss->cr3 = vcpu->arch.cr3;
3338 tss->eip = vcpu->arch.rip; 3408 tss->eip = kvm_rip_read(vcpu);
3339 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 3409 tss->eflags = kvm_x86_ops->get_rflags(vcpu);
3340 tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; 3410 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3341 tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3411 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3342 tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; 3412 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3343 tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; 3413 tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3344 tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; 3414 tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3345 tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; 3415 tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3346 tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; 3416 tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI);
3347 tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; 3417 tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI);
3348
3349 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3418 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3350 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3419 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
3351 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 3420 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
@@ -3361,37 +3430,37 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3361{ 3430{
3362 kvm_set_cr3(vcpu, tss->cr3); 3431 kvm_set_cr3(vcpu, tss->cr3);
3363 3432
3364 vcpu->arch.rip = tss->eip; 3433 kvm_rip_write(vcpu, tss->eip);
3365 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 3434 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2);
3366 3435
3367 vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; 3436 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
3368 vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; 3437 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
3369 vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; 3438 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx);
3370 vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; 3439 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx);
3371 vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; 3440 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp);
3372 vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; 3441 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp);
3373 vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; 3442 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
3374 vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; 3443 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
3375 3444
3376 if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 3445 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR))
3377 return 1; 3446 return 1;
3378 3447
3379 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3448 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3380 return 1; 3449 return 1;
3381 3450
3382 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3451 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3383 return 1; 3452 return 1;
3384 3453
3385 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3454 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3386 return 1; 3455 return 1;
3387 3456
3388 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3457 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3389 return 1; 3458 return 1;
3390 3459
3391 if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 3460 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS))
3392 return 1; 3461 return 1;
3393 3462
3394 if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 3463 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS))
3395 return 1; 3464 return 1;
3396 return 0; 3465 return 0;
3397} 3466}
@@ -3399,16 +3468,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
3399static void save_state_to_tss16(struct kvm_vcpu *vcpu, 3468static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3400 struct tss_segment_16 *tss) 3469 struct tss_segment_16 *tss)
3401{ 3470{
3402 tss->ip = vcpu->arch.rip; 3471 tss->ip = kvm_rip_read(vcpu);
3403 tss->flag = kvm_x86_ops->get_rflags(vcpu); 3472 tss->flag = kvm_x86_ops->get_rflags(vcpu);
3404 tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; 3473 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
3405 tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; 3474 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
3406 tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; 3475 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
3407 tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; 3476 tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX);
3408 tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; 3477 tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP);
3409 tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; 3478 tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP);
3410 tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; 3479 tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI);
3411 tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; 3480 tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI);
3412 3481
3413 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); 3482 tss->es = get_segment_selector(vcpu, VCPU_SREG_ES);
3414 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); 3483 tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS);
@@ -3421,49 +3490,55 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
3421static int load_state_from_tss16(struct kvm_vcpu *vcpu, 3490static int load_state_from_tss16(struct kvm_vcpu *vcpu,
3422 struct tss_segment_16 *tss) 3491 struct tss_segment_16 *tss)
3423{ 3492{
3424 vcpu->arch.rip = tss->ip; 3493 kvm_rip_write(vcpu, tss->ip);
3425 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 3494 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2);
3426 vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; 3495 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
3427 vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; 3496 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
3428 vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; 3497 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
3429 vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; 3498 kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx);
3430 vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; 3499 kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp);
3431 vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; 3500 kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp);
3432 vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; 3501 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
3433 vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; 3502 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
3434 3503
3435 if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 3504 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR))
3436 return 1; 3505 return 1;
3437 3506
3438 if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 3507 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES))
3439 return 1; 3508 return 1;
3440 3509
3441 if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 3510 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS))
3442 return 1; 3511 return 1;
3443 3512
3444 if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 3513 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS))
3445 return 1; 3514 return 1;
3446 3515
3447 if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 3516 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS))
3448 return 1; 3517 return 1;
3449 return 0; 3518 return 0;
3450} 3519}
3451 3520
3452int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, 3521static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
3453 struct desc_struct *cseg_desc, 3522 u32 old_tss_base,
3454 struct desc_struct *nseg_desc) 3523 struct desc_struct *nseg_desc)
3455{ 3524{
3456 struct tss_segment_16 tss_segment_16; 3525 struct tss_segment_16 tss_segment_16;
3457 int ret = 0; 3526 int ret = 0;
3458 3527
3459 if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16)) 3528 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3529 sizeof tss_segment_16))
3460 goto out; 3530 goto out;
3461 3531
3462 save_state_to_tss16(vcpu, &tss_segment_16); 3532 save_state_to_tss16(vcpu, &tss_segment_16);
3463 save_tss_segment16(vcpu, cseg_desc, &tss_segment_16);
3464 3533
3465 if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16)) 3534 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16,
3535 sizeof tss_segment_16))
3466 goto out; 3536 goto out;
3537
3538 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3539 &tss_segment_16, sizeof tss_segment_16))
3540 goto out;
3541
3467 if (load_state_from_tss16(vcpu, &tss_segment_16)) 3542 if (load_state_from_tss16(vcpu, &tss_segment_16))
3468 goto out; 3543 goto out;
3469 3544
@@ -3472,21 +3547,27 @@ out:
3472 return ret; 3547 return ret;
3473} 3548}
3474 3549
3475int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, 3550static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
3476 struct desc_struct *cseg_desc, 3551 u32 old_tss_base,
3477 struct desc_struct *nseg_desc) 3552 struct desc_struct *nseg_desc)
3478{ 3553{
3479 struct tss_segment_32 tss_segment_32; 3554 struct tss_segment_32 tss_segment_32;
3480 int ret = 0; 3555 int ret = 0;
3481 3556
3482 if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32)) 3557 if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3558 sizeof tss_segment_32))
3483 goto out; 3559 goto out;
3484 3560
3485 save_state_to_tss32(vcpu, &tss_segment_32); 3561 save_state_to_tss32(vcpu, &tss_segment_32);
3486 save_tss_segment32(vcpu, cseg_desc, &tss_segment_32);
3487 3562
3488 if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32)) 3563 if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32,
3564 sizeof tss_segment_32))
3489 goto out; 3565 goto out;
3566
3567 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc),
3568 &tss_segment_32, sizeof tss_segment_32))
3569 goto out;
3570
3490 if (load_state_from_tss32(vcpu, &tss_segment_32)) 3571 if (load_state_from_tss32(vcpu, &tss_segment_32))
3491 goto out; 3572 goto out;
3492 3573
@@ -3501,16 +3582,20 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3501 struct desc_struct cseg_desc; 3582 struct desc_struct cseg_desc;
3502 struct desc_struct nseg_desc; 3583 struct desc_struct nseg_desc;
3503 int ret = 0; 3584 int ret = 0;
3585 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
3586 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
3504 3587
3505 get_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3588 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base);
3506 3589
3590 /* FIXME: Handle errors. Failure to read either TSS or their
3591 * descriptors should generate a pagefault.
3592 */
3507 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) 3593 if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc))
3508 goto out; 3594 goto out;
3509 3595
3510 if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc)) 3596 if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc))
3511 goto out; 3597 goto out;
3512 3598
3513
3514 if (reason != TASK_SWITCH_IRET) { 3599 if (reason != TASK_SWITCH_IRET) {
3515 int cpl; 3600 int cpl;
3516 3601
@@ -3528,8 +3613,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3528 3613
3529 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { 3614 if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) {
3530 cseg_desc.type &= ~(1 << 1); //clear the B flag 3615 cseg_desc.type &= ~(1 << 1); //clear the B flag
3531 save_guest_segment_descriptor(vcpu, tr_seg.selector, 3616 save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc);
3532 &cseg_desc);
3533 } 3617 }
3534 3618
3535 if (reason == TASK_SWITCH_IRET) { 3619 if (reason == TASK_SWITCH_IRET) {
@@ -3538,13 +3622,12 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3538 } 3622 }
3539 3623
3540 kvm_x86_ops->skip_emulated_instruction(vcpu); 3624 kvm_x86_ops->skip_emulated_instruction(vcpu);
3541 kvm_x86_ops->cache_regs(vcpu);
3542 3625
3543 if (nseg_desc.type & 8) 3626 if (nseg_desc.type & 8)
3544 ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc, 3627 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base,
3545 &nseg_desc); 3628 &nseg_desc);
3546 else 3629 else
3547 ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc, 3630 ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base,
3548 &nseg_desc); 3631 &nseg_desc);
3549 3632
3550 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 3633 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
@@ -3561,9 +3644,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
3561 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 3644 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS);
3562 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 3645 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
3563 tr_seg.type = 11; 3646 tr_seg.type = 11;
3564 set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 3647 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
3565out: 3648out:
3566 kvm_x86_ops->decache_regs(vcpu);
3567 return ret; 3649 return ret;
3568} 3650}
3569EXPORT_SYMBOL_GPL(kvm_task_switch); 3651EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -3626,17 +3708,24 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
3626 pr_debug("Set back pending irq %d\n", 3708 pr_debug("Set back pending irq %d\n",
3627 pending_vec); 3709 pending_vec);
3628 } 3710 }
3711 kvm_pic_clear_isr_ack(vcpu->kvm);
3629 } 3712 }
3630 3713
3631 set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 3714 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
3632 set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); 3715 kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
3633 set_segment(vcpu, &sregs->es, VCPU_SREG_ES); 3716 kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
3634 set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); 3717 kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
3635 set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); 3718 kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
3636 set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); 3719 kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
3720
3721 kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
3722 kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
3637 3723
3638 set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); 3724 /* Older userspace won't unhalt the vcpu on reset. */
3639 set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); 3725 if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 &&
3726 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
3727 !(vcpu->arch.cr0 & X86_CR0_PE))
3728 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3640 3729
3641 vcpu_put(vcpu); 3730 vcpu_put(vcpu);
3642 3731
@@ -3751,14 +3840,14 @@ void fx_init(struct kvm_vcpu *vcpu)
3751 * allocate ram with GFP_KERNEL. 3840 * allocate ram with GFP_KERNEL.
3752 */ 3841 */
3753 if (!used_math()) 3842 if (!used_math())
3754 fx_save(&vcpu->arch.host_fx_image); 3843 kvm_fx_save(&vcpu->arch.host_fx_image);
3755 3844
3756 /* Initialize guest FPU by resetting ours and saving into guest's */ 3845 /* Initialize guest FPU by resetting ours and saving into guest's */
3757 preempt_disable(); 3846 preempt_disable();
3758 fx_save(&vcpu->arch.host_fx_image); 3847 kvm_fx_save(&vcpu->arch.host_fx_image);
3759 fx_finit(); 3848 kvm_fx_finit();
3760 fx_save(&vcpu->arch.guest_fx_image); 3849 kvm_fx_save(&vcpu->arch.guest_fx_image);
3761 fx_restore(&vcpu->arch.host_fx_image); 3850 kvm_fx_restore(&vcpu->arch.host_fx_image);
3762 preempt_enable(); 3851 preempt_enable();
3763 3852
3764 vcpu->arch.cr0 |= X86_CR0_ET; 3853 vcpu->arch.cr0 |= X86_CR0_ET;
@@ -3775,8 +3864,8 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
3775 return; 3864 return;
3776 3865
3777 vcpu->guest_fpu_loaded = 1; 3866 vcpu->guest_fpu_loaded = 1;
3778 fx_save(&vcpu->arch.host_fx_image); 3867 kvm_fx_save(&vcpu->arch.host_fx_image);
3779 fx_restore(&vcpu->arch.guest_fx_image); 3868 kvm_fx_restore(&vcpu->arch.guest_fx_image);
3780} 3869}
3781EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); 3870EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
3782 3871
@@ -3786,8 +3875,8 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
3786 return; 3875 return;
3787 3876
3788 vcpu->guest_fpu_loaded = 0; 3877 vcpu->guest_fpu_loaded = 0;
3789 fx_save(&vcpu->arch.guest_fx_image); 3878 kvm_fx_save(&vcpu->arch.guest_fx_image);
3790 fx_restore(&vcpu->arch.host_fx_image); 3879 kvm_fx_restore(&vcpu->arch.host_fx_image);
3791 ++vcpu->stat.fpu_reload; 3880 ++vcpu->stat.fpu_reload;
3792} 3881}
3793EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); 3882EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
@@ -3922,6 +4011,7 @@ struct kvm *kvm_arch_create_vm(void)
3922 return ERR_PTR(-ENOMEM); 4011 return ERR_PTR(-ENOMEM);
3923 4012
3924 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 4013 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4014 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
3925 4015
3926 return kvm; 4016 return kvm;
3927} 4017}
@@ -3954,6 +4044,8 @@ static void kvm_free_vcpus(struct kvm *kvm)
3954 4044
3955void kvm_arch_destroy_vm(struct kvm *kvm) 4045void kvm_arch_destroy_vm(struct kvm *kvm)
3956{ 4046{
4047 kvm_iommu_unmap_guest(kvm);
4048 kvm_free_all_assigned_devices(kvm);
3957 kvm_free_pit(kvm); 4049 kvm_free_pit(kvm);
3958 kfree(kvm->arch.vpic); 4050 kfree(kvm->arch.vpic);
3959 kfree(kvm->arch.vioapic); 4051 kfree(kvm->arch.vioapic);
@@ -3979,16 +4071,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
3979 */ 4071 */
3980 if (!user_alloc) { 4072 if (!user_alloc) {
3981 if (npages && !old.rmap) { 4073 if (npages && !old.rmap) {
4074 unsigned long userspace_addr;
4075
3982 down_write(&current->mm->mmap_sem); 4076 down_write(&current->mm->mmap_sem);
3983 memslot->userspace_addr = do_mmap(NULL, 0, 4077 userspace_addr = do_mmap(NULL, 0,
3984 npages * PAGE_SIZE, 4078 npages * PAGE_SIZE,
3985 PROT_READ | PROT_WRITE, 4079 PROT_READ | PROT_WRITE,
3986 MAP_SHARED | MAP_ANONYMOUS, 4080 MAP_PRIVATE | MAP_ANONYMOUS,
3987 0); 4081 0);
3988 up_write(&current->mm->mmap_sem); 4082 up_write(&current->mm->mmap_sem);
3989 4083
3990 if (IS_ERR((void *)memslot->userspace_addr)) 4084 if (IS_ERR((void *)userspace_addr))
3991 return PTR_ERR((void *)memslot->userspace_addr); 4085 return PTR_ERR((void *)userspace_addr);
4086
4087 /* set userspace_addr atomically for kvm_hva_to_rmapp */
4088 spin_lock(&kvm->mmu_lock);
4089 memslot->userspace_addr = userspace_addr;
4090 spin_unlock(&kvm->mmu_lock);
3992 } else { 4091 } else {
3993 if (!old.user_alloc && old.rmap) { 4092 if (!old.user_alloc && old.rmap) {
3994 int ret; 4093 int ret;
@@ -4016,6 +4115,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4016 return 0; 4115 return 0;
4017} 4116}
4018 4117
4118void kvm_arch_flush_shadow(struct kvm *kvm)
4119{
4120 kvm_mmu_zap_all(kvm);
4121}
4122
4019int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 4123int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
4020{ 4124{
4021 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 4125 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
new file mode 100644
index 000000000000..6a4be78a7384
--- /dev/null
+++ b/arch/x86/kvm/x86.h
@@ -0,0 +1,22 @@
1#ifndef ARCH_X86_KVM_X86_H
2#define ARCH_X86_KVM_X86_H
3
4#include <linux/kvm_host.h>
5
6static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
7{
8 vcpu->arch.exception.pending = false;
9}
10
11static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector)
12{
13 vcpu->arch.interrupt.pending = true;
14 vcpu->arch.interrupt.nr = vector;
15}
16
17static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu)
18{
19 vcpu->arch.interrupt.pending = false;
20}
21
22#endif
diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c
index 932f216d890c..ea051173b0da 100644
--- a/arch/x86/kvm/x86_emulate.c
+++ b/arch/x86/kvm/x86_emulate.c
@@ -26,6 +26,7 @@
26#define DPRINTF(_f, _a ...) printf(_f , ## _a) 26#define DPRINTF(_f, _a ...) printf(_f , ## _a)
27#else 27#else
28#include <linux/kvm_host.h> 28#include <linux/kvm_host.h>
29#include "kvm_cache_regs.h"
29#define DPRINTF(x...) do {} while (0) 30#define DPRINTF(x...) do {} while (0)
30#endif 31#endif
31#include <linux/module.h> 32#include <linux/module.h>
@@ -46,25 +47,26 @@
46#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ 47#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
47#define DstReg (2<<1) /* Register operand. */ 48#define DstReg (2<<1) /* Register operand. */
48#define DstMem (3<<1) /* Memory operand. */ 49#define DstMem (3<<1) /* Memory operand. */
49#define DstMask (3<<1) 50#define DstAcc (4<<1) /* Destination Accumulator */
51#define DstMask (7<<1)
50/* Source operand type. */ 52/* Source operand type. */
51#define SrcNone (0<<3) /* No source operand. */ 53#define SrcNone (0<<4) /* No source operand. */
52#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ 54#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */
53#define SrcReg (1<<3) /* Register operand. */ 55#define SrcReg (1<<4) /* Register operand. */
54#define SrcMem (2<<3) /* Memory operand. */ 56#define SrcMem (2<<4) /* Memory operand. */
55#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ 57#define SrcMem16 (3<<4) /* Memory operand (16-bit). */
56#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ 58#define SrcMem32 (4<<4) /* Memory operand (32-bit). */
57#define SrcImm (5<<3) /* Immediate operand. */ 59#define SrcImm (5<<4) /* Immediate operand. */
58#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ 60#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */
59#define SrcMask (7<<3) 61#define SrcMask (7<<4)
60/* Generic ModRM decode. */ 62/* Generic ModRM decode. */
61#define ModRM (1<<6) 63#define ModRM (1<<7)
62/* Destination is only written; never read. */ 64/* Destination is only written; never read. */
63#define Mov (1<<7) 65#define Mov (1<<8)
64#define BitOp (1<<8) 66#define BitOp (1<<9)
65#define MemAbs (1<<9) /* Memory operand is absolute displacement */ 67#define MemAbs (1<<10) /* Memory operand is absolute displacement */
66#define String (1<<10) /* String instruction (rep capable) */ 68#define String (1<<12) /* String instruction (rep capable) */
67#define Stack (1<<11) /* Stack instruction (push/pop) */ 69#define Stack (1<<13) /* Stack instruction (push/pop) */
68#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 70#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
69#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 71#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
70#define GroupMask 0xff /* Group number stored in bits 0:7 */ 72#define GroupMask 0xff /* Group number stored in bits 0:7 */
@@ -94,7 +96,7 @@ static u16 opcode_table[256] = {
94 /* 0x20 - 0x27 */ 96 /* 0x20 - 0x27 */
95 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 97 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
96 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 98 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
97 SrcImmByte, SrcImm, 0, 0, 99 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
98 /* 0x28 - 0x2F */ 100 /* 0x28 - 0x2F */
99 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 101 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
100 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 102 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
@@ -106,7 +108,8 @@ static u16 opcode_table[256] = {
106 /* 0x38 - 0x3F */ 108 /* 0x38 - 0x3F */
107 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 109 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
108 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 110 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
109 0, 0, 0, 0, 111 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
112 0, 0,
110 /* 0x40 - 0x47 */ 113 /* 0x40 - 0x47 */
111 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 114 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
112 /* 0x48 - 0x4F */ 115 /* 0x48 - 0x4F */
@@ -121,7 +124,7 @@ static u16 opcode_table[256] = {
121 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 124 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
122 0, 0, 0, 0, 125 0, 0, 0, 0,
123 /* 0x68 - 0x6F */ 126 /* 0x68 - 0x6F */
124 0, 0, ImplicitOps | Mov | Stack, 0, 127 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
125 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ 128 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */
126 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ 129 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */
127 /* 0x70 - 0x77 */ 130 /* 0x70 - 0x77 */
@@ -138,9 +141,11 @@ static u16 opcode_table[256] = {
138 /* 0x88 - 0x8F */ 141 /* 0x88 - 0x8F */
139 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 142 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
140 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 143 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
141 0, ModRM | DstReg, 0, Group | Group1A, 144 DstMem | SrcReg | ModRM | Mov, ModRM | DstReg,
142 /* 0x90 - 0x9F */ 145 DstReg | SrcMem | ModRM | Mov, Group | Group1A,
143 0, 0, 0, 0, 0, 0, 0, 0, 146 /* 0x90 - 0x97 */
147 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
148 /* 0x98 - 0x9F */
144 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 149 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
145 /* 0xA0 - 0xA7 */ 150 /* 0xA0 - 0xA7 */
146 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 151 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -151,8 +156,16 @@ static u16 opcode_table[256] = {
151 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 156 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
152 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, 157 ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String,
153 ByteOp | ImplicitOps | String, ImplicitOps | String, 158 ByteOp | ImplicitOps | String, ImplicitOps | String,
154 /* 0xB0 - 0xBF */ 159 /* 0xB0 - 0xB7 */
155 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 160 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
161 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
162 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
163 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
164 /* 0xB8 - 0xBF */
165 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
166 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
167 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
168 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
156 /* 0xC0 - 0xC7 */ 169 /* 0xC0 - 0xC7 */
157 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 170 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
158 0, ImplicitOps | Stack, 0, 0, 171 0, ImplicitOps | Stack, 0, 0,
@@ -166,16 +179,20 @@ static u16 opcode_table[256] = {
166 /* 0xD8 - 0xDF */ 179 /* 0xD8 - 0xDF */
167 0, 0, 0, 0, 0, 0, 0, 0, 180 0, 0, 0, 0, 0, 0, 0, 0,
168 /* 0xE0 - 0xE7 */ 181 /* 0xE0 - 0xE7 */
169 0, 0, 0, 0, 0, 0, 0, 0,
170 /* 0xE8 - 0xEF */
171 ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps,
172 0, 0, 0, 0, 182 0, 0, 0, 0,
183 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
184 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
185 /* 0xE8 - 0xEF */
186 ImplicitOps | Stack, SrcImm | ImplicitOps,
187 ImplicitOps, SrcImmByte | ImplicitOps,
188 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
189 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
173 /* 0xF0 - 0xF7 */ 190 /* 0xF0 - 0xF7 */
174 0, 0, 0, 0, 191 0, 0, 0, 0,
175 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, 192 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3,
176 /* 0xF8 - 0xFF */ 193 /* 0xF8 - 0xFF */
177 ImplicitOps, 0, ImplicitOps, ImplicitOps, 194 ImplicitOps, 0, ImplicitOps, ImplicitOps,
178 0, 0, Group | Group4, Group | Group5, 195 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
179}; 196};
180 197
181static u16 twobyte_table[256] = { 198static u16 twobyte_table[256] = {
@@ -215,7 +232,7 @@ static u16 twobyte_table[256] = {
215 /* 0xA0 - 0xA7 */ 232 /* 0xA0 - 0xA7 */
216 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, 233 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0,
217 /* 0xA8 - 0xAF */ 234 /* 0xA8 - 0xAF */
218 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, 235 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0,
219 /* 0xB0 - 0xB7 */ 236 /* 0xB0 - 0xB7 */
220 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, 237 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0,
221 DstMem | SrcReg | ModRM | BitOp, 238 DstMem | SrcReg | ModRM | BitOp,
@@ -264,15 +281,16 @@ static u16 group_table[] = {
264 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 281 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
265 0, 0, 0, 0, 282 0, 0, 0, 0,
266 [Group3*8] = 283 [Group3*8] =
267 DstMem | SrcImm | ModRM | SrcImm, 0, 284 DstMem | SrcImm | ModRM, 0,
268 DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 285 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
269 0, 0, 0, 0, 286 0, 0, 0, 0,
270 [Group4*8] = 287 [Group4*8] =
271 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 288 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
272 0, 0, 0, 0, 0, 0, 289 0, 0, 0, 0, 0, 0,
273 [Group5*8] = 290 [Group5*8] =
274 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, 291 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
275 SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, 292 SrcMem | ModRM | Stack, 0,
293 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
276 [Group7*8] = 294 [Group7*8] =
277 0, 0, ModRM | SrcMem, ModRM | SrcMem, 295 0, 0, ModRM | SrcMem, ModRM | SrcMem,
278 SrcNone | ModRM | DstMem | Mov, 0, 296 SrcNone | ModRM | DstMem | Mov, 0,
@@ -518,6 +536,39 @@ static inline void jmp_rel(struct decode_cache *c, int rel)
518 register_address_increment(c, &c->eip, rel); 536 register_address_increment(c, &c->eip, rel);
519} 537}
520 538
539static void set_seg_override(struct decode_cache *c, int seg)
540{
541 c->has_seg_override = true;
542 c->seg_override = seg;
543}
544
545static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg)
546{
547 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
548 return 0;
549
550 return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg);
551}
552
553static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt,
554 struct decode_cache *c)
555{
556 if (!c->has_seg_override)
557 return 0;
558
559 return seg_base(ctxt, c->seg_override);
560}
561
562static unsigned long es_base(struct x86_emulate_ctxt *ctxt)
563{
564 return seg_base(ctxt, VCPU_SREG_ES);
565}
566
567static unsigned long ss_base(struct x86_emulate_ctxt *ctxt)
568{
569 return seg_base(ctxt, VCPU_SREG_SS);
570}
571
521static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 572static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
522 struct x86_emulate_ops *ops, 573 struct x86_emulate_ops *ops,
523 unsigned long linear, u8 *dest) 574 unsigned long linear, u8 *dest)
@@ -660,7 +711,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
660{ 711{
661 struct decode_cache *c = &ctxt->decode; 712 struct decode_cache *c = &ctxt->decode;
662 u8 sib; 713 u8 sib;
663 int index_reg = 0, base_reg = 0, scale, rip_relative = 0; 714 int index_reg = 0, base_reg = 0, scale;
664 int rc = 0; 715 int rc = 0;
665 716
666 if (c->rex_prefix) { 717 if (c->rex_prefix) {
@@ -731,47 +782,28 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
731 } 782 }
732 if (c->modrm_rm == 2 || c->modrm_rm == 3 || 783 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
733 (c->modrm_rm == 6 && c->modrm_mod != 0)) 784 (c->modrm_rm == 6 && c->modrm_mod != 0))
734 if (!c->override_base) 785 if (!c->has_seg_override)
735 c->override_base = &ctxt->ss_base; 786 set_seg_override(c, VCPU_SREG_SS);
736 c->modrm_ea = (u16)c->modrm_ea; 787 c->modrm_ea = (u16)c->modrm_ea;
737 } else { 788 } else {
738 /* 32/64-bit ModR/M decode. */ 789 /* 32/64-bit ModR/M decode. */
739 switch (c->modrm_rm) { 790 if ((c->modrm_rm & 7) == 4) {
740 case 4:
741 case 12:
742 sib = insn_fetch(u8, 1, c->eip); 791 sib = insn_fetch(u8, 1, c->eip);
743 index_reg |= (sib >> 3) & 7; 792 index_reg |= (sib >> 3) & 7;
744 base_reg |= sib & 7; 793 base_reg |= sib & 7;
745 scale = sib >> 6; 794 scale = sib >> 6;
746 795
747 switch (base_reg) { 796 if ((base_reg & 7) == 5 && c->modrm_mod == 0)
748 case 5: 797 c->modrm_ea += insn_fetch(s32, 4, c->eip);
749 if (c->modrm_mod != 0) 798 else
750 c->modrm_ea += c->regs[base_reg];
751 else
752 c->modrm_ea +=
753 insn_fetch(s32, 4, c->eip);
754 break;
755 default:
756 c->modrm_ea += c->regs[base_reg]; 799 c->modrm_ea += c->regs[base_reg];
757 } 800 if (index_reg != 4)
758 switch (index_reg) {
759 case 4:
760 break;
761 default:
762 c->modrm_ea += c->regs[index_reg] << scale; 801 c->modrm_ea += c->regs[index_reg] << scale;
763 } 802 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
764 break; 803 if (ctxt->mode == X86EMUL_MODE_PROT64)
765 case 5: 804 c->rip_relative = 1;
766 if (c->modrm_mod != 0) 805 } else
767 c->modrm_ea += c->regs[c->modrm_rm];
768 else if (ctxt->mode == X86EMUL_MODE_PROT64)
769 rip_relative = 1;
770 break;
771 default:
772 c->modrm_ea += c->regs[c->modrm_rm]; 806 c->modrm_ea += c->regs[c->modrm_rm];
773 break;
774 }
775 switch (c->modrm_mod) { 807 switch (c->modrm_mod) {
776 case 0: 808 case 0:
777 if (c->modrm_rm == 5) 809 if (c->modrm_rm == 5)
@@ -785,22 +817,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
785 break; 817 break;
786 } 818 }
787 } 819 }
788 if (rip_relative) {
789 c->modrm_ea += c->eip;
790 switch (c->d & SrcMask) {
791 case SrcImmByte:
792 c->modrm_ea += 1;
793 break;
794 case SrcImm:
795 if (c->d & ByteOp)
796 c->modrm_ea += 1;
797 else
798 if (c->op_bytes == 8)
799 c->modrm_ea += 4;
800 else
801 c->modrm_ea += c->op_bytes;
802 }
803 }
804done: 820done:
805 return rc; 821 return rc;
806} 822}
@@ -837,7 +853,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
837 /* Shadow copy of register state. Committed on successful emulation. */ 853 /* Shadow copy of register state. Committed on successful emulation. */
838 854
839 memset(c, 0, sizeof(struct decode_cache)); 855 memset(c, 0, sizeof(struct decode_cache));
840 c->eip = ctxt->vcpu->arch.rip; 856 c->eip = kvm_rip_read(ctxt->vcpu);
857 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
841 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 858 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
842 859
843 switch (mode) { 860 switch (mode) {
@@ -876,23 +893,15 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
876 /* switch between 2/4 bytes */ 893 /* switch between 2/4 bytes */
877 c->ad_bytes = def_ad_bytes ^ 6; 894 c->ad_bytes = def_ad_bytes ^ 6;
878 break; 895 break;
896 case 0x26: /* ES override */
879 case 0x2e: /* CS override */ 897 case 0x2e: /* CS override */
880 c->override_base = &ctxt->cs_base; 898 case 0x36: /* SS override */
881 break;
882 case 0x3e: /* DS override */ 899 case 0x3e: /* DS override */
883 c->override_base = &ctxt->ds_base; 900 set_seg_override(c, (c->b >> 3) & 3);
884 break;
885 case 0x26: /* ES override */
886 c->override_base = &ctxt->es_base;
887 break; 901 break;
888 case 0x64: /* FS override */ 902 case 0x64: /* FS override */
889 c->override_base = &ctxt->fs_base;
890 break;
891 case 0x65: /* GS override */ 903 case 0x65: /* GS override */
892 c->override_base = &ctxt->gs_base; 904 set_seg_override(c, c->b & 7);
893 break;
894 case 0x36: /* SS override */
895 c->override_base = &ctxt->ss_base;
896 break; 905 break;
897 case 0x40 ... 0x4f: /* REX */ 906 case 0x40 ... 0x4f: /* REX */
898 if (mode != X86EMUL_MODE_PROT64) 907 if (mode != X86EMUL_MODE_PROT64)
@@ -964,15 +973,11 @@ done_prefixes:
964 if (rc) 973 if (rc)
965 goto done; 974 goto done;
966 975
967 if (!c->override_base) 976 if (!c->has_seg_override)
968 c->override_base = &ctxt->ds_base; 977 set_seg_override(c, VCPU_SREG_DS);
969 if (mode == X86EMUL_MODE_PROT64 &&
970 c->override_base != &ctxt->fs_base &&
971 c->override_base != &ctxt->gs_base)
972 c->override_base = NULL;
973 978
974 if (c->override_base) 979 if (!(!c->twobyte && c->b == 0x8d))
975 c->modrm_ea += *c->override_base; 980 c->modrm_ea += seg_override_base(ctxt, c);
976 981
977 if (c->ad_bytes != 8) 982 if (c->ad_bytes != 8)
978 c->modrm_ea = (u32)c->modrm_ea; 983 c->modrm_ea = (u32)c->modrm_ea;
@@ -1049,6 +1054,7 @@ done_prefixes:
1049 break; 1054 break;
1050 case DstMem: 1055 case DstMem:
1051 if ((c->d & ModRM) && c->modrm_mod == 3) { 1056 if ((c->d & ModRM) && c->modrm_mod == 3) {
1057 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1052 c->dst.type = OP_REG; 1058 c->dst.type = OP_REG;
1053 c->dst.val = c->dst.orig_val = c->modrm_val; 1059 c->dst.val = c->dst.orig_val = c->modrm_val;
1054 c->dst.ptr = c->modrm_ptr; 1060 c->dst.ptr = c->modrm_ptr;
@@ -1056,8 +1062,28 @@ done_prefixes:
1056 } 1062 }
1057 c->dst.type = OP_MEM; 1063 c->dst.type = OP_MEM;
1058 break; 1064 break;
1065 case DstAcc:
1066 c->dst.type = OP_REG;
1067 c->dst.bytes = c->op_bytes;
1068 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1069 switch (c->op_bytes) {
1070 case 1:
1071 c->dst.val = *(u8 *)c->dst.ptr;
1072 break;
1073 case 2:
1074 c->dst.val = *(u16 *)c->dst.ptr;
1075 break;
1076 case 4:
1077 c->dst.val = *(u32 *)c->dst.ptr;
1078 break;
1079 }
1080 c->dst.orig_val = c->dst.val;
1081 break;
1059 } 1082 }
1060 1083
1084 if (c->rip_relative)
1085 c->modrm_ea += c->eip;
1086
1061done: 1087done:
1062 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 1088 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1063} 1089}
@@ -1070,7 +1096,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt)
1070 c->dst.bytes = c->op_bytes; 1096 c->dst.bytes = c->op_bytes;
1071 c->dst.val = c->src.val; 1097 c->dst.val = c->src.val;
1072 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1098 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1073 c->dst.ptr = (void *) register_address(c, ctxt->ss_base, 1099 c->dst.ptr = (void *) register_address(c, ss_base(ctxt),
1074 c->regs[VCPU_REGS_RSP]); 1100 c->regs[VCPU_REGS_RSP]);
1075} 1101}
1076 1102
@@ -1080,7 +1106,7 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1080 struct decode_cache *c = &ctxt->decode; 1106 struct decode_cache *c = &ctxt->decode;
1081 int rc; 1107 int rc;
1082 1108
1083 rc = ops->read_std(register_address(c, ctxt->ss_base, 1109 rc = ops->read_std(register_address(c, ss_base(ctxt),
1084 c->regs[VCPU_REGS_RSP]), 1110 c->regs[VCPU_REGS_RSP]),
1085 &c->dst.val, c->dst.bytes, ctxt->vcpu); 1111 &c->dst.val, c->dst.bytes, ctxt->vcpu);
1086 if (rc != 0) 1112 if (rc != 0)
@@ -1156,6 +1182,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1156 case 1: /* dec */ 1182 case 1: /* dec */
1157 emulate_1op("dec", c->dst, ctxt->eflags); 1183 emulate_1op("dec", c->dst, ctxt->eflags);
1158 break; 1184 break;
1185 case 2: /* call near abs */ {
1186 long int old_eip;
1187 old_eip = c->eip;
1188 c->eip = c->src.val;
1189 c->src.val = old_eip;
1190 emulate_push(ctxt);
1191 break;
1192 }
1159 case 4: /* jmp abs */ 1193 case 4: /* jmp abs */
1160 c->eip = c->src.val; 1194 c->eip = c->src.val;
1161 break; 1195 break;
@@ -1256,6 +1290,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1256 u64 msr_data; 1290 u64 msr_data;
1257 unsigned long saved_eip = 0; 1291 unsigned long saved_eip = 0;
1258 struct decode_cache *c = &ctxt->decode; 1292 struct decode_cache *c = &ctxt->decode;
1293 unsigned int port;
1294 int io_dir_in;
1259 int rc = 0; 1295 int rc = 0;
1260 1296
1261 /* Shadow copy of register state. Committed on successful emulation. 1297 /* Shadow copy of register state. Committed on successful emulation.
@@ -1272,7 +1308,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1272 if (c->rep_prefix && (c->d & String)) { 1308 if (c->rep_prefix && (c->d & String)) {
1273 /* All REP prefixes have the same first termination condition */ 1309 /* All REP prefixes have the same first termination condition */
1274 if (c->regs[VCPU_REGS_RCX] == 0) { 1310 if (c->regs[VCPU_REGS_RCX] == 0) {
1275 ctxt->vcpu->arch.rip = c->eip; 1311 kvm_rip_write(ctxt->vcpu, c->eip);
1276 goto done; 1312 goto done;
1277 } 1313 }
1278 /* The second termination condition only applies for REPE 1314 /* The second termination condition only applies for REPE
@@ -1286,17 +1322,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1286 (c->b == 0xae) || (c->b == 0xaf)) { 1322 (c->b == 0xae) || (c->b == 0xaf)) {
1287 if ((c->rep_prefix == REPE_PREFIX) && 1323 if ((c->rep_prefix == REPE_PREFIX) &&
1288 ((ctxt->eflags & EFLG_ZF) == 0)) { 1324 ((ctxt->eflags & EFLG_ZF) == 0)) {
1289 ctxt->vcpu->arch.rip = c->eip; 1325 kvm_rip_write(ctxt->vcpu, c->eip);
1290 goto done; 1326 goto done;
1291 } 1327 }
1292 if ((c->rep_prefix == REPNE_PREFIX) && 1328 if ((c->rep_prefix == REPNE_PREFIX) &&
1293 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { 1329 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) {
1294 ctxt->vcpu->arch.rip = c->eip; 1330 kvm_rip_write(ctxt->vcpu, c->eip);
1295 goto done; 1331 goto done;
1296 } 1332 }
1297 } 1333 }
1298 c->regs[VCPU_REGS_RCX]--; 1334 c->regs[VCPU_REGS_RCX]--;
1299 c->eip = ctxt->vcpu->arch.rip; 1335 c->eip = kvm_rip_read(ctxt->vcpu);
1300 } 1336 }
1301 1337
1302 if (c->src.type == OP_MEM) { 1338 if (c->src.type == OP_MEM) {
@@ -1356,27 +1392,10 @@ special_insn:
1356 sbb: /* sbb */ 1392 sbb: /* sbb */
1357 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 1393 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1358 break; 1394 break;
1359 case 0x20 ... 0x23: 1395 case 0x20 ... 0x25:
1360 and: /* and */ 1396 and: /* and */
1361 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); 1397 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
1362 break; 1398 break;
1363 case 0x24: /* and al imm8 */
1364 c->dst.type = OP_REG;
1365 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1366 c->dst.val = *(u8 *)c->dst.ptr;
1367 c->dst.bytes = 1;
1368 c->dst.orig_val = c->dst.val;
1369 goto and;
1370 case 0x25: /* and ax imm16, or eax imm32 */
1371 c->dst.type = OP_REG;
1372 c->dst.bytes = c->op_bytes;
1373 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1374 if (c->op_bytes == 2)
1375 c->dst.val = *(u16 *)c->dst.ptr;
1376 else
1377 c->dst.val = *(u32 *)c->dst.ptr;
1378 c->dst.orig_val = c->dst.val;
1379 goto and;
1380 case 0x28 ... 0x2d: 1399 case 0x28 ... 0x2d:
1381 sub: /* sub */ 1400 sub: /* sub */
1382 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); 1401 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
@@ -1402,11 +1421,11 @@ special_insn:
1402 register_address_increment(c, &c->regs[VCPU_REGS_RSP], 1421 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1403 -c->op_bytes); 1422 -c->op_bytes);
1404 c->dst.ptr = (void *) register_address( 1423 c->dst.ptr = (void *) register_address(
1405 c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]); 1424 c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]);
1406 break; 1425 break;
1407 case 0x58 ... 0x5f: /* pop reg */ 1426 case 0x58 ... 0x5f: /* pop reg */
1408 pop_instruction: 1427 pop_instruction:
1409 if ((rc = ops->read_std(register_address(c, ctxt->ss_base, 1428 if ((rc = ops->read_std(register_address(c, ss_base(ctxt),
1410 c->regs[VCPU_REGS_RSP]), c->dst.ptr, 1429 c->regs[VCPU_REGS_RSP]), c->dst.ptr,
1411 c->op_bytes, ctxt->vcpu)) != 0) 1430 c->op_bytes, ctxt->vcpu)) != 0)
1412 goto done; 1431 goto done;
@@ -1420,9 +1439,8 @@ special_insn:
1420 goto cannot_emulate; 1439 goto cannot_emulate;
1421 c->dst.val = (s32) c->src.val; 1440 c->dst.val = (s32) c->src.val;
1422 break; 1441 break;
1442 case 0x68: /* push imm */
1423 case 0x6a: /* push imm8 */ 1443 case 0x6a: /* push imm8 */
1424 c->src.val = 0L;
1425 c->src.val = insn_fetch(s8, 1, c->eip);
1426 emulate_push(ctxt); 1444 emulate_push(ctxt);
1427 break; 1445 break;
1428 case 0x6c: /* insb */ 1446 case 0x6c: /* insb */
@@ -1433,7 +1451,7 @@ special_insn:
1433 c->rep_prefix ? 1451 c->rep_prefix ?
1434 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 1452 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
1435 (ctxt->eflags & EFLG_DF), 1453 (ctxt->eflags & EFLG_DF),
1436 register_address(c, ctxt->es_base, 1454 register_address(c, es_base(ctxt),
1437 c->regs[VCPU_REGS_RDI]), 1455 c->regs[VCPU_REGS_RDI]),
1438 c->rep_prefix, 1456 c->rep_prefix,
1439 c->regs[VCPU_REGS_RDX]) == 0) { 1457 c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1449,9 +1467,8 @@ special_insn:
1449 c->rep_prefix ? 1467 c->rep_prefix ?
1450 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, 1468 address_mask(c, c->regs[VCPU_REGS_RCX]) : 1,
1451 (ctxt->eflags & EFLG_DF), 1469 (ctxt->eflags & EFLG_DF),
1452 register_address(c, c->override_base ? 1470 register_address(c,
1453 *c->override_base : 1471 seg_override_base(ctxt, c),
1454 ctxt->ds_base,
1455 c->regs[VCPU_REGS_RSI]), 1472 c->regs[VCPU_REGS_RSI]),
1456 c->rep_prefix, 1473 c->rep_prefix,
1457 c->regs[VCPU_REGS_RDX]) == 0) { 1474 c->regs[VCPU_REGS_RDX]) == 0) {
@@ -1490,6 +1507,7 @@ special_insn:
1490 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 1507 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
1491 break; 1508 break;
1492 case 0x86 ... 0x87: /* xchg */ 1509 case 0x86 ... 0x87: /* xchg */
1510 xchg:
1493 /* Write back the register source. */ 1511 /* Write back the register source. */
1494 switch (c->dst.bytes) { 1512 switch (c->dst.bytes) {
1495 case 1: 1513 case 1:
@@ -1514,14 +1532,60 @@ special_insn:
1514 break; 1532 break;
1515 case 0x88 ... 0x8b: /* mov */ 1533 case 0x88 ... 0x8b: /* mov */
1516 goto mov; 1534 goto mov;
1535 case 0x8c: { /* mov r/m, sreg */
1536 struct kvm_segment segreg;
1537
1538 if (c->modrm_reg <= 5)
1539 kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg);
1540 else {
1541 printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n",
1542 c->modrm);
1543 goto cannot_emulate;
1544 }
1545 c->dst.val = segreg.selector;
1546 break;
1547 }
1517 case 0x8d: /* lea r16/r32, m */ 1548 case 0x8d: /* lea r16/r32, m */
1518 c->dst.val = c->modrm_ea; 1549 c->dst.val = c->modrm_ea;
1519 break; 1550 break;
1551 case 0x8e: { /* mov seg, r/m16 */
1552 uint16_t sel;
1553 int type_bits;
1554 int err;
1555
1556 sel = c->src.val;
1557 if (c->modrm_reg <= 5) {
1558 type_bits = (c->modrm_reg == 1) ? 9 : 1;
1559 err = kvm_load_segment_descriptor(ctxt->vcpu, sel,
1560 type_bits, c->modrm_reg);
1561 } else {
1562 printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
1563 c->modrm);
1564 goto cannot_emulate;
1565 }
1566
1567 if (err < 0)
1568 goto cannot_emulate;
1569
1570 c->dst.type = OP_NONE; /* Disable writeback. */
1571 break;
1572 }
1520 case 0x8f: /* pop (sole member of Grp1a) */ 1573 case 0x8f: /* pop (sole member of Grp1a) */
1521 rc = emulate_grp1a(ctxt, ops); 1574 rc = emulate_grp1a(ctxt, ops);
1522 if (rc != 0) 1575 if (rc != 0)
1523 goto done; 1576 goto done;
1524 break; 1577 break;
1578 case 0x90: /* nop / xchg r8,rax */
1579 if (!(c->rex_prefix & 1)) { /* nop */
1580 c->dst.type = OP_NONE;
1581 break;
1582 }
1583 case 0x91 ... 0x97: /* xchg reg,rax */
1584 c->src.type = c->dst.type = OP_REG;
1585 c->src.bytes = c->dst.bytes = c->op_bytes;
1586 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
1587 c->src.val = *(c->src.ptr);
1588 goto xchg;
1525 case 0x9c: /* pushf */ 1589 case 0x9c: /* pushf */
1526 c->src.val = (unsigned long) ctxt->eflags; 1590 c->src.val = (unsigned long) ctxt->eflags;
1527 emulate_push(ctxt); 1591 emulate_push(ctxt);
@@ -1540,11 +1604,10 @@ special_insn:
1540 c->dst.type = OP_MEM; 1604 c->dst.type = OP_MEM;
1541 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1605 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1542 c->dst.ptr = (unsigned long *)register_address(c, 1606 c->dst.ptr = (unsigned long *)register_address(c,
1543 ctxt->es_base, 1607 es_base(ctxt),
1544 c->regs[VCPU_REGS_RDI]); 1608 c->regs[VCPU_REGS_RDI]);
1545 if ((rc = ops->read_emulated(register_address(c, 1609 if ((rc = ops->read_emulated(register_address(c,
1546 c->override_base ? *c->override_base : 1610 seg_override_base(ctxt, c),
1547 ctxt->ds_base,
1548 c->regs[VCPU_REGS_RSI]), 1611 c->regs[VCPU_REGS_RSI]),
1549 &c->dst.val, 1612 &c->dst.val,
1550 c->dst.bytes, ctxt->vcpu)) != 0) 1613 c->dst.bytes, ctxt->vcpu)) != 0)
@@ -1560,8 +1623,7 @@ special_insn:
1560 c->src.type = OP_NONE; /* Disable writeback. */ 1623 c->src.type = OP_NONE; /* Disable writeback. */
1561 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1624 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1562 c->src.ptr = (unsigned long *)register_address(c, 1625 c->src.ptr = (unsigned long *)register_address(c,
1563 c->override_base ? *c->override_base : 1626 seg_override_base(ctxt, c),
1564 ctxt->ds_base,
1565 c->regs[VCPU_REGS_RSI]); 1627 c->regs[VCPU_REGS_RSI]);
1566 if ((rc = ops->read_emulated((unsigned long)c->src.ptr, 1628 if ((rc = ops->read_emulated((unsigned long)c->src.ptr,
1567 &c->src.val, 1629 &c->src.val,
@@ -1572,7 +1634,7 @@ special_insn:
1572 c->dst.type = OP_NONE; /* Disable writeback. */ 1634 c->dst.type = OP_NONE; /* Disable writeback. */
1573 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1635 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1574 c->dst.ptr = (unsigned long *)register_address(c, 1636 c->dst.ptr = (unsigned long *)register_address(c,
1575 ctxt->es_base, 1637 es_base(ctxt),
1576 c->regs[VCPU_REGS_RDI]); 1638 c->regs[VCPU_REGS_RDI]);
1577 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 1639 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr,
1578 &c->dst.val, 1640 &c->dst.val,
@@ -1596,7 +1658,7 @@ special_insn:
1596 c->dst.type = OP_MEM; 1658 c->dst.type = OP_MEM;
1597 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1659 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1598 c->dst.ptr = (unsigned long *)register_address(c, 1660 c->dst.ptr = (unsigned long *)register_address(c,
1599 ctxt->es_base, 1661 es_base(ctxt),
1600 c->regs[VCPU_REGS_RDI]); 1662 c->regs[VCPU_REGS_RDI]);
1601 c->dst.val = c->regs[VCPU_REGS_RAX]; 1663 c->dst.val = c->regs[VCPU_REGS_RAX];
1602 register_address_increment(c, &c->regs[VCPU_REGS_RDI], 1664 register_address_increment(c, &c->regs[VCPU_REGS_RDI],
@@ -1608,8 +1670,7 @@ special_insn:
1608 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 1670 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1609 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 1671 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1610 if ((rc = ops->read_emulated(register_address(c, 1672 if ((rc = ops->read_emulated(register_address(c,
1611 c->override_base ? *c->override_base : 1673 seg_override_base(ctxt, c),
1612 ctxt->ds_base,
1613 c->regs[VCPU_REGS_RSI]), 1674 c->regs[VCPU_REGS_RSI]),
1614 &c->dst.val, 1675 &c->dst.val,
1615 c->dst.bytes, 1676 c->dst.bytes,
@@ -1622,6 +1683,8 @@ special_insn:
1622 case 0xae ... 0xaf: /* scas */ 1683 case 0xae ... 0xaf: /* scas */
1623 DPRINTF("Urk! I don't handle SCAS.\n"); 1684 DPRINTF("Urk! I don't handle SCAS.\n");
1624 goto cannot_emulate; 1685 goto cannot_emulate;
1686 case 0xb0 ... 0xbf: /* mov r, imm */
1687 goto mov;
1625 case 0xc0 ... 0xc1: 1688 case 0xc0 ... 0xc1:
1626 emulate_grp2(ctxt); 1689 emulate_grp2(ctxt);
1627 break; 1690 break;
@@ -1640,6 +1703,16 @@ special_insn:
1640 c->src.val = c->regs[VCPU_REGS_RCX]; 1703 c->src.val = c->regs[VCPU_REGS_RCX];
1641 emulate_grp2(ctxt); 1704 emulate_grp2(ctxt);
1642 break; 1705 break;
1706 case 0xe4: /* inb */
1707 case 0xe5: /* in */
1708 port = insn_fetch(u8, 1, c->eip);
1709 io_dir_in = 1;
1710 goto do_io;
1711 case 0xe6: /* outb */
1712 case 0xe7: /* out */
1713 port = insn_fetch(u8, 1, c->eip);
1714 io_dir_in = 0;
1715 goto do_io;
1643 case 0xe8: /* call (near) */ { 1716 case 0xe8: /* call (near) */ {
1644 long int rel; 1717 long int rel;
1645 switch (c->op_bytes) { 1718 switch (c->op_bytes) {
@@ -1660,13 +1733,55 @@ special_insn:
1660 break; 1733 break;
1661 } 1734 }
1662 case 0xe9: /* jmp rel */ 1735 case 0xe9: /* jmp rel */
1663 case 0xeb: /* jmp rel short */ 1736 goto jmp;
1737 case 0xea: /* jmp far */ {
1738 uint32_t eip;
1739 uint16_t sel;
1740
1741 switch (c->op_bytes) {
1742 case 2:
1743 eip = insn_fetch(u16, 2, c->eip);
1744 break;
1745 case 4:
1746 eip = insn_fetch(u32, 4, c->eip);
1747 break;
1748 default:
1749 DPRINTF("jmp far: Invalid op_bytes\n");
1750 goto cannot_emulate;
1751 }
1752 sel = insn_fetch(u16, 2, c->eip);
1753 if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) {
1754 DPRINTF("jmp far: Failed to load CS descriptor\n");
1755 goto cannot_emulate;
1756 }
1757
1758 c->eip = eip;
1759 break;
1760 }
1761 case 0xeb:
1762 jmp: /* jmp rel short */
1664 jmp_rel(c, c->src.val); 1763 jmp_rel(c, c->src.val);
1665 c->dst.type = OP_NONE; /* Disable writeback. */ 1764 c->dst.type = OP_NONE; /* Disable writeback. */
1666 break; 1765 break;
1766 case 0xec: /* in al,dx */
1767 case 0xed: /* in (e/r)ax,dx */
1768 port = c->regs[VCPU_REGS_RDX];
1769 io_dir_in = 1;
1770 goto do_io;
1771 case 0xee: /* out al,dx */
1772 case 0xef: /* out (e/r)ax,dx */
1773 port = c->regs[VCPU_REGS_RDX];
1774 io_dir_in = 0;
1775 do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in,
1776 (c->d & ByteOp) ? 1 : c->op_bytes,
1777 port) != 0) {
1778 c->eip = saved_eip;
1779 goto cannot_emulate;
1780 }
1781 return 0;
1667 case 0xf4: /* hlt */ 1782 case 0xf4: /* hlt */
1668 ctxt->vcpu->arch.halt_request = 1; 1783 ctxt->vcpu->arch.halt_request = 1;
1669 goto done; 1784 break;
1670 case 0xf5: /* cmc */ 1785 case 0xf5: /* cmc */
1671 /* complement carry flag from eflags reg */ 1786 /* complement carry flag from eflags reg */
1672 ctxt->eflags ^= EFLG_CF; 1787 ctxt->eflags ^= EFLG_CF;
@@ -1689,6 +1804,14 @@ special_insn:
1689 ctxt->eflags |= X86_EFLAGS_IF; 1804 ctxt->eflags |= X86_EFLAGS_IF;
1690 c->dst.type = OP_NONE; /* Disable writeback. */ 1805 c->dst.type = OP_NONE; /* Disable writeback. */
1691 break; 1806 break;
1807 case 0xfc: /* cld */
1808 ctxt->eflags &= ~EFLG_DF;
1809 c->dst.type = OP_NONE; /* Disable writeback. */
1810 break;
1811 case 0xfd: /* std */
1812 ctxt->eflags |= EFLG_DF;
1813 c->dst.type = OP_NONE; /* Disable writeback. */
1814 break;
1692 case 0xfe ... 0xff: /* Grp4/Grp5 */ 1815 case 0xfe ... 0xff: /* Grp4/Grp5 */
1693 rc = emulate_grp45(ctxt, ops); 1816 rc = emulate_grp45(ctxt, ops);
1694 if (rc != 0) 1817 if (rc != 0)
@@ -1703,7 +1826,7 @@ writeback:
1703 1826
1704 /* Commit shadow register state. */ 1827 /* Commit shadow register state. */
1705 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); 1828 memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs);
1706 ctxt->vcpu->arch.rip = c->eip; 1829 kvm_rip_write(ctxt->vcpu, c->eip);
1707 1830
1708done: 1831done:
1709 if (rc == X86EMUL_UNHANDLEABLE) { 1832 if (rc == X86EMUL_UNHANDLEABLE) {
@@ -1728,7 +1851,7 @@ twobyte_insn:
1728 goto done; 1851 goto done;
1729 1852
1730 /* Let the processor re-execute the fixed hypercall */ 1853 /* Let the processor re-execute the fixed hypercall */
1731 c->eip = ctxt->vcpu->arch.rip; 1854 c->eip = kvm_rip_read(ctxt->vcpu);
1732 /* Disable writeback. */ 1855 /* Disable writeback. */
1733 c->dst.type = OP_NONE; 1856 c->dst.type = OP_NONE;
1734 break; 1857 break;
@@ -1824,7 +1947,7 @@ twobyte_insn:
1824 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); 1947 rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data);
1825 if (rc) { 1948 if (rc) {
1826 kvm_inject_gp(ctxt->vcpu, 0); 1949 kvm_inject_gp(ctxt->vcpu, 0);
1827 c->eip = ctxt->vcpu->arch.rip; 1950 c->eip = kvm_rip_read(ctxt->vcpu);
1828 } 1951 }
1829 rc = X86EMUL_CONTINUE; 1952 rc = X86EMUL_CONTINUE;
1830 c->dst.type = OP_NONE; 1953 c->dst.type = OP_NONE;
@@ -1834,7 +1957,7 @@ twobyte_insn:
1834 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); 1957 rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data);
1835 if (rc) { 1958 if (rc) {
1836 kvm_inject_gp(ctxt->vcpu, 0); 1959 kvm_inject_gp(ctxt->vcpu, 0);
1837 c->eip = ctxt->vcpu->arch.rip; 1960 c->eip = kvm_rip_read(ctxt->vcpu);
1838 } else { 1961 } else {
1839 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 1962 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
1840 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 1963 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
@@ -1882,6 +2005,8 @@ twobyte_insn:
1882 c->src.val &= (c->dst.bytes << 3) - 1; 2005 c->src.val &= (c->dst.bytes << 3) - 1;
1883 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 2006 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
1884 break; 2007 break;
2008 case 0xae: /* clflush */
2009 break;
1885 case 0xb0 ... 0xb1: /* cmpxchg */ 2010 case 0xb0 ... 0xb1: /* cmpxchg */
1886 /* 2011 /*
1887 * Save real source value, then compare EAX against 2012 * Save real source value, then compare EAX against
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 50dad44fb542..48ee4f9435f4 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -55,6 +55,7 @@
55#include <linux/lguest_launcher.h> 55#include <linux/lguest_launcher.h>
56#include <linux/virtio_console.h> 56#include <linux/virtio_console.h>
57#include <linux/pm.h> 57#include <linux/pm.h>
58#include <asm/apic.h>
58#include <asm/lguest.h> 59#include <asm/lguest.h>
59#include <asm/paravirt.h> 60#include <asm/paravirt.h>
60#include <asm/param.h> 61#include <asm/param.h>
@@ -581,7 +582,7 @@ static void __init lguest_init_IRQ(void)
581 for (i = 0; i < LGUEST_IRQS; i++) { 582 for (i = 0; i < LGUEST_IRQS; i++) {
582 int vector = FIRST_EXTERNAL_VECTOR + i; 583 int vector = FIRST_EXTERNAL_VECTOR + i;
583 if (vector != SYSCALL_VECTOR) { 584 if (vector != SYSCALL_VECTOR) {
584 set_intr_gate(vector, interrupt[i]); 585 set_intr_gate(vector, interrupt[vector]);
585 set_irq_chip_and_handler_name(i, &lguest_irq_controller, 586 set_irq_chip_and_handler_name(i, &lguest_irq_controller,
586 handle_level_irq, 587 handle_level_irq,
587 "level"); 588 "level");
@@ -783,14 +784,44 @@ static void lguest_wbinvd(void)
783 * code qualifies for Advanced. It will also never interrupt anything. It 784 * code qualifies for Advanced. It will also never interrupt anything. It
784 * does, however, allow us to get through the Linux boot code. */ 785 * does, however, allow us to get through the Linux boot code. */
785#ifdef CONFIG_X86_LOCAL_APIC 786#ifdef CONFIG_X86_LOCAL_APIC
786static void lguest_apic_write(unsigned long reg, u32 v) 787static void lguest_apic_write(u32 reg, u32 v)
787{ 788{
788} 789}
789 790
790static u32 lguest_apic_read(unsigned long reg) 791static u32 lguest_apic_read(u32 reg)
791{ 792{
792 return 0; 793 return 0;
793} 794}
795
796static u64 lguest_apic_icr_read(void)
797{
798 return 0;
799}
800
801static void lguest_apic_icr_write(u32 low, u32 id)
802{
803 /* Warn to see if there's any stray references */
804 WARN_ON(1);
805}
806
807static void lguest_apic_wait_icr_idle(void)
808{
809 return;
810}
811
812static u32 lguest_apic_safe_wait_icr_idle(void)
813{
814 return 0;
815}
816
817static struct apic_ops lguest_basic_apic_ops = {
818 .read = lguest_apic_read,
819 .write = lguest_apic_write,
820 .icr_read = lguest_apic_icr_read,
821 .icr_write = lguest_apic_icr_write,
822 .wait_icr_idle = lguest_apic_wait_icr_idle,
823 .safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle,
824};
794#endif 825#endif
795 826
796/* STOP! Until an interrupt comes in. */ 827/* STOP! Until an interrupt comes in. */
@@ -990,9 +1021,7 @@ __init void lguest_init(void)
990 1021
991#ifdef CONFIG_X86_LOCAL_APIC 1022#ifdef CONFIG_X86_LOCAL_APIC
992 /* apic read/write intercepts */ 1023 /* apic read/write intercepts */
993 pv_apic_ops.apic_write = lguest_apic_write; 1024 apic_ops = &lguest_basic_apic_ops;
994 pv_apic_ops.apic_write_atomic = lguest_apic_write;
995 pv_apic_ops.apic_read = lguest_apic_read;
996#endif 1025#endif
997 1026
998 /* time operations */ 1027 /* time operations */
@@ -1015,6 +1044,9 @@ __init void lguest_init(void)
1015 init_pg_tables_start = __pa(pg0); 1044 init_pg_tables_start = __pa(pg0);
1016 init_pg_tables_end = __pa(pg0); 1045 init_pg_tables_end = __pa(pg0);
1017 1046
1047 /* As described in head_32.S, we map the first 128M of memory. */
1048 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1049
1018 /* Load the %fs segment register (the per-cpu segment register) with 1050 /* Load the %fs segment register (the per-cpu segment register) with
1019 * the normal data segment to get through booting. */ 1051 * the normal data segment to get through booting. */
1020 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); 1052 asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory");
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index aa3fa4119424..55e11aa6d66c 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -17,9 +17,6 @@ ifeq ($(CONFIG_X86_32),y)
17 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o 17 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
18else 18else
19 obj-y += io_64.o iomap_copy_64.o 19 obj-y += io_64.o iomap_copy_64.o
20
21 CFLAGS_csum-partial_64.o := -funroll-loops
22
23 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o 20 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
24 lib-y += thunk_64.o clear_page_64.o copy_page_64.o 21 lib-y += thunk_64.o clear_page_64.o copy_page_64.o
25 lib-y += memmove_64.o memset_64.o 22 lib-y += memmove_64.o memset_64.o
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dfdf428975c0..f118c110af32 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -52,7 +52,7 @@
52 jnz 100b 52 jnz 100b
53102: 53102:
54 .section .fixup,"ax" 54 .section .fixup,"ax"
55103: addl %r8d,%edx /* ecx is zerorest also */ 55103: addl %ecx,%edx /* ecx is zerorest also */
56 jmp copy_user_handle_tail 56 jmp copy_user_handle_tail
57 .previous 57 .previous
58 58
diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S
index 40e0e309d27e..cb0c112386fb 100644
--- a/arch/x86/lib/copy_user_nocache_64.S
+++ b/arch/x86/lib/copy_user_nocache_64.S
@@ -32,7 +32,7 @@
32 jnz 100b 32 jnz 100b
33102: 33102:
34 .section .fixup,"ax" 34 .section .fixup,"ax"
35103: addl %r8d,%edx /* ecx is zerorest also */ 35103: addl %ecx,%edx /* ecx is zerorest also */
36 jmp copy_user_handle_tail 36 jmp copy_user_handle_tail
37 .previous 37 .previous
38 38
@@ -108,7 +108,6 @@ ENTRY(__copy_user_nocache)
108 jmp 60f 108 jmp 60f
10950: movl %ecx,%edx 10950: movl %ecx,%edx
11060: sfence 11060: sfence
111 movl %r8d,%ecx
112 jmp copy_user_handle_tail 111 jmp copy_user_handle_tail
113 .previous 112 .previous
114 113
diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c
index d5a2b39f882b..321cf720dbb6 100644
--- a/arch/x86/lib/msr-on-cpu.c
+++ b/arch/x86/lib/msr-on-cpu.c
@@ -16,36 +16,46 @@ static void __rdmsr_on_cpu(void *info)
16 rdmsr(rv->msr_no, rv->l, rv->h); 16 rdmsr(rv->msr_no, rv->l, rv->h);
17} 17}
18 18
19static void __rdmsr_safe_on_cpu(void *info) 19static void __wrmsr_on_cpu(void *info)
20{ 20{
21 struct msr_info *rv = info; 21 struct msr_info *rv = info;
22 22
23 rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h); 23 wrmsr(rv->msr_no, rv->l, rv->h);
24} 24}
25 25
26static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe) 26int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
27{ 27{
28 int err = 0; 28 int err;
29 struct msr_info rv; 29 struct msr_info rv;
30 30
31 rv.msr_no = msr_no; 31 rv.msr_no = msr_no;
32 if (safe) { 32 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
33 smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
34 err = rv.err;
35 } else {
36 smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
37 }
38 *l = rv.l; 33 *l = rv.l;
39 *h = rv.h; 34 *h = rv.h;
40 35
41 return err; 36 return err;
42} 37}
43 38
44static void __wrmsr_on_cpu(void *info) 39int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
40{
41 int err;
42 struct msr_info rv;
43
44 rv.msr_no = msr_no;
45 rv.l = l;
46 rv.h = h;
47 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
48
49 return err;
50}
51
52/* These "safe" variants are slower and should be used when the target MSR
53 may not actually exist. */
54static void __rdmsr_safe_on_cpu(void *info)
45{ 55{
46 struct msr_info *rv = info; 56 struct msr_info *rv = info;
47 57
48 wrmsr(rv->msr_no, rv->l, rv->h); 58 rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h);
49} 59}
50 60
51static void __wrmsr_safe_on_cpu(void *info) 61static void __wrmsr_safe_on_cpu(void *info)
@@ -55,44 +65,30 @@ static void __wrmsr_safe_on_cpu(void *info)
55 rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h); 65 rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h);
56} 66}
57 67
58static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe) 68int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
59{ 69{
60 int err = 0; 70 int err;
61 struct msr_info rv; 71 struct msr_info rv;
62 72
63 rv.msr_no = msr_no; 73 rv.msr_no = msr_no;
64 rv.l = l; 74 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
65 rv.h = h; 75 *l = rv.l;
66 if (safe) { 76 *h = rv.h;
67 smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
68 err = rv.err;
69 } else {
70 smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
71 }
72
73 return err;
74}
75
76void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
77{
78 _wrmsr_on_cpu(cpu, msr_no, l, h, 0);
79}
80 77
81void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) 78 return err ? err : rv.err;
82{
83 _rdmsr_on_cpu(cpu, msr_no, l, h, 0);
84} 79}
85 80
86/* These "safe" variants are slower and should be used when the target MSR
87 may not actually exist. */
88int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) 81int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
89{ 82{
90 return _wrmsr_on_cpu(cpu, msr_no, l, h, 1); 83 int err;
91} 84 struct msr_info rv;
92 85
93int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) 86 rv.msr_no = msr_no;
94{ 87 rv.l = l;
95 return _rdmsr_on_cpu(cpu, msr_no, l, h, 1); 88 rv.h = h;
89 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
90
91 return err ? err : rv.err;
96} 92}
97 93
98EXPORT_SYMBOL(rdmsr_on_cpu); 94EXPORT_SYMBOL(rdmsr_on_cpu);
diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c
index 94972e7c094d..82004d2bf05e 100644
--- a/arch/x86/lib/string_32.c
+++ b/arch/x86/lib/string_32.c
@@ -22,7 +22,7 @@ char *strcpy(char *dest, const char *src)
22 "testb %%al,%%al\n\t" 22 "testb %%al,%%al\n\t"
23 "jne 1b" 23 "jne 1b"
24 : "=&S" (d0), "=&D" (d1), "=&a" (d2) 24 : "=&S" (d0), "=&D" (d1), "=&a" (d2)
25 :"0" (src), "1" (dest) : "memory"); 25 : "0" (src), "1" (dest) : "memory");
26 return dest; 26 return dest;
27} 27}
28EXPORT_SYMBOL(strcpy); 28EXPORT_SYMBOL(strcpy);
@@ -42,7 +42,7 @@ char *strncpy(char *dest, const char *src, size_t count)
42 "stosb\n" 42 "stosb\n"
43 "2:" 43 "2:"
44 : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) 44 : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3)
45 :"0" (src), "1" (dest), "2" (count) : "memory"); 45 : "0" (src), "1" (dest), "2" (count) : "memory");
46 return dest; 46 return dest;
47} 47}
48EXPORT_SYMBOL(strncpy); 48EXPORT_SYMBOL(strncpy);
@@ -60,7 +60,7 @@ char *strcat(char *dest, const char *src)
60 "testb %%al,%%al\n\t" 60 "testb %%al,%%al\n\t"
61 "jne 1b" 61 "jne 1b"
62 : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) 62 : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3)
63 : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory"); 63 : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory");
64 return dest; 64 return dest;
65} 65}
66EXPORT_SYMBOL(strcat); 66EXPORT_SYMBOL(strcat);
@@ -105,9 +105,9 @@ int strcmp(const char *cs, const char *ct)
105 "2:\tsbbl %%eax,%%eax\n\t" 105 "2:\tsbbl %%eax,%%eax\n\t"
106 "orb $1,%%al\n" 106 "orb $1,%%al\n"
107 "3:" 107 "3:"
108 :"=a" (res), "=&S" (d0), "=&D" (d1) 108 : "=a" (res), "=&S" (d0), "=&D" (d1)
109 :"1" (cs), "2" (ct) 109 : "1" (cs), "2" (ct)
110 :"memory"); 110 : "memory");
111 return res; 111 return res;
112} 112}
113EXPORT_SYMBOL(strcmp); 113EXPORT_SYMBOL(strcmp);
@@ -130,9 +130,9 @@ int strncmp(const char *cs, const char *ct, size_t count)
130 "3:\tsbbl %%eax,%%eax\n\t" 130 "3:\tsbbl %%eax,%%eax\n\t"
131 "orb $1,%%al\n" 131 "orb $1,%%al\n"
132 "4:" 132 "4:"
133 :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) 133 : "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2)
134 :"1" (cs), "2" (ct), "3" (count) 134 : "1" (cs), "2" (ct), "3" (count)
135 :"memory"); 135 : "memory");
136 return res; 136 return res;
137} 137}
138EXPORT_SYMBOL(strncmp); 138EXPORT_SYMBOL(strncmp);
@@ -152,9 +152,9 @@ char *strchr(const char *s, int c)
152 "movl $1,%1\n" 152 "movl $1,%1\n"
153 "2:\tmovl %1,%0\n\t" 153 "2:\tmovl %1,%0\n\t"
154 "decl %0" 154 "decl %0"
155 :"=a" (res), "=&S" (d0) 155 : "=a" (res), "=&S" (d0)
156 :"1" (s), "0" (c) 156 : "1" (s), "0" (c)
157 :"memory"); 157 : "memory");
158 return res; 158 return res;
159} 159}
160EXPORT_SYMBOL(strchr); 160EXPORT_SYMBOL(strchr);
@@ -169,9 +169,9 @@ size_t strlen(const char *s)
169 "scasb\n\t" 169 "scasb\n\t"
170 "notl %0\n\t" 170 "notl %0\n\t"
171 "decl %0" 171 "decl %0"
172 :"=c" (res), "=&D" (d0) 172 : "=c" (res), "=&D" (d0)
173 :"1" (s), "a" (0), "0" (0xffffffffu) 173 : "1" (s), "a" (0), "0" (0xffffffffu)
174 :"memory"); 174 : "memory");
175 return res; 175 return res;
176} 176}
177EXPORT_SYMBOL(strlen); 177EXPORT_SYMBOL(strlen);
@@ -189,9 +189,9 @@ void *memchr(const void *cs, int c, size_t count)
189 "je 1f\n\t" 189 "je 1f\n\t"
190 "movl $1,%0\n" 190 "movl $1,%0\n"
191 "1:\tdecl %0" 191 "1:\tdecl %0"
192 :"=D" (res), "=&c" (d0) 192 : "=D" (res), "=&c" (d0)
193 :"a" (c), "0" (cs), "1" (count) 193 : "a" (c), "0" (cs), "1" (count)
194 :"memory"); 194 : "memory");
195 return res; 195 return res;
196} 196}
197EXPORT_SYMBOL(memchr); 197EXPORT_SYMBOL(memchr);
@@ -228,9 +228,9 @@ size_t strnlen(const char *s, size_t count)
228 "cmpl $-1,%1\n\t" 228 "cmpl $-1,%1\n\t"
229 "jne 1b\n" 229 "jne 1b\n"
230 "3:\tsubl %2,%0" 230 "3:\tsubl %2,%0"
231 :"=a" (res), "=&d" (d0) 231 : "=a" (res), "=&d" (d0)
232 :"c" (s), "1" (count) 232 : "c" (s), "1" (count)
233 :"memory"); 233 : "memory");
234 return res; 234 return res;
235} 235}
236EXPORT_SYMBOL(strnlen); 236EXPORT_SYMBOL(strnlen);
diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c
index 42e8a50303f3..8e2d55f754bf 100644
--- a/arch/x86/lib/strstr_32.c
+++ b/arch/x86/lib/strstr_32.c
@@ -23,9 +23,9 @@ __asm__ __volatile__(
23 "jne 1b\n\t" 23 "jne 1b\n\t"
24 "xorl %%eax,%%eax\n\t" 24 "xorl %%eax,%%eax\n\t"
25 "2:" 25 "2:"
26 :"=a" (__res), "=&c" (d0), "=&S" (d1) 26 : "=a" (__res), "=&c" (d0), "=&S" (d1)
27 :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct) 27 : "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct)
28 :"dx", "di"); 28 : "dx", "di");
29return __res; 29return __res;
30} 30}
31 31
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 24e60944971a..9e68075544f6 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -14,6 +14,13 @@
14#include <asm/uaccess.h> 14#include <asm/uaccess.h>
15#include <asm/mmx.h> 15#include <asm/mmx.h>
16 16
17#ifdef CONFIG_X86_INTEL_USERCOPY
18/*
19 * Alignment at which movsl is preferred for bulk memory copies.
20 */
21struct movsl_mask movsl_mask __read_mostly;
22#endif
23
17static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n) 24static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n)
18{ 25{
19#ifdef CONFIG_X86_INTEL_USERCOPY 26#ifdef CONFIG_X86_INTEL_USERCOPY
diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c
index 48278fa7d3de..37b9ae4d44c5 100644
--- a/arch/x86/mach-default/setup.c
+++ b/arch/x86/mach-default/setup.c
@@ -10,13 +10,7 @@
10#include <asm/e820.h> 10#include <asm/e820.h>
11#include <asm/setup.h> 11#include <asm/setup.h>
12 12
13/* 13#include <mach_ipi.h>
14 * Any quirks to be performed to initialize timers/irqs/etc?
15 */
16int (*arch_time_init_quirk)(void);
17int (*arch_pre_intr_init_quirk)(void);
18int (*arch_intr_init_quirk)(void);
19int (*arch_trap_init_quirk)(void);
20 14
21#ifdef CONFIG_HOTPLUG_CPU 15#ifdef CONFIG_HOTPLUG_CPU
22#define DEFAULT_SEND_IPI (1) 16#define DEFAULT_SEND_IPI (1)
@@ -24,7 +18,7 @@ int (*arch_trap_init_quirk)(void);
24#define DEFAULT_SEND_IPI (0) 18#define DEFAULT_SEND_IPI (0)
25#endif 19#endif
26 20
27int no_broadcast=DEFAULT_SEND_IPI; 21int no_broadcast = DEFAULT_SEND_IPI;
28 22
29/** 23/**
30 * pre_intr_init_hook - initialisation prior to setting up interrupt vectors 24 * pre_intr_init_hook - initialisation prior to setting up interrupt vectors
@@ -37,22 +31,13 @@ int no_broadcast=DEFAULT_SEND_IPI;
37 **/ 31 **/
38void __init pre_intr_init_hook(void) 32void __init pre_intr_init_hook(void)
39{ 33{
40 if (arch_pre_intr_init_quirk) { 34 if (x86_quirks->arch_pre_intr_init) {
41 if (arch_pre_intr_init_quirk()) 35 if (x86_quirks->arch_pre_intr_init())
42 return; 36 return;
43 } 37 }
44 init_ISA_irqs(); 38 init_ISA_irqs();
45} 39}
46 40
47/*
48 * IRQ2 is cascade interrupt to second interrupt controller
49 */
50static struct irqaction irq2 = {
51 .handler = no_action,
52 .mask = CPU_MASK_NONE,
53 .name = "cascade",
54};
55
56/** 41/**
57 * intr_init_hook - post gate setup interrupt initialisation 42 * intr_init_hook - post gate setup interrupt initialisation
58 * 43 *
@@ -64,16 +49,10 @@ static struct irqaction irq2 = {
64 **/ 49 **/
65void __init intr_init_hook(void) 50void __init intr_init_hook(void)
66{ 51{
67 if (arch_intr_init_quirk) { 52 if (x86_quirks->arch_intr_init) {
68 if (arch_intr_init_quirk()) 53 if (x86_quirks->arch_intr_init())
69 return; 54 return;
70 } 55 }
71#ifdef CONFIG_X86_LOCAL_APIC
72 apic_intr_init();
73#endif
74
75 if (!acpi_ioapic)
76 setup_irq(2, &irq2);
77} 56}
78 57
79/** 58/**
@@ -97,8 +76,8 @@ void __init pre_setup_arch_hook(void)
97 **/ 76 **/
98void __init trap_init_hook(void) 77void __init trap_init_hook(void)
99{ 78{
100 if (arch_trap_init_quirk) { 79 if (x86_quirks->arch_trap_init) {
101 if (arch_trap_init_quirk()) 80 if (x86_quirks->arch_trap_init())
102 return; 81 return;
103 } 82 }
104} 83}
@@ -111,6 +90,16 @@ static struct irqaction irq0 = {
111}; 90};
112 91
113/** 92/**
93 * pre_time_init_hook - do any specific initialisations before.
94 *
95 **/
96void __init pre_time_init_hook(void)
97{
98 if (x86_quirks->arch_pre_time_init)
99 x86_quirks->arch_pre_time_init();
100}
101
102/**
114 * time_init_hook - do any specific initialisations for the system timer. 103 * time_init_hook - do any specific initialisations for the system timer.
115 * 104 *
116 * Description: 105 * Description:
@@ -119,13 +108,13 @@ static struct irqaction irq0 = {
119 **/ 108 **/
120void __init time_init_hook(void) 109void __init time_init_hook(void)
121{ 110{
122 if (arch_time_init_quirk) { 111 if (x86_quirks->arch_time_init) {
123 /* 112 /*
124 * A nonzero return code does not mean failure, it means 113 * A nonzero return code does not mean failure, it means
125 * that the architecture quirk does not want any 114 * that the architecture quirk does not want any
126 * generic (timer) setup to be performed after this: 115 * generic (timer) setup to be performed after this:
127 */ 116 */
128 if (arch_time_init_quirk()) 117 if (x86_quirks->arch_time_init())
129 return; 118 return;
130 } 119 }
131 120
diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile
deleted file mode 100644
index 3ef8b43b62fc..000000000000
--- a/arch/x86/mach-es7000/Makefile
+++ /dev/null
@@ -1,5 +0,0 @@
1#
2# Makefile for the linux kernel.
3#
4
5obj-$(CONFIG_X86_ES7000) := es7000plat.o
diff --git a/arch/x86/mach-es7000/es7000.h b/arch/x86/mach-es7000/es7000.h
deleted file mode 100644
index c8d5aa132fa0..000000000000
--- a/arch/x86/mach-es7000/es7000.h
+++ /dev/null
@@ -1,114 +0,0 @@
1/*
2 * Written by: Garry Forsgren, Unisys Corporation
3 * Natalie Protasevich, Unisys Corporation
4 * This file contains the code to configure and interface
5 * with Unisys ES7000 series hardware system manager.
6 *
7 * Copyright (c) 2003 Unisys Corporation. All Rights Reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify it
10 * under the terms of version 2 of the GNU General Public License as
11 * published by the Free Software Foundation.
12 *
13 * This program is distributed in the hope that it would be useful, but
14 * WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
16 *
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write the Free Software Foundation, Inc., 59
19 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
20 *
21 * Contact information: Unisys Corporation, Township Line & Union Meeting
22 * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or:
23 *
24 * http://www.unisys.com
25 */
26
27/*
28 * ES7000 chipsets
29 */
30
31#define NON_UNISYS 0
32#define ES7000_CLASSIC 1
33#define ES7000_ZORRO 2
34
35
36#define MIP_REG 1
37#define MIP_PSAI_REG 4
38
39#define MIP_BUSY 1
40#define MIP_SPIN 0xf0000
41#define MIP_VALID 0x0100000000000000ULL
42#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff)
43
44#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff)
45
46struct mip_reg_info {
47 unsigned long long mip_info;
48 unsigned long long delivery_info;
49 unsigned long long host_reg;
50 unsigned long long mip_reg;
51};
52
53struct part_info {
54 unsigned char type;
55 unsigned char length;
56 unsigned char part_id;
57 unsigned char apic_mode;
58 unsigned long snum;
59 char ptype[16];
60 char sname[64];
61 char pname[64];
62};
63
64struct psai {
65 unsigned long long entry_type;
66 unsigned long long addr;
67 unsigned long long bep_addr;
68};
69
70struct es7000_mem_info {
71 unsigned char type;
72 unsigned char length;
73 unsigned char resv[6];
74 unsigned long long start;
75 unsigned long long size;
76};
77
78struct es7000_oem_table {
79 unsigned long long hdr;
80 struct mip_reg_info mip;
81 struct part_info pif;
82 struct es7000_mem_info shm;
83 struct psai psai;
84};
85
86#ifdef CONFIG_ACPI
87
88struct oem_table {
89 struct acpi_table_header Header;
90 u32 OEMTableAddr;
91 u32 OEMTableSize;
92};
93
94extern int find_unisys_acpi_oem_table(unsigned long *oem_addr);
95#endif
96
97struct mip_reg {
98 unsigned long long off_0;
99 unsigned long long off_8;
100 unsigned long long off_10;
101 unsigned long long off_18;
102 unsigned long long off_20;
103 unsigned long long off_28;
104 unsigned long long off_30;
105 unsigned long long off_38;
106};
107
108#define MIP_SW_APIC 0x1020b
109#define MIP_FUNC(VALUE) (VALUE & 0xff)
110
111extern int parse_unisys_oem (char *oemptr);
112extern void setup_unisys(void);
113extern int es7000_start_cpu(int cpu, unsigned long eip);
114extern void es7000_sw_apic(void);
diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile
index 0dbd7803a1d5..6730f4e7c744 100644
--- a/arch/x86/mach-generic/Makefile
+++ b/arch/x86/mach-generic/Makefile
@@ -9,4 +9,3 @@ obj-$(CONFIG_X86_NUMAQ) += numaq.o
9obj-$(CONFIG_X86_SUMMIT) += summit.o 9obj-$(CONFIG_X86_SUMMIT) += summit.o
10obj-$(CONFIG_X86_BIGSMP) += bigsmp.o 10obj-$(CONFIG_X86_BIGSMP) += bigsmp.o
11obj-$(CONFIG_X86_ES7000) += es7000.o 11obj-$(CONFIG_X86_ES7000) += es7000.o
12obj-$(CONFIG_X86_ES7000) += ../../x86/mach-es7000/
diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c
index 59d771714559..3c3b471ea496 100644
--- a/arch/x86/mach-generic/bigsmp.c
+++ b/arch/x86/mach-generic/bigsmp.c
@@ -5,18 +5,17 @@
5#define APIC_DEFINITION 1 5#define APIC_DEFINITION 1
6#include <linux/threads.h> 6#include <linux/threads.h>
7#include <linux/cpumask.h> 7#include <linux/cpumask.h>
8#include <asm/smp.h>
9#include <asm/mpspec.h> 8#include <asm/mpspec.h>
10#include <asm/genapic.h> 9#include <asm/genapic.h>
11#include <asm/fixmap.h> 10#include <asm/fixmap.h>
12#include <asm/apicdef.h> 11#include <asm/apicdef.h>
13#include <linux/kernel.h> 12#include <linux/kernel.h>
14#include <linux/smp.h>
15#include <linux/init.h> 13#include <linux/init.h>
16#include <linux/dmi.h> 14#include <linux/dmi.h>
17#include <asm/mach-bigsmp/mach_apic.h> 15#include <asm/bigsmp/apicdef.h>
18#include <asm/mach-bigsmp/mach_apicdef.h> 16#include <linux/smp.h>
19#include <asm/mach-bigsmp/mach_ipi.h> 17#include <asm/bigsmp/apic.h>
18#include <asm/bigsmp/ipi.h>
20#include <asm/mach-default/mach_mpparse.h> 19#include <asm/mach-default/mach_mpparse.h>
21 20
22static int dmi_bigsmp; /* can be set by dmi scanners */ 21static int dmi_bigsmp; /* can be set by dmi scanners */
@@ -42,6 +41,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = {
42 { } 41 { }
43}; 42};
44 43
44static cpumask_t vector_allocation_domain(int cpu)
45{
46 return cpumask_of_cpu(cpu);
47}
45 48
46static int probe_bigsmp(void) 49static int probe_bigsmp(void)
47{ 50{
diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c
index 4742626f08c4..28459cab3ddb 100644
--- a/arch/x86/mach-generic/es7000.c
+++ b/arch/x86/mach-generic/es7000.c
@@ -4,20 +4,19 @@
4#define APIC_DEFINITION 1 4#define APIC_DEFINITION 1
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7#include <asm/smp.h>
8#include <asm/mpspec.h> 7#include <asm/mpspec.h>
9#include <asm/genapic.h> 8#include <asm/genapic.h>
10#include <asm/fixmap.h> 9#include <asm/fixmap.h>
11#include <asm/apicdef.h> 10#include <asm/apicdef.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/smp.h>
15#include <linux/init.h> 13#include <linux/init.h>
16#include <asm/mach-es7000/mach_apicdef.h> 14#include <asm/es7000/apicdef.h>
17#include <asm/mach-es7000/mach_apic.h> 15#include <linux/smp.h>
18#include <asm/mach-es7000/mach_ipi.h> 16#include <asm/es7000/apic.h>
19#include <asm/mach-es7000/mach_mpparse.h> 17#include <asm/es7000/ipi.h>
20#include <asm/mach-es7000/mach_wakecpu.h> 18#include <asm/es7000/mpparse.h>
19#include <asm/es7000/wakecpu.h>
21 20
22static int probe_es7000(void) 21static int probe_es7000(void)
23{ 22{
@@ -48,16 +47,26 @@ static __init int mps_oem_check(struct mp_config_table *mpc, char *oem,
48/* Hook from generic ACPI tables.c */ 47/* Hook from generic ACPI tables.c */
49static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 48static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{ 49{
51 unsigned long oem_addr; 50 unsigned long oem_addr = 0;
51 int check_dsdt;
52 int ret = 0;
53
54 /* check dsdt at first to avoid clear fix_map for oem_addr */
55 check_dsdt = es7000_check_dsdt();
56
52 if (!find_unisys_acpi_oem_table(&oem_addr)) { 57 if (!find_unisys_acpi_oem_table(&oem_addr)) {
53 if (es7000_check_dsdt()) 58 if (check_dsdt)
54 return parse_unisys_oem((char *)oem_addr); 59 ret = parse_unisys_oem((char *)oem_addr);
55 else { 60 else {
56 setup_unisys(); 61 setup_unisys();
57 return 1; 62 ret = 1;
58 } 63 }
64 /*
65 * we need to unmap it
66 */
67 unmap_unisys_acpi_oem_table(oem_addr);
59 } 68 }
60 return 0; 69 return ret;
61} 70}
62#else 71#else
63static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) 72static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
@@ -66,4 +75,18 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id)
66} 75}
67#endif 76#endif
68 77
78static cpumask_t vector_allocation_domain(int cpu)
79{
80 /* Careful. Some cpus do not strictly honor the set of cpus
81 * specified in the interrupt destination when using lowest
82 * priority interrupt delivery mode.
83 *
84 * In particular there was a hyperthreading cpu observed to
85 * deliver interrupts to the wrong hyperthread when only one
86 * hyperthread was specified in the interrupt desitination.
87 */
88 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
89 return domain;
90}
91
69struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); 92struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000);
diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c
index 8091e68764c4..71a309b122e6 100644
--- a/arch/x86/mach-generic/numaq.c
+++ b/arch/x86/mach-generic/numaq.c
@@ -4,7 +4,6 @@
4#define APIC_DEFINITION 1 4#define APIC_DEFINITION 1
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7#include <linux/smp.h>
8#include <asm/mpspec.h> 7#include <asm/mpspec.h>
9#include <asm/genapic.h> 8#include <asm/genapic.h>
10#include <asm/fixmap.h> 9#include <asm/fixmap.h>
@@ -12,11 +11,12 @@
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/init.h> 13#include <linux/init.h>
15#include <asm/mach-numaq/mach_apic.h> 14#include <asm/numaq/apicdef.h>
16#include <asm/mach-numaq/mach_apicdef.h> 15#include <linux/smp.h>
17#include <asm/mach-numaq/mach_ipi.h> 16#include <asm/numaq/apic.h>
18#include <asm/mach-numaq/mach_mpparse.h> 17#include <asm/numaq/ipi.h>
19#include <asm/mach-numaq/mach_wakecpu.h> 18#include <asm/numaq/mpparse.h>
19#include <asm/numaq/wakecpu.h>
20#include <asm/numaq.h> 20#include <asm/numaq.h>
21 21
22static int mps_oem_check(struct mp_config_table *mpc, char *oem, 22static int mps_oem_check(struct mp_config_table *mpc, char *oem,
@@ -38,4 +38,18 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id)
38 return 0; 38 return 0;
39} 39}
40 40
41static cpumask_t vector_allocation_domain(int cpu)
42{
43 /* Careful. Some cpus do not strictly honor the set of cpus
44 * specified in the interrupt destination when using lowest
45 * priority interrupt delivery mode.
46 *
47 * In particular there was a hyperthreading cpu observed to
48 * deliver interrupts to the wrong hyperthread when only one
49 * hyperthread was specified in the interrupt desitination.
50 */
51 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
52 return domain;
53}
54
41struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); 55struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq);
diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c
index a97ea0f35b1e..6272b5e69da6 100644
--- a/arch/x86/mach-generic/summit.c
+++ b/arch/x86/mach-generic/summit.c
@@ -4,19 +4,18 @@
4#define APIC_DEFINITION 1 4#define APIC_DEFINITION 1
5#include <linux/threads.h> 5#include <linux/threads.h>
6#include <linux/cpumask.h> 6#include <linux/cpumask.h>
7#include <asm/smp.h>
8#include <asm/mpspec.h> 7#include <asm/mpspec.h>
9#include <asm/genapic.h> 8#include <asm/genapic.h>
10#include <asm/fixmap.h> 9#include <asm/fixmap.h>
11#include <asm/apicdef.h> 10#include <asm/apicdef.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/smp.h>
15#include <linux/init.h> 13#include <linux/init.h>
16#include <asm/mach-summit/mach_apic.h> 14#include <asm/summit/apicdef.h>
17#include <asm/mach-summit/mach_apicdef.h> 15#include <linux/smp.h>
18#include <asm/mach-summit/mach_ipi.h> 16#include <asm/summit/apic.h>
19#include <asm/mach-summit/mach_mpparse.h> 17#include <asm/summit/ipi.h>
18#include <asm/summit/mpparse.h>
20 19
21static int probe_summit(void) 20static int probe_summit(void)
22{ 21{
@@ -24,4 +23,18 @@ static int probe_summit(void)
24 return 0; 23 return 0;
25} 24}
26 25
26static cpumask_t vector_allocation_domain(int cpu)
27{
28 /* Careful. Some cpus do not strictly honor the set of cpus
29 * specified in the interrupt destination when using lowest
30 * priority interrupt delivery mode.
31 *
32 * In particular there was a hyperthreading cpu observed to
33 * deliver interrupts to the wrong hyperthread when only one
34 * hyperthread was specified in the interrupt desitination.
35 */
36 cpumask_t domain = { { [0] = APIC_ALL_CPUS, } };
37 return domain;
38}
39
27struct genapic apic_summit = APIC_INIT("summit", probe_summit); 40struct genapic apic_summit = APIC_INIT("summit", probe_summit);
diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c
index a037041817c7..4f4e50c3ad3b 100644
--- a/arch/x86/mach-rdc321x/platform.c
+++ b/arch/x86/mach-rdc321x/platform.c
@@ -25,7 +25,6 @@
25#include <linux/list.h> 25#include <linux/list.h>
26#include <linux/device.h> 26#include <linux/device.h>
27#include <linux/platform_device.h> 27#include <linux/platform_device.h>
28#include <linux/version.h>
29#include <linux/leds.h> 28#include <linux/leds.h>
30 29
31#include <asm/gpio.h> 30#include <asm/gpio.h>
diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c
index ee0fba092157..0f6e8a6523ae 100644
--- a/arch/x86/mach-voyager/voyager_smp.c
+++ b/arch/x86/mach-voyager/voyager_smp.c
@@ -448,6 +448,8 @@ static void __init start_secondary(void *unused)
448 448
449 VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); 449 VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid));
450 450
451 notify_cpu_starting(cpuid);
452
451 /* enable interrupts */ 453 /* enable interrupts */
452 local_irq_enable(); 454 local_irq_enable();
453 455
@@ -1481,7 +1483,7 @@ static void disable_local_vic_irq(unsigned int irq)
1481 * the interrupt off to another CPU */ 1483 * the interrupt off to another CPU */
1482static void before_handle_vic_irq(unsigned int irq) 1484static void before_handle_vic_irq(unsigned int irq)
1483{ 1485{
1484 irq_desc_t *desc = irq_desc + irq; 1486 irq_desc_t *desc = irq_to_desc(irq);
1485 __u8 cpu = smp_processor_id(); 1487 __u8 cpu = smp_processor_id();
1486 1488
1487 _raw_spin_lock(&vic_irq_lock); 1489 _raw_spin_lock(&vic_irq_lock);
@@ -1516,7 +1518,7 @@ static void before_handle_vic_irq(unsigned int irq)
1516/* Finish the VIC interrupt: basically mask */ 1518/* Finish the VIC interrupt: basically mask */
1517static void after_handle_vic_irq(unsigned int irq) 1519static void after_handle_vic_irq(unsigned int irq)
1518{ 1520{
1519 irq_desc_t *desc = irq_desc + irq; 1521 irq_desc_t *desc = irq_to_desc(irq);
1520 1522
1521 _raw_spin_lock(&vic_irq_lock); 1523 _raw_spin_lock(&vic_irq_lock);
1522 { 1524 {
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 9873716e9f76..59f89b434b45 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ 1obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
2 pat.o pgtable.o 2 pat.o pgtable.o gup.o
3 3
4obj-$(CONFIG_X86_32) += pgtable_32.o 4obj-$(CONFIG_X86_32) += pgtable_32.o
5 5
@@ -13,11 +13,8 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o
13mmiotrace-y := pf_in.o mmio-mod.o 13mmiotrace-y := pf_in.o mmio-mod.o
14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 14obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
15 15
16ifeq ($(CONFIG_X86_32),y) 16obj-$(CONFIG_NUMA) += numa_$(BITS).o
17obj-$(CONFIG_NUMA) += discontig_32.o
18else
19obj-$(CONFIG_NUMA) += numa_64.o
20obj-$(CONFIG_K8_NUMA) += k8topology_64.o 17obj-$(CONFIG_K8_NUMA) += k8topology_64.o
21endif
22obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 18obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o
23 19
20obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c
index 0bb0caed8971..e7277cbcfb40 100644
--- a/arch/x86/mm/dump_pagetables.c
+++ b/arch/x86/mm/dump_pagetables.c
@@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st,
148 * we have now. "break" is either changing perms, levels or 148 * we have now. "break" is either changing perms, levels or
149 * address space marker. 149 * address space marker.
150 */ 150 */
151 prot = pgprot_val(new_prot) & ~(PTE_MASK); 151 prot = pgprot_val(new_prot) & PTE_FLAGS_MASK;
152 cur = pgprot_val(st->current_prot) & ~(PTE_MASK); 152 cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK;
153 153
154 if (!st->level) { 154 if (!st->level) {
155 /* First entry */ 155 /* First entry */
@@ -221,7 +221,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
221 for (i = 0; i < PTRS_PER_PMD; i++) { 221 for (i = 0; i < PTRS_PER_PMD; i++) {
222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); 222 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
223 if (!pmd_none(*start)) { 223 if (!pmd_none(*start)) {
224 pgprotval_t prot = pmd_val(*start) & ~PTE_MASK; 224 pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK;
225 225
226 if (pmd_large(*start) || !pmd_present(*start)) 226 if (pmd_large(*start) || !pmd_present(*start))
227 note_page(m, st, __pgprot(prot), 3); 227 note_page(m, st, __pgprot(prot), 3);
@@ -253,7 +253,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
253 for (i = 0; i < PTRS_PER_PUD; i++) { 253 for (i = 0; i < PTRS_PER_PUD; i++) {
254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); 254 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
255 if (!pud_none(*start)) { 255 if (!pud_none(*start)) {
256 pgprotval_t prot = pud_val(*start) & ~PTE_MASK; 256 pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK;
257 257
258 if (pud_large(*start) || !pud_present(*start)) 258 if (pud_large(*start) || !pud_present(*start))
259 note_page(m, st, __pgprot(prot), 2); 259 note_page(m, st, __pgprot(prot), 2);
@@ -288,7 +288,7 @@ static void walk_pgd_level(struct seq_file *m)
288 for (i = 0; i < PTRS_PER_PGD; i++) { 288 for (i = 0; i < PTRS_PER_PGD; i++) {
289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT); 289 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
290 if (!pgd_none(*start)) { 290 if (!pgd_none(*start)) {
291 pgprotval_t prot = pgd_val(*start) & ~PTE_MASK; 291 pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK;
292 292
293 if (pgd_large(*start) || !pgd_present(*start)) 293 if (pgd_large(*start) || !pgd_present(*start))
294 note_page(m, &st, __pgprot(prot), 1); 294 note_page(m, &st, __pgprot(prot), 1);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 455f3fe67b42..31e8730fa246 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -35,6 +35,7 @@
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36#include <asm/proto.h> 36#include <asm/proto.h>
37#include <asm-generic/sections.h> 37#include <asm-generic/sections.h>
38#include <asm/traps.h>
38 39
39/* 40/*
40 * Page fault error code bits 41 * Page fault error code bits
@@ -357,8 +358,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address)
357 return 0; 358 return 0;
358} 359}
359 360
360void do_invalid_op(struct pt_regs *, unsigned long);
361
362static int is_f00f_bug(struct pt_regs *regs, unsigned long address) 361static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
363{ 362{
364#ifdef CONFIG_X86_F00F_BUG 363#ifdef CONFIG_X86_F00F_BUG
@@ -593,11 +592,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
593 unsigned long flags; 592 unsigned long flags;
594#endif 593#endif
595 594
596 /*
597 * We can fault from pretty much anywhere, with unknown IRQ state.
598 */
599 trace_hardirqs_fixup();
600
601 tsk = current; 595 tsk = current;
602 mm = tsk->mm; 596 mm = tsk->mm;
603 prefetchw(&mm->mmap_sem); 597 prefetchw(&mm->mmap_sem);
@@ -646,24 +640,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
646 } 640 }
647 641
648 642
649#ifdef CONFIG_X86_32
650 /* It's safe to allow irq's after cr2 has been saved and the vmalloc
651 fault has been handled. */
652 if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK))
653 local_irq_enable();
654
655 /* 643 /*
656 * If we're in an interrupt, have no user context or are running in an 644 * It's safe to allow irq's after cr2 has been saved and the
657 * atomic region then we must not take the fault. 645 * vmalloc fault has been handled.
646 *
647 * User-mode registers count as a user access even for any
648 * potential system fault or CPU buglet.
658 */ 649 */
659 if (in_atomic() || !mm) 650 if (user_mode_vm(regs)) {
660 goto bad_area_nosemaphore; 651 local_irq_enable();
661#else /* CONFIG_X86_64 */ 652 error_code |= PF_USER;
662 if (likely(regs->flags & X86_EFLAGS_IF)) 653 } else if (regs->flags & X86_EFLAGS_IF)
663 local_irq_enable(); 654 local_irq_enable();
664 655
656#ifdef CONFIG_X86_64
665 if (unlikely(error_code & PF_RSVD)) 657 if (unlikely(error_code & PF_RSVD))
666 pgtable_bad(address, regs, error_code); 658 pgtable_bad(address, regs, error_code);
659#endif
667 660
668 /* 661 /*
669 * If we're in an interrupt, have no user context or are running in an 662 * If we're in an interrupt, have no user context or are running in an
@@ -672,15 +665,9 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
672 if (unlikely(in_atomic() || !mm)) 665 if (unlikely(in_atomic() || !mm))
673 goto bad_area_nosemaphore; 666 goto bad_area_nosemaphore;
674 667
675 /*
676 * User-mode registers count as a user access even for any
677 * potential system fault or CPU buglet.
678 */
679 if (user_mode_vm(regs))
680 error_code |= PF_USER;
681again: 668again:
682#endif 669 /*
683 /* When running in the kernel we expect faults to occur only to 670 * When running in the kernel we expect faults to occur only to
684 * addresses in user space. All other faults represent errors in the 671 * addresses in user space. All other faults represent errors in the
685 * kernel and should generate an OOPS. Unfortunately, in the case of an 672 * kernel and should generate an OOPS. Unfortunately, in the case of an
686 * erroneous fault occurring in a code path which already holds mmap_sem 673 * erroneous fault occurring in a code path which already holds mmap_sem
@@ -743,9 +730,6 @@ good_area:
743 goto bad_area; 730 goto bad_area;
744 } 731 }
745 732
746#ifdef CONFIG_X86_32
747survive:
748#endif
749 /* 733 /*
750 * If for any reason at all we couldn't handle the fault, 734 * If for any reason at all we couldn't handle the fault,
751 * make sure we exit gracefully rather than endlessly redo 735 * make sure we exit gracefully rather than endlessly redo
@@ -880,12 +864,11 @@ out_of_memory:
880 up_read(&mm->mmap_sem); 864 up_read(&mm->mmap_sem);
881 if (is_global_init(tsk)) { 865 if (is_global_init(tsk)) {
882 yield(); 866 yield();
883#ifdef CONFIG_X86_32 867 /*
884 down_read(&mm->mmap_sem); 868 * Re-lookup the vma - in theory the vma tree might
885 goto survive; 869 * have changed:
886#else 870 */
887 goto again; 871 goto again;
888#endif
889 } 872 }
890 873
891 printk("VM: killing process %s\n", tsk->comm); 874 printk("VM: killing process %s\n", tsk->comm);
@@ -915,15 +898,15 @@ LIST_HEAD(pgd_list);
915 898
916void vmalloc_sync_all(void) 899void vmalloc_sync_all(void)
917{ 900{
918#ifdef CONFIG_X86_32
919 unsigned long start = VMALLOC_START & PGDIR_MASK;
920 unsigned long address; 901 unsigned long address;
921 902
903#ifdef CONFIG_X86_32
922 if (SHARED_KERNEL_PMD) 904 if (SHARED_KERNEL_PMD)
923 return; 905 return;
924 906
925 BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); 907 for (address = VMALLOC_START & PMD_MASK;
926 for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { 908 address >= TASK_SIZE && address < FIXADDR_TOP;
909 address += PMD_SIZE) {
927 unsigned long flags; 910 unsigned long flags;
928 struct page *page; 911 struct page *page;
929 912
@@ -936,10 +919,8 @@ void vmalloc_sync_all(void)
936 spin_unlock_irqrestore(&pgd_lock, flags); 919 spin_unlock_irqrestore(&pgd_lock, flags);
937 } 920 }
938#else /* CONFIG_X86_64 */ 921#else /* CONFIG_X86_64 */
939 unsigned long start = VMALLOC_START & PGDIR_MASK; 922 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
940 unsigned long address; 923 address += PGDIR_SIZE) {
941
942 for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) {
943 const pgd_t *pgd_ref = pgd_offset_k(address); 924 const pgd_t *pgd_ref = pgd_offset_k(address);
944 unsigned long flags; 925 unsigned long flags;
945 struct page *page; 926 struct page *page;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
new file mode 100644
index 000000000000..4ba373c5b8c8
--- /dev/null
+++ b/arch/x86/mm/gup.c
@@ -0,0 +1,298 @@
1/*
2 * Lockless get_user_pages_fast for x86
3 *
4 * Copyright (C) 2008 Nick Piggin
5 * Copyright (C) 2008 Novell Inc.
6 */
7#include <linux/sched.h>
8#include <linux/mm.h>
9#include <linux/vmstat.h>
10#include <linux/highmem.h>
11
12#include <asm/pgtable.h>
13
14static inline pte_t gup_get_pte(pte_t *ptep)
15{
16#ifndef CONFIG_X86_PAE
17 return *ptep;
18#else
19 /*
20 * With get_user_pages_fast, we walk down the pagetables without taking
21 * any locks. For this we would like to load the pointers atoimcally,
22 * but that is not possible (without expensive cmpxchg8b) on PAE. What
23 * we do have is the guarantee that a pte will only either go from not
24 * present to present, or present to not present or both -- it will not
25 * switch to a completely different present page without a TLB flush in
26 * between; something that we are blocking by holding interrupts off.
27 *
28 * Setting ptes from not present to present goes:
29 * ptep->pte_high = h;
30 * smp_wmb();
31 * ptep->pte_low = l;
32 *
33 * And present to not present goes:
34 * ptep->pte_low = 0;
35 * smp_wmb();
36 * ptep->pte_high = 0;
37 *
38 * We must ensure here that the load of pte_low sees l iff pte_high
39 * sees h. We load pte_high *after* loading pte_low, which ensures we
40 * don't see an older value of pte_high. *Then* we recheck pte_low,
41 * which ensures that we haven't picked up a changed pte high. We might
42 * have got rubbish values from pte_low and pte_high, but we are
43 * guaranteed that pte_low will not have the present bit set *unless*
44 * it is 'l'. And get_user_pages_fast only operates on present ptes, so
45 * we're safe.
46 *
47 * gup_get_pte should not be used or copied outside gup.c without being
48 * very careful -- it does not atomically load the pte or anything that
49 * is likely to be useful for you.
50 */
51 pte_t pte;
52
53retry:
54 pte.pte_low = ptep->pte_low;
55 smp_rmb();
56 pte.pte_high = ptep->pte_high;
57 smp_rmb();
58 if (unlikely(pte.pte_low != ptep->pte_low))
59 goto retry;
60
61 return pte;
62#endif
63}
64
65/*
66 * The performance critical leaf functions are made noinline otherwise gcc
67 * inlines everything into a single function which results in too much
68 * register pressure.
69 */
70static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
71 unsigned long end, int write, struct page **pages, int *nr)
72{
73 unsigned long mask;
74 pte_t *ptep;
75
76 mask = _PAGE_PRESENT|_PAGE_USER;
77 if (write)
78 mask |= _PAGE_RW;
79
80 ptep = pte_offset_map(&pmd, addr);
81 do {
82 pte_t pte = gup_get_pte(ptep);
83 struct page *page;
84
85 if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) {
86 pte_unmap(ptep);
87 return 0;
88 }
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte);
91 get_page(page);
92 pages[*nr] = page;
93 (*nr)++;
94
95 } while (ptep++, addr += PAGE_SIZE, addr != end);
96 pte_unmap(ptep - 1);
97
98 return 1;
99}
100
101static inline void get_head_page_multiple(struct page *page, int nr)
102{
103 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count);
106}
107
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
109 unsigned long end, int write, struct page **pages, int *nr)
110{
111 unsigned long mask;
112 pte_t pte = *(pte_t *)&pmd;
113 struct page *head, *page;
114 int refs;
115
116 mask = _PAGE_PRESENT|_PAGE_USER;
117 if (write)
118 mask |= _PAGE_RW;
119 if ((pte_flags(pte) & mask) != mask)
120 return 0;
121 /* hugepages are never "special" */
122 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
123 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
124
125 refs = 0;
126 head = pte_page(pte);
127 page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
128 do {
129 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page;
131 (*nr)++;
132 page++;
133 refs++;
134 } while (addr += PAGE_SIZE, addr != end);
135 get_head_page_multiple(head, refs);
136
137 return 1;
138}
139
140static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
141 int write, struct page **pages, int *nr)
142{
143 unsigned long next;
144 pmd_t *pmdp;
145
146 pmdp = pmd_offset(&pud, addr);
147 do {
148 pmd_t pmd = *pmdp;
149
150 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd))
152 return 0;
153 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
155 return 0;
156 } else {
157 if (!gup_pte_range(pmd, addr, next, write, pages, nr))
158 return 0;
159 }
160 } while (pmdp++, addr = next, addr != end);
161
162 return 1;
163}
164
165static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
166 unsigned long end, int write, struct page **pages, int *nr)
167{
168 unsigned long mask;
169 pte_t pte = *(pte_t *)&pud;
170 struct page *head, *page;
171 int refs;
172
173 mask = _PAGE_PRESENT|_PAGE_USER;
174 if (write)
175 mask |= _PAGE_RW;
176 if ((pte_flags(pte) & mask) != mask)
177 return 0;
178 /* hugepages are never "special" */
179 VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL);
180 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
181
182 refs = 0;
183 head = pte_page(pte);
184 page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
185 do {
186 VM_BUG_ON(compound_head(page) != head);
187 pages[*nr] = page;
188 (*nr)++;
189 page++;
190 refs++;
191 } while (addr += PAGE_SIZE, addr != end);
192 get_head_page_multiple(head, refs);
193
194 return 1;
195}
196
197static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end,
198 int write, struct page **pages, int *nr)
199{
200 unsigned long next;
201 pud_t *pudp;
202
203 pudp = pud_offset(&pgd, addr);
204 do {
205 pud_t pud = *pudp;
206
207 next = pud_addr_end(addr, end);
208 if (pud_none(pud))
209 return 0;
210 if (unlikely(pud_large(pud))) {
211 if (!gup_huge_pud(pud, addr, next, write, pages, nr))
212 return 0;
213 } else {
214 if (!gup_pmd_range(pud, addr, next, write, pages, nr))
215 return 0;
216 }
217 } while (pudp++, addr = next, addr != end);
218
219 return 1;
220}
221
222int get_user_pages_fast(unsigned long start, int nr_pages, int write,
223 struct page **pages)
224{
225 struct mm_struct *mm = current->mm;
226 unsigned long addr, len, end;
227 unsigned long next;
228 pgd_t *pgdp;
229 int nr = 0;
230
231 start &= PAGE_MASK;
232 addr = start;
233 len = (unsigned long) nr_pages << PAGE_SHIFT;
234 end = start + len;
235 if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
236 start, len)))
237 goto slow_irqon;
238
239 /*
240 * XXX: batch / limit 'nr', to avoid large irq off latency
241 * needs some instrumenting to determine the common sizes used by
242 * important workloads (eg. DB2), and whether limiting the batch size
243 * will decrease performance.
244 *
245 * It seems like we're in the clear for the moment. Direct-IO is
246 * the main guy that batches up lots of get_user_pages, and even
247 * they are limited to 64-at-a-time which is not so many.
248 */
249 /*
250 * This doesn't prevent pagetable teardown, but does prevent
251 * the pagetables and pages from being freed on x86.
252 *
253 * So long as we atomically load page table pointers versus teardown
254 * (which we do on x86, with the above PAE exception), we can follow the
255 * address down to the the page and take a ref on it.
256 */
257 local_irq_disable();
258 pgdp = pgd_offset(mm, addr);
259 do {
260 pgd_t pgd = *pgdp;
261
262 next = pgd_addr_end(addr, end);
263 if (pgd_none(pgd))
264 goto slow;
265 if (!gup_pud_range(pgd, addr, next, write, pages, &nr))
266 goto slow;
267 } while (pgdp++, addr = next, addr != end);
268 local_irq_enable();
269
270 VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT);
271 return nr;
272
273 {
274 int ret;
275
276slow:
277 local_irq_enable();
278slow_irqon:
279 /* Try to get the remaining pages with get_user_pages */
280 start += nr << PAGE_SHIFT;
281 pages += nr;
282
283 down_read(&mm->mmap_sem);
284 ret = get_user_pages(current, mm, start,
285 (end - start) >> PAGE_SHIFT, write, 0, pages, NULL);
286 up_read(&mm->mmap_sem);
287
288 /* Have to be a bit careful with return values */
289 if (nr > 0) {
290 if (ret < 0)
291 ret = nr;
292 else
293 ret += nr;
294 }
295
296 return ret;
297 }
298}
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 165c871ba9af..bcc079c282dd 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -137,6 +137,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
137 137
138 return (void*) vaddr; 138 return (void*) vaddr;
139} 139}
140EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
140 141
141struct page *kmap_atomic_to_page(void *ptr) 142struct page *kmap_atomic_to_page(void *ptr)
142{ 143{
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 0b3d567e686d..8f307d914c2e 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
124 return 1; 124 return 1;
125} 125}
126 126
127pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) 127pte_t *huge_pte_alloc(struct mm_struct *mm,
128 unsigned long addr, unsigned long sz)
128{ 129{
129 pgd_t *pgd; 130 pgd_t *pgd;
130 pud_t *pud; 131 pud_t *pud;
@@ -133,9 +134,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
133 pgd = pgd_offset(mm, addr); 134 pgd = pgd_offset(mm, addr);
134 pud = pud_alloc(mm, pgd, addr); 135 pud = pud_alloc(mm, pgd, addr);
135 if (pud) { 136 if (pud) {
136 if (pud_none(*pud)) 137 if (sz == PUD_SIZE) {
137 huge_pmd_share(mm, addr, pud); 138 pte = (pte_t *)pud;
138 pte = (pte_t *) pmd_alloc(mm, pud, addr); 139 } else {
140 BUG_ON(sz != PMD_SIZE);
141 if (pud_none(*pud))
142 huge_pmd_share(mm, addr, pud);
143 pte = (pte_t *) pmd_alloc(mm, pud, addr);
144 }
139 } 145 }
140 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); 146 BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
141 147
@@ -151,8 +157,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
151 pgd = pgd_offset(mm, addr); 157 pgd = pgd_offset(mm, addr);
152 if (pgd_present(*pgd)) { 158 if (pgd_present(*pgd)) {
153 pud = pud_offset(pgd, addr); 159 pud = pud_offset(pgd, addr);
154 if (pud_present(*pud)) 160 if (pud_present(*pud)) {
161 if (pud_large(*pud))
162 return (pte_t *)pud;
155 pmd = pmd_offset(pud, addr); 163 pmd = pmd_offset(pud, addr);
164 }
156 } 165 }
157 return (pte_t *) pmd; 166 return (pte_t *) pmd;
158} 167}
@@ -188,6 +197,11 @@ int pmd_huge(pmd_t pmd)
188 return 0; 197 return 0;
189} 198}
190 199
200int pud_huge(pud_t pud)
201{
202 return 0;
203}
204
191struct page * 205struct page *
192follow_huge_pmd(struct mm_struct *mm, unsigned long address, 206follow_huge_pmd(struct mm_struct *mm, unsigned long address,
193 pmd_t *pmd, int write) 207 pmd_t *pmd, int write)
@@ -208,6 +222,11 @@ int pmd_huge(pmd_t pmd)
208 return !!(pmd_val(pmd) & _PAGE_PSE); 222 return !!(pmd_val(pmd) & _PAGE_PSE);
209} 223}
210 224
225int pud_huge(pud_t pud)
226{
227 return !!(pud_val(pud) & _PAGE_PSE);
228}
229
211struct page * 230struct page *
212follow_huge_pmd(struct mm_struct *mm, unsigned long address, 231follow_huge_pmd(struct mm_struct *mm, unsigned long address,
213 pmd_t *pmd, int write) 232 pmd_t *pmd, int write)
@@ -216,9 +235,22 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
216 235
217 page = pte_page(*(pte_t *)pmd); 236 page = pte_page(*(pte_t *)pmd);
218 if (page) 237 if (page)
219 page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); 238 page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
239 return page;
240}
241
242struct page *
243follow_huge_pud(struct mm_struct *mm, unsigned long address,
244 pud_t *pud, int write)
245{
246 struct page *page;
247
248 page = pte_page(*(pte_t *)pud);
249 if (page)
250 page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
220 return page; 251 return page;
221} 252}
253
222#endif 254#endif
223 255
224/* x86_64 also uses this file */ 256/* x86_64 also uses this file */
@@ -228,6 +260,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
228 unsigned long addr, unsigned long len, 260 unsigned long addr, unsigned long len,
229 unsigned long pgoff, unsigned long flags) 261 unsigned long pgoff, unsigned long flags)
230{ 262{
263 struct hstate *h = hstate_file(file);
231 struct mm_struct *mm = current->mm; 264 struct mm_struct *mm = current->mm;
232 struct vm_area_struct *vma; 265 struct vm_area_struct *vma;
233 unsigned long start_addr; 266 unsigned long start_addr;
@@ -240,7 +273,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file,
240 } 273 }
241 274
242full_search: 275full_search:
243 addr = ALIGN(start_addr, HPAGE_SIZE); 276 addr = ALIGN(start_addr, huge_page_size(h));
244 277
245 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { 278 for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
246 /* At this point: (!vma || addr < vma->vm_end). */ 279 /* At this point: (!vma || addr < vma->vm_end). */
@@ -262,7 +295,7 @@ full_search:
262 } 295 }
263 if (addr + mm->cached_hole_size < vma->vm_start) 296 if (addr + mm->cached_hole_size < vma->vm_start)
264 mm->cached_hole_size = vma->vm_start - addr; 297 mm->cached_hole_size = vma->vm_start - addr;
265 addr = ALIGN(vma->vm_end, HPAGE_SIZE); 298 addr = ALIGN(vma->vm_end, huge_page_size(h));
266 } 299 }
267} 300}
268 301
@@ -270,6 +303,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file,
270 unsigned long addr0, unsigned long len, 303 unsigned long addr0, unsigned long len,
271 unsigned long pgoff, unsigned long flags) 304 unsigned long pgoff, unsigned long flags)
272{ 305{
306 struct hstate *h = hstate_file(file);
273 struct mm_struct *mm = current->mm; 307 struct mm_struct *mm = current->mm;
274 struct vm_area_struct *vma, *prev_vma; 308 struct vm_area_struct *vma, *prev_vma;
275 unsigned long base = mm->mmap_base, addr = addr0; 309 unsigned long base = mm->mmap_base, addr = addr0;
@@ -290,7 +324,7 @@ try_again:
290 goto fail; 324 goto fail;
291 325
292 /* either no address requested or cant fit in requested address hole */ 326 /* either no address requested or cant fit in requested address hole */
293 addr = (mm->free_area_cache - len) & HPAGE_MASK; 327 addr = (mm->free_area_cache - len) & huge_page_mask(h);
294 do { 328 do {
295 /* 329 /*
296 * Lookup failure means no vma is above this address, 330 * Lookup failure means no vma is above this address,
@@ -321,7 +355,7 @@ try_again:
321 largest_hole = vma->vm_start - addr; 355 largest_hole = vma->vm_start - addr;
322 356
323 /* try just below the current vma->vm_start */ 357 /* try just below the current vma->vm_start */
324 addr = (vma->vm_start - len) & HPAGE_MASK; 358 addr = (vma->vm_start - len) & huge_page_mask(h);
325 } while (len <= vma->vm_start); 359 } while (len <= vma->vm_start);
326 360
327fail: 361fail:
@@ -359,22 +393,23 @@ unsigned long
359hugetlb_get_unmapped_area(struct file *file, unsigned long addr, 393hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
360 unsigned long len, unsigned long pgoff, unsigned long flags) 394 unsigned long len, unsigned long pgoff, unsigned long flags)
361{ 395{
396 struct hstate *h = hstate_file(file);
362 struct mm_struct *mm = current->mm; 397 struct mm_struct *mm = current->mm;
363 struct vm_area_struct *vma; 398 struct vm_area_struct *vma;
364 399
365 if (len & ~HPAGE_MASK) 400 if (len & ~huge_page_mask(h))
366 return -EINVAL; 401 return -EINVAL;
367 if (len > TASK_SIZE) 402 if (len > TASK_SIZE)
368 return -ENOMEM; 403 return -ENOMEM;
369 404
370 if (flags & MAP_FIXED) { 405 if (flags & MAP_FIXED) {
371 if (prepare_hugepage_range(addr, len)) 406 if (prepare_hugepage_range(file, addr, len))
372 return -EINVAL; 407 return -EINVAL;
373 return addr; 408 return addr;
374 } 409 }
375 410
376 if (addr) { 411 if (addr) {
377 addr = ALIGN(addr, HPAGE_SIZE); 412 addr = ALIGN(addr, huge_page_size(h));
378 vma = find_vma(mm, addr); 413 vma = find_vma(mm, addr);
379 if (TASK_SIZE - len >= addr && 414 if (TASK_SIZE - len >= addr &&
380 (!vma || addr + len <= vma->vm_start)) 415 (!vma || addr + len <= vma->vm_start))
@@ -390,3 +425,20 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
390 425
391#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ 426#endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/
392 427
428#ifdef CONFIG_X86_64
429static __init int setup_hugepagesz(char *opt)
430{
431 unsigned long ps = memparse(opt, &opt);
432 if (ps == PMD_SIZE) {
433 hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT);
434 } else if (ps == PUD_SIZE && cpu_has_gbpages) {
435 hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
436 } else {
437 printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n",
438 ps >> 20);
439 return 0;
440 }
441 return 1;
442}
443__setup("hugepagesz=", setup_hugepagesz);
444#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 9689a5138e64..8396868e82c5 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -31,6 +31,7 @@
31#include <linux/cpumask.h> 31#include <linux/cpumask.h>
32 32
33#include <asm/asm.h> 33#include <asm/asm.h>
34#include <asm/bios_ebda.h>
34#include <asm/processor.h> 35#include <asm/processor.h>
35#include <asm/system.h> 36#include <asm/system.h>
36#include <asm/uaccess.h> 37#include <asm/uaccess.h>
@@ -47,6 +48,7 @@
47#include <asm/paravirt.h> 48#include <asm/paravirt.h>
48#include <asm/setup.h> 49#include <asm/setup.h>
49#include <asm/cacheflush.h> 50#include <asm/cacheflush.h>
51#include <asm/smp.h>
50 52
51unsigned int __VMALLOC_RESERVE = 128 << 20; 53unsigned int __VMALLOC_RESERVE = 128 << 20;
52 54
@@ -194,11 +196,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
194 pgd_t *pgd; 196 pgd_t *pgd;
195 pmd_t *pmd; 197 pmd_t *pmd;
196 pte_t *pte; 198 pte_t *pte;
197 unsigned pages_2m = 0, pages_4k = 0; 199 unsigned pages_2m, pages_4k;
200 int mapping_iter;
201
202 /*
203 * First iteration will setup identity mapping using large/small pages
204 * based on use_pse, with other attributes same as set by
205 * the early code in head_32.S
206 *
207 * Second iteration will setup the appropriate attributes (NX, GLOBAL..)
208 * as desired for the kernel identity mapping.
209 *
210 * This two pass mechanism conforms to the TLB app note which says:
211 *
212 * "Software should not write to a paging-structure entry in a way
213 * that would change, for any linear address, both the page size
214 * and either the page frame or attributes."
215 */
216 mapping_iter = 1;
198 217
199 if (!cpu_has_pse) 218 if (!cpu_has_pse)
200 use_pse = 0; 219 use_pse = 0;
201 220
221repeat:
222 pages_2m = pages_4k = 0;
202 pfn = start_pfn; 223 pfn = start_pfn;
203 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET); 224 pgd_idx = pgd_index((pfn<<PAGE_SHIFT) + PAGE_OFFSET);
204 pgd = pgd_base + pgd_idx; 225 pgd = pgd_base + pgd_idx;
@@ -224,6 +245,13 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
224 if (use_pse) { 245 if (use_pse) {
225 unsigned int addr2; 246 unsigned int addr2;
226 pgprot_t prot = PAGE_KERNEL_LARGE; 247 pgprot_t prot = PAGE_KERNEL_LARGE;
248 /*
249 * first pass will use the same initial
250 * identity mapping attribute + _PAGE_PSE.
251 */
252 pgprot_t init_prot =
253 __pgprot(PTE_IDENT_ATTR |
254 _PAGE_PSE);
227 255
228 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE + 256 addr2 = (pfn + PTRS_PER_PTE-1) * PAGE_SIZE +
229 PAGE_OFFSET + PAGE_SIZE-1; 257 PAGE_OFFSET + PAGE_SIZE-1;
@@ -233,7 +261,10 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
233 prot = PAGE_KERNEL_LARGE_EXEC; 261 prot = PAGE_KERNEL_LARGE_EXEC;
234 262
235 pages_2m++; 263 pages_2m++;
236 set_pmd(pmd, pfn_pmd(pfn, prot)); 264 if (mapping_iter == 1)
265 set_pmd(pmd, pfn_pmd(pfn, init_prot));
266 else
267 set_pmd(pmd, pfn_pmd(pfn, prot));
237 268
238 pfn += PTRS_PER_PTE; 269 pfn += PTRS_PER_PTE;
239 continue; 270 continue;
@@ -245,17 +276,43 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base,
245 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn; 276 for (; pte_ofs < PTRS_PER_PTE && pfn < end_pfn;
246 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) { 277 pte++, pfn++, pte_ofs++, addr += PAGE_SIZE) {
247 pgprot_t prot = PAGE_KERNEL; 278 pgprot_t prot = PAGE_KERNEL;
279 /*
280 * first pass will use the same initial
281 * identity mapping attribute.
282 */
283 pgprot_t init_prot = __pgprot(PTE_IDENT_ATTR);
248 284
249 if (is_kernel_text(addr)) 285 if (is_kernel_text(addr))
250 prot = PAGE_KERNEL_EXEC; 286 prot = PAGE_KERNEL_EXEC;
251 287
252 pages_4k++; 288 pages_4k++;
253 set_pte(pte, pfn_pte(pfn, prot)); 289 if (mapping_iter == 1)
290 set_pte(pte, pfn_pte(pfn, init_prot));
291 else
292 set_pte(pte, pfn_pte(pfn, prot));
254 } 293 }
255 } 294 }
256 } 295 }
257 update_page_count(PG_LEVEL_2M, pages_2m); 296 if (mapping_iter == 1) {
258 update_page_count(PG_LEVEL_4K, pages_4k); 297 /*
298 * update direct mapping page count only in the first
299 * iteration.
300 */
301 update_page_count(PG_LEVEL_2M, pages_2m);
302 update_page_count(PG_LEVEL_4K, pages_4k);
303
304 /*
305 * local global flush tlb, which will flush the previous
306 * mappings present in both small and large page TLB's.
307 */
308 __flush_tlb_all();
309
310 /*
311 * Second iteration will set the actual desired PTE attributes.
312 */
313 mapping_iter = 2;
314 goto repeat;
315 }
259} 316}
260 317
261/* 318/*
@@ -458,11 +515,7 @@ static void __init pagetable_init(void)
458{ 515{
459 pgd_t *pgd_base = swapper_pg_dir; 516 pgd_t *pgd_base = swapper_pg_dir;
460 517
461 paravirt_pagetable_setup_start(pgd_base);
462
463 permanent_kmaps_init(pgd_base); 518 permanent_kmaps_init(pgd_base);
464
465 paravirt_pagetable_setup_done(pgd_base);
466} 519}
467 520
468#ifdef CONFIG_ACPI_SLEEP 521#ifdef CONFIG_ACPI_SLEEP
@@ -505,7 +558,7 @@ void zap_low_mappings(void)
505 558
506int nx_enabled; 559int nx_enabled;
507 560
508pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); 561pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
509EXPORT_SYMBOL_GPL(__supported_pte_mask); 562EXPORT_SYMBOL_GPL(__supported_pte_mask);
510 563
511#ifdef CONFIG_X86_PAE 564#ifdef CONFIG_X86_PAE
@@ -722,7 +775,7 @@ void __init setup_bootmem_allocator(void)
722 after_init_bootmem = 1; 775 after_init_bootmem = 1;
723} 776}
724 777
725static void __init find_early_table_space(unsigned long end) 778static void __init find_early_table_space(unsigned long end, int use_pse)
726{ 779{
727 unsigned long puds, pmds, ptes, tables, start; 780 unsigned long puds, pmds, ptes, tables, start;
728 781
@@ -732,7 +785,7 @@ static void __init find_early_table_space(unsigned long end)
732 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 785 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
733 tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); 786 tables += PAGE_ALIGN(pmds * sizeof(pmd_t));
734 787
735 if (cpu_has_pse) { 788 if (use_pse) {
736 unsigned long extra; 789 unsigned long extra;
737 790
738 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 791 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
@@ -772,12 +825,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
772 pgd_t *pgd_base = swapper_pg_dir; 825 pgd_t *pgd_base = swapper_pg_dir;
773 unsigned long start_pfn, end_pfn; 826 unsigned long start_pfn, end_pfn;
774 unsigned long big_page_start; 827 unsigned long big_page_start;
828#ifdef CONFIG_DEBUG_PAGEALLOC
829 /*
830 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
831 * This will simplify cpa(), which otherwise needs to support splitting
832 * large pages into small in interrupt context, etc.
833 */
834 int use_pse = 0;
835#else
836 int use_pse = cpu_has_pse;
837#endif
775 838
776 /* 839 /*
777 * Find space for the kernel direct mapping tables. 840 * Find space for the kernel direct mapping tables.
778 */ 841 */
779 if (!after_init_bootmem) 842 if (!after_init_bootmem)
780 find_early_table_space(end); 843 find_early_table_space(end, use_pse);
781 844
782#ifdef CONFIG_X86_PAE 845#ifdef CONFIG_X86_PAE
783 set_nx(); 846 set_nx();
@@ -823,7 +886,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
823 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); 886 end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT);
824 if (start_pfn < end_pfn) 887 if (start_pfn < end_pfn)
825 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, 888 kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn,
826 cpu_has_pse); 889 use_pse);
827 890
828 /* tail is not big page alignment ? */ 891 /* tail is not big page alignment ? */
829 start_pfn = end_pfn; 892 start_pfn = end_pfn;
@@ -844,6 +907,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
844 reserve_early(table_start << PAGE_SHIFT, 907 reserve_early(table_start << PAGE_SHIFT,
845 table_end << PAGE_SHIFT, "PGTABLE"); 908 table_end << PAGE_SHIFT, "PGTABLE");
846 909
910 if (!after_init_bootmem)
911 early_memtest(start, end);
912
847 return end >> PAGE_SHIFT; 913 return end >> PAGE_SHIFT;
848} 914}
849 915
@@ -868,8 +934,6 @@ void __init paging_init(void)
868 */ 934 */
869 sparse_init(); 935 sparse_init();
870 zone_sizes_init(); 936 zone_sizes_init();
871
872 paravirt_post_allocator_init();
873} 937}
874 938
875/* 939/*
@@ -906,6 +970,8 @@ void __init mem_init(void)
906 int codesize, reservedpages, datasize, initsize; 970 int codesize, reservedpages, datasize, initsize;
907 int tmp; 971 int tmp;
908 972
973 start_periodic_check_for_corruption();
974
909#ifdef CONFIG_FLATMEM 975#ifdef CONFIG_FLATMEM
910 BUG_ON(!mem_map); 976 BUG_ON(!mem_map);
911#endif 977#endif
@@ -985,7 +1051,6 @@ void __init mem_init(void)
985 if (boot_cpu_data.wp_works_ok < 0) 1051 if (boot_cpu_data.wp_works_ok < 0)
986 test_wp_bit(); 1052 test_wp_bit();
987 1053
988 cpa_init();
989 save_pg_dir(); 1054 save_pg_dir();
990 zap_low_mappings(); 1055 zap_low_mappings();
991} 1056}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 306049edd553..b8e461d49412 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -31,6 +31,7 @@
31#include <linux/nmi.h> 31#include <linux/nmi.h>
32 32
33#include <asm/processor.h> 33#include <asm/processor.h>
34#include <asm/bios_ebda.h>
34#include <asm/system.h> 35#include <asm/system.h>
35#include <asm/uaccess.h> 36#include <asm/uaccess.h>
36#include <asm/pgtable.h> 37#include <asm/pgtable.h>
@@ -60,7 +61,7 @@ static unsigned long dma_reserve __initdata;
60 61
61DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 62DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
62 63
63int direct_gbpages __meminitdata 64int direct_gbpages
64#ifdef CONFIG_DIRECT_GBPAGES 65#ifdef CONFIG_DIRECT_GBPAGES
65 = 1 66 = 1
66#endif 67#endif
@@ -86,46 +87,69 @@ early_param("gbpages", parse_direct_gbpages_on);
86 * around without checking the pgd every time. 87 * around without checking the pgd every time.
87 */ 88 */
88 89
89void show_mem(void) 90int after_bootmem;
90{
91 long i, total = 0, reserved = 0;
92 long shared = 0, cached = 0;
93 struct page *page;
94 pg_data_t *pgdat;
95
96 printk(KERN_INFO "Mem-info:\n");
97 show_free_areas();
98 for_each_online_pgdat(pgdat) {
99 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
100 /*
101 * This loop can take a while with 256 GB and
102 * 4k pages so defer the NMI watchdog:
103 */
104 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
105 touch_nmi_watchdog();
106 91
107 if (!pfn_valid(pgdat->node_start_pfn + i)) 92pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
108 continue; 93EXPORT_SYMBOL_GPL(__supported_pte_mask);
109 94
110 page = pfn_to_page(pgdat->node_start_pfn + i); 95static int do_not_nx __cpuinitdata;
111 total++; 96
112 if (PageReserved(page)) 97/*
113 reserved++; 98 * noexec=on|off
114 else if (PageSwapCache(page)) 99 * Control non-executable mappings for 64-bit processes.
115 cached++; 100 *
116 else if (page_count(page)) 101 * on Enable (default)
117 shared += page_count(page) - 1; 102 * off Disable
118 } 103 */
104static int __init nonx_setup(char *str)
105{
106 if (!str)
107 return -EINVAL;
108 if (!strncmp(str, "on", 2)) {
109 __supported_pte_mask |= _PAGE_NX;
110 do_not_nx = 0;
111 } else if (!strncmp(str, "off", 3)) {
112 do_not_nx = 1;
113 __supported_pte_mask &= ~_PAGE_NX;
119 } 114 }
120 printk(KERN_INFO "%lu pages of RAM\n", total); 115 return 0;
121 printk(KERN_INFO "%lu reserved pages\n", reserved);
122 printk(KERN_INFO "%lu pages shared\n", shared);
123 printk(KERN_INFO "%lu pages swap cached\n", cached);
124} 116}
117early_param("noexec", nonx_setup);
125 118
126int after_bootmem; 119void __cpuinit check_efer(void)
120{
121 unsigned long efer;
122
123 rdmsrl(MSR_EFER, efer);
124 if (!(efer & EFER_NX) || do_not_nx)
125 __supported_pte_mask &= ~_PAGE_NX;
126}
127
128int force_personality32;
129
130/*
131 * noexec32=on|off
132 * Control non executable heap for 32bit processes.
133 * To control the stack too use noexec=off
134 *
135 * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default)
136 * off PROT_READ implies PROT_EXEC
137 */
138static int __init nonx32_setup(char *str)
139{
140 if (!strcmp(str, "on"))
141 force_personality32 &= ~READ_IMPLIES_EXEC;
142 else if (!strcmp(str, "off"))
143 force_personality32 |= READ_IMPLIES_EXEC;
144 return 1;
145}
146__setup("noexec32=", nonx32_setup);
127 147
128static __init void *spp_getpage(void) 148/*
149 * NOTE: This function is marked __ref because it calls __init function
150 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
151 */
152static __ref void *spp_getpage(void)
129{ 153{
130 void *ptr; 154 void *ptr;
131 155
@@ -172,9 +196,6 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte)
172 } 196 }
173 197
174 pte = pte_offset_kernel(pmd, vaddr); 198 pte = pte_offset_kernel(pmd, vaddr);
175 if (!pte_none(*pte) && pte_val(new_pte) &&
176 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
177 pte_ERROR(*pte);
178 set_pte(pte, new_pte); 199 set_pte(pte, new_pte);
179 200
180 /* 201 /*
@@ -258,7 +279,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
258void __init cleanup_highmap(void) 279void __init cleanup_highmap(void)
259{ 280{
260 unsigned long vaddr = __START_KERNEL_map; 281 unsigned long vaddr = __START_KERNEL_map;
261 unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; 282 unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1;
262 pmd_t *pmd = level2_kernel_pgt; 283 pmd_t *pmd = level2_kernel_pgt;
263 pmd_t *last_pmd = pmd + PTRS_PER_PMD; 284 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
264 285
@@ -274,7 +295,7 @@ static unsigned long __initdata table_start;
274static unsigned long __meminitdata table_end; 295static unsigned long __meminitdata table_end;
275static unsigned long __meminitdata table_top; 296static unsigned long __meminitdata table_top;
276 297
277static __meminit void *alloc_low_page(unsigned long *phys) 298static __ref void *alloc_low_page(unsigned long *phys)
278{ 299{
279 unsigned long pfn = table_end++; 300 unsigned long pfn = table_end++;
280 void *adr; 301 void *adr;
@@ -289,13 +310,13 @@ static __meminit void *alloc_low_page(unsigned long *phys)
289 if (pfn >= table_top) 310 if (pfn >= table_top)
290 panic("alloc_low_page: ran out of memory"); 311 panic("alloc_low_page: ran out of memory");
291 312
292 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); 313 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
293 memset(adr, 0, PAGE_SIZE); 314 memset(adr, 0, PAGE_SIZE);
294 *phys = pfn * PAGE_SIZE; 315 *phys = pfn * PAGE_SIZE;
295 return adr; 316 return adr;
296} 317}
297 318
298static __meminit void unmap_low_page(void *adr) 319static __ref void unmap_low_page(void *adr)
299{ 320{
300 if (after_bootmem) 321 if (after_bootmem)
301 return; 322 return;
@@ -304,7 +325,8 @@ static __meminit void unmap_low_page(void *adr)
304} 325}
305 326
306static unsigned long __meminit 327static unsigned long __meminit
307phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) 328phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
329 pgprot_t prot)
308{ 330{
309 unsigned pages = 0; 331 unsigned pages = 0;
310 unsigned long last_map_addr = end; 332 unsigned long last_map_addr = end;
@@ -322,32 +344,40 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end)
322 break; 344 break;
323 } 345 }
324 346
347 /*
348 * We will re-use the existing mapping.
349 * Xen for example has some special requirements, like mapping
350 * pagetable pages as RO. So assume someone who pre-setup
351 * these mappings are more intelligent.
352 */
325 if (pte_val(*pte)) 353 if (pte_val(*pte))
326 continue; 354 continue;
327 355
328 if (0) 356 if (0)
329 printk(" pte=%p addr=%lx pte=%016lx\n", 357 printk(" pte=%p addr=%lx pte=%016lx\n",
330 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); 358 pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte);
331 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL));
332 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
333 pages++; 359 pages++;
360 set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot));
361 last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE;
334 } 362 }
363
335 update_page_count(PG_LEVEL_4K, pages); 364 update_page_count(PG_LEVEL_4K, pages);
336 365
337 return last_map_addr; 366 return last_map_addr;
338} 367}
339 368
340static unsigned long __meminit 369static unsigned long __meminit
341phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) 370phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
371 pgprot_t prot)
342{ 372{
343 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); 373 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
344 374
345 return phys_pte_init(pte, address, end); 375 return phys_pte_init(pte, address, end, prot);
346} 376}
347 377
348static unsigned long __meminit 378static unsigned long __meminit
349phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 379phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
350 unsigned long page_size_mask) 380 unsigned long page_size_mask, pgprot_t prot)
351{ 381{
352 unsigned long pages = 0; 382 unsigned long pages = 0;
353 unsigned long last_map_addr = end; 383 unsigned long last_map_addr = end;
@@ -358,6 +388,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
358 unsigned long pte_phys; 388 unsigned long pte_phys;
359 pmd_t *pmd = pmd_page + pmd_index(address); 389 pmd_t *pmd = pmd_page + pmd_index(address);
360 pte_t *pte; 390 pte_t *pte;
391 pgprot_t new_prot = prot;
361 392
362 if (address >= end) { 393 if (address >= end) {
363 if (!after_bootmem) { 394 if (!after_bootmem) {
@@ -368,25 +399,48 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
368 } 399 }
369 400
370 if (pmd_val(*pmd)) { 401 if (pmd_val(*pmd)) {
371 if (!pmd_large(*pmd)) 402 if (!pmd_large(*pmd)) {
403 spin_lock(&init_mm.page_table_lock);
372 last_map_addr = phys_pte_update(pmd, address, 404 last_map_addr = phys_pte_update(pmd, address,
373 end); 405 end, prot);
374 continue; 406 spin_unlock(&init_mm.page_table_lock);
407 continue;
408 }
409 /*
410 * If we are ok with PG_LEVEL_2M mapping, then we will
411 * use the existing mapping,
412 *
413 * Otherwise, we will split the large page mapping but
414 * use the same existing protection bits except for
415 * large page, so that we don't violate Intel's TLB
416 * Application note (317080) which says, while changing
417 * the page sizes, new and old translations should
418 * not differ with respect to page frame and
419 * attributes.
420 */
421 if (page_size_mask & (1 << PG_LEVEL_2M))
422 continue;
423 new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd));
375 } 424 }
376 425
377 if (page_size_mask & (1<<PG_LEVEL_2M)) { 426 if (page_size_mask & (1<<PG_LEVEL_2M)) {
378 pages++; 427 pages++;
428 spin_lock(&init_mm.page_table_lock);
379 set_pte((pte_t *)pmd, 429 set_pte((pte_t *)pmd,
380 pfn_pte(address >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 430 pfn_pte(address >> PAGE_SHIFT,
431 __pgprot(pgprot_val(prot) | _PAGE_PSE)));
432 spin_unlock(&init_mm.page_table_lock);
381 last_map_addr = (address & PMD_MASK) + PMD_SIZE; 433 last_map_addr = (address & PMD_MASK) + PMD_SIZE;
382 continue; 434 continue;
383 } 435 }
384 436
385 pte = alloc_low_page(&pte_phys); 437 pte = alloc_low_page(&pte_phys);
386 last_map_addr = phys_pte_init(pte, address, end); 438 last_map_addr = phys_pte_init(pte, address, end, new_prot);
387 unmap_low_page(pte); 439 unmap_low_page(pte);
388 440
441 spin_lock(&init_mm.page_table_lock);
389 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); 442 pmd_populate_kernel(&init_mm, pmd, __va(pte_phys));
443 spin_unlock(&init_mm.page_table_lock);
390 } 444 }
391 update_page_count(PG_LEVEL_2M, pages); 445 update_page_count(PG_LEVEL_2M, pages);
392 return last_map_addr; 446 return last_map_addr;
@@ -394,14 +448,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
394 448
395static unsigned long __meminit 449static unsigned long __meminit
396phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, 450phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
397 unsigned long page_size_mask) 451 unsigned long page_size_mask, pgprot_t prot)
398{ 452{
399 pmd_t *pmd = pmd_offset(pud, 0); 453 pmd_t *pmd = pmd_offset(pud, 0);
400 unsigned long last_map_addr; 454 unsigned long last_map_addr;
401 455
402 spin_lock(&init_mm.page_table_lock); 456 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
403 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask);
404 spin_unlock(&init_mm.page_table_lock);
405 __flush_tlb_all(); 457 __flush_tlb_all();
406 return last_map_addr; 458 return last_map_addr;
407} 459}
@@ -418,6 +470,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
418 unsigned long pmd_phys; 470 unsigned long pmd_phys;
419 pud_t *pud = pud_page + pud_index(addr); 471 pud_t *pud = pud_page + pud_index(addr);
420 pmd_t *pmd; 472 pmd_t *pmd;
473 pgprot_t prot = PAGE_KERNEL;
421 474
422 if (addr >= end) 475 if (addr >= end)
423 break; 476 break;
@@ -429,30 +482,49 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
429 } 482 }
430 483
431 if (pud_val(*pud)) { 484 if (pud_val(*pud)) {
432 if (!pud_large(*pud)) 485 if (!pud_large(*pud)) {
433 last_map_addr = phys_pmd_update(pud, addr, end, 486 last_map_addr = phys_pmd_update(pud, addr, end,
434 page_size_mask); 487 page_size_mask, prot);
435 continue; 488 continue;
489 }
490 /*
491 * If we are ok with PG_LEVEL_1G mapping, then we will
492 * use the existing mapping.
493 *
494 * Otherwise, we will split the gbpage mapping but use
495 * the same existing protection bits except for large
496 * page, so that we don't violate Intel's TLB
497 * Application note (317080) which says, while changing
498 * the page sizes, new and old translations should
499 * not differ with respect to page frame and
500 * attributes.
501 */
502 if (page_size_mask & (1 << PG_LEVEL_1G))
503 continue;
504 prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud));
436 } 505 }
437 506
438 if (page_size_mask & (1<<PG_LEVEL_1G)) { 507 if (page_size_mask & (1<<PG_LEVEL_1G)) {
439 pages++; 508 pages++;
509 spin_lock(&init_mm.page_table_lock);
440 set_pte((pte_t *)pud, 510 set_pte((pte_t *)pud,
441 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE)); 511 pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL_LARGE));
512 spin_unlock(&init_mm.page_table_lock);
442 last_map_addr = (addr & PUD_MASK) + PUD_SIZE; 513 last_map_addr = (addr & PUD_MASK) + PUD_SIZE;
443 continue; 514 continue;
444 } 515 }
445 516
446 pmd = alloc_low_page(&pmd_phys); 517 pmd = alloc_low_page(&pmd_phys);
518 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask,
519 prot);
520 unmap_low_page(pmd);
447 521
448 spin_lock(&init_mm.page_table_lock); 522 spin_lock(&init_mm.page_table_lock);
449 last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask);
450 unmap_low_page(pmd);
451 pud_populate(&init_mm, pud, __va(pmd_phys)); 523 pud_populate(&init_mm, pud, __va(pmd_phys));
452 spin_unlock(&init_mm.page_table_lock); 524 spin_unlock(&init_mm.page_table_lock);
453
454 } 525 }
455 __flush_tlb_all(); 526 __flush_tlb_all();
527
456 update_page_count(PG_LEVEL_1G, pages); 528 update_page_count(PG_LEVEL_1G, pages);
457 529
458 return last_map_addr; 530 return last_map_addr;
@@ -469,27 +541,28 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
469 return phys_pud_init(pud, addr, end, page_size_mask); 541 return phys_pud_init(pud, addr, end, page_size_mask);
470} 542}
471 543
472static void __init find_early_table_space(unsigned long end) 544static void __init find_early_table_space(unsigned long end, int use_pse,
545 int use_gbpages)
473{ 546{
474 unsigned long puds, pmds, ptes, tables, start; 547 unsigned long puds, pmds, ptes, tables, start;
475 548
476 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 549 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
477 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); 550 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
478 if (direct_gbpages) { 551 if (use_gbpages) {
479 unsigned long extra; 552 unsigned long extra;
480 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); 553 extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT);
481 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; 554 pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT;
482 } else 555 } else
483 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; 556 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
484 tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); 557 tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE);
485 558
486 if (cpu_has_pse) { 559 if (use_pse) {
487 unsigned long extra; 560 unsigned long extra;
488 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); 561 extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT);
489 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; 562 ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT;
490 } else 563 } else
491 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; 564 ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT;
492 tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); 565 tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE);
493 566
494 /* 567 /*
495 * RED-PEN putting page tables only on node 0 could 568 * RED-PEN putting page tables only on node 0 could
@@ -517,118 +590,6 @@ static void __init init_gbpages(void)
517 direct_gbpages = 0; 590 direct_gbpages = 0;
518} 591}
519 592
520#ifdef CONFIG_MEMTEST
521
522static void __init memtest(unsigned long start_phys, unsigned long size,
523 unsigned pattern)
524{
525 unsigned long i;
526 unsigned long *start;
527 unsigned long start_bad;
528 unsigned long last_bad;
529 unsigned long val;
530 unsigned long start_phys_aligned;
531 unsigned long count;
532 unsigned long incr;
533
534 switch (pattern) {
535 case 0:
536 val = 0UL;
537 break;
538 case 1:
539 val = -1UL;
540 break;
541 case 2:
542 val = 0x5555555555555555UL;
543 break;
544 case 3:
545 val = 0xaaaaaaaaaaaaaaaaUL;
546 break;
547 default:
548 return;
549 }
550
551 incr = sizeof(unsigned long);
552 start_phys_aligned = ALIGN(start_phys, incr);
553 count = (size - (start_phys_aligned - start_phys))/incr;
554 start = __va(start_phys_aligned);
555 start_bad = 0;
556 last_bad = 0;
557
558 for (i = 0; i < count; i++)
559 start[i] = val;
560 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
561 if (*start != val) {
562 if (start_phys_aligned == last_bad + incr) {
563 last_bad += incr;
564 } else {
565 if (start_bad) {
566 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
567 val, start_bad, last_bad + incr);
568 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
569 }
570 start_bad = last_bad = start_phys_aligned;
571 }
572 }
573 }
574 if (start_bad) {
575 printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved",
576 val, start_bad, last_bad + incr);
577 reserve_early(start_bad, last_bad - start_bad, "BAD RAM");
578 }
579
580}
581
582/* default is disabled */
583static int memtest_pattern __initdata;
584
585static int __init parse_memtest(char *arg)
586{
587 if (arg)
588 memtest_pattern = simple_strtoul(arg, NULL, 0);
589 return 0;
590}
591
592early_param("memtest", parse_memtest);
593
594static void __init early_memtest(unsigned long start, unsigned long end)
595{
596 u64 t_start, t_size;
597 unsigned pattern;
598
599 if (!memtest_pattern)
600 return;
601
602 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
603 for (pattern = 0; pattern < memtest_pattern; pattern++) {
604 t_start = start;
605 t_size = 0;
606 while (t_start < end) {
607 t_start = find_e820_area_size(t_start, &t_size, 1);
608
609 /* done ? */
610 if (t_start >= end)
611 break;
612 if (t_start + t_size > end)
613 t_size = end - t_start;
614
615 printk(KERN_CONT "\n %016llx - %016llx pattern %d",
616 (unsigned long long)t_start,
617 (unsigned long long)t_start + t_size, pattern);
618
619 memtest(t_start, t_size, pattern);
620
621 t_start += t_size;
622 }
623 }
624 printk(KERN_CONT "\n");
625}
626#else
627static void __init early_memtest(unsigned long start, unsigned long end)
628{
629}
630#endif
631
632static unsigned long __init kernel_physical_mapping_init(unsigned long start, 593static unsigned long __init kernel_physical_mapping_init(unsigned long start,
633 unsigned long end, 594 unsigned long end,
634 unsigned long page_size_mask) 595 unsigned long page_size_mask)
@@ -654,17 +615,16 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start,
654 continue; 615 continue;
655 } 616 }
656 617
657 if (after_bootmem) 618 pud = alloc_low_page(&pud_phys);
658 pud = pud_offset(pgd, start & PGDIR_MASK);
659 else
660 pud = alloc_low_page(&pud_phys);
661
662 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), 619 last_map_addr = phys_pud_init(pud, __pa(start), __pa(next),
663 page_size_mask); 620 page_size_mask);
664 unmap_low_page(pud); 621 unmap_low_page(pud);
665 pgd_populate(&init_mm, pgd_offset_k(start), 622
666 __va(pud_phys)); 623 spin_lock(&init_mm.page_table_lock);
624 pgd_populate(&init_mm, pgd, __va(pud_phys));
625 spin_unlock(&init_mm.page_table_lock);
667 } 626 }
627 __flush_tlb_all();
668 628
669 return last_map_addr; 629 return last_map_addr;
670} 630}
@@ -708,6 +668,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
708 668
709 struct map_range mr[NR_RANGE_MR]; 669 struct map_range mr[NR_RANGE_MR];
710 int nr_range, i; 670 int nr_range, i;
671 int use_pse, use_gbpages;
711 672
712 printk(KERN_INFO "init_memory_mapping\n"); 673 printk(KERN_INFO "init_memory_mapping\n");
713 674
@@ -721,9 +682,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
721 if (!after_bootmem) 682 if (!after_bootmem)
722 init_gbpages(); 683 init_gbpages();
723 684
724 if (direct_gbpages) 685#ifdef CONFIG_DEBUG_PAGEALLOC
686 /*
687 * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages.
688 * This will simplify cpa(), which otherwise needs to support splitting
689 * large pages into small in interrupt context, etc.
690 */
691 use_pse = use_gbpages = 0;
692#else
693 use_pse = cpu_has_pse;
694 use_gbpages = direct_gbpages;
695#endif
696
697 if (use_gbpages)
725 page_size_mask |= 1 << PG_LEVEL_1G; 698 page_size_mask |= 1 << PG_LEVEL_1G;
726 if (cpu_has_pse) 699 if (use_pse)
727 page_size_mask |= 1 << PG_LEVEL_2M; 700 page_size_mask |= 1 << PG_LEVEL_2M;
728 701
729 memset(mr, 0, sizeof(mr)); 702 memset(mr, 0, sizeof(mr));
@@ -773,7 +746,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
773 old_start = mr[i].start; 746 old_start = mr[i].start;
774 memmove(&mr[i], &mr[i+1], 747 memmove(&mr[i], &mr[i+1],
775 (nr_range - 1 - i) * sizeof (struct map_range)); 748 (nr_range - 1 - i) * sizeof (struct map_range));
776 mr[i].start = old_start; 749 mr[i--].start = old_start;
777 nr_range--; 750 nr_range--;
778 } 751 }
779 752
@@ -784,7 +757,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
784 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k")); 757 (mr[i].page_size_mask & (1<<PG_LEVEL_2M))?"2M":"4k"));
785 758
786 if (!after_bootmem) 759 if (!after_bootmem)
787 find_early_table_space(end); 760 find_early_table_space(end, use_pse, use_gbpages);
788 761
789 for (i = 0; i < nr_range; i++) 762 for (i = 0; i < nr_range; i++)
790 last_map_addr = kernel_physical_mapping_init( 763 last_map_addr = kernel_physical_mapping_init(
@@ -906,6 +879,8 @@ void __init mem_init(void)
906{ 879{
907 long codesize, reservedpages, datasize, initsize; 880 long codesize, reservedpages, datasize, initsize;
908 881
882 start_periodic_check_for_corruption();
883
909 pci_iommu_alloc(); 884 pci_iommu_alloc();
910 885
911 /* clear_bss() already clear the empty_zero_page */ 886 /* clear_bss() already clear the empty_zero_page */
@@ -943,8 +918,6 @@ void __init mem_init(void)
943 reservedpages << (PAGE_SHIFT-10), 918 reservedpages << (PAGE_SHIFT-10),
944 datasize >> 10, 919 datasize >> 10,
945 initsize >> 10); 920 initsize >> 10);
946
947 cpa_init();
948} 921}
949 922
950void free_init_pages(char *what, unsigned long begin, unsigned long end) 923void free_init_pages(char *what, unsigned long begin, unsigned long end)
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 24c1d3c30186..ae71e11eb3e5 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,18 +24,47 @@
24 24
25#ifdef CONFIG_X86_64 25#ifdef CONFIG_X86_64
26 26
27static inline int phys_addr_valid(unsigned long addr)
28{
29 return addr < (1UL << boot_cpu_data.x86_phys_bits);
30}
31
27unsigned long __phys_addr(unsigned long x) 32unsigned long __phys_addr(unsigned long x)
28{ 33{
29 if (x >= __START_KERNEL_map) 34 if (x >= __START_KERNEL_map) {
30 return x - __START_KERNEL_map + phys_base; 35 x -= __START_KERNEL_map;
31 return x - PAGE_OFFSET; 36 VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE);
37 x += phys_base;
38 } else {
39 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
40 x -= PAGE_OFFSET;
41 VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM :
42 !phys_addr_valid(x));
43 }
44 return x;
32} 45}
33EXPORT_SYMBOL(__phys_addr); 46EXPORT_SYMBOL(__phys_addr);
34 47
35static inline int phys_addr_valid(unsigned long addr) 48bool __virt_addr_valid(unsigned long x)
36{ 49{
37 return addr < (1UL << boot_cpu_data.x86_phys_bits); 50 if (x >= __START_KERNEL_map) {
51 x -= __START_KERNEL_map;
52 if (x >= KERNEL_IMAGE_SIZE)
53 return false;
54 x += phys_base;
55 } else {
56 if (x < PAGE_OFFSET)
57 return false;
58 x -= PAGE_OFFSET;
59 if (system_state == SYSTEM_BOOTING ?
60 x > MAXMEM : !phys_addr_valid(x)) {
61 return false;
62 }
63 }
64
65 return pfn_valid(x >> PAGE_SHIFT);
38} 66}
67EXPORT_SYMBOL(__virt_addr_valid);
39 68
40#else 69#else
41 70
@@ -44,6 +73,28 @@ static inline int phys_addr_valid(unsigned long addr)
44 return 1; 73 return 1;
45} 74}
46 75
76#ifdef CONFIG_DEBUG_VIRTUAL
77unsigned long __phys_addr(unsigned long x)
78{
79 /* VMALLOC_* aren't constants; not available at the boot time */
80 VIRTUAL_BUG_ON(x < PAGE_OFFSET);
81 VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING &&
82 is_vmalloc_addr((void *) x));
83 return x - PAGE_OFFSET;
84}
85EXPORT_SYMBOL(__phys_addr);
86#endif
87
88bool __virt_addr_valid(unsigned long x)
89{
90 if (x < PAGE_OFFSET)
91 return false;
92 if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x))
93 return false;
94 return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT);
95}
96EXPORT_SYMBOL(__virt_addr_valid);
97
47#endif 98#endif
48 99
49int page_is_ram(unsigned long pagenr) 100int page_is_ram(unsigned long pagenr)
@@ -83,6 +134,25 @@ int page_is_ram(unsigned long pagenr)
83 return 0; 134 return 0;
84} 135}
85 136
137int pagerange_is_ram(unsigned long start, unsigned long end)
138{
139 int ram_page = 0, not_rampage = 0;
140 unsigned long page_nr;
141
142 for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT);
143 ++page_nr) {
144 if (page_is_ram(page_nr))
145 ram_page = 1;
146 else
147 not_rampage = 1;
148
149 if (ram_page == not_rampage)
150 return -1;
151 }
152
153 return ram_page;
154}
155
86/* 156/*
87 * Fix up the linear direct mapping of the kernel to avoid cache attribute 157 * Fix up the linear direct mapping of the kernel to avoid cache attribute
88 * conflicts. 158 * conflicts.
@@ -150,6 +220,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
150 return (__force void __iomem *)phys_to_virt(phys_addr); 220 return (__force void __iomem *)phys_to_virt(phys_addr);
151 221
152 /* 222 /*
223 * Check if the request spans more than any BAR in the iomem resource
224 * tree.
225 */
226 WARN_ON(iomem_map_sanity_check(phys_addr, size));
227
228 /*
153 * Don't allow anybody to remap normal RAM that we're using.. 229 * Don't allow anybody to remap normal RAM that we're using..
154 */ 230 */
155 for (pfn = phys_addr >> PAGE_SHIFT; 231 for (pfn = phys_addr >> PAGE_SHIFT;
@@ -170,7 +246,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
170 phys_addr &= PAGE_MASK; 246 phys_addr &= PAGE_MASK;
171 size = PAGE_ALIGN(last_addr+1) - phys_addr; 247 size = PAGE_ALIGN(last_addr+1) - phys_addr;
172 248
173 retval = reserve_memtype(phys_addr, phys_addr + size, 249 retval = reserve_memtype(phys_addr, (u64)phys_addr + size,
174 prot_val, &new_prot_val); 250 prot_val, &new_prot_val);
175 if (retval) { 251 if (retval) {
176 pr_debug("Warning: reserve_memtype returned %d\n", retval); 252 pr_debug("Warning: reserve_memtype returned %d\n", retval);
@@ -204,16 +280,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
204 switch (prot_val) { 280 switch (prot_val) {
205 case _PAGE_CACHE_UC: 281 case _PAGE_CACHE_UC:
206 default: 282 default:
207 prot = PAGE_KERNEL_NOCACHE; 283 prot = PAGE_KERNEL_IO_NOCACHE;
208 break; 284 break;
209 case _PAGE_CACHE_UC_MINUS: 285 case _PAGE_CACHE_UC_MINUS:
210 prot = PAGE_KERNEL_UC_MINUS; 286 prot = PAGE_KERNEL_IO_UC_MINUS;
211 break; 287 break;
212 case _PAGE_CACHE_WC: 288 case _PAGE_CACHE_WC:
213 prot = PAGE_KERNEL_WC; 289 prot = PAGE_KERNEL_IO_WC;
214 break; 290 break;
215 case _PAGE_CACHE_WB: 291 case _PAGE_CACHE_WB:
216 prot = PAGE_KERNEL; 292 prot = PAGE_KERNEL_IO;
217 break; 293 break;
218 } 294 }
219 295
@@ -330,6 +406,14 @@ static void __iomem *ioremap_default(resource_size_t phys_addr,
330 return (void __iomem *)ret; 406 return (void __iomem *)ret;
331} 407}
332 408
409void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
410 unsigned long prot_val)
411{
412 return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK),
413 __builtin_return_address(0));
414}
415EXPORT_SYMBOL(ioremap_prot);
416
333/** 417/**
334 * iounmap - Free a IO remapping 418 * iounmap - Free a IO remapping
335 * @addr: virtual address from ioremap_* 419 * @addr: virtual address from ioremap_*
@@ -413,7 +497,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr)
413 return; 497 return;
414} 498}
415 499
416int __initdata early_ioremap_debug; 500static int __initdata early_ioremap_debug;
417 501
418static int __init early_ioremap_debug_setup(char *str) 502static int __init early_ioremap_debug_setup(char *str)
419{ 503{
@@ -522,12 +606,12 @@ static void __init __early_set_fixmap(enum fixed_addresses idx,
522} 606}
523 607
524static inline void __init early_set_fixmap(enum fixed_addresses idx, 608static inline void __init early_set_fixmap(enum fixed_addresses idx,
525 unsigned long phys) 609 unsigned long phys, pgprot_t prot)
526{ 610{
527 if (after_paging_init) 611 if (after_paging_init)
528 set_fixmap(idx, phys); 612 __set_fixmap(idx, phys, prot);
529 else 613 else
530 __early_set_fixmap(idx, phys, PAGE_KERNEL); 614 __early_set_fixmap(idx, phys, prot);
531} 615}
532 616
533static inline void __init early_clear_fixmap(enum fixed_addresses idx) 617static inline void __init early_clear_fixmap(enum fixed_addresses idx)
@@ -538,37 +622,56 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
538 __early_set_fixmap(idx, 0, __pgprot(0)); 622 __early_set_fixmap(idx, 0, __pgprot(0));
539} 623}
540 624
541 625static void *prev_map[FIX_BTMAPS_SLOTS] __initdata;
542int __initdata early_ioremap_nested; 626static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
543
544static int __init check_early_ioremap_leak(void) 627static int __init check_early_ioremap_leak(void)
545{ 628{
546 if (!early_ioremap_nested) 629 int count = 0;
547 return 0; 630 int i;
548 631
549 printk(KERN_WARNING 632 for (i = 0; i < FIX_BTMAPS_SLOTS; i++)
633 if (prev_map[i])
634 count++;
635
636 if (!count)
637 return 0;
638 WARN(1, KERN_WARNING
550 "Debug warning: early ioremap leak of %d areas detected.\n", 639 "Debug warning: early ioremap leak of %d areas detected.\n",
551 early_ioremap_nested); 640 count);
552 printk(KERN_WARNING 641 printk(KERN_WARNING
553 "please boot with early_ioremap_debug and report the dmesg.\n"); 642 "please boot with early_ioremap_debug and report the dmesg.\n");
554 WARN_ON(1);
555 643
556 return 1; 644 return 1;
557} 645}
558late_initcall(check_early_ioremap_leak); 646late_initcall(check_early_ioremap_leak);
559 647
560void __init *early_ioremap(unsigned long phys_addr, unsigned long size) 648static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot)
561{ 649{
562 unsigned long offset, last_addr; 650 unsigned long offset, last_addr;
563 unsigned int nrpages, nesting; 651 unsigned int nrpages;
564 enum fixed_addresses idx0, idx; 652 enum fixed_addresses idx0, idx;
653 int i, slot;
565 654
566 WARN_ON(system_state != SYSTEM_BOOTING); 655 WARN_ON(system_state != SYSTEM_BOOTING);
567 656
568 nesting = early_ioremap_nested; 657 slot = -1;
658 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
659 if (!prev_map[i]) {
660 slot = i;
661 break;
662 }
663 }
664
665 if (slot < 0) {
666 printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n",
667 phys_addr, size);
668 WARN_ON(1);
669 return NULL;
670 }
671
569 if (early_ioremap_debug) { 672 if (early_ioremap_debug) {
570 printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", 673 printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ",
571 phys_addr, size, nesting); 674 phys_addr, size, slot);
572 dump_stack(); 675 dump_stack();
573 } 676 }
574 677
@@ -579,17 +682,13 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
579 return NULL; 682 return NULL;
580 } 683 }
581 684
582 if (nesting >= FIX_BTMAPS_NESTING) { 685 prev_size[slot] = size;
583 WARN_ON(1);
584 return NULL;
585 }
586 early_ioremap_nested++;
587 /* 686 /*
588 * Mappings have to be page-aligned 687 * Mappings have to be page-aligned
589 */ 688 */
590 offset = phys_addr & ~PAGE_MASK; 689 offset = phys_addr & ~PAGE_MASK;
591 phys_addr &= PAGE_MASK; 690 phys_addr &= PAGE_MASK;
592 size = PAGE_ALIGN(last_addr) - phys_addr; 691 size = PAGE_ALIGN(last_addr + 1) - phys_addr;
593 692
594 /* 693 /*
595 * Mappings have to fit in the FIX_BTMAP area. 694 * Mappings have to fit in the FIX_BTMAP area.
@@ -603,10 +702,10 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
603 /* 702 /*
604 * Ok, go for it.. 703 * Ok, go for it..
605 */ 704 */
606 idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; 705 idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
607 idx = idx0; 706 idx = idx0;
608 while (nrpages > 0) { 707 while (nrpages > 0) {
609 early_set_fixmap(idx, phys_addr); 708 early_set_fixmap(idx, phys_addr, prot);
610 phys_addr += PAGE_SIZE; 709 phys_addr += PAGE_SIZE;
611 --idx; 710 --idx;
612 --nrpages; 711 --nrpages;
@@ -614,7 +713,20 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
614 if (early_ioremap_debug) 713 if (early_ioremap_debug)
615 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); 714 printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0));
616 715
617 return (void *) (offset + fix_to_virt(idx0)); 716 prev_map[slot] = (void *) (offset + fix_to_virt(idx0));
717 return prev_map[slot];
718}
719
720/* Remap an IO device */
721void __init *early_ioremap(unsigned long phys_addr, unsigned long size)
722{
723 return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO);
724}
725
726/* Remap memory */
727void __init *early_memremap(unsigned long phys_addr, unsigned long size)
728{
729 return __early_ioremap(phys_addr, size, PAGE_KERNEL);
618} 730}
619 731
620void __init early_iounmap(void *addr, unsigned long size) 732void __init early_iounmap(void *addr, unsigned long size)
@@ -623,15 +735,33 @@ void __init early_iounmap(void *addr, unsigned long size)
623 unsigned long offset; 735 unsigned long offset;
624 unsigned int nrpages; 736 unsigned int nrpages;
625 enum fixed_addresses idx; 737 enum fixed_addresses idx;
626 int nesting; 738 int i, slot;
739
740 slot = -1;
741 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
742 if (prev_map[i] == addr) {
743 slot = i;
744 break;
745 }
746 }
627 747
628 nesting = --early_ioremap_nested; 748 if (slot < 0) {
629 if (WARN_ON(nesting < 0)) 749 printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n",
750 addr, size);
751 WARN_ON(1);
630 return; 752 return;
753 }
754
755 if (prev_size[slot] != size) {
756 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n",
757 addr, size, slot, prev_size[slot]);
758 WARN_ON(1);
759 return;
760 }
631 761
632 if (early_ioremap_debug) { 762 if (early_ioremap_debug) {
633 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, 763 printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr,
634 size, nesting); 764 size, slot);
635 dump_stack(); 765 dump_stack();
636 } 766 }
637 767
@@ -643,12 +773,13 @@ void __init early_iounmap(void *addr, unsigned long size)
643 offset = virt_addr & ~PAGE_MASK; 773 offset = virt_addr & ~PAGE_MASK;
644 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; 774 nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT;
645 775
646 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; 776 idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot;
647 while (nrpages > 0) { 777 while (nrpages > 0) {
648 early_clear_fixmap(idx); 778 early_clear_fixmap(idx);
649 --idx; 779 --idx;
650 --nrpages; 780 --nrpages;
651 } 781 }
782 prev_map[slot] = 0;
652} 783}
653 784
654void __this_fixmap_does_not_exist(void) 785void __this_fixmap_does_not_exist(void)
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
new file mode 100644
index 000000000000..9cab18b0b857
--- /dev/null
+++ b/arch/x86/mm/memtest.c
@@ -0,0 +1,122 @@
1#include <linux/kernel.h>
2#include <linux/errno.h>
3#include <linux/string.h>
4#include <linux/types.h>
5#include <linux/mm.h>
6#include <linux/smp.h>
7#include <linux/init.h>
8#include <linux/pfn.h>
9
10#include <asm/e820.h>
11
12static void __init memtest(unsigned long start_phys, unsigned long size,
13 unsigned pattern)
14{
15 unsigned long i;
16 unsigned long *start;
17 unsigned long start_bad;
18 unsigned long last_bad;
19 unsigned long val;
20 unsigned long start_phys_aligned;
21 unsigned long count;
22 unsigned long incr;
23
24 switch (pattern) {
25 case 0:
26 val = 0UL;
27 break;
28 case 1:
29 val = -1UL;
30 break;
31 case 2:
32#ifdef CONFIG_X86_64
33 val = 0x5555555555555555UL;
34#else
35 val = 0x55555555UL;
36#endif
37 break;
38 case 3:
39#ifdef CONFIG_X86_64
40 val = 0xaaaaaaaaaaaaaaaaUL;
41#else
42 val = 0xaaaaaaaaUL;
43#endif
44 break;
45 default:
46 return;
47 }
48
49 incr = sizeof(unsigned long);
50 start_phys_aligned = ALIGN(start_phys, incr);
51 count = (size - (start_phys_aligned - start_phys))/incr;
52 start = __va(start_phys_aligned);
53 start_bad = 0;
54 last_bad = 0;
55
56 for (i = 0; i < count; i++)
57 start[i] = val;
58 for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
59 if (*start != val) {
60 if (start_phys_aligned == last_bad + incr) {
61 last_bad += incr;
62 } else {
63 if (start_bad) {
64 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
65 val, start_bad, last_bad + incr);
66 reserve_early(start_bad, last_bad + incr, "BAD RAM");
67 }
68 start_bad = last_bad = start_phys_aligned;
69 }
70 }
71 }
72 if (start_bad) {
73 printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved",
74 val, start_bad, last_bad + incr);
75 reserve_early(start_bad, last_bad + incr, "BAD RAM");
76 }
77}
78
79/* default is disabled */
80static int memtest_pattern __initdata;
81
82static int __init parse_memtest(char *arg)
83{
84 if (arg)
85 memtest_pattern = simple_strtoul(arg, NULL, 0);
86 return 0;
87}
88
89early_param("memtest", parse_memtest);
90
91void __init early_memtest(unsigned long start, unsigned long end)
92{
93 u64 t_start, t_size;
94 unsigned pattern;
95
96 if (!memtest_pattern)
97 return;
98
99 printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern);
100 for (pattern = 0; pattern < memtest_pattern; pattern++) {
101 t_start = start;
102 t_size = 0;
103 while (t_start < end) {
104 t_start = find_e820_area_size(t_start, &t_size, 1);
105
106 /* done ? */
107 if (t_start >= end)
108 break;
109 if (t_start + t_size > end)
110 t_size = end - t_start;
111
112 printk(KERN_CONT "\n %010llx - %010llx pattern %d",
113 (unsigned long long)t_start,
114 (unsigned long long)t_start + t_size, pattern);
115
116 memtest(t_start, t_size, pattern);
117
118 t_start += t_size;
119 }
120 }
121 printk(KERN_CONT "\n");
122}
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index e7397e108beb..2c4baa88f2cb 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -56,13 +56,6 @@ struct remap_trace {
56static DEFINE_PER_CPU(struct trap_reason, pf_reason); 56static DEFINE_PER_CPU(struct trap_reason, pf_reason);
57static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); 57static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace);
58 58
59#if 0 /* XXX: no way gather this info anymore */
60/* Access to this is not per-cpu. */
61static DEFINE_PER_CPU(atomic_t, dropped);
62#endif
63
64static struct dentry *marker_file;
65
66static DEFINE_MUTEX(mmiotrace_mutex); 59static DEFINE_MUTEX(mmiotrace_mutex);
67static DEFINE_SPINLOCK(trace_lock); 60static DEFINE_SPINLOCK(trace_lock);
68static atomic_t mmiotrace_enabled; 61static atomic_t mmiotrace_enabled;
@@ -75,7 +68,7 @@ static LIST_HEAD(trace_list); /* struct remap_trace */
75 * and trace_lock. 68 * and trace_lock.
76 * - Routines depending on is_enabled() must take trace_lock. 69 * - Routines depending on is_enabled() must take trace_lock.
77 * - trace_list users must hold trace_lock. 70 * - trace_list users must hold trace_lock.
78 * - is_enabled() guarantees that mmio_trace_record is allowed. 71 * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed.
79 * - pre/post callbacks assume the effect of is_enabled() being true. 72 * - pre/post callbacks assume the effect of is_enabled() being true.
80 */ 73 */
81 74
@@ -97,44 +90,6 @@ static bool is_enabled(void)
97 return atomic_read(&mmiotrace_enabled); 90 return atomic_read(&mmiotrace_enabled);
98} 91}
99 92
100#if 0 /* XXX: needs rewrite */
101/*
102 * Write callback for the debugfs entry:
103 * Read a marker and write it to the mmio trace log
104 */
105static ssize_t write_marker(struct file *file, const char __user *buffer,
106 size_t count, loff_t *ppos)
107{
108 char *event = NULL;
109 struct mm_io_header *headp;
110 ssize_t len = (count > 65535) ? 65535 : count;
111
112 event = kzalloc(sizeof(*headp) + len, GFP_KERNEL);
113 if (!event)
114 return -ENOMEM;
115
116 headp = (struct mm_io_header *)event;
117 headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT);
118 headp->data_len = len;
119
120 if (copy_from_user(event + sizeof(*headp), buffer, len)) {
121 kfree(event);
122 return -EFAULT;
123 }
124
125 spin_lock_irq(&trace_lock);
126#if 0 /* XXX: convert this to use tracing */
127 if (is_enabled())
128 relay_write(chan, event, sizeof(*headp) + len);
129 else
130#endif
131 len = -EINVAL;
132 spin_unlock_irq(&trace_lock);
133 kfree(event);
134 return len;
135}
136#endif
137
138static void print_pte(unsigned long address) 93static void print_pte(unsigned long address)
139{ 94{
140 unsigned int level; 95 unsigned int level;
@@ -307,8 +262,10 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size,
307 map.map_id = trace->id; 262 map.map_id = trace->id;
308 263
309 spin_lock_irq(&trace_lock); 264 spin_lock_irq(&trace_lock);
310 if (!is_enabled()) 265 if (!is_enabled()) {
266 kfree(trace);
311 goto not_enabled; 267 goto not_enabled;
268 }
312 269
313 mmio_trace_mapping(&map); 270 mmio_trace_mapping(&map);
314 list_add_tail(&trace->list, &trace_list); 271 list_add_tail(&trace->list, &trace_list);
@@ -377,6 +334,23 @@ void mmiotrace_iounmap(volatile void __iomem *addr)
377 iounmap_trace_core(addr); 334 iounmap_trace_core(addr);
378} 335}
379 336
337int mmiotrace_printk(const char *fmt, ...)
338{
339 int ret = 0;
340 va_list args;
341 unsigned long flags;
342 va_start(args, fmt);
343
344 spin_lock_irqsave(&trace_lock, flags);
345 if (is_enabled())
346 ret = mmio_trace_printk(fmt, args);
347 spin_unlock_irqrestore(&trace_lock, flags);
348
349 va_end(args);
350 return ret;
351}
352EXPORT_SYMBOL(mmiotrace_printk);
353
380static void clear_trace_list(void) 354static void clear_trace_list(void)
381{ 355{
382 struct remap_trace *trace; 356 struct remap_trace *trace;
@@ -430,7 +404,9 @@ static void enter_uniprocessor(void)
430 "may miss events.\n"); 404 "may miss events.\n");
431} 405}
432 406
433static void leave_uniprocessor(void) 407/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
408 but this whole function is ifdefed CONFIG_HOTPLUG_CPU */
409static void __ref leave_uniprocessor(void)
434{ 410{
435 int cpu; 411 int cpu;
436 int err; 412 int err;
@@ -460,26 +436,12 @@ static void leave_uniprocessor(void)
460} 436}
461#endif 437#endif
462 438
463#if 0 /* XXX: out of order */
464static struct file_operations fops_marker = {
465 .owner = THIS_MODULE,
466 .write = write_marker
467};
468#endif
469
470void enable_mmiotrace(void) 439void enable_mmiotrace(void)
471{ 440{
472 mutex_lock(&mmiotrace_mutex); 441 mutex_lock(&mmiotrace_mutex);
473 if (is_enabled()) 442 if (is_enabled())
474 goto out; 443 goto out;
475 444
476#if 0 /* XXX: tracing does not support text entries */
477 marker_file = debugfs_create_file("marker", 0660, dir, NULL,
478 &fops_marker);
479 if (!marker_file)
480 pr_err(NAME "marker file creation failed.\n");
481#endif
482
483 if (nommiotrace) 445 if (nommiotrace)
484 pr_info(NAME "MMIO tracing disabled.\n"); 446 pr_info(NAME "MMIO tracing disabled.\n");
485 enter_uniprocessor(); 447 enter_uniprocessor();
@@ -504,11 +466,6 @@ void disable_mmiotrace(void)
504 466
505 clear_trace_list(); /* guarantees: no more kmmio callbacks */ 467 clear_trace_list(); /* guarantees: no more kmmio callbacks */
506 leave_uniprocessor(); 468 leave_uniprocessor();
507 if (marker_file) {
508 debugfs_remove(marker_file);
509 marker_file = NULL;
510 }
511
512 pr_info(NAME "disabled.\n"); 469 pr_info(NAME "disabled.\n");
513out: 470out:
514 mutex_unlock(&mmiotrace_mutex); 471 mutex_unlock(&mmiotrace_mutex);
diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/numa_32.c
index 5dfef9fa061a..847c164725f4 100644
--- a/arch/x86/mm/discontig_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -42,7 +42,6 @@
42 42
43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
44EXPORT_SYMBOL(node_data); 44EXPORT_SYMBOL(node_data);
45static bootmem_data_t node0_bdata;
46 45
47/* 46/*
48 * numa interface - we expect the numa architecture specific code to have 47 * numa interface - we expect the numa architecture specific code to have
@@ -329,7 +328,7 @@ void __init initmem_init(unsigned long start_pfn,
329 328
330 get_memcfg_numa(); 329 get_memcfg_numa();
331 330
332 kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); 331 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
333 332
334 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); 333 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
335 do { 334 do {
@@ -385,7 +384,7 @@ void __init initmem_init(unsigned long start_pfn,
385 for_each_online_node(nid) 384 for_each_online_node(nid)
386 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 385 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
387 386
388 NODE_DATA(0)->bdata = &node0_bdata; 387 NODE_DATA(0)->bdata = &bootmem_node_data[0];
389 setup_bootmem_allocator(); 388 setup_bootmem_allocator();
390} 389}
391 390
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index b432d5781773..cebcbf152d46 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -20,15 +20,9 @@
20#include <asm/acpi.h> 20#include <asm/acpi.h>
21#include <asm/k8.h> 21#include <asm/k8.h>
22 22
23#ifndef Dprintk
24#define Dprintk(x...)
25#endif
26
27struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
28EXPORT_SYMBOL(node_data); 24EXPORT_SYMBOL(node_data);
29 25
30static bootmem_data_t plat_node_bdata[MAX_NUMNODES];
31
32struct memnode memnode; 26struct memnode memnode;
33 27
34s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { 28s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
@@ -85,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void)
85 return 0; 79 return 0;
86 80
87 addr = 0x8000; 81 addr = 0x8000;
88 nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); 82 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
89 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT, 83 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
90 nodemap_size, L1_CACHE_BYTES); 84 nodemap_size, L1_CACHE_BYTES);
91 if (nodemap_addr == -1UL) { 85 if (nodemap_addr == -1UL) {
@@ -182,10 +176,10 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
182 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 176 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
183 unsigned long bootmap_start, nodedata_phys; 177 unsigned long bootmap_start, nodedata_phys;
184 void *bootmap; 178 void *bootmap;
185 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE); 179 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
186 int nid; 180 int nid;
187 181
188 start = round_up(start, ZONE_ALIGN); 182 start = roundup(start, ZONE_ALIGN);
189 183
190 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 184 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
191 start, end); 185 start, end);
@@ -202,7 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
202 nodedata_phys + pgdat_size - 1); 196 nodedata_phys + pgdat_size - 1);
203 197
204 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 198 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
205 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; 199 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
206 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 200 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
207 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 201 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
208 202
@@ -216,9 +210,9 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
216 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); 210 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
217 nid = phys_to_nid(nodedata_phys); 211 nid = phys_to_nid(nodedata_phys);
218 if (nid == nodeid) 212 if (nid == nodeid)
219 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE); 213 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
220 else 214 else
221 bootmap_start = round_up(start, PAGE_SIZE); 215 bootmap_start = roundup(start, PAGE_SIZE);
222 /* 216 /*
223 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like 217 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
224 * to use that to align to PAGE_SIZE 218 * to use that to align to PAGE_SIZE
diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 0dcd42eb94e6..e1d106909218 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -32,7 +32,7 @@ enum {
32 GPS = (1<<30) 32 GPS = (1<<30)
33}; 33};
34 34
35#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1) 35#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST)
36 36
37static int pte_testbit(pte_t pte) 37static int pte_testbit(pte_t pte)
38{ 38{
@@ -118,6 +118,7 @@ static int pageattr_test(void)
118 unsigned int level; 118 unsigned int level;
119 int i, k; 119 int i, k;
120 int err; 120 int err;
121 unsigned long test_addr;
121 122
122 if (print) 123 if (print)
123 printk(KERN_INFO "CPA self-test:\n"); 124 printk(KERN_INFO "CPA self-test:\n");
@@ -172,7 +173,8 @@ static int pageattr_test(void)
172 continue; 173 continue;
173 } 174 }
174 175
175 err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT); 176 test_addr = addr[i];
177 err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0);
176 if (err < 0) { 178 if (err < 0) {
177 printk(KERN_ERR "CPA %d failed %d\n", i, err); 179 printk(KERN_ERR "CPA %d failed %d\n", i, err);
178 failed++; 180 failed++;
@@ -204,7 +206,8 @@ static int pageattr_test(void)
204 failed++; 206 failed++;
205 continue; 207 continue;
206 } 208 }
207 err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT); 209 test_addr = addr[i];
210 err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0);
208 if (err < 0) { 211 if (err < 0) {
209 printk(KERN_ERR "CPA reverting failed: %d\n", err); 212 printk(KERN_ERR "CPA reverting failed: %d\n", err);
210 failed++; 213 failed++;
@@ -221,8 +224,7 @@ static int pageattr_test(void)
221 failed += print_split(&sc); 224 failed += print_split(&sc);
222 225
223 if (failed) { 226 if (failed) {
224 printk(KERN_ERR "NOT PASSED. Please report.\n"); 227 WARN(1, KERN_ERR "NOT PASSED. Please report.\n");
225 WARN_ON(1);
226 return -EINVAL; 228 return -EINVAL;
227 } else { 229 } else {
228 if (print) 230 if (print)
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 65c6e46bf059..f1dc1b75d166 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -25,15 +25,27 @@
25 * The current flushing context - we pass it instead of 5 arguments: 25 * The current flushing context - we pass it instead of 5 arguments:
26 */ 26 */
27struct cpa_data { 27struct cpa_data {
28 unsigned long vaddr; 28 unsigned long *vaddr;
29 pgprot_t mask_set; 29 pgprot_t mask_set;
30 pgprot_t mask_clr; 30 pgprot_t mask_clr;
31 int numpages; 31 int numpages;
32 int flushtlb; 32 int flags;
33 unsigned long pfn; 33 unsigned long pfn;
34 unsigned force_split : 1; 34 unsigned force_split : 1;
35 int curpage;
35}; 36};
36 37
38/*
39 * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings)
40 * using cpa_lock. So that we don't allow any other cpu, with stale large tlb
41 * entries change the page attribute in parallel to some other cpu
42 * splitting a large page entry along with changing the attribute.
43 */
44static DEFINE_SPINLOCK(cpa_lock);
45
46#define CPA_FLUSHTLB 1
47#define CPA_ARRAY 2
48
37#ifdef CONFIG_PROC_FS 49#ifdef CONFIG_PROC_FS
38static unsigned long direct_pages_count[PG_LEVEL_NUM]; 50static unsigned long direct_pages_count[PG_LEVEL_NUM];
39 51
@@ -53,17 +65,22 @@ static void split_page_count(int level)
53 direct_pages_count[level - 1] += PTRS_PER_PTE; 65 direct_pages_count[level - 1] += PTRS_PER_PTE;
54} 66}
55 67
56int arch_report_meminfo(char *page) 68void arch_report_meminfo(struct seq_file *m)
57{ 69{
58 int n = sprintf(page, "DirectMap4k: %8lu\n" 70 seq_printf(m, "DirectMap4k: %8lu kB\n",
59 "DirectMap2M: %8lu\n", 71 direct_pages_count[PG_LEVEL_4K] << 2);
60 direct_pages_count[PG_LEVEL_4K], 72#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
61 direct_pages_count[PG_LEVEL_2M]); 73 seq_printf(m, "DirectMap2M: %8lu kB\n",
74 direct_pages_count[PG_LEVEL_2M] << 11);
75#else
76 seq_printf(m, "DirectMap4M: %8lu kB\n",
77 direct_pages_count[PG_LEVEL_2M] << 12);
78#endif
62#ifdef CONFIG_X86_64 79#ifdef CONFIG_X86_64
63 n += sprintf(page + n, "DirectMap1G: %8lu\n", 80 if (direct_gbpages)
64 direct_pages_count[PG_LEVEL_1G]); 81 seq_printf(m, "DirectMap1G: %8lu kB\n",
82 direct_pages_count[PG_LEVEL_1G] << 20);
65#endif 83#endif
66 return n;
67} 84}
68#else 85#else
69static inline void split_page_count(int level) { } 86static inline void split_page_count(int level) { }
@@ -78,7 +95,7 @@ static inline unsigned long highmap_start_pfn(void)
78 95
79static inline unsigned long highmap_end_pfn(void) 96static inline unsigned long highmap_end_pfn(void)
80{ 97{
81 return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; 98 return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT;
82} 99}
83 100
84#endif 101#endif
@@ -184,6 +201,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache)
184 } 201 }
185} 202}
186 203
204static void cpa_flush_array(unsigned long *start, int numpages, int cache)
205{
206 unsigned int i, level;
207 unsigned long *addr;
208
209 BUG_ON(irqs_disabled());
210
211 on_each_cpu(__cpa_flush_range, NULL, 1);
212
213 if (!cache)
214 return;
215
216 /* 4M threshold */
217 if (numpages >= 1024) {
218 if (boot_cpu_data.x86_model >= 4)
219 wbinvd();
220 return;
221 }
222 /*
223 * We only need to flush on one CPU,
224 * clflush is a MESI-coherent instruction that
225 * will cause all other CPUs to flush the same
226 * cachelines:
227 */
228 for (i = 0, addr = start; i < numpages; i++, addr++) {
229 pte_t *pte = lookup_address(*addr, &level);
230
231 /*
232 * Only flush present addresses:
233 */
234 if (pte && (pte_val(*pte) & _PAGE_PRESENT))
235 clflush_cache_range((void *) *addr, PAGE_SIZE);
236 }
237}
238
187/* 239/*
188 * Certain areas of memory on x86 require very specific protection flags, 240 * Certain areas of memory on x86 require very specific protection flags,
189 * for example the BIOS area or kernel text. Callers don't always get this 241 * for example the BIOS area or kernel text. Callers don't always get this
@@ -392,7 +444,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
392 */ 444 */
393 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); 445 new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot));
394 __set_pmd_pte(kpte, address, new_pte); 446 __set_pmd_pte(kpte, address, new_pte);
395 cpa->flushtlb = 1; 447 cpa->flags |= CPA_FLUSHTLB;
396 do_split = 0; 448 do_split = 0;
397 } 449 }
398 450
@@ -402,84 +454,6 @@ out_unlock:
402 return do_split; 454 return do_split;
403} 455}
404 456
405static LIST_HEAD(page_pool);
406static unsigned long pool_size, pool_pages, pool_low;
407static unsigned long pool_used, pool_failed;
408
409static void cpa_fill_pool(struct page **ret)
410{
411 gfp_t gfp = GFP_KERNEL;
412 unsigned long flags;
413 struct page *p;
414
415 /*
416 * Avoid recursion (on debug-pagealloc) and also signal
417 * our priority to get to these pagetables:
418 */
419 if (current->flags & PF_MEMALLOC)
420 return;
421 current->flags |= PF_MEMALLOC;
422
423 /*
424 * Allocate atomically from atomic contexts:
425 */
426 if (in_atomic() || irqs_disabled() || debug_pagealloc)
427 gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN;
428
429 while (pool_pages < pool_size || (ret && !*ret)) {
430 p = alloc_pages(gfp, 0);
431 if (!p) {
432 pool_failed++;
433 break;
434 }
435 /*
436 * If the call site needs a page right now, provide it:
437 */
438 if (ret && !*ret) {
439 *ret = p;
440 continue;
441 }
442 spin_lock_irqsave(&pgd_lock, flags);
443 list_add(&p->lru, &page_pool);
444 pool_pages++;
445 spin_unlock_irqrestore(&pgd_lock, flags);
446 }
447
448 current->flags &= ~PF_MEMALLOC;
449}
450
451#define SHIFT_MB (20 - PAGE_SHIFT)
452#define ROUND_MB_GB ((1 << 10) - 1)
453#define SHIFT_MB_GB 10
454#define POOL_PAGES_PER_GB 16
455
456void __init cpa_init(void)
457{
458 struct sysinfo si;
459 unsigned long gb;
460
461 si_meminfo(&si);
462 /*
463 * Calculate the number of pool pages:
464 *
465 * Convert totalram (nr of pages) to MiB and round to the next
466 * GiB. Shift MiB to Gib and multiply the result by
467 * POOL_PAGES_PER_GB:
468 */
469 if (debug_pagealloc) {
470 gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB;
471 pool_size = POOL_PAGES_PER_GB * gb;
472 } else {
473 pool_size = 1;
474 }
475 pool_low = pool_size;
476
477 cpa_fill_pool(NULL);
478 printk(KERN_DEBUG
479 "CPA: page pool initialized %lu of %lu pages preallocated\n",
480 pool_pages, pool_size);
481}
482
483static int split_large_page(pte_t *kpte, unsigned long address) 457static int split_large_page(pte_t *kpte, unsigned long address)
484{ 458{
485 unsigned long flags, pfn, pfninc = 1; 459 unsigned long flags, pfn, pfninc = 1;
@@ -488,28 +462,15 @@ static int split_large_page(pte_t *kpte, unsigned long address)
488 pgprot_t ref_prot; 462 pgprot_t ref_prot;
489 struct page *base; 463 struct page *base;
490 464
491 /* 465 if (!debug_pagealloc)
492 * Get a page from the pool. The pool list is protected by the 466 spin_unlock(&cpa_lock);
493 * pgd_lock, which we have to take anyway for the split 467 base = alloc_pages(GFP_KERNEL, 0);
494 * operation: 468 if (!debug_pagealloc)
495 */ 469 spin_lock(&cpa_lock);
496 spin_lock_irqsave(&pgd_lock, flags); 470 if (!base)
497 if (list_empty(&page_pool)) { 471 return -ENOMEM;
498 spin_unlock_irqrestore(&pgd_lock, flags);
499 base = NULL;
500 cpa_fill_pool(&base);
501 if (!base)
502 return -ENOMEM;
503 spin_lock_irqsave(&pgd_lock, flags);
504 } else {
505 base = list_first_entry(&page_pool, struct page, lru);
506 list_del(&base->lru);
507 pool_pages--;
508
509 if (pool_pages < pool_low)
510 pool_low = pool_pages;
511 }
512 472
473 spin_lock_irqsave(&pgd_lock, flags);
513 /* 474 /*
514 * Check for races, another CPU might have split this page 475 * Check for races, another CPU might have split this page
515 * up for us already: 476 * up for us already:
@@ -566,11 +527,8 @@ out_unlock:
566 * If we dropped out via the lookup_address check under 527 * If we dropped out via the lookup_address check under
567 * pgd_lock then stick the page back into the pool: 528 * pgd_lock then stick the page back into the pool:
568 */ 529 */
569 if (base) { 530 if (base)
570 list_add(&base->lru, &page_pool); 531 __free_page(base);
571 pool_pages++;
572 } else
573 pool_used++;
574 spin_unlock_irqrestore(&pgd_lock, flags); 532 spin_unlock_irqrestore(&pgd_lock, flags);
575 533
576 return 0; 534 return 0;
@@ -578,11 +536,16 @@ out_unlock:
578 536
579static int __change_page_attr(struct cpa_data *cpa, int primary) 537static int __change_page_attr(struct cpa_data *cpa, int primary)
580{ 538{
581 unsigned long address = cpa->vaddr; 539 unsigned long address;
582 int do_split, err; 540 int do_split, err;
583 unsigned int level; 541 unsigned int level;
584 pte_t *kpte, old_pte; 542 pte_t *kpte, old_pte;
585 543
544 if (cpa->flags & CPA_ARRAY)
545 address = cpa->vaddr[cpa->curpage];
546 else
547 address = *cpa->vaddr;
548
586repeat: 549repeat:
587 kpte = lookup_address(address, &level); 550 kpte = lookup_address(address, &level);
588 if (!kpte) 551 if (!kpte)
@@ -592,10 +555,9 @@ repeat:
592 if (!pte_val(old_pte)) { 555 if (!pte_val(old_pte)) {
593 if (!primary) 556 if (!primary)
594 return 0; 557 return 0;
595 printk(KERN_WARNING "CPA: called for zero pte. " 558 WARN(1, KERN_WARNING "CPA: called for zero pte. "
596 "vaddr = %lx cpa->vaddr = %lx\n", address, 559 "vaddr = %lx cpa->vaddr = %lx\n", address,
597 cpa->vaddr); 560 *cpa->vaddr);
598 WARN_ON(1);
599 return -EINVAL; 561 return -EINVAL;
600 } 562 }
601 563
@@ -621,7 +583,7 @@ repeat:
621 */ 583 */
622 if (pte_val(old_pte) != pte_val(new_pte)) { 584 if (pte_val(old_pte) != pte_val(new_pte)) {
623 set_pte_atomic(kpte, new_pte); 585 set_pte_atomic(kpte, new_pte);
624 cpa->flushtlb = 1; 586 cpa->flags |= CPA_FLUSHTLB;
625 } 587 }
626 cpa->numpages = 1; 588 cpa->numpages = 1;
627 return 0; 589 return 0;
@@ -645,7 +607,25 @@ repeat:
645 */ 607 */
646 err = split_large_page(kpte, address); 608 err = split_large_page(kpte, address);
647 if (!err) { 609 if (!err) {
648 cpa->flushtlb = 1; 610 /*
611 * Do a global flush tlb after splitting the large page
612 * and before we do the actual change page attribute in the PTE.
613 *
614 * With out this, we violate the TLB application note, that says
615 * "The TLBs may contain both ordinary and large-page
616 * translations for a 4-KByte range of linear addresses. This
617 * may occur if software modifies the paging structures so that
618 * the page size used for the address range changes. If the two
619 * translations differ with respect to page frame or attributes
620 * (e.g., permissions), processor behavior is undefined and may
621 * be implementation-specific."
622 *
623 * We do this global tlb flush inside the cpa_lock, so that we
624 * don't allow any other cpu, with stale tlb entries change the
625 * page attribute in parallel, that also falls into the
626 * just split large page entry.
627 */
628 flush_tlb_all();
649 goto repeat; 629 goto repeat;
650 } 630 }
651 631
@@ -658,6 +638,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
658{ 638{
659 struct cpa_data alias_cpa; 639 struct cpa_data alias_cpa;
660 int ret = 0; 640 int ret = 0;
641 unsigned long temp_cpa_vaddr, vaddr;
661 642
662 if (cpa->pfn >= max_pfn_mapped) 643 if (cpa->pfn >= max_pfn_mapped)
663 return 0; 644 return 0;
@@ -670,16 +651,24 @@ static int cpa_process_alias(struct cpa_data *cpa)
670 * No need to redo, when the primary call touched the direct 651 * No need to redo, when the primary call touched the direct
671 * mapping already: 652 * mapping already:
672 */ 653 */
673 if (!(within(cpa->vaddr, PAGE_OFFSET, 654 if (cpa->flags & CPA_ARRAY)
655 vaddr = cpa->vaddr[cpa->curpage];
656 else
657 vaddr = *cpa->vaddr;
658
659 if (!(within(vaddr, PAGE_OFFSET,
674 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) 660 PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT))
675#ifdef CONFIG_X86_64 661#ifdef CONFIG_X86_64
676 || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32), 662 || within(vaddr, PAGE_OFFSET + (1UL<<32),
677 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) 663 PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))
678#endif 664#endif
679 )) { 665 )) {
680 666
681 alias_cpa = *cpa; 667 alias_cpa = *cpa;
682 alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); 668 temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT);
669 alias_cpa.vaddr = &temp_cpa_vaddr;
670 alias_cpa.flags &= ~CPA_ARRAY;
671
683 672
684 ret = __change_page_attr_set_clr(&alias_cpa, 0); 673 ret = __change_page_attr_set_clr(&alias_cpa, 0);
685 } 674 }
@@ -691,7 +680,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
691 * No need to redo, when the primary call touched the high 680 * No need to redo, when the primary call touched the high
692 * mapping already: 681 * mapping already:
693 */ 682 */
694 if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) 683 if (within(vaddr, (unsigned long) _text, (unsigned long) _end))
695 return 0; 684 return 0;
696 685
697 /* 686 /*
@@ -702,8 +691,9 @@ static int cpa_process_alias(struct cpa_data *cpa)
702 return 0; 691 return 0;
703 692
704 alias_cpa = *cpa; 693 alias_cpa = *cpa;
705 alias_cpa.vaddr = 694 temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base;
706 (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; 695 alias_cpa.vaddr = &temp_cpa_vaddr;
696 alias_cpa.flags &= ~CPA_ARRAY;
707 697
708 /* 698 /*
709 * The high mapping range is imprecise, so ignore the return value. 699 * The high mapping range is imprecise, so ignore the return value.
@@ -723,8 +713,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
723 * preservation check. 713 * preservation check.
724 */ 714 */
725 cpa->numpages = numpages; 715 cpa->numpages = numpages;
716 /* for array changes, we can't use large page */
717 if (cpa->flags & CPA_ARRAY)
718 cpa->numpages = 1;
726 719
720 if (!debug_pagealloc)
721 spin_lock(&cpa_lock);
727 ret = __change_page_attr(cpa, checkalias); 722 ret = __change_page_attr(cpa, checkalias);
723 if (!debug_pagealloc)
724 spin_unlock(&cpa_lock);
728 if (ret) 725 if (ret)
729 return ret; 726 return ret;
730 727
@@ -741,7 +738,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
741 */ 738 */
742 BUG_ON(cpa->numpages > numpages); 739 BUG_ON(cpa->numpages > numpages);
743 numpages -= cpa->numpages; 740 numpages -= cpa->numpages;
744 cpa->vaddr += cpa->numpages * PAGE_SIZE; 741 if (cpa->flags & CPA_ARRAY)
742 cpa->curpage++;
743 else
744 *cpa->vaddr += cpa->numpages * PAGE_SIZE;
745
745 } 746 }
746 return 0; 747 return 0;
747} 748}
@@ -752,9 +753,9 @@ static inline int cache_attr(pgprot_t attr)
752 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); 753 (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD);
753} 754}
754 755
755static int change_page_attr_set_clr(unsigned long addr, int numpages, 756static int change_page_attr_set_clr(unsigned long *addr, int numpages,
756 pgprot_t mask_set, pgprot_t mask_clr, 757 pgprot_t mask_set, pgprot_t mask_clr,
757 int force_split) 758 int force_split, int array)
758{ 759{
759 struct cpa_data cpa; 760 struct cpa_data cpa;
760 int ret, cache, checkalias; 761 int ret, cache, checkalias;
@@ -769,21 +770,40 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
769 return 0; 770 return 0;
770 771
771 /* Ensure we are PAGE_SIZE aligned */ 772 /* Ensure we are PAGE_SIZE aligned */
772 if (addr & ~PAGE_MASK) { 773 if (!array) {
773 addr &= PAGE_MASK; 774 if (*addr & ~PAGE_MASK) {
774 /* 775 *addr &= PAGE_MASK;
775 * People should not be passing in unaligned addresses: 776 /*
776 */ 777 * People should not be passing in unaligned addresses:
777 WARN_ON_ONCE(1); 778 */
779 WARN_ON_ONCE(1);
780 }
781 } else {
782 int i;
783 for (i = 0; i < numpages; i++) {
784 if (addr[i] & ~PAGE_MASK) {
785 addr[i] &= PAGE_MASK;
786 WARN_ON_ONCE(1);
787 }
788 }
778 } 789 }
779 790
791 /* Must avoid aliasing mappings in the highmem code */
792 kmap_flush_unused();
793
794 vm_unmap_aliases();
795
780 cpa.vaddr = addr; 796 cpa.vaddr = addr;
781 cpa.numpages = numpages; 797 cpa.numpages = numpages;
782 cpa.mask_set = mask_set; 798 cpa.mask_set = mask_set;
783 cpa.mask_clr = mask_clr; 799 cpa.mask_clr = mask_clr;
784 cpa.flushtlb = 0; 800 cpa.flags = 0;
801 cpa.curpage = 0;
785 cpa.force_split = force_split; 802 cpa.force_split = force_split;
786 803
804 if (array)
805 cpa.flags |= CPA_ARRAY;
806
787 /* No alias checking for _NX bit modifications */ 807 /* No alias checking for _NX bit modifications */
788 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; 808 checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX;
789 809
@@ -792,7 +812,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
792 /* 812 /*
793 * Check whether we really changed something: 813 * Check whether we really changed something:
794 */ 814 */
795 if (!cpa.flushtlb) 815 if (!(cpa.flags & CPA_FLUSHTLB))
796 goto out; 816 goto out;
797 817
798 /* 818 /*
@@ -807,27 +827,30 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages,
807 * error case we fall back to cpa_flush_all (which uses 827 * error case we fall back to cpa_flush_all (which uses
808 * wbindv): 828 * wbindv):
809 */ 829 */
810 if (!ret && cpu_has_clflush) 830 if (!ret && cpu_has_clflush) {
811 cpa_flush_range(addr, numpages, cache); 831 if (cpa.flags & CPA_ARRAY)
812 else 832 cpa_flush_array(addr, numpages, cache);
833 else
834 cpa_flush_range(*addr, numpages, cache);
835 } else
813 cpa_flush_all(cache); 836 cpa_flush_all(cache);
814 837
815out: 838out:
816 cpa_fill_pool(NULL);
817
818 return ret; 839 return ret;
819} 840}
820 841
821static inline int change_page_attr_set(unsigned long addr, int numpages, 842static inline int change_page_attr_set(unsigned long *addr, int numpages,
822 pgprot_t mask) 843 pgprot_t mask, int array)
823{ 844{
824 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); 845 return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0,
846 array);
825} 847}
826 848
827static inline int change_page_attr_clear(unsigned long addr, int numpages, 849static inline int change_page_attr_clear(unsigned long *addr, int numpages,
828 pgprot_t mask) 850 pgprot_t mask, int array)
829{ 851{
830 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); 852 return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0,
853 array);
831} 854}
832 855
833int _set_memory_uc(unsigned long addr, int numpages) 856int _set_memory_uc(unsigned long addr, int numpages)
@@ -835,8 +858,8 @@ int _set_memory_uc(unsigned long addr, int numpages)
835 /* 858 /*
836 * for now UC MINUS. see comments in ioremap_nocache() 859 * for now UC MINUS. see comments in ioremap_nocache()
837 */ 860 */
838 return change_page_attr_set(addr, numpages, 861 return change_page_attr_set(&addr, numpages,
839 __pgprot(_PAGE_CACHE_UC_MINUS)); 862 __pgprot(_PAGE_CACHE_UC_MINUS), 0);
840} 863}
841 864
842int set_memory_uc(unsigned long addr, int numpages) 865int set_memory_uc(unsigned long addr, int numpages)
@@ -844,7 +867,7 @@ int set_memory_uc(unsigned long addr, int numpages)
844 /* 867 /*
845 * for now UC MINUS. see comments in ioremap_nocache() 868 * for now UC MINUS. see comments in ioremap_nocache()
846 */ 869 */
847 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 870 if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
848 _PAGE_CACHE_UC_MINUS, NULL)) 871 _PAGE_CACHE_UC_MINUS, NULL))
849 return -EINVAL; 872 return -EINVAL;
850 873
@@ -852,10 +875,48 @@ int set_memory_uc(unsigned long addr, int numpages)
852} 875}
853EXPORT_SYMBOL(set_memory_uc); 876EXPORT_SYMBOL(set_memory_uc);
854 877
878int set_memory_array_uc(unsigned long *addr, int addrinarray)
879{
880 unsigned long start;
881 unsigned long end;
882 int i;
883 /*
884 * for now UC MINUS. see comments in ioremap_nocache()
885 */
886 for (i = 0; i < addrinarray; i++) {
887 start = __pa(addr[i]);
888 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
889 if (end != __pa(addr[i + 1]))
890 break;
891 i++;
892 }
893 if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
894 goto out;
895 }
896
897 return change_page_attr_set(addr, addrinarray,
898 __pgprot(_PAGE_CACHE_UC_MINUS), 1);
899out:
900 for (i = 0; i < addrinarray; i++) {
901 unsigned long tmp = __pa(addr[i]);
902
903 if (tmp == start)
904 break;
905 for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
906 if (end != __pa(addr[i + 1]))
907 break;
908 i++;
909 }
910 free_memtype(tmp, end);
911 }
912 return -EINVAL;
913}
914EXPORT_SYMBOL(set_memory_array_uc);
915
855int _set_memory_wc(unsigned long addr, int numpages) 916int _set_memory_wc(unsigned long addr, int numpages)
856{ 917{
857 return change_page_attr_set(addr, numpages, 918 return change_page_attr_set(&addr, numpages,
858 __pgprot(_PAGE_CACHE_WC)); 919 __pgprot(_PAGE_CACHE_WC), 0);
859} 920}
860 921
861int set_memory_wc(unsigned long addr, int numpages) 922int set_memory_wc(unsigned long addr, int numpages)
@@ -863,7 +924,7 @@ int set_memory_wc(unsigned long addr, int numpages)
863 if (!pat_enabled) 924 if (!pat_enabled)
864 return set_memory_uc(addr, numpages); 925 return set_memory_uc(addr, numpages);
865 926
866 if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, 927 if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
867 _PAGE_CACHE_WC, NULL)) 928 _PAGE_CACHE_WC, NULL))
868 return -EINVAL; 929 return -EINVAL;
869 930
@@ -873,49 +934,71 @@ EXPORT_SYMBOL(set_memory_wc);
873 934
874int _set_memory_wb(unsigned long addr, int numpages) 935int _set_memory_wb(unsigned long addr, int numpages)
875{ 936{
876 return change_page_attr_clear(addr, numpages, 937 return change_page_attr_clear(&addr, numpages,
877 __pgprot(_PAGE_CACHE_MASK)); 938 __pgprot(_PAGE_CACHE_MASK), 0);
878} 939}
879 940
880int set_memory_wb(unsigned long addr, int numpages) 941int set_memory_wb(unsigned long addr, int numpages)
881{ 942{
882 free_memtype(addr, addr + numpages * PAGE_SIZE); 943 free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
883 944
884 return _set_memory_wb(addr, numpages); 945 return _set_memory_wb(addr, numpages);
885} 946}
886EXPORT_SYMBOL(set_memory_wb); 947EXPORT_SYMBOL(set_memory_wb);
887 948
949int set_memory_array_wb(unsigned long *addr, int addrinarray)
950{
951 int i;
952
953 for (i = 0; i < addrinarray; i++) {
954 unsigned long start = __pa(addr[i]);
955 unsigned long end;
956
957 for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
958 if (end != __pa(addr[i + 1]))
959 break;
960 i++;
961 }
962 free_memtype(start, end);
963 }
964 return change_page_attr_clear(addr, addrinarray,
965 __pgprot(_PAGE_CACHE_MASK), 1);
966}
967EXPORT_SYMBOL(set_memory_array_wb);
968
888int set_memory_x(unsigned long addr, int numpages) 969int set_memory_x(unsigned long addr, int numpages)
889{ 970{
890 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); 971 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
891} 972}
892EXPORT_SYMBOL(set_memory_x); 973EXPORT_SYMBOL(set_memory_x);
893 974
894int set_memory_nx(unsigned long addr, int numpages) 975int set_memory_nx(unsigned long addr, int numpages)
895{ 976{
896 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); 977 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
897} 978}
898EXPORT_SYMBOL(set_memory_nx); 979EXPORT_SYMBOL(set_memory_nx);
899 980
900int set_memory_ro(unsigned long addr, int numpages) 981int set_memory_ro(unsigned long addr, int numpages)
901{ 982{
902 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); 983 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
903} 984}
985EXPORT_SYMBOL_GPL(set_memory_ro);
904 986
905int set_memory_rw(unsigned long addr, int numpages) 987int set_memory_rw(unsigned long addr, int numpages)
906{ 988{
907 return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); 989 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0);
908} 990}
991EXPORT_SYMBOL_GPL(set_memory_rw);
909 992
910int set_memory_np(unsigned long addr, int numpages) 993int set_memory_np(unsigned long addr, int numpages)
911{ 994{
912 return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); 995 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0);
913} 996}
914 997
915int set_memory_4k(unsigned long addr, int numpages) 998int set_memory_4k(unsigned long addr, int numpages)
916{ 999{
917 return change_page_attr_set_clr(addr, numpages, __pgprot(0), 1000 return change_page_attr_set_clr(&addr, numpages, __pgprot(0),
918 __pgprot(0), 1); 1001 __pgprot(0), 1, 0);
919} 1002}
920 1003
921int set_pages_uc(struct page *page, int numpages) 1004int set_pages_uc(struct page *page, int numpages)
@@ -968,22 +1051,38 @@ int set_pages_rw(struct page *page, int numpages)
968 1051
969static int __set_pages_p(struct page *page, int numpages) 1052static int __set_pages_p(struct page *page, int numpages)
970{ 1053{
971 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1054 unsigned long tempaddr = (unsigned long) page_address(page);
1055 struct cpa_data cpa = { .vaddr = &tempaddr,
972 .numpages = numpages, 1056 .numpages = numpages,
973 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), 1057 .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW),
974 .mask_clr = __pgprot(0)}; 1058 .mask_clr = __pgprot(0),
1059 .flags = 0};
975 1060
976 return __change_page_attr_set_clr(&cpa, 1); 1061 /*
1062 * No alias checking needed for setting present flag. otherwise,
1063 * we may need to break large pages for 64-bit kernel text
1064 * mappings (this adds to complexity if we want to do this from
1065 * atomic context especially). Let's keep it simple!
1066 */
1067 return __change_page_attr_set_clr(&cpa, 0);
977} 1068}
978 1069
979static int __set_pages_np(struct page *page, int numpages) 1070static int __set_pages_np(struct page *page, int numpages)
980{ 1071{
981 struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), 1072 unsigned long tempaddr = (unsigned long) page_address(page);
1073 struct cpa_data cpa = { .vaddr = &tempaddr,
982 .numpages = numpages, 1074 .numpages = numpages,
983 .mask_set = __pgprot(0), 1075 .mask_set = __pgprot(0),
984 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; 1076 .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW),
1077 .flags = 0};
985 1078
986 return __change_page_attr_set_clr(&cpa, 1); 1079 /*
1080 * No alias checking needed for setting not present flag. otherwise,
1081 * we may need to break large pages for 64-bit kernel text
1082 * mappings (this adds to complexity if we want to do this from
1083 * atomic context especially). Let's keep it simple!
1084 */
1085 return __change_page_attr_set_clr(&cpa, 0);
987} 1086}
988 1087
989void kernel_map_pages(struct page *page, int numpages, int enable) 1088void kernel_map_pages(struct page *page, int numpages, int enable)
@@ -1003,11 +1102,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
1003 1102
1004 /* 1103 /*
1005 * The return value is ignored as the calls cannot fail. 1104 * The return value is ignored as the calls cannot fail.
1006 * Large pages are kept enabled at boot time, and are 1105 * Large pages for identity mappings are not used at boot time
1007 * split up quickly with DEBUG_PAGEALLOC. If a splitup 1106 * and hence no memory allocations during large page split.
1008 * fails here (due to temporary memory shortage) no damage
1009 * is done because we just keep the largepage intact up
1010 * to the next attempt when it will likely be split up:
1011 */ 1107 */
1012 if (enable) 1108 if (enable)
1013 __set_pages_p(page, numpages); 1109 __set_pages_p(page, numpages);
@@ -1019,53 +1115,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
1019 * but that can deadlock->flush only current cpu: 1115 * but that can deadlock->flush only current cpu:
1020 */ 1116 */
1021 __flush_tlb_all(); 1117 __flush_tlb_all();
1022
1023 /*
1024 * Try to refill the page pool here. We can do this only after
1025 * the tlb flush.
1026 */
1027 cpa_fill_pool(NULL);
1028}
1029
1030#ifdef CONFIG_DEBUG_FS
1031static int dpa_show(struct seq_file *m, void *v)
1032{
1033 seq_puts(m, "DEBUG_PAGEALLOC\n");
1034 seq_printf(m, "pool_size : %lu\n", pool_size);
1035 seq_printf(m, "pool_pages : %lu\n", pool_pages);
1036 seq_printf(m, "pool_low : %lu\n", pool_low);
1037 seq_printf(m, "pool_used : %lu\n", pool_used);
1038 seq_printf(m, "pool_failed : %lu\n", pool_failed);
1039
1040 return 0;
1041} 1118}
1042 1119
1043static int dpa_open(struct inode *inode, struct file *filp)
1044{
1045 return single_open(filp, dpa_show, NULL);
1046}
1047
1048static const struct file_operations dpa_fops = {
1049 .open = dpa_open,
1050 .read = seq_read,
1051 .llseek = seq_lseek,
1052 .release = single_release,
1053};
1054
1055static int __init debug_pagealloc_proc_init(void)
1056{
1057 struct dentry *de;
1058
1059 de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL,
1060 &dpa_fops);
1061 if (!de)
1062 return -ENOMEM;
1063
1064 return 0;
1065}
1066__initcall(debug_pagealloc_proc_init);
1067#endif
1068
1069#ifdef CONFIG_HIBERNATION 1120#ifdef CONFIG_HIBERNATION
1070 1121
1071bool kernel_page_present(struct page *page) 1122bool kernel_page_present(struct page *page)
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index d4585077977a..738fd0f24958 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -7,22 +7,24 @@
7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. 7 * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen.
8 */ 8 */
9 9
10#include <linux/mm.h> 10#include <linux/seq_file.h>
11#include <linux/bootmem.h>
12#include <linux/debugfs.h>
11#include <linux/kernel.h> 13#include <linux/kernel.h>
12#include <linux/gfp.h> 14#include <linux/gfp.h>
15#include <linux/mm.h>
13#include <linux/fs.h> 16#include <linux/fs.h>
14#include <linux/bootmem.h>
15 17
16#include <asm/msr.h> 18#include <asm/cacheflush.h>
17#include <asm/tlbflush.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
19#include <asm/page.h> 20#include <asm/tlbflush.h>
20#include <asm/pgtable.h> 21#include <asm/pgtable.h>
21#include <asm/pat.h>
22#include <asm/e820.h>
23#include <asm/cacheflush.h>
24#include <asm/fcntl.h> 22#include <asm/fcntl.h>
23#include <asm/e820.h>
25#include <asm/mtrr.h> 24#include <asm/mtrr.h>
25#include <asm/page.h>
26#include <asm/msr.h>
27#include <asm/pat.h>
26#include <asm/io.h> 28#include <asm/io.h>
27 29
28#ifdef CONFIG_X86_PAT 30#ifdef CONFIG_X86_PAT
@@ -44,6 +46,7 @@ early_param("nopat", nopat);
44 46
45 47
46static int debug_enable; 48static int debug_enable;
49
47static int __init pat_debug_setup(char *str) 50static int __init pat_debug_setup(char *str)
48{ 51{
49 debug_enable = 1; 52 debug_enable = 1;
@@ -143,14 +146,14 @@ static char *cattr_name(unsigned long flags)
143 */ 146 */
144 147
145struct memtype { 148struct memtype {
146 u64 start; 149 u64 start;
147 u64 end; 150 u64 end;
148 unsigned long type; 151 unsigned long type;
149 struct list_head nd; 152 struct list_head nd;
150}; 153};
151 154
152static LIST_HEAD(memtype_list); 155static LIST_HEAD(memtype_list);
153static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ 156static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */
154 157
155/* 158/*
156 * Does intersection of PAT memory type and MTRR memory type and returns 159 * Does intersection of PAT memory type and MTRR memory type and returns
@@ -178,8 +181,8 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type)
178 return req_type; 181 return req_type;
179} 182}
180 183
181static int chk_conflict(struct memtype *new, struct memtype *entry, 184static int
182 unsigned long *type) 185chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type)
183{ 186{
184 if (new->type != entry->type) { 187 if (new->type != entry->type) {
185 if (type) { 188 if (type) {
@@ -205,6 +208,69 @@ static int chk_conflict(struct memtype *new, struct memtype *entry,
205 return -EBUSY; 208 return -EBUSY;
206} 209}
207 210
211static struct memtype *cached_entry;
212static u64 cached_start;
213
214/*
215 * For RAM pages, mark the pages as non WB memory type using
216 * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or
217 * set_memory_wc() on a RAM page at a time before marking it as WB again.
218 * This is ok, because only one driver will be owning the page and
219 * doing set_memory_*() calls.
220 *
221 * For now, we use PageNonWB to track that the RAM page is being mapped
222 * as non WB. In future, we will have to use one more flag
223 * (or some other mechanism in page_struct) to distinguish between
224 * UC and WC mapping.
225 */
226static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type,
227 unsigned long *new_type)
228{
229 struct page *page;
230 u64 pfn, end_pfn;
231
232 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
233 page = pfn_to_page(pfn);
234 if (page_mapped(page) || PageNonWB(page))
235 goto out;
236
237 SetPageNonWB(page);
238 }
239 return 0;
240
241out:
242 end_pfn = pfn;
243 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
244 page = pfn_to_page(pfn);
245 ClearPageNonWB(page);
246 }
247
248 return -EINVAL;
249}
250
251static int free_ram_pages_type(u64 start, u64 end)
252{
253 struct page *page;
254 u64 pfn, end_pfn;
255
256 for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) {
257 page = pfn_to_page(pfn);
258 if (page_mapped(page) || !PageNonWB(page))
259 goto out;
260
261 ClearPageNonWB(page);
262 }
263 return 0;
264
265out:
266 end_pfn = pfn;
267 for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) {
268 page = pfn_to_page(pfn);
269 SetPageNonWB(page);
270 }
271 return -EINVAL;
272}
273
208/* 274/*
209 * req_type typically has one of the: 275 * req_type typically has one of the:
210 * - _PAGE_CACHE_WB 276 * - _PAGE_CACHE_WB
@@ -221,14 +287,15 @@ static int chk_conflict(struct memtype *new, struct memtype *entry,
221 * it will return a negative return value. 287 * it will return a negative return value.
222 */ 288 */
223int reserve_memtype(u64 start, u64 end, unsigned long req_type, 289int reserve_memtype(u64 start, u64 end, unsigned long req_type,
224 unsigned long *new_type) 290 unsigned long *new_type)
225{ 291{
226 struct memtype *new, *entry; 292 struct memtype *new, *entry;
227 unsigned long actual_type; 293 unsigned long actual_type;
228 struct list_head *where; 294 struct list_head *where;
295 int is_range_ram;
229 int err = 0; 296 int err = 0;
230 297
231 BUG_ON(start >= end); /* end is exclusive */ 298 BUG_ON(start >= end); /* end is exclusive */
232 299
233 if (!pat_enabled) { 300 if (!pat_enabled) {
234 /* This is identical to page table setting without PAT */ 301 /* This is identical to page table setting without PAT */
@@ -261,28 +328,41 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
261 actual_type = _PAGE_CACHE_WB; 328 actual_type = _PAGE_CACHE_WB;
262 else 329 else
263 actual_type = _PAGE_CACHE_UC_MINUS; 330 actual_type = _PAGE_CACHE_UC_MINUS;
264 } else 331 } else {
265 actual_type = pat_x_mtrr_type(start, end, 332 actual_type = pat_x_mtrr_type(start, end,
266 req_type & _PAGE_CACHE_MASK); 333 req_type & _PAGE_CACHE_MASK);
334 }
335
336 is_range_ram = pagerange_is_ram(start, end);
337 if (is_range_ram == 1)
338 return reserve_ram_pages_type(start, end, req_type, new_type);
339 else if (is_range_ram < 0)
340 return -EINVAL;
267 341
268 new = kmalloc(sizeof(struct memtype), GFP_KERNEL); 342 new = kmalloc(sizeof(struct memtype), GFP_KERNEL);
269 if (!new) 343 if (!new)
270 return -ENOMEM; 344 return -ENOMEM;
271 345
272 new->start = start; 346 new->start = start;
273 new->end = end; 347 new->end = end;
274 new->type = actual_type; 348 new->type = actual_type;
275 349
276 if (new_type) 350 if (new_type)
277 *new_type = actual_type; 351 *new_type = actual_type;
278 352
279 spin_lock(&memtype_lock); 353 spin_lock(&memtype_lock);
280 354
355 if (cached_entry && start >= cached_start)
356 entry = cached_entry;
357 else
358 entry = list_entry(&memtype_list, struct memtype, nd);
359
281 /* Search for existing mapping that overlaps the current range */ 360 /* Search for existing mapping that overlaps the current range */
282 where = NULL; 361 where = NULL;
283 list_for_each_entry(entry, &memtype_list, nd) { 362 list_for_each_entry_continue(entry, &memtype_list, nd) {
284 if (end <= entry->start) { 363 if (end <= entry->start) {
285 where = entry->nd.prev; 364 where = entry->nd.prev;
365 cached_entry = list_entry(where, struct memtype, nd);
286 break; 366 break;
287 } else if (start <= entry->start) { /* end > entry->start */ 367 } else if (start <= entry->start) { /* end > entry->start */
288 err = chk_conflict(new, entry, new_type); 368 err = chk_conflict(new, entry, new_type);
@@ -290,6 +370,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
290 dprintk("Overlap at 0x%Lx-0x%Lx\n", 370 dprintk("Overlap at 0x%Lx-0x%Lx\n",
291 entry->start, entry->end); 371 entry->start, entry->end);
292 where = entry->nd.prev; 372 where = entry->nd.prev;
373 cached_entry = list_entry(where,
374 struct memtype, nd);
293 } 375 }
294 break; 376 break;
295 } else if (start < entry->end) { /* start > entry->start */ 377 } else if (start < entry->end) { /* start > entry->start */
@@ -297,7 +379,20 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
297 if (!err) { 379 if (!err) {
298 dprintk("Overlap at 0x%Lx-0x%Lx\n", 380 dprintk("Overlap at 0x%Lx-0x%Lx\n",
299 entry->start, entry->end); 381 entry->start, entry->end);
300 where = &entry->nd; 382 cached_entry = list_entry(entry->nd.prev,
383 struct memtype, nd);
384
385 /*
386 * Move to right position in the linked
387 * list to add this new entry
388 */
389 list_for_each_entry_continue(entry,
390 &memtype_list, nd) {
391 if (start <= entry->start) {
392 where = entry->nd.prev;
393 break;
394 }
395 }
301 } 396 }
302 break; 397 break;
303 } 398 }
@@ -309,9 +404,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
309 start, end, cattr_name(new->type), cattr_name(req_type)); 404 start, end, cattr_name(new->type), cattr_name(req_type));
310 kfree(new); 405 kfree(new);
311 spin_unlock(&memtype_lock); 406 spin_unlock(&memtype_lock);
407
312 return err; 408 return err;
313 } 409 }
314 410
411 cached_start = start;
412
315 if (where) 413 if (where)
316 list_add(&new->nd, where); 414 list_add(&new->nd, where);
317 else 415 else
@@ -330,6 +428,7 @@ int free_memtype(u64 start, u64 end)
330{ 428{
331 struct memtype *entry; 429 struct memtype *entry;
332 int err = -EINVAL; 430 int err = -EINVAL;
431 int is_range_ram;
333 432
334 if (!pat_enabled) 433 if (!pat_enabled)
335 return 0; 434 return 0;
@@ -338,9 +437,18 @@ int free_memtype(u64 start, u64 end)
338 if (is_ISA_range(start, end - 1)) 437 if (is_ISA_range(start, end - 1))
339 return 0; 438 return 0;
340 439
440 is_range_ram = pagerange_is_ram(start, end);
441 if (is_range_ram == 1)
442 return free_ram_pages_type(start, end);
443 else if (is_range_ram < 0)
444 return -EINVAL;
445
341 spin_lock(&memtype_lock); 446 spin_lock(&memtype_lock);
342 list_for_each_entry(entry, &memtype_list, nd) { 447 list_for_each_entry(entry, &memtype_list, nd) {
343 if (entry->start == start && entry->end == end) { 448 if (entry->start == start && entry->end == end) {
449 if (cached_entry == entry || cached_start == start)
450 cached_entry = NULL;
451
344 list_del(&entry->nd); 452 list_del(&entry->nd);
345 kfree(entry); 453 kfree(entry);
346 err = 0; 454 err = 0;
@@ -355,26 +463,19 @@ int free_memtype(u64 start, u64 end)
355 } 463 }
356 464
357 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); 465 dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end);
466
358 return err; 467 return err;
359} 468}
360 469
361 470
362/*
363 * /dev/mem mmap interface. The memtype used for mapping varies:
364 * - Use UC for mappings with O_SYNC flag
365 * - Without O_SYNC flag, if there is any conflict in reserve_memtype,
366 * inherit the memtype from existing mapping.
367 * - Else use UC_MINUS memtype (for backward compatibility with existing
368 * X drivers.
369 */
370pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, 471pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn,
371 unsigned long size, pgprot_t vma_prot) 472 unsigned long size, pgprot_t vma_prot)
372{ 473{
373 return vma_prot; 474 return vma_prot;
374} 475}
375 476
376#ifdef CONFIG_NONPROMISC_DEVMEM 477#ifdef CONFIG_STRICT_DEVMEM
377/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/ 478/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/
378static inline int range_is_allowed(unsigned long pfn, unsigned long size) 479static inline int range_is_allowed(unsigned long pfn, unsigned long size)
379{ 480{
380 return 1; 481 return 1;
@@ -398,20 +499,20 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
398 } 499 }
399 return 1; 500 return 1;
400} 501}
401#endif /* CONFIG_NONPROMISC_DEVMEM */ 502#endif /* CONFIG_STRICT_DEVMEM */
402 503
403int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, 504int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
404 unsigned long size, pgprot_t *vma_prot) 505 unsigned long size, pgprot_t *vma_prot)
405{ 506{
406 u64 offset = ((u64) pfn) << PAGE_SHIFT; 507 u64 offset = ((u64) pfn) << PAGE_SHIFT;
407 unsigned long flags = _PAGE_CACHE_UC_MINUS; 508 unsigned long flags = -1;
408 int retval; 509 int retval;
409 510
410 if (!range_is_allowed(pfn, size)) 511 if (!range_is_allowed(pfn, size))
411 return 0; 512 return 0;
412 513
413 if (file->f_flags & O_SYNC) { 514 if (file->f_flags & O_SYNC) {
414 flags = _PAGE_CACHE_UC; 515 flags = _PAGE_CACHE_UC_MINUS;
415 } 516 }
416 517
417#ifdef CONFIG_X86_32 518#ifdef CONFIG_X86_32
@@ -434,13 +535,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
434#endif 535#endif
435 536
436 /* 537 /*
437 * With O_SYNC, we can only take UC mapping. Fail if we cannot. 538 * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot.
539 *
438 * Without O_SYNC, we want to get 540 * Without O_SYNC, we want to get
439 * - WB for WB-able memory and no other conflicting mappings 541 * - WB for WB-able memory and no other conflicting mappings
440 * - UC_MINUS for non-WB-able memory with no other conflicting mappings 542 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
441 * - Inherit from confliting mappings otherwise 543 * - Inherit from confliting mappings otherwise
442 */ 544 */
443 if (flags != _PAGE_CACHE_UC_MINUS) { 545 if (flags != -1) {
444 retval = reserve_memtype(offset, offset + size, flags, NULL); 546 retval = reserve_memtype(offset, offset + size, flags, NULL);
445 } else { 547 } else {
446 retval = reserve_memtype(offset, offset + size, -1, &flags); 548 retval = reserve_memtype(offset, offset + size, -1, &flags);
@@ -468,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
468 570
469void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) 571void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
470{ 572{
573 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
471 u64 addr = (u64)pfn << PAGE_SHIFT; 574 u64 addr = (u64)pfn << PAGE_SHIFT;
472 unsigned long flags; 575 unsigned long flags;
473 unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK);
474 576
475 reserve_memtype(addr, addr + size, want_flags, &flags); 577 reserve_memtype(addr, addr + size, want_flags, &flags);
476 if (flags != want_flags) { 578 if (flags != want_flags) {
@@ -489,3 +591,91 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot)
489 591
490 free_memtype(addr, addr + size); 592 free_memtype(addr, addr + size);
491} 593}
594
595#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT)
596
597/* get Nth element of the linked list */
598static struct memtype *memtype_get_idx(loff_t pos)
599{
600 struct memtype *list_node, *print_entry;
601 int i = 1;
602
603 print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL);
604 if (!print_entry)
605 return NULL;
606
607 spin_lock(&memtype_lock);
608 list_for_each_entry(list_node, &memtype_list, nd) {
609 if (pos == i) {
610 *print_entry = *list_node;
611 spin_unlock(&memtype_lock);
612 return print_entry;
613 }
614 ++i;
615 }
616 spin_unlock(&memtype_lock);
617 kfree(print_entry);
618
619 return NULL;
620}
621
622static void *memtype_seq_start(struct seq_file *seq, loff_t *pos)
623{
624 if (*pos == 0) {
625 ++*pos;
626 seq_printf(seq, "PAT memtype list:\n");
627 }
628
629 return memtype_get_idx(*pos);
630}
631
632static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
633{
634 ++*pos;
635 return memtype_get_idx(*pos);
636}
637
638static void memtype_seq_stop(struct seq_file *seq, void *v)
639{
640}
641
642static int memtype_seq_show(struct seq_file *seq, void *v)
643{
644 struct memtype *print_entry = (struct memtype *)v;
645
646 seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type),
647 print_entry->start, print_entry->end);
648 kfree(print_entry);
649
650 return 0;
651}
652
653static struct seq_operations memtype_seq_ops = {
654 .start = memtype_seq_start,
655 .next = memtype_seq_next,
656 .stop = memtype_seq_stop,
657 .show = memtype_seq_show,
658};
659
660static int memtype_seq_open(struct inode *inode, struct file *file)
661{
662 return seq_open(file, &memtype_seq_ops);
663}
664
665static const struct file_operations memtype_fops = {
666 .open = memtype_seq_open,
667 .read = seq_read,
668 .llseek = seq_lseek,
669 .release = seq_release,
670};
671
672static int __init pat_memtype_list_init(void)
673{
674 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir,
675 NULL, &memtype_fops);
676 return 0;
677}
678
679late_initcall(pat_memtype_list_init);
680
681#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index efa1911e20ca..df3d5c861cda 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -79,25 +79,34 @@ static unsigned int mw32[] = { 0xC7 };
79static unsigned int mw64[] = { 0x89, 0x8B }; 79static unsigned int mw64[] = { 0x89, 0x8B };
80#endif /* not __i386__ */ 80#endif /* not __i386__ */
81 81
82static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged, 82struct prefix_bits {
83 int *rexr) 83 unsigned shorted:1;
84 unsigned enlarged:1;
85 unsigned rexr:1;
86 unsigned rex:1;
87};
88
89static int skip_prefix(unsigned char *addr, struct prefix_bits *prf)
84{ 90{
85 int i; 91 int i;
86 unsigned char *p = addr; 92 unsigned char *p = addr;
87 *shorted = 0; 93 prf->shorted = 0;
88 *enlarged = 0; 94 prf->enlarged = 0;
89 *rexr = 0; 95 prf->rexr = 0;
96 prf->rex = 0;
90 97
91restart: 98restart:
92 for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { 99 for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) {
93 if (*p == prefix_codes[i]) { 100 if (*p == prefix_codes[i]) {
94 if (*p == 0x66) 101 if (*p == 0x66)
95 *shorted = 1; 102 prf->shorted = 1;
96#ifdef __amd64__ 103#ifdef __amd64__
97 if ((*p & 0xf8) == 0x48) 104 if ((*p & 0xf8) == 0x48)
98 *enlarged = 1; 105 prf->enlarged = 1;
99 if ((*p & 0xf4) == 0x44) 106 if ((*p & 0xf4) == 0x44)
100 *rexr = 1; 107 prf->rexr = 1;
108 if ((*p & 0xf0) == 0x40)
109 prf->rex = 1;
101#endif 110#endif
102 p++; 111 p++;
103 goto restart; 112 goto restart;
@@ -135,12 +144,12 @@ enum reason_type get_ins_type(unsigned long ins_addr)
135{ 144{
136 unsigned int opcode; 145 unsigned int opcode;
137 unsigned char *p; 146 unsigned char *p;
138 int shorted, enlarged, rexr; 147 struct prefix_bits prf;
139 int i; 148 int i;
140 enum reason_type rv = OTHERS; 149 enum reason_type rv = OTHERS;
141 150
142 p = (unsigned char *)ins_addr; 151 p = (unsigned char *)ins_addr;
143 p += skip_prefix(p, &shorted, &enlarged, &rexr); 152 p += skip_prefix(p, &prf);
144 p += get_opcode(p, &opcode); 153 p += get_opcode(p, &opcode);
145 154
146 CHECK_OP_TYPE(opcode, reg_rop, REG_READ); 155 CHECK_OP_TYPE(opcode, reg_rop, REG_READ);
@@ -156,10 +165,11 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr)
156{ 165{
157 unsigned int opcode; 166 unsigned int opcode;
158 unsigned char *p; 167 unsigned char *p;
159 int i, shorted, enlarged, rexr; 168 struct prefix_bits prf;
169 int i;
160 170
161 p = (unsigned char *)ins_addr; 171 p = (unsigned char *)ins_addr;
162 p += skip_prefix(p, &shorted, &enlarged, &rexr); 172 p += skip_prefix(p, &prf);
163 p += get_opcode(p, &opcode); 173 p += get_opcode(p, &opcode);
164 174
165 for (i = 0; i < ARRAY_SIZE(rw8); i++) 175 for (i = 0; i < ARRAY_SIZE(rw8); i++)
@@ -168,7 +178,7 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr)
168 178
169 for (i = 0; i < ARRAY_SIZE(rw32); i++) 179 for (i = 0; i < ARRAY_SIZE(rw32); i++)
170 if (rw32[i] == opcode) 180 if (rw32[i] == opcode)
171 return (shorted ? 2 : (enlarged ? 8 : 4)); 181 return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
172 182
173 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); 183 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
174 return 0; 184 return 0;
@@ -178,10 +188,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr)
178{ 188{
179 unsigned int opcode; 189 unsigned int opcode;
180 unsigned char *p; 190 unsigned char *p;
181 int i, shorted, enlarged, rexr; 191 struct prefix_bits prf;
192 int i;
182 193
183 p = (unsigned char *)ins_addr; 194 p = (unsigned char *)ins_addr;
184 p += skip_prefix(p, &shorted, &enlarged, &rexr); 195 p += skip_prefix(p, &prf);
185 p += get_opcode(p, &opcode); 196 p += get_opcode(p, &opcode);
186 197
187 for (i = 0; i < ARRAY_SIZE(mw8); i++) 198 for (i = 0; i < ARRAY_SIZE(mw8); i++)
@@ -194,11 +205,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr)
194 205
195 for (i = 0; i < ARRAY_SIZE(mw32); i++) 206 for (i = 0; i < ARRAY_SIZE(mw32); i++)
196 if (mw32[i] == opcode) 207 if (mw32[i] == opcode)
197 return shorted ? 2 : 4; 208 return prf.shorted ? 2 : 4;
198 209
199 for (i = 0; i < ARRAY_SIZE(mw64); i++) 210 for (i = 0; i < ARRAY_SIZE(mw64); i++)
200 if (mw64[i] == opcode) 211 if (mw64[i] == opcode)
201 return shorted ? 2 : (enlarged ? 8 : 4); 212 return prf.shorted ? 2 : (prf.enlarged ? 8 : 4);
202 213
203 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); 214 printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode);
204 return 0; 215 return 0;
@@ -238,7 +249,7 @@ enum {
238#endif 249#endif
239}; 250};
240 251
241static unsigned char *get_reg_w8(int no, struct pt_regs *regs) 252static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs)
242{ 253{
243 unsigned char *rv = NULL; 254 unsigned char *rv = NULL;
244 255
@@ -255,18 +266,6 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
255 case arg_DL: 266 case arg_DL:
256 rv = (unsigned char *)&regs->dx; 267 rv = (unsigned char *)&regs->dx;
257 break; 268 break;
258 case arg_AH:
259 rv = 1 + (unsigned char *)&regs->ax;
260 break;
261 case arg_BH:
262 rv = 1 + (unsigned char *)&regs->bx;
263 break;
264 case arg_CH:
265 rv = 1 + (unsigned char *)&regs->cx;
266 break;
267 case arg_DH:
268 rv = 1 + (unsigned char *)&regs->dx;
269 break;
270#ifdef __amd64__ 269#ifdef __amd64__
271 case arg_R8: 270 case arg_R8:
272 rv = (unsigned char *)&regs->r8; 271 rv = (unsigned char *)&regs->r8;
@@ -294,9 +293,55 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs)
294 break; 293 break;
295#endif 294#endif
296 default: 295 default:
297 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
298 break; 296 break;
299 } 297 }
298
299 if (rv)
300 return rv;
301
302 if (rex) {
303 /*
304 * If REX prefix exists, access low bytes of SI etc.
305 * instead of AH etc.
306 */
307 switch (no) {
308 case arg_SI:
309 rv = (unsigned char *)&regs->si;
310 break;
311 case arg_DI:
312 rv = (unsigned char *)&regs->di;
313 break;
314 case arg_BP:
315 rv = (unsigned char *)&regs->bp;
316 break;
317 case arg_SP:
318 rv = (unsigned char *)&regs->sp;
319 break;
320 default:
321 break;
322 }
323 } else {
324 switch (no) {
325 case arg_AH:
326 rv = 1 + (unsigned char *)&regs->ax;
327 break;
328 case arg_BH:
329 rv = 1 + (unsigned char *)&regs->bx;
330 break;
331 case arg_CH:
332 rv = 1 + (unsigned char *)&regs->cx;
333 break;
334 case arg_DH:
335 rv = 1 + (unsigned char *)&regs->dx;
336 break;
337 default:
338 break;
339 }
340 }
341
342 if (!rv)
343 printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no);
344
300 return rv; 345 return rv;
301} 346}
302 347
@@ -368,11 +413,12 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
368 unsigned char mod_rm; 413 unsigned char mod_rm;
369 int reg; 414 int reg;
370 unsigned char *p; 415 unsigned char *p;
371 int i, shorted, enlarged, rexr; 416 struct prefix_bits prf;
417 int i;
372 unsigned long rv; 418 unsigned long rv;
373 419
374 p = (unsigned char *)ins_addr; 420 p = (unsigned char *)ins_addr;
375 p += skip_prefix(p, &shorted, &enlarged, &rexr); 421 p += skip_prefix(p, &prf);
376 p += get_opcode(p, &opcode); 422 p += get_opcode(p, &opcode);
377 for (i = 0; i < ARRAY_SIZE(reg_rop); i++) 423 for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
378 if (reg_rop[i] == opcode) { 424 if (reg_rop[i] == opcode) {
@@ -392,10 +438,10 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
392 438
393do_work: 439do_work:
394 mod_rm = *p; 440 mod_rm = *p;
395 reg = ((mod_rm >> 3) & 0x7) | (rexr << 3); 441 reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3);
396 switch (get_ins_reg_width(ins_addr)) { 442 switch (get_ins_reg_width(ins_addr)) {
397 case 1: 443 case 1:
398 return *get_reg_w8(reg, regs); 444 return *get_reg_w8(reg, prf.rex, regs);
399 445
400 case 2: 446 case 2:
401 return *(unsigned short *)get_reg_w32(reg, regs); 447 return *(unsigned short *)get_reg_w32(reg, regs);
@@ -422,11 +468,12 @@ unsigned long get_ins_imm_val(unsigned long ins_addr)
422 unsigned char mod_rm; 468 unsigned char mod_rm;
423 unsigned char mod; 469 unsigned char mod;
424 unsigned char *p; 470 unsigned char *p;
425 int i, shorted, enlarged, rexr; 471 struct prefix_bits prf;
472 int i;
426 unsigned long rv; 473 unsigned long rv;
427 474
428 p = (unsigned char *)ins_addr; 475 p = (unsigned char *)ins_addr;
429 p += skip_prefix(p, &shorted, &enlarged, &rexr); 476 p += skip_prefix(p, &prf);
430 p += get_opcode(p, &opcode); 477 p += get_opcode(p, &opcode);
431 for (i = 0; i < ARRAY_SIZE(imm_wop); i++) 478 for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
432 if (imm_wop[i] == opcode) { 479 if (imm_wop[i] == opcode) {
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 557b2abceef8..86f2ffc43c3d 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -63,10 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd)
63#define UNSHARED_PTRS_PER_PGD \ 63#define UNSHARED_PTRS_PER_PGD \
64 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 64 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
65 65
66static void pgd_ctor(void *p) 66static void pgd_ctor(pgd_t *pgd)
67{ 67{
68 pgd_t *pgd = p;
69
70 /* If the pgd points to a shared pagetable level (either the 68 /* If the pgd points to a shared pagetable level (either the
71 ptes in non-PAE, or shared PMD in PAE), then just copy the 69 ptes in non-PAE, or shared PMD in PAE), then just copy the
72 references from swapper_pg_dir. */ 70 references from swapper_pg_dir. */
@@ -87,7 +85,7 @@ static void pgd_ctor(void *p)
87 pgd_list_add(pgd); 85 pgd_list_add(pgd);
88} 86}
89 87
90static void pgd_dtor(void *pgd) 88static void pgd_dtor(pgd_t *pgd)
91{ 89{
92 unsigned long flags; /* can be called from interrupt context */ 90 unsigned long flags; /* can be called from interrupt context */
93 91
@@ -207,6 +205,9 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[])
207 unsigned long addr; 205 unsigned long addr;
208 int i; 206 int i;
209 207
208 if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */
209 return;
210
210 pud = pud_offset(pgd, 0); 211 pud = pud_offset(pgd, 0);
211 212
212 for (addr = i = 0; i < PREALLOCATED_PMDS; 213 for (addr = i = 0; i < PREALLOCATED_PMDS;
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index b4becbf8c570..0951db9ee519 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -20,53 +20,6 @@
20#include <asm/tlb.h> 20#include <asm/tlb.h>
21#include <asm/tlbflush.h> 21#include <asm/tlbflush.h>
22 22
23void show_mem(void)
24{
25 int total = 0, reserved = 0;
26 int shared = 0, cached = 0;
27 int highmem = 0;
28 struct page *page;
29 pg_data_t *pgdat;
30 unsigned long i;
31 unsigned long flags;
32
33 printk(KERN_INFO "Mem-info:\n");
34 show_free_areas();
35 for_each_online_pgdat(pgdat) {
36 pgdat_resize_lock(pgdat, &flags);
37 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
38 if (unlikely(i % MAX_ORDER_NR_PAGES == 0))
39 touch_nmi_watchdog();
40 page = pgdat_page_nr(pgdat, i);
41 total++;
42 if (PageHighMem(page))
43 highmem++;
44 if (PageReserved(page))
45 reserved++;
46 else if (PageSwapCache(page))
47 cached++;
48 else if (page_count(page))
49 shared += page_count(page) - 1;
50 }
51 pgdat_resize_unlock(pgdat, &flags);
52 }
53 printk(KERN_INFO "%d pages of RAM\n", total);
54 printk(KERN_INFO "%d pages of HIGHMEM\n", highmem);
55 printk(KERN_INFO "%d reserved pages\n", reserved);
56 printk(KERN_INFO "%d pages shared\n", shared);
57 printk(KERN_INFO "%d pages swap cached\n", cached);
58
59 printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY));
60 printk(KERN_INFO "%lu pages writeback\n",
61 global_page_state(NR_WRITEBACK));
62 printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED));
63 printk(KERN_INFO "%lu pages slab\n",
64 global_page_state(NR_SLAB_RECLAIMABLE) +
65 global_page_state(NR_SLAB_UNRECLAIMABLE));
66 printk(KERN_INFO "%lu pages pagetables\n",
67 global_page_state(NR_PAGETABLE));
68}
69
70/* 23/*
71 * Associate a virtual page frame with a given physical page frame 24 * Associate a virtual page frame with a given physical page frame
72 * and protection flags for that frame. 25 * and protection flags for that frame.
@@ -170,7 +123,8 @@ static int __init parse_vmalloc(char *arg)
170 if (!arg) 123 if (!arg)
171 return -EINVAL; 124 return -EINVAL;
172 125
173 __VMALLOC_RESERVE = memparse(arg, &arg); 126 /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/
127 __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET;
174 return 0; 128 return 0;
175} 129}
176early_param("vmalloc", parse_vmalloc); 130early_param("vmalloc", parse_vmalloc);
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 1eb2973a301c..16ae70fc57e7 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -178,7 +178,7 @@ void acpi_numa_arch_fixup(void)
178 * start of the node, and that the current "end" address is after 178 * start of the node, and that the current "end" address is after
179 * the previous one. 179 * the previous one.
180 */ 180 */
181static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) 181static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
182{ 182{
183 /* 183 /*
184 * Only add present memory as told by the e820. 184 * Only add present memory as told by the e820.
@@ -189,10 +189,10 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
189 if (memory_chunk->start_pfn >= max_pfn) { 189 if (memory_chunk->start_pfn >= max_pfn) {
190 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", 190 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
191 memory_chunk->start_pfn, memory_chunk->end_pfn); 191 memory_chunk->start_pfn, memory_chunk->end_pfn);
192 return; 192 return -1;
193 } 193 }
194 if (memory_chunk->nid != nid) 194 if (memory_chunk->nid != nid)
195 return; 195 return -1;
196 196
197 if (!node_has_online_mem(nid)) 197 if (!node_has_online_mem(nid))
198 node_start_pfn[nid] = memory_chunk->start_pfn; 198 node_start_pfn[nid] = memory_chunk->start_pfn;
@@ -202,6 +202,8 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c
202 202
203 if (node_end_pfn[nid] < memory_chunk->end_pfn) 203 if (node_end_pfn[nid] < memory_chunk->end_pfn)
204 node_end_pfn[nid] = memory_chunk->end_pfn; 204 node_end_pfn[nid] = memory_chunk->end_pfn;
205
206 return 0;
205} 207}
206 208
207int __init get_memcfg_from_srat(void) 209int __init get_memcfg_from_srat(void)
@@ -259,7 +261,9 @@ int __init get_memcfg_from_srat(void)
259 printk(KERN_DEBUG 261 printk(KERN_DEBUG
260 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", 262 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
261 j, chunk->nid, chunk->start_pfn, chunk->end_pfn); 263 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
262 node_read_chunk(chunk->nid, chunk); 264 if (node_read_chunk(chunk->nid, chunk))
265 continue;
266
263 e820_register_active_regions(chunk->nid, chunk->start_pfn, 267 e820_register_active_regions(chunk->nid, chunk->start_pfn,
264 min(chunk->end_pfn, max_pfn)); 268 min(chunk->end_pfn, max_pfn));
265 } 269 }
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index 1b4763e26ea9..51c0a2fc14fe 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -138,7 +138,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
138 return; 138 return;
139 } 139 }
140 140
141 if (is_uv_system()) 141 if (get_uv_system_type() >= UV_X2APIC)
142 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; 142 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
143 else 143 else
144 apic_id = pa->apic_id; 144 apic_id = pa->apic_id;
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index d877c5b423ef..ab50a8d7402c 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -3,6 +3,7 @@
3 */ 3 */
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/io.h> 5#include <linux/io.h>
6#include <linux/mmiotrace.h>
6 7
7#define MODULE_NAME "testmmiotrace" 8#define MODULE_NAME "testmmiotrace"
8 9
@@ -13,6 +14,7 @@ MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB.");
13static void do_write_test(void __iomem *p) 14static void do_write_test(void __iomem *p)
14{ 15{
15 unsigned int i; 16 unsigned int i;
17 mmiotrace_printk("Write test.\n");
16 for (i = 0; i < 256; i++) 18 for (i = 0; i < 256; i++)
17 iowrite8(i, p + i); 19 iowrite8(i, p + i);
18 for (i = 1024; i < (5 * 1024); i += 2) 20 for (i = 1024; i < (5 * 1024); i += 2)
@@ -24,6 +26,7 @@ static void do_write_test(void __iomem *p)
24static void do_read_test(void __iomem *p) 26static void do_read_test(void __iomem *p)
25{ 27{
26 unsigned int i; 28 unsigned int i;
29 mmiotrace_printk("Read test.\n");
27 for (i = 0; i < 256; i++) 30 for (i = 0; i < 256; i++)
28 ioread8(p + i); 31 ioread8(p + i);
29 for (i = 1024; i < (5 * 1024); i += 2) 32 for (i = 1024; i < (5 * 1024); i += 2)
@@ -39,6 +42,7 @@ static void do_test(void)
39 pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); 42 pr_err(MODULE_NAME ": could not ioremap, aborting.\n");
40 return; 43 return;
41 } 44 }
45 mmiotrace_printk("ioremap returned %p.\n", p);
42 do_write_test(p); 46 do_write_test(p);
43 do_read_test(p); 47 do_read_test(p);
44 iounmap(p); 48 iounmap(p);
diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile
index 30f3eb366667..446902b2a6b6 100644
--- a/arch/x86/oprofile/Makefile
+++ b/arch/x86/oprofile/Makefile
@@ -7,6 +7,6 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \
7 timer_int.o ) 7 timer_int.o )
8 8
9oprofile-y := $(DRIVER_OBJS) init.o backtrace.o 9oprofile-y := $(DRIVER_OBJS) init.o backtrace.o
10oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \ 10oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \
11 op_model_ppro.o op_model_p4.o 11 op_model_ppro.o op_model_p4.o
12oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o 12oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index e2095cba409f..04df67f8a7ba 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -52,8 +52,7 @@ struct frame_head {
52 unsigned long ret; 52 unsigned long ret;
53} __attribute__((packed)); 53} __attribute__((packed));
54 54
55static struct frame_head * 55static struct frame_head *dump_user_backtrace(struct frame_head *head)
56dump_user_backtrace(struct frame_head * head)
57{ 56{
58 struct frame_head bufhead[2]; 57 struct frame_head bufhead[2];
59 58
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 7f3329b55d2e..022cd41ea9b4 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -1,10 +1,11 @@
1/** 1/**
2 * @file nmi_int.c 2 * @file nmi_int.c
3 * 3 *
4 * @remark Copyright 2002 OProfile authors 4 * @remark Copyright 2002-2008 OProfile authors
5 * @remark Read the file COPYING 5 * @remark Read the file COPYING
6 * 6 *
7 * @author John Levon <levon@movementarian.org> 7 * @author John Levon <levon@movementarian.org>
8 * @author Robert Richter <robert.richter@amd.com>
8 */ 9 */
9 10
10#include <linux/init.h> 11#include <linux/init.h>
@@ -15,6 +16,7 @@
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/moduleparam.h> 17#include <linux/moduleparam.h>
17#include <linux/kdebug.h> 18#include <linux/kdebug.h>
19#include <linux/cpu.h>
18#include <asm/nmi.h> 20#include <asm/nmi.h>
19#include <asm/msr.h> 21#include <asm/msr.h>
20#include <asm/apic.h> 22#include <asm/apic.h>
@@ -26,60 +28,9 @@ static struct op_x86_model_spec const *model;
26static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); 28static DEFINE_PER_CPU(struct op_msrs, cpu_msrs);
27static DEFINE_PER_CPU(unsigned long, saved_lvtpc); 29static DEFINE_PER_CPU(unsigned long, saved_lvtpc);
28 30
29static int nmi_start(void);
30static void nmi_stop(void);
31
32/* 0 == registered but off, 1 == registered and on */ 31/* 0 == registered but off, 1 == registered and on */
33static int nmi_enabled = 0; 32static int nmi_enabled = 0;
34 33
35#ifdef CONFIG_PM
36
37static int nmi_suspend(struct sys_device *dev, pm_message_t state)
38{
39 if (nmi_enabled == 1)
40 nmi_stop();
41 return 0;
42}
43
44static int nmi_resume(struct sys_device *dev)
45{
46 if (nmi_enabled == 1)
47 nmi_start();
48 return 0;
49}
50
51static struct sysdev_class oprofile_sysclass = {
52 .name = "oprofile",
53 .resume = nmi_resume,
54 .suspend = nmi_suspend,
55};
56
57static struct sys_device device_oprofile = {
58 .id = 0,
59 .cls = &oprofile_sysclass,
60};
61
62static int __init init_sysfs(void)
63{
64 int error;
65
66 error = sysdev_class_register(&oprofile_sysclass);
67 if (!error)
68 error = sysdev_register(&device_oprofile);
69 return error;
70}
71
72static void exit_sysfs(void)
73{
74 sysdev_unregister(&device_oprofile);
75 sysdev_class_unregister(&oprofile_sysclass);
76}
77
78#else
79#define init_sysfs() do { } while (0)
80#define exit_sysfs() do { } while (0)
81#endif /* CONFIG_PM */
82
83static int profile_exceptions_notify(struct notifier_block *self, 34static int profile_exceptions_notify(struct notifier_block *self,
84 unsigned long val, void *data) 35 unsigned long val, void *data)
85{ 36{
@@ -269,10 +220,12 @@ static void nmi_cpu_shutdown(void *dummy)
269 220
270static void nmi_shutdown(void) 221static void nmi_shutdown(void)
271{ 222{
272 struct op_msrs *msrs = &get_cpu_var(cpu_msrs); 223 struct op_msrs *msrs;
224
273 nmi_enabled = 0; 225 nmi_enabled = 0;
274 on_each_cpu(nmi_cpu_shutdown, NULL, 1); 226 on_each_cpu(nmi_cpu_shutdown, NULL, 1);
275 unregister_die_notifier(&profile_exceptions_nb); 227 unregister_die_notifier(&profile_exceptions_nb);
228 msrs = &get_cpu_var(cpu_msrs);
276 model->shutdown(msrs); 229 model->shutdown(msrs);
277 free_msrs(); 230 free_msrs();
278 put_cpu_var(cpu_msrs); 231 put_cpu_var(cpu_msrs);
@@ -332,6 +285,77 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
332 return 0; 285 return 0;
333} 286}
334 287
288#ifdef CONFIG_SMP
289static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action,
290 void *data)
291{
292 int cpu = (unsigned long)data;
293 switch (action) {
294 case CPU_DOWN_FAILED:
295 case CPU_ONLINE:
296 smp_call_function_single(cpu, nmi_cpu_start, NULL, 0);
297 break;
298 case CPU_DOWN_PREPARE:
299 smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1);
300 break;
301 }
302 return NOTIFY_DONE;
303}
304
305static struct notifier_block oprofile_cpu_nb = {
306 .notifier_call = oprofile_cpu_notifier
307};
308#endif
309
310#ifdef CONFIG_PM
311
312static int nmi_suspend(struct sys_device *dev, pm_message_t state)
313{
314 /* Only one CPU left, just stop that one */
315 if (nmi_enabled == 1)
316 nmi_cpu_stop(NULL);
317 return 0;
318}
319
320static int nmi_resume(struct sys_device *dev)
321{
322 if (nmi_enabled == 1)
323 nmi_cpu_start(NULL);
324 return 0;
325}
326
327static struct sysdev_class oprofile_sysclass = {
328 .name = "oprofile",
329 .resume = nmi_resume,
330 .suspend = nmi_suspend,
331};
332
333static struct sys_device device_oprofile = {
334 .id = 0,
335 .cls = &oprofile_sysclass,
336};
337
338static int __init init_sysfs(void)
339{
340 int error;
341
342 error = sysdev_class_register(&oprofile_sysclass);
343 if (!error)
344 error = sysdev_register(&device_oprofile);
345 return error;
346}
347
348static void exit_sysfs(void)
349{
350 sysdev_unregister(&device_oprofile);
351 sysdev_class_unregister(&oprofile_sysclass);
352}
353
354#else
355#define init_sysfs() do { } while (0)
356#define exit_sysfs() do { } while (0)
357#endif /* CONFIG_PM */
358
335static int p4force; 359static int p4force;
336module_param(p4force, int, 0); 360module_param(p4force, int, 0);
337 361
@@ -369,26 +393,47 @@ static int __init ppro_init(char **cpu_type)
369{ 393{
370 __u8 cpu_model = boot_cpu_data.x86_model; 394 __u8 cpu_model = boot_cpu_data.x86_model;
371 395
372 if (cpu_model == 14) 396 switch (cpu_model) {
397 case 0 ... 2:
398 *cpu_type = "i386/ppro";
399 break;
400 case 3 ... 5:
401 *cpu_type = "i386/pii";
402 break;
403 case 6 ... 8:
404 *cpu_type = "i386/piii";
405 break;
406 case 9:
407 *cpu_type = "i386/p6_mobile";
408 break;
409 case 10 ... 13:
410 *cpu_type = "i386/p6";
411 break;
412 case 14:
373 *cpu_type = "i386/core"; 413 *cpu_type = "i386/core";
374 else if (cpu_model == 15 || cpu_model == 23) 414 break;
415 case 15: case 23:
375 *cpu_type = "i386/core_2"; 416 *cpu_type = "i386/core_2";
376 else if (cpu_model > 0xd) 417 break;
418 default:
419 /* Unknown */
377 return 0; 420 return 0;
378 else if (cpu_model == 9) {
379 *cpu_type = "i386/p6_mobile";
380 } else if (cpu_model > 5) {
381 *cpu_type = "i386/piii";
382 } else if (cpu_model > 2) {
383 *cpu_type = "i386/pii";
384 } else {
385 *cpu_type = "i386/ppro";
386 } 421 }
387 422
388 model = &op_ppro_spec; 423 model = &op_ppro_spec;
389 return 1; 424 return 1;
390} 425}
391 426
427static int __init arch_perfmon_init(char **cpu_type)
428{
429 if (!cpu_has_arch_perfmon)
430 return 0;
431 *cpu_type = "i386/arch_perfmon";
432 model = &op_arch_perfmon_spec;
433 arch_perfmon_setup_counters();
434 return 1;
435}
436
392/* in order to get sysfs right */ 437/* in order to get sysfs right */
393static int using_nmi; 438static int using_nmi;
394 439
@@ -396,7 +441,8 @@ int __init op_nmi_init(struct oprofile_operations *ops)
396{ 441{
397 __u8 vendor = boot_cpu_data.x86_vendor; 442 __u8 vendor = boot_cpu_data.x86_vendor;
398 __u8 family = boot_cpu_data.x86; 443 __u8 family = boot_cpu_data.x86;
399 char *cpu_type; 444 char *cpu_type = NULL;
445 int ret = 0;
400 446
401 if (!cpu_has_apic) 447 if (!cpu_has_apic)
402 return -ENODEV; 448 return -ENODEV;
@@ -409,19 +455,23 @@ int __init op_nmi_init(struct oprofile_operations *ops)
409 default: 455 default:
410 return -ENODEV; 456 return -ENODEV;
411 case 6: 457 case 6:
412 model = &op_athlon_spec; 458 model = &op_amd_spec;
413 cpu_type = "i386/athlon"; 459 cpu_type = "i386/athlon";
414 break; 460 break;
415 case 0xf: 461 case 0xf:
416 model = &op_athlon_spec; 462 model = &op_amd_spec;
417 /* Actually it could be i386/hammer too, but give 463 /* Actually it could be i386/hammer too, but give
418 user space an consistent name. */ 464 user space an consistent name. */
419 cpu_type = "x86-64/hammer"; 465 cpu_type = "x86-64/hammer";
420 break; 466 break;
421 case 0x10: 467 case 0x10:
422 model = &op_athlon_spec; 468 model = &op_amd_spec;
423 cpu_type = "x86-64/family10"; 469 cpu_type = "x86-64/family10";
424 break; 470 break;
471 case 0x11:
472 model = &op_amd_spec;
473 cpu_type = "x86-64/family11h";
474 break;
425 } 475 }
426 break; 476 break;
427 477
@@ -429,39 +479,56 @@ int __init op_nmi_init(struct oprofile_operations *ops)
429 switch (family) { 479 switch (family) {
430 /* Pentium IV */ 480 /* Pentium IV */
431 case 0xf: 481 case 0xf:
432 if (!p4_init(&cpu_type)) 482 p4_init(&cpu_type);
433 return -ENODEV;
434 break; 483 break;
435 484
436 /* A P6-class processor */ 485 /* A P6-class processor */
437 case 6: 486 case 6:
438 if (!ppro_init(&cpu_type)) 487 ppro_init(&cpu_type);
439 return -ENODEV;
440 break; 488 break;
441 489
442 default: 490 default:
443 return -ENODEV; 491 break;
444 } 492 }
493
494 if (!cpu_type && !arch_perfmon_init(&cpu_type))
495 return -ENODEV;
445 break; 496 break;
446 497
447 default: 498 default:
448 return -ENODEV; 499 return -ENODEV;
449 } 500 }
450 501
451 init_sysfs(); 502#ifdef CONFIG_SMP
452 using_nmi = 1; 503 register_cpu_notifier(&oprofile_cpu_nb);
504#endif
505 /* default values, can be overwritten by model */
453 ops->create_files = nmi_create_files; 506 ops->create_files = nmi_create_files;
454 ops->setup = nmi_setup; 507 ops->setup = nmi_setup;
455 ops->shutdown = nmi_shutdown; 508 ops->shutdown = nmi_shutdown;
456 ops->start = nmi_start; 509 ops->start = nmi_start;
457 ops->stop = nmi_stop; 510 ops->stop = nmi_stop;
458 ops->cpu_type = cpu_type; 511 ops->cpu_type = cpu_type;
512
513 if (model->init)
514 ret = model->init(ops);
515 if (ret)
516 return ret;
517
518 init_sysfs();
519 using_nmi = 1;
459 printk(KERN_INFO "oprofile: using NMI interrupt.\n"); 520 printk(KERN_INFO "oprofile: using NMI interrupt.\n");
460 return 0; 521 return 0;
461} 522}
462 523
463void op_nmi_exit(void) 524void op_nmi_exit(void)
464{ 525{
465 if (using_nmi) 526 if (using_nmi) {
466 exit_sysfs(); 527 exit_sysfs();
528#ifdef CONFIG_SMP
529 unregister_cpu_notifier(&oprofile_cpu_nb);
530#endif
531 }
532 if (model->exit)
533 model->exit();
467} 534}
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
index 2880b15c4675..91b6a116165e 100644
--- a/arch/x86/oprofile/op_counter.h
+++ b/arch/x86/oprofile/op_counter.h
@@ -6,22 +6,22 @@
6 * 6 *
7 * @author John Levon 7 * @author John Levon
8 */ 8 */
9 9
10#ifndef OP_COUNTER_H 10#ifndef OP_COUNTER_H
11#define OP_COUNTER_H 11#define OP_COUNTER_H
12 12
13#define OP_MAX_COUNTER 8 13#define OP_MAX_COUNTER 8
14 14
15/* Per-perfctr configuration as set via 15/* Per-perfctr configuration as set via
16 * oprofilefs. 16 * oprofilefs.
17 */ 17 */
18struct op_counter_config { 18struct op_counter_config {
19 unsigned long count; 19 unsigned long count;
20 unsigned long enabled; 20 unsigned long enabled;
21 unsigned long event; 21 unsigned long event;
22 unsigned long kernel; 22 unsigned long kernel;
23 unsigned long user; 23 unsigned long user;
24 unsigned long unit_mask; 24 unsigned long unit_mask;
25}; 25};
26 26
27extern struct op_counter_config counter_config[]; 27extern struct op_counter_config counter_config[];
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
new file mode 100644
index 000000000000..509513760a6e
--- /dev/null
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -0,0 +1,546 @@
1/*
2 * @file op_model_amd.c
3 * athlon / K7 / K8 / Family 10h model-specific MSR operations
4 *
5 * @remark Copyright 2002-2008 OProfile authors
6 * @remark Read the file COPYING
7 *
8 * @author John Levon
9 * @author Philippe Elie
10 * @author Graydon Hoare
11 * @author Robert Richter <robert.richter@amd.com>
12 * @author Barry Kasindorf
13*/
14
15#include <linux/oprofile.h>
16#include <linux/device.h>
17#include <linux/pci.h>
18
19#include <asm/ptrace.h>
20#include <asm/msr.h>
21#include <asm/nmi.h>
22
23#include "op_x86_model.h"
24#include "op_counter.h"
25
26#define NUM_COUNTERS 4
27#define NUM_CONTROLS 4
28
29#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
30#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
31#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0)
32#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
33
34#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
35#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
36#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
37#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
38#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
39#define CTRL_CLEAR_LO(x) (x &= (1<<21))
40#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0)
41#define CTRL_SET_ENABLE(val) (val |= 1<<20)
42#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
43#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
44#define CTRL_SET_UM(val, m) (val |= (m << 8))
45#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff))
46#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf))
47#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9))
48#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
49
50static unsigned long reset_value[NUM_COUNTERS];
51
52#ifdef CONFIG_OPROFILE_IBS
53
54/* IbsFetchCtl bits/masks */
55#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */
56#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */
57#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */
58
59/*IbsOpCtl bits */
60#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */
61#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */
62
63/* Codes used in cpu_buffer.c */
64/* This produces duplicate code, need to be fixed */
65#define IBS_FETCH_BEGIN 3
66#define IBS_OP_BEGIN 4
67
68/* The function interface needs to be fixed, something like add
69 data. Should then be added to linux/oprofile.h. */
70extern void
71oprofile_add_ibs_sample(struct pt_regs *const regs,
72 unsigned int *const ibs_sample, int ibs_code);
73
74struct ibs_fetch_sample {
75 /* MSRC001_1031 IBS Fetch Linear Address Register */
76 unsigned int ibs_fetch_lin_addr_low;
77 unsigned int ibs_fetch_lin_addr_high;
78 /* MSRC001_1030 IBS Fetch Control Register */
79 unsigned int ibs_fetch_ctl_low;
80 unsigned int ibs_fetch_ctl_high;
81 /* MSRC001_1032 IBS Fetch Physical Address Register */
82 unsigned int ibs_fetch_phys_addr_low;
83 unsigned int ibs_fetch_phys_addr_high;
84};
85
86struct ibs_op_sample {
87 /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */
88 unsigned int ibs_op_rip_low;
89 unsigned int ibs_op_rip_high;
90 /* MSRC001_1035 IBS Op Data Register */
91 unsigned int ibs_op_data1_low;
92 unsigned int ibs_op_data1_high;
93 /* MSRC001_1036 IBS Op Data 2 Register */
94 unsigned int ibs_op_data2_low;
95 unsigned int ibs_op_data2_high;
96 /* MSRC001_1037 IBS Op Data 3 Register */
97 unsigned int ibs_op_data3_low;
98 unsigned int ibs_op_data3_high;
99 /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */
100 unsigned int ibs_dc_linear_low;
101 unsigned int ibs_dc_linear_high;
102 /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */
103 unsigned int ibs_dc_phys_low;
104 unsigned int ibs_dc_phys_high;
105};
106
107/*
108 * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+
109*/
110static void clear_ibs_nmi(void);
111
112static int ibs_allowed; /* AMD Family10h and later */
113
114struct op_ibs_config {
115 unsigned long op_enabled;
116 unsigned long fetch_enabled;
117 unsigned long max_cnt_fetch;
118 unsigned long max_cnt_op;
119 unsigned long rand_en;
120 unsigned long dispatched_ops;
121};
122
123static struct op_ibs_config ibs_config;
124
125#endif
126
127/* functions for op_amd_spec */
128
129static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
130{
131 int i;
132
133 for (i = 0; i < NUM_COUNTERS; i++) {
134 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
135 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
136 else
137 msrs->counters[i].addr = 0;
138 }
139
140 for (i = 0; i < NUM_CONTROLS; i++) {
141 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
142 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
143 else
144 msrs->controls[i].addr = 0;
145 }
146}
147
148
149static void op_amd_setup_ctrs(struct op_msrs const * const msrs)
150{
151 unsigned int low, high;
152 int i;
153
154 /* clear all counters */
155 for (i = 0 ; i < NUM_CONTROLS; ++i) {
156 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
157 continue;
158 CTRL_READ(low, high, msrs, i);
159 CTRL_CLEAR_LO(low);
160 CTRL_CLEAR_HI(high);
161 CTRL_WRITE(low, high, msrs, i);
162 }
163
164 /* avoid a false detection of ctr overflows in NMI handler */
165 for (i = 0; i < NUM_COUNTERS; ++i) {
166 if (unlikely(!CTR_IS_RESERVED(msrs, i)))
167 continue;
168 CTR_WRITE(1, msrs, i);
169 }
170
171 /* enable active counters */
172 for (i = 0; i < NUM_COUNTERS; ++i) {
173 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
174 reset_value[i] = counter_config[i].count;
175
176 CTR_WRITE(counter_config[i].count, msrs, i);
177
178 CTRL_READ(low, high, msrs, i);
179 CTRL_CLEAR_LO(low);
180 CTRL_CLEAR_HI(high);
181 CTRL_SET_ENABLE(low);
182 CTRL_SET_USR(low, counter_config[i].user);
183 CTRL_SET_KERN(low, counter_config[i].kernel);
184 CTRL_SET_UM(low, counter_config[i].unit_mask);
185 CTRL_SET_EVENT_LOW(low, counter_config[i].event);
186 CTRL_SET_EVENT_HIGH(high, counter_config[i].event);
187 CTRL_SET_HOST_ONLY(high, 0);
188 CTRL_SET_GUEST_ONLY(high, 0);
189
190 CTRL_WRITE(low, high, msrs, i);
191 } else {
192 reset_value[i] = 0;
193 }
194 }
195}
196
197#ifdef CONFIG_OPROFILE_IBS
198
199static inline int
200op_amd_handle_ibs(struct pt_regs * const regs,
201 struct op_msrs const * const msrs)
202{
203 unsigned int low, high;
204 struct ibs_fetch_sample ibs_fetch;
205 struct ibs_op_sample ibs_op;
206
207 if (!ibs_allowed)
208 return 1;
209
210 if (ibs_config.fetch_enabled) {
211 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
212 if (high & IBS_FETCH_HIGH_VALID_BIT) {
213 ibs_fetch.ibs_fetch_ctl_high = high;
214 ibs_fetch.ibs_fetch_ctl_low = low;
215 rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high);
216 ibs_fetch.ibs_fetch_lin_addr_high = high;
217 ibs_fetch.ibs_fetch_lin_addr_low = low;
218 rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high);
219 ibs_fetch.ibs_fetch_phys_addr_high = high;
220 ibs_fetch.ibs_fetch_phys_addr_low = low;
221
222 oprofile_add_ibs_sample(regs,
223 (unsigned int *)&ibs_fetch,
224 IBS_FETCH_BEGIN);
225
226 /*reenable the IRQ */
227 rdmsr(MSR_AMD64_IBSFETCHCTL, low, high);
228 high &= ~IBS_FETCH_HIGH_VALID_BIT;
229 high |= IBS_FETCH_HIGH_ENABLE;
230 low &= IBS_FETCH_LOW_MAX_CNT_MASK;
231 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
232 }
233 }
234
235 if (ibs_config.op_enabled) {
236 rdmsr(MSR_AMD64_IBSOPCTL, low, high);
237 if (low & IBS_OP_LOW_VALID_BIT) {
238 rdmsr(MSR_AMD64_IBSOPRIP, low, high);
239 ibs_op.ibs_op_rip_low = low;
240 ibs_op.ibs_op_rip_high = high;
241 rdmsr(MSR_AMD64_IBSOPDATA, low, high);
242 ibs_op.ibs_op_data1_low = low;
243 ibs_op.ibs_op_data1_high = high;
244 rdmsr(MSR_AMD64_IBSOPDATA2, low, high);
245 ibs_op.ibs_op_data2_low = low;
246 ibs_op.ibs_op_data2_high = high;
247 rdmsr(MSR_AMD64_IBSOPDATA3, low, high);
248 ibs_op.ibs_op_data3_low = low;
249 ibs_op.ibs_op_data3_high = high;
250 rdmsr(MSR_AMD64_IBSDCLINAD, low, high);
251 ibs_op.ibs_dc_linear_low = low;
252 ibs_op.ibs_dc_linear_high = high;
253 rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high);
254 ibs_op.ibs_dc_phys_low = low;
255 ibs_op.ibs_dc_phys_high = high;
256
257 /* reenable the IRQ */
258 oprofile_add_ibs_sample(regs,
259 (unsigned int *)&ibs_op,
260 IBS_OP_BEGIN);
261 rdmsr(MSR_AMD64_IBSOPCTL, low, high);
262 high = 0;
263 low &= ~IBS_OP_LOW_VALID_BIT;
264 low |= IBS_OP_LOW_ENABLE;
265 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
266 }
267 }
268
269 return 1;
270}
271
272#endif
273
274static int op_amd_check_ctrs(struct pt_regs * const regs,
275 struct op_msrs const * const msrs)
276{
277 unsigned int low, high;
278 int i;
279
280 for (i = 0 ; i < NUM_COUNTERS; ++i) {
281 if (!reset_value[i])
282 continue;
283 CTR_READ(low, high, msrs, i);
284 if (CTR_OVERFLOWED(low)) {
285 oprofile_add_sample(regs, i);
286 CTR_WRITE(reset_value[i], msrs, i);
287 }
288 }
289
290#ifdef CONFIG_OPROFILE_IBS
291 op_amd_handle_ibs(regs, msrs);
292#endif
293
294 /* See op_model_ppro.c */
295 return 1;
296}
297
298static void op_amd_start(struct op_msrs const * const msrs)
299{
300 unsigned int low, high;
301 int i;
302 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
303 if (reset_value[i]) {
304 CTRL_READ(low, high, msrs, i);
305 CTRL_SET_ACTIVE(low);
306 CTRL_WRITE(low, high, msrs, i);
307 }
308 }
309
310#ifdef CONFIG_OPROFILE_IBS
311 if (ibs_allowed && ibs_config.fetch_enabled) {
312 low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF;
313 high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */
314 + IBS_FETCH_HIGH_ENABLE;
315 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
316 }
317
318 if (ibs_allowed && ibs_config.op_enabled) {
319 low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF)
320 + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */
321 + IBS_OP_LOW_ENABLE;
322 high = 0;
323 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
324 }
325#endif
326}
327
328
329static void op_amd_stop(struct op_msrs const * const msrs)
330{
331 unsigned int low, high;
332 int i;
333
334 /* Subtle: stop on all counters to avoid race with
335 * setting our pm callback */
336 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
337 if (!reset_value[i])
338 continue;
339 CTRL_READ(low, high, msrs, i);
340 CTRL_SET_INACTIVE(low);
341 CTRL_WRITE(low, high, msrs, i);
342 }
343
344#ifdef CONFIG_OPROFILE_IBS
345 if (ibs_allowed && ibs_config.fetch_enabled) {
346 low = 0; /* clear max count and enable */
347 high = 0;
348 wrmsr(MSR_AMD64_IBSFETCHCTL, low, high);
349 }
350
351 if (ibs_allowed && ibs_config.op_enabled) {
352 low = 0; /* clear max count and enable */
353 high = 0;
354 wrmsr(MSR_AMD64_IBSOPCTL, low, high);
355 }
356#endif
357}
358
359static void op_amd_shutdown(struct op_msrs const * const msrs)
360{
361 int i;
362
363 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
364 if (CTR_IS_RESERVED(msrs, i))
365 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
366 }
367 for (i = 0 ; i < NUM_CONTROLS ; ++i) {
368 if (CTRL_IS_RESERVED(msrs, i))
369 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
370 }
371}
372
373#ifndef CONFIG_OPROFILE_IBS
374
375/* no IBS support */
376
377static int op_amd_init(struct oprofile_operations *ops)
378{
379 return 0;
380}
381
382static void op_amd_exit(void) {}
383
384#else
385
386static u8 ibs_eilvt_off;
387
388static inline void apic_init_ibs_nmi_per_cpu(void *arg)
389{
390 ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0);
391}
392
393static inline void apic_clear_ibs_nmi_per_cpu(void *arg)
394{
395 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
396}
397
398static int pfm_amd64_setup_eilvt(void)
399{
400#define IBSCTL_LVTOFFSETVAL (1 << 8)
401#define IBSCTL 0x1cc
402 struct pci_dev *cpu_cfg;
403 int nodes;
404 u32 value = 0;
405
406 /* per CPU setup */
407 on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1);
408
409 nodes = 0;
410 cpu_cfg = NULL;
411 do {
412 cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD,
413 PCI_DEVICE_ID_AMD_10H_NB_MISC,
414 cpu_cfg);
415 if (!cpu_cfg)
416 break;
417 ++nodes;
418 pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
419 | IBSCTL_LVTOFFSETVAL);
420 pci_read_config_dword(cpu_cfg, IBSCTL, &value);
421 if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) {
422 printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
423 "IBSCTL = 0x%08x", value);
424 return 1;
425 }
426 } while (1);
427
428 if (!nodes) {
429 printk(KERN_DEBUG "No CPU node configured for IBS");
430 return 1;
431 }
432
433#ifdef CONFIG_NUMA
434 /* Sanity check */
435 /* Works only for 64bit with proper numa implementation. */
436 if (nodes != num_possible_nodes()) {
437 printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, "
438 "found: %d, expected %d",
439 nodes, num_possible_nodes());
440 return 1;
441 }
442#endif
443 return 0;
444}
445
446/*
447 * initialize the APIC for the IBS interrupts
448 * if available (AMD Family10h rev B0 and later)
449 */
450static void setup_ibs(void)
451{
452 ibs_allowed = boot_cpu_has(X86_FEATURE_IBS);
453
454 if (!ibs_allowed)
455 return;
456
457 if (pfm_amd64_setup_eilvt()) {
458 ibs_allowed = 0;
459 return;
460 }
461
462 printk(KERN_INFO "oprofile: AMD IBS detected\n");
463}
464
465
466/*
467 * unitialize the APIC for the IBS interrupts if needed on AMD Family10h
468 * rev B0 and later */
469static void clear_ibs_nmi(void)
470{
471 if (ibs_allowed)
472 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
473}
474
475static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
476
477static int setup_ibs_files(struct super_block *sb, struct dentry *root)
478{
479 struct dentry *dir;
480 int ret = 0;
481
482 /* architecture specific files */
483 if (create_arch_files)
484 ret = create_arch_files(sb, root);
485
486 if (ret)
487 return ret;
488
489 if (!ibs_allowed)
490 return ret;
491
492 /* model specific files */
493
494 /* setup some reasonable defaults */
495 ibs_config.max_cnt_fetch = 250000;
496 ibs_config.fetch_enabled = 0;
497 ibs_config.max_cnt_op = 250000;
498 ibs_config.op_enabled = 0;
499 ibs_config.dispatched_ops = 1;
500
501 dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
502 oprofilefs_create_ulong(sb, dir, "enable",
503 &ibs_config.fetch_enabled);
504 oprofilefs_create_ulong(sb, dir, "max_count",
505 &ibs_config.max_cnt_fetch);
506 oprofilefs_create_ulong(sb, dir, "rand_enable",
507 &ibs_config.rand_en);
508
509 dir = oprofilefs_mkdir(sb, root, "ibs_op");
510 oprofilefs_create_ulong(sb, dir, "enable",
511 &ibs_config.op_enabled);
512 oprofilefs_create_ulong(sb, dir, "max_count",
513 &ibs_config.max_cnt_op);
514 oprofilefs_create_ulong(sb, dir, "dispatched_ops",
515 &ibs_config.dispatched_ops);
516
517 return 0;
518}
519
520static int op_amd_init(struct oprofile_operations *ops)
521{
522 setup_ibs();
523 create_arch_files = ops->create_files;
524 ops->create_files = setup_ibs_files;
525 return 0;
526}
527
528static void op_amd_exit(void)
529{
530 clear_ibs_nmi();
531}
532
533#endif
534
535struct op_x86_model_spec const op_amd_spec = {
536 .init = op_amd_init,
537 .exit = op_amd_exit,
538 .num_counters = NUM_COUNTERS,
539 .num_controls = NUM_CONTROLS,
540 .fill_in_addresses = &op_amd_fill_in_addresses,
541 .setup_ctrs = &op_amd_setup_ctrs,
542 .check_ctrs = &op_amd_check_ctrs,
543 .start = &op_amd_start,
544 .stop = &op_amd_stop,
545 .shutdown = &op_amd_shutdown
546};
diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c
deleted file mode 100644
index 3d534879a9dc..000000000000
--- a/arch/x86/oprofile/op_model_athlon.c
+++ /dev/null
@@ -1,190 +0,0 @@
1/*
2 * @file op_model_athlon.h
3 * athlon / K7 / K8 / Family 10h model-specific MSR operations
4 *
5 * @remark Copyright 2002 OProfile authors
6 * @remark Read the file COPYING
7 *
8 * @author John Levon
9 * @author Philippe Elie
10 * @author Graydon Hoare
11 */
12
13#include <linux/oprofile.h>
14#include <asm/ptrace.h>
15#include <asm/msr.h>
16#include <asm/nmi.h>
17
18#include "op_x86_model.h"
19#include "op_counter.h"
20
21#define NUM_COUNTERS 4
22#define NUM_CONTROLS 4
23
24#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
25#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
26#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0)
27#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
28
29#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
30#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
31#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0)
32#define CTRL_SET_ACTIVE(n) (n |= (1<<22))
33#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22))
34#define CTRL_CLEAR_LO(x) (x &= (1<<21))
35#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0)
36#define CTRL_SET_ENABLE(val) (val |= 1<<20)
37#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16))
38#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17))
39#define CTRL_SET_UM(val, m) (val |= (m << 8))
40#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff))
41#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf))
42#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9))
43#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8))
44
45static unsigned long reset_value[NUM_COUNTERS];
46
47static void athlon_fill_in_addresses(struct op_msrs * const msrs)
48{
49 int i;
50
51 for (i = 0; i < NUM_COUNTERS; i++) {
52 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
53 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
54 else
55 msrs->counters[i].addr = 0;
56 }
57
58 for (i = 0; i < NUM_CONTROLS; i++) {
59 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
60 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
61 else
62 msrs->controls[i].addr = 0;
63 }
64}
65
66
67static void athlon_setup_ctrs(struct op_msrs const * const msrs)
68{
69 unsigned int low, high;
70 int i;
71
72 /* clear all counters */
73 for (i = 0 ; i < NUM_CONTROLS; ++i) {
74 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
75 continue;
76 CTRL_READ(low, high, msrs, i);
77 CTRL_CLEAR_LO(low);
78 CTRL_CLEAR_HI(high);
79 CTRL_WRITE(low, high, msrs, i);
80 }
81
82 /* avoid a false detection of ctr overflows in NMI handler */
83 for (i = 0; i < NUM_COUNTERS; ++i) {
84 if (unlikely(!CTR_IS_RESERVED(msrs, i)))
85 continue;
86 CTR_WRITE(1, msrs, i);
87 }
88
89 /* enable active counters */
90 for (i = 0; i < NUM_COUNTERS; ++i) {
91 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
92 reset_value[i] = counter_config[i].count;
93
94 CTR_WRITE(counter_config[i].count, msrs, i);
95
96 CTRL_READ(low, high, msrs, i);
97 CTRL_CLEAR_LO(low);
98 CTRL_CLEAR_HI(high);
99 CTRL_SET_ENABLE(low);
100 CTRL_SET_USR(low, counter_config[i].user);
101 CTRL_SET_KERN(low, counter_config[i].kernel);
102 CTRL_SET_UM(low, counter_config[i].unit_mask);
103 CTRL_SET_EVENT_LOW(low, counter_config[i].event);
104 CTRL_SET_EVENT_HIGH(high, counter_config[i].event);
105 CTRL_SET_HOST_ONLY(high, 0);
106 CTRL_SET_GUEST_ONLY(high, 0);
107
108 CTRL_WRITE(low, high, msrs, i);
109 } else {
110 reset_value[i] = 0;
111 }
112 }
113}
114
115
116static int athlon_check_ctrs(struct pt_regs * const regs,
117 struct op_msrs const * const msrs)
118{
119 unsigned int low, high;
120 int i;
121
122 for (i = 0 ; i < NUM_COUNTERS; ++i) {
123 if (!reset_value[i])
124 continue;
125 CTR_READ(low, high, msrs, i);
126 if (CTR_OVERFLOWED(low)) {
127 oprofile_add_sample(regs, i);
128 CTR_WRITE(reset_value[i], msrs, i);
129 }
130 }
131
132 /* See op_model_ppro.c */
133 return 1;
134}
135
136
137static void athlon_start(struct op_msrs const * const msrs)
138{
139 unsigned int low, high;
140 int i;
141 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
142 if (reset_value[i]) {
143 CTRL_READ(low, high, msrs, i);
144 CTRL_SET_ACTIVE(low);
145 CTRL_WRITE(low, high, msrs, i);
146 }
147 }
148}
149
150
151static void athlon_stop(struct op_msrs const * const msrs)
152{
153 unsigned int low, high;
154 int i;
155
156 /* Subtle: stop on all counters to avoid race with
157 * setting our pm callback */
158 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
159 if (!reset_value[i])
160 continue;
161 CTRL_READ(low, high, msrs, i);
162 CTRL_SET_INACTIVE(low);
163 CTRL_WRITE(low, high, msrs, i);
164 }
165}
166
167static void athlon_shutdown(struct op_msrs const * const msrs)
168{
169 int i;
170
171 for (i = 0 ; i < NUM_COUNTERS ; ++i) {
172 if (CTR_IS_RESERVED(msrs, i))
173 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
174 }
175 for (i = 0 ; i < NUM_CONTROLS ; ++i) {
176 if (CTRL_IS_RESERVED(msrs, i))
177 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
178 }
179}
180
181struct op_x86_model_spec const op_athlon_spec = {
182 .num_counters = NUM_COUNTERS,
183 .num_controls = NUM_CONTROLS,
184 .fill_in_addresses = &athlon_fill_in_addresses,
185 .setup_ctrs = &athlon_setup_ctrs,
186 .check_ctrs = &athlon_check_ctrs,
187 .start = &athlon_start,
188 .stop = &athlon_stop,
189 .shutdown = &athlon_shutdown
190};
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 56b4757a1f47..4c4a51c90bc2 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -10,11 +10,12 @@
10 10
11#include <linux/oprofile.h> 11#include <linux/oprofile.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/ptrace.h>
14#include <linux/nmi.h>
13#include <asm/msr.h> 15#include <asm/msr.h>
14#include <asm/ptrace.h>
15#include <asm/fixmap.h> 16#include <asm/fixmap.h>
16#include <asm/apic.h> 17#include <asm/apic.h>
17#include <asm/nmi.h> 18
18 19
19#include "op_x86_model.h" 20#include "op_x86_model.h"
20#include "op_counter.h" 21#include "op_counter.h"
@@ -40,7 +41,7 @@ static unsigned int num_controls = NUM_CONTROLS_NON_HT;
40static inline void setup_num_counters(void) 41static inline void setup_num_counters(void)
41{ 42{
42#ifdef CONFIG_SMP 43#ifdef CONFIG_SMP
43 if (smp_num_siblings == 2){ 44 if (smp_num_siblings == 2) {
44 num_counters = NUM_COUNTERS_HT2; 45 num_counters = NUM_COUNTERS_HT2;
45 num_controls = NUM_CONTROLS_HT2; 46 num_controls = NUM_CONTROLS_HT2;
46 } 47 }
@@ -86,7 +87,7 @@ struct p4_event_binding {
86#define CTR_FLAME_2 (1 << 6) 87#define CTR_FLAME_2 (1 << 6)
87#define CTR_IQ_5 (1 << 7) 88#define CTR_IQ_5 (1 << 7)
88 89
89static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { 90static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = {
90 { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, 91 { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 },
91 { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, 92 { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 },
92 { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, 93 { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 },
@@ -97,32 +98,32 @@ static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = {
97 { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } 98 { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 }
98}; 99};
99 100
100#define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT 101#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT)
101 102
102/* p4 event codes in libop/op_event.h are indices into this table. */ 103/* p4 event codes in libop/op_event.h are indices into this table. */
103 104
104static struct p4_event_binding p4_events[NUM_EVENTS] = { 105static struct p4_event_binding p4_events[NUM_EVENTS] = {
105 106
106 { /* BRANCH_RETIRED */ 107 { /* BRANCH_RETIRED */
107 0x05, 0x06, 108 0x05, 0x06,
108 { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, 109 { {CTR_IQ_4, MSR_P4_CRU_ESCR2},
109 {CTR_IQ_5, MSR_P4_CRU_ESCR3} } 110 {CTR_IQ_5, MSR_P4_CRU_ESCR3} }
110 }, 111 },
111 112
112 { /* MISPRED_BRANCH_RETIRED */ 113 { /* MISPRED_BRANCH_RETIRED */
113 0x04, 0x03, 114 0x04, 0x03,
114 { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, 115 { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
115 { CTR_IQ_5, MSR_P4_CRU_ESCR1} } 116 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
116 }, 117 },
117 118
118 { /* TC_DELIVER_MODE */ 119 { /* TC_DELIVER_MODE */
119 0x01, 0x01, 120 0x01, 0x01,
120 { { CTR_MS_0, MSR_P4_TC_ESCR0}, 121 { { CTR_MS_0, MSR_P4_TC_ESCR0},
121 { CTR_MS_2, MSR_P4_TC_ESCR1} } 122 { CTR_MS_2, MSR_P4_TC_ESCR1} }
122 }, 123 },
123 124
124 { /* BPU_FETCH_REQUEST */ 125 { /* BPU_FETCH_REQUEST */
125 0x00, 0x03, 126 0x00, 0x03,
126 { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, 127 { { CTR_BPU_0, MSR_P4_BPU_ESCR0},
127 { CTR_BPU_2, MSR_P4_BPU_ESCR1} } 128 { CTR_BPU_2, MSR_P4_BPU_ESCR1} }
128 }, 129 },
@@ -146,7 +147,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
146 }, 147 },
147 148
148 { /* LOAD_PORT_REPLAY */ 149 { /* LOAD_PORT_REPLAY */
149 0x02, 0x04, 150 0x02, 0x04,
150 { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, 151 { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0},
151 { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } 152 { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} }
152 }, 153 },
@@ -170,43 +171,43 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
170 }, 171 },
171 172
172 { /* BSQ_CACHE_REFERENCE */ 173 { /* BSQ_CACHE_REFERENCE */
173 0x07, 0x0c, 174 0x07, 0x0c,
174 { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, 175 { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
175 { CTR_BPU_2, MSR_P4_BSU_ESCR1} } 176 { CTR_BPU_2, MSR_P4_BSU_ESCR1} }
176 }, 177 },
177 178
178 { /* IOQ_ALLOCATION */ 179 { /* IOQ_ALLOCATION */
179 0x06, 0x03, 180 0x06, 0x03,
180 { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, 181 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
181 { 0, 0 } } 182 { 0, 0 } }
182 }, 183 },
183 184
184 { /* IOQ_ACTIVE_ENTRIES */ 185 { /* IOQ_ACTIVE_ENTRIES */
185 0x06, 0x1a, 186 0x06, 0x1a,
186 { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, 187 { { CTR_BPU_2, MSR_P4_FSB_ESCR1},
187 { 0, 0 } } 188 { 0, 0 } }
188 }, 189 },
189 190
190 { /* FSB_DATA_ACTIVITY */ 191 { /* FSB_DATA_ACTIVITY */
191 0x06, 0x17, 192 0x06, 0x17,
192 { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, 193 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
193 { CTR_BPU_2, MSR_P4_FSB_ESCR1} } 194 { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
194 }, 195 },
195 196
196 { /* BSQ_ALLOCATION */ 197 { /* BSQ_ALLOCATION */
197 0x07, 0x05, 198 0x07, 0x05,
198 { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, 199 { { CTR_BPU_0, MSR_P4_BSU_ESCR0},
199 { 0, 0 } } 200 { 0, 0 } }
200 }, 201 },
201 202
202 { /* BSQ_ACTIVE_ENTRIES */ 203 { /* BSQ_ACTIVE_ENTRIES */
203 0x07, 0x06, 204 0x07, 0x06,
204 { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, 205 { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */},
205 { 0, 0 } } 206 { 0, 0 } }
206 }, 207 },
207 208
208 { /* X87_ASSIST */ 209 { /* X87_ASSIST */
209 0x05, 0x03, 210 0x05, 0x03,
210 { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, 211 { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
211 { CTR_IQ_5, MSR_P4_CRU_ESCR3} } 212 { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
212 }, 213 },
@@ -216,21 +217,21 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
216 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 217 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
217 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 218 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
218 }, 219 },
219 220
220 { /* PACKED_SP_UOP */ 221 { /* PACKED_SP_UOP */
221 0x01, 0x08, 222 0x01, 0x08,
222 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 223 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
223 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 224 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
224 }, 225 },
225 226
226 { /* PACKED_DP_UOP */ 227 { /* PACKED_DP_UOP */
227 0x01, 0x0c, 228 0x01, 0x0c,
228 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 229 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
229 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 230 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
230 }, 231 },
231 232
232 { /* SCALAR_SP_UOP */ 233 { /* SCALAR_SP_UOP */
233 0x01, 0x0a, 234 0x01, 0x0a,
234 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 235 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
235 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 236 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
236 }, 237 },
@@ -242,31 +243,31 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
242 }, 243 },
243 244
244 { /* 64BIT_MMX_UOP */ 245 { /* 64BIT_MMX_UOP */
245 0x01, 0x02, 246 0x01, 0x02,
246 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 247 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
247 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 248 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
248 }, 249 },
249 250
250 { /* 128BIT_MMX_UOP */ 251 { /* 128BIT_MMX_UOP */
251 0x01, 0x1a, 252 0x01, 0x1a,
252 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 253 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
253 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 254 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
254 }, 255 },
255 256
256 { /* X87_FP_UOP */ 257 { /* X87_FP_UOP */
257 0x01, 0x04, 258 0x01, 0x04,
258 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 259 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
259 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 260 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
260 }, 261 },
261 262
262 { /* X87_SIMD_MOVES_UOP */ 263 { /* X87_SIMD_MOVES_UOP */
263 0x01, 0x2e, 264 0x01, 0x2e,
264 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, 265 { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0},
265 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } 266 { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} }
266 }, 267 },
267 268
268 { /* MACHINE_CLEAR */ 269 { /* MACHINE_CLEAR */
269 0x05, 0x02, 270 0x05, 0x02,
270 { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, 271 { { CTR_IQ_4, MSR_P4_CRU_ESCR2},
271 { CTR_IQ_5, MSR_P4_CRU_ESCR3} } 272 { CTR_IQ_5, MSR_P4_CRU_ESCR3} }
272 }, 273 },
@@ -276,9 +277,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
276 { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, 277 { { CTR_BPU_0, MSR_P4_FSB_ESCR0},
277 { CTR_BPU_2, MSR_P4_FSB_ESCR1} } 278 { CTR_BPU_2, MSR_P4_FSB_ESCR1} }
278 }, 279 },
279 280
280 { /* TC_MS_XFER */ 281 { /* TC_MS_XFER */
281 0x00, 0x05, 282 0x00, 0x05,
282 { { CTR_MS_0, MSR_P4_MS_ESCR0}, 283 { { CTR_MS_0, MSR_P4_MS_ESCR0},
283 { CTR_MS_2, MSR_P4_MS_ESCR1} } 284 { CTR_MS_2, MSR_P4_MS_ESCR1} }
284 }, 285 },
@@ -308,7 +309,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
308 }, 309 },
309 310
310 { /* INSTR_RETIRED */ 311 { /* INSTR_RETIRED */
311 0x04, 0x02, 312 0x04, 0x02,
312 { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, 313 { { CTR_IQ_4, MSR_P4_CRU_ESCR0},
313 { CTR_IQ_5, MSR_P4_CRU_ESCR1} } 314 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
314 }, 315 },
@@ -319,14 +320,14 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
319 { CTR_IQ_5, MSR_P4_CRU_ESCR1} } 320 { CTR_IQ_5, MSR_P4_CRU_ESCR1} }
320 }, 321 },
321 322
322 { /* UOP_TYPE */ 323 { /* UOP_TYPE */
323 0x02, 0x02, 324 0x02, 0x02,
324 { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, 325 { { CTR_IQ_4, MSR_P4_RAT_ESCR0},
325 { CTR_IQ_5, MSR_P4_RAT_ESCR1} } 326 { CTR_IQ_5, MSR_P4_RAT_ESCR1} }
326 }, 327 },
327 328
328 { /* RETIRED_MISPRED_BRANCH_TYPE */ 329 { /* RETIRED_MISPRED_BRANCH_TYPE */
329 0x02, 0x05, 330 0x02, 0x05,
330 { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, 331 { { CTR_MS_0, MSR_P4_TBPU_ESCR0},
331 { CTR_MS_2, MSR_P4_TBPU_ESCR1} } 332 { CTR_MS_2, MSR_P4_TBPU_ESCR1} }
332 }, 333 },
@@ -349,8 +350,8 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
349#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) 350#define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1))
350#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) 351#define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25))
351#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) 352#define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9))
352#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) 353#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
353#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) 354#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0)
354 355
355#define CCCR_RESERVED_BITS 0x38030FFF 356#define CCCR_RESERVED_BITS 0x38030FFF
356#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) 357#define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS)
@@ -360,15 +361,15 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = {
360#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) 361#define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27))
361#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) 362#define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12))
362#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) 363#define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12))
363#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) 364#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
364#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) 365#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0)
365#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) 366#define CCCR_OVF_P(cccr) ((cccr) & (1U<<31))
366#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) 367#define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31)))
367 368
368#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0) 369#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
369#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0) 370#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
370#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0) 371#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0)
371#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0) 372#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0)
372#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) 373#define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000))
373 374
374 375
@@ -380,7 +381,7 @@ static unsigned int get_stagger(void)
380#ifdef CONFIG_SMP 381#ifdef CONFIG_SMP
381 int cpu = smp_processor_id(); 382 int cpu = smp_processor_id();
382 return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu))); 383 return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu)));
383#endif 384#endif
384 return 0; 385 return 0;
385} 386}
386 387
@@ -395,25 +396,23 @@ static unsigned long reset_value[NUM_COUNTERS_NON_HT];
395 396
396static void p4_fill_in_addresses(struct op_msrs * const msrs) 397static void p4_fill_in_addresses(struct op_msrs * const msrs)
397{ 398{
398 unsigned int i; 399 unsigned int i;
399 unsigned int addr, cccraddr, stag; 400 unsigned int addr, cccraddr, stag;
400 401
401 setup_num_counters(); 402 setup_num_counters();
402 stag = get_stagger(); 403 stag = get_stagger();
403 404
404 /* initialize some registers */ 405 /* initialize some registers */
405 for (i = 0; i < num_counters; ++i) { 406 for (i = 0; i < num_counters; ++i)
406 msrs->counters[i].addr = 0; 407 msrs->counters[i].addr = 0;
407 } 408 for (i = 0; i < num_controls; ++i)
408 for (i = 0; i < num_controls; ++i) {
409 msrs->controls[i].addr = 0; 409 msrs->controls[i].addr = 0;
410 } 410
411
412 /* the counter & cccr registers we pay attention to */ 411 /* the counter & cccr registers we pay attention to */
413 for (i = 0; i < num_counters; ++i) { 412 for (i = 0; i < num_counters; ++i) {
414 addr = p4_counters[VIRT_CTR(stag, i)].counter_address; 413 addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
415 cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address; 414 cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address;
416 if (reserve_perfctr_nmi(addr)){ 415 if (reserve_perfctr_nmi(addr)) {
417 msrs->counters[i].addr = addr; 416 msrs->counters[i].addr = addr;
418 msrs->controls[i].addr = cccraddr; 417 msrs->controls[i].addr = cccraddr;
419 } 418 }
@@ -447,22 +446,22 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
447 if (reserve_evntsel_nmi(addr)) 446 if (reserve_evntsel_nmi(addr))
448 msrs->controls[i].addr = addr; 447 msrs->controls[i].addr = addr;
449 } 448 }
450 449
451 for (addr = MSR_P4_MS_ESCR0 + stag; 450 for (addr = MSR_P4_MS_ESCR0 + stag;
452 addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { 451 addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) {
453 if (reserve_evntsel_nmi(addr)) 452 if (reserve_evntsel_nmi(addr))
454 msrs->controls[i].addr = addr; 453 msrs->controls[i].addr = addr;
455 } 454 }
456 455
457 for (addr = MSR_P4_IX_ESCR0 + stag; 456 for (addr = MSR_P4_IX_ESCR0 + stag;
458 addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { 457 addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) {
459 if (reserve_evntsel_nmi(addr)) 458 if (reserve_evntsel_nmi(addr))
460 msrs->controls[i].addr = addr; 459 msrs->controls[i].addr = addr;
461 } 460 }
462 461
463 /* there are 2 remaining non-contiguously located ESCRs */ 462 /* there are 2 remaining non-contiguously located ESCRs */
464 463
465 if (num_counters == NUM_COUNTERS_NON_HT) { 464 if (num_counters == NUM_COUNTERS_NON_HT) {
466 /* standard non-HT CPUs handle both remaining ESCRs*/ 465 /* standard non-HT CPUs handle both remaining ESCRs*/
467 if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) 466 if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5))
468 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; 467 msrs->controls[i++].addr = MSR_P4_CRU_ESCR5;
@@ -498,20 +497,20 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
498 unsigned int stag; 497 unsigned int stag;
499 498
500 stag = get_stagger(); 499 stag = get_stagger();
501 500
502 /* convert from counter *number* to counter *bit* */ 501 /* convert from counter *number* to counter *bit* */
503 counter_bit = 1 << VIRT_CTR(stag, ctr); 502 counter_bit = 1 << VIRT_CTR(stag, ctr);
504 503
505 /* find our event binding structure. */ 504 /* find our event binding structure. */
506 if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { 505 if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) {
507 printk(KERN_ERR 506 printk(KERN_ERR
508 "oprofile: P4 event code 0x%lx out of range\n", 507 "oprofile: P4 event code 0x%lx out of range\n",
509 counter_config[ctr].event); 508 counter_config[ctr].event);
510 return; 509 return;
511 } 510 }
512 511
513 ev = &(p4_events[counter_config[ctr].event - 1]); 512 ev = &(p4_events[counter_config[ctr].event - 1]);
514 513
515 for (i = 0; i < maxbind; i++) { 514 for (i = 0; i < maxbind; i++) {
516 if (ev->bindings[i].virt_counter & counter_bit) { 515 if (ev->bindings[i].virt_counter & counter_bit) {
517 516
@@ -526,25 +525,24 @@ static void pmc_setup_one_p4_counter(unsigned int ctr)
526 ESCR_SET_OS_1(escr, counter_config[ctr].kernel); 525 ESCR_SET_OS_1(escr, counter_config[ctr].kernel);
527 } 526 }
528 ESCR_SET_EVENT_SELECT(escr, ev->event_select); 527 ESCR_SET_EVENT_SELECT(escr, ev->event_select);
529 ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); 528 ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask);
530 ESCR_WRITE(escr, high, ev, i); 529 ESCR_WRITE(escr, high, ev, i);
531 530
532 /* modify CCCR */ 531 /* modify CCCR */
533 CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); 532 CCCR_READ(cccr, high, VIRT_CTR(stag, ctr));
534 CCCR_CLEAR(cccr); 533 CCCR_CLEAR(cccr);
535 CCCR_SET_REQUIRED_BITS(cccr); 534 CCCR_SET_REQUIRED_BITS(cccr);
536 CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); 535 CCCR_SET_ESCR_SELECT(cccr, ev->escr_select);
537 if (stag == 0) { 536 if (stag == 0)
538 CCCR_SET_PMI_OVF_0(cccr); 537 CCCR_SET_PMI_OVF_0(cccr);
539 } else { 538 else
540 CCCR_SET_PMI_OVF_1(cccr); 539 CCCR_SET_PMI_OVF_1(cccr);
541 }
542 CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); 540 CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr));
543 return; 541 return;
544 } 542 }
545 } 543 }
546 544
547 printk(KERN_ERR 545 printk(KERN_ERR
548 "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n", 546 "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n",
549 counter_config[ctr].event, stag, ctr); 547 counter_config[ctr].event, stag, ctr);
550} 548}
@@ -559,14 +557,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
559 stag = get_stagger(); 557 stag = get_stagger();
560 558
561 rdmsr(MSR_IA32_MISC_ENABLE, low, high); 559 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
562 if (! MISC_PMC_ENABLED_P(low)) { 560 if (!MISC_PMC_ENABLED_P(low)) {
563 printk(KERN_ERR "oprofile: P4 PMC not available\n"); 561 printk(KERN_ERR "oprofile: P4 PMC not available\n");
564 return; 562 return;
565 } 563 }
566 564
567 /* clear the cccrs we will use */ 565 /* clear the cccrs we will use */
568 for (i = 0 ; i < num_counters ; i++) { 566 for (i = 0 ; i < num_counters ; i++) {
569 if (unlikely(!CTRL_IS_RESERVED(msrs,i))) 567 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
570 continue; 568 continue;
571 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); 569 rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high);
572 CCCR_CLEAR(low); 570 CCCR_CLEAR(low);
@@ -576,14 +574,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs)
576 574
577 /* clear all escrs (including those outside our concern) */ 575 /* clear all escrs (including those outside our concern) */
578 for (i = num_counters; i < num_controls; i++) { 576 for (i = num_counters; i < num_controls; i++) {
579 if (unlikely(!CTRL_IS_RESERVED(msrs,i))) 577 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
580 continue; 578 continue;
581 wrmsr(msrs->controls[i].addr, 0, 0); 579 wrmsr(msrs->controls[i].addr, 0, 0);
582 } 580 }
583 581
584 /* setup all counters */ 582 /* setup all counters */
585 for (i = 0 ; i < num_counters ; ++i) { 583 for (i = 0 ; i < num_counters ; ++i) {
586 if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs,i))) { 584 if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) {
587 reset_value[i] = counter_config[i].count; 585 reset_value[i] = counter_config[i].count;
588 pmc_setup_one_p4_counter(i); 586 pmc_setup_one_p4_counter(i);
589 CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); 587 CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i));
@@ -603,11 +601,11 @@ static int p4_check_ctrs(struct pt_regs * const regs,
603 stag = get_stagger(); 601 stag = get_stagger();
604 602
605 for (i = 0; i < num_counters; ++i) { 603 for (i = 0; i < num_counters; ++i) {
606 604
607 if (!reset_value[i]) 605 if (!reset_value[i])
608 continue; 606 continue;
609 607
610 /* 608 /*
611 * there is some eccentricity in the hardware which 609 * there is some eccentricity in the hardware which
612 * requires that we perform 2 extra corrections: 610 * requires that we perform 2 extra corrections:
613 * 611 *
@@ -616,24 +614,24 @@ static int p4_check_ctrs(struct pt_regs * const regs,
616 * 614 *
617 * - write the counter back twice to ensure it gets 615 * - write the counter back twice to ensure it gets
618 * updated properly. 616 * updated properly.
619 * 617 *
620 * the former seems to be related to extra NMIs happening 618 * the former seems to be related to extra NMIs happening
621 * during the current NMI; the latter is reported as errata 619 * during the current NMI; the latter is reported as errata
622 * N15 in intel doc 249199-029, pentium 4 specification 620 * N15 in intel doc 249199-029, pentium 4 specification
623 * update, though their suggested work-around does not 621 * update, though their suggested work-around does not
624 * appear to solve the problem. 622 * appear to solve the problem.
625 */ 623 */
626 624
627 real = VIRT_CTR(stag, i); 625 real = VIRT_CTR(stag, i);
628 626
629 CCCR_READ(low, high, real); 627 CCCR_READ(low, high, real);
630 CTR_READ(ctr, high, real); 628 CTR_READ(ctr, high, real);
631 if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { 629 if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) {
632 oprofile_add_sample(regs, i); 630 oprofile_add_sample(regs, i);
633 CTR_WRITE(reset_value[i], real); 631 CTR_WRITE(reset_value[i], real);
634 CCCR_CLEAR_OVF(low); 632 CCCR_CLEAR_OVF(low);
635 CCCR_WRITE(low, high, real); 633 CCCR_WRITE(low, high, real);
636 CTR_WRITE(reset_value[i], real); 634 CTR_WRITE(reset_value[i], real);
637 } 635 }
638 } 636 }
639 637
@@ -683,15 +681,16 @@ static void p4_shutdown(struct op_msrs const * const msrs)
683 int i; 681 int i;
684 682
685 for (i = 0 ; i < num_counters ; ++i) { 683 for (i = 0 ; i < num_counters ; ++i) {
686 if (CTR_IS_RESERVED(msrs,i)) 684 if (CTR_IS_RESERVED(msrs, i))
687 release_perfctr_nmi(msrs->counters[i].addr); 685 release_perfctr_nmi(msrs->counters[i].addr);
688 } 686 }
689 /* some of the control registers are specially reserved in 687 /*
688 * some of the control registers are specially reserved in
690 * conjunction with the counter registers (hence the starting offset). 689 * conjunction with the counter registers (hence the starting offset).
691 * This saves a few bits. 690 * This saves a few bits.
692 */ 691 */
693 for (i = num_counters ; i < num_controls ; ++i) { 692 for (i = num_counters ; i < num_controls ; ++i) {
694 if (CTRL_IS_RESERVED(msrs,i)) 693 if (CTRL_IS_RESERVED(msrs, i))
695 release_evntsel_nmi(msrs->controls[i].addr); 694 release_evntsel_nmi(msrs->controls[i].addr);
696 } 695 }
697} 696}
@@ -699,24 +698,24 @@ static void p4_shutdown(struct op_msrs const * const msrs)
699 698
700#ifdef CONFIG_SMP 699#ifdef CONFIG_SMP
701struct op_x86_model_spec const op_p4_ht2_spec = { 700struct op_x86_model_spec const op_p4_ht2_spec = {
702 .num_counters = NUM_COUNTERS_HT2, 701 .num_counters = NUM_COUNTERS_HT2,
703 .num_controls = NUM_CONTROLS_HT2, 702 .num_controls = NUM_CONTROLS_HT2,
704 .fill_in_addresses = &p4_fill_in_addresses, 703 .fill_in_addresses = &p4_fill_in_addresses,
705 .setup_ctrs = &p4_setup_ctrs, 704 .setup_ctrs = &p4_setup_ctrs,
706 .check_ctrs = &p4_check_ctrs, 705 .check_ctrs = &p4_check_ctrs,
707 .start = &p4_start, 706 .start = &p4_start,
708 .stop = &p4_stop, 707 .stop = &p4_stop,
709 .shutdown = &p4_shutdown 708 .shutdown = &p4_shutdown
710}; 709};
711#endif 710#endif
712 711
713struct op_x86_model_spec const op_p4_spec = { 712struct op_x86_model_spec const op_p4_spec = {
714 .num_counters = NUM_COUNTERS_NON_HT, 713 .num_counters = NUM_COUNTERS_NON_HT,
715 .num_controls = NUM_CONTROLS_NON_HT, 714 .num_controls = NUM_CONTROLS_NON_HT,
716 .fill_in_addresses = &p4_fill_in_addresses, 715 .fill_in_addresses = &p4_fill_in_addresses,
717 .setup_ctrs = &p4_setup_ctrs, 716 .setup_ctrs = &p4_setup_ctrs,
718 .check_ctrs = &p4_check_ctrs, 717 .check_ctrs = &p4_check_ctrs,
719 .start = &p4_start, 718 .start = &p4_start,
720 .stop = &p4_stop, 719 .stop = &p4_stop,
721 .shutdown = &p4_shutdown 720 .shutdown = &p4_shutdown
722}; 721};
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index eff431f6c57b..0620d6d45f7d 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -1,32 +1,34 @@
1/* 1/*
2 * @file op_model_ppro.h 2 * @file op_model_ppro.h
3 * pentium pro / P6 model-specific MSR operations 3 * Family 6 perfmon and architectural perfmon MSR operations
4 * 4 *
5 * @remark Copyright 2002 OProfile authors 5 * @remark Copyright 2002 OProfile authors
6 * @remark Copyright 2008 Intel Corporation
6 * @remark Read the file COPYING 7 * @remark Read the file COPYING
7 * 8 *
8 * @author John Levon 9 * @author John Levon
9 * @author Philippe Elie 10 * @author Philippe Elie
10 * @author Graydon Hoare 11 * @author Graydon Hoare
12 * @author Andi Kleen
11 */ 13 */
12 14
13#include <linux/oprofile.h> 15#include <linux/oprofile.h>
16#include <linux/slab.h>
14#include <asm/ptrace.h> 17#include <asm/ptrace.h>
15#include <asm/msr.h> 18#include <asm/msr.h>
16#include <asm/apic.h> 19#include <asm/apic.h>
17#include <asm/nmi.h> 20#include <asm/nmi.h>
21#include <asm/intel_arch_perfmon.h>
18 22
19#include "op_x86_model.h" 23#include "op_x86_model.h"
20#include "op_counter.h" 24#include "op_counter.h"
21 25
22#define NUM_COUNTERS 2 26static int num_counters = 2;
23#define NUM_CONTROLS 2 27static int counter_width = 32;
24 28
25#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) 29#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0)
26#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) 30#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0)
27#define CTR_32BIT_WRITE(l, msrs, c) \ 31#define CTR_OVERFLOWED(n) (!((n) & (1U<<(counter_width-1))))
28 do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0); } while (0)
29#define CTR_OVERFLOWED(n) (!((n) & (1U<<31)))
30 32
31#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) 33#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0)
32#define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) 34#define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0)
@@ -40,20 +42,20 @@
40#define CTRL_SET_UM(val, m) (val |= (m << 8)) 42#define CTRL_SET_UM(val, m) (val |= (m << 8))
41#define CTRL_SET_EVENT(val, e) (val |= e) 43#define CTRL_SET_EVENT(val, e) (val |= e)
42 44
43static unsigned long reset_value[NUM_COUNTERS]; 45static u64 *reset_value;
44 46
45static void ppro_fill_in_addresses(struct op_msrs * const msrs) 47static void ppro_fill_in_addresses(struct op_msrs * const msrs)
46{ 48{
47 int i; 49 int i;
48 50
49 for (i = 0; i < NUM_COUNTERS; i++) { 51 for (i = 0; i < num_counters; i++) {
50 if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) 52 if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
51 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; 53 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
52 else 54 else
53 msrs->counters[i].addr = 0; 55 msrs->counters[i].addr = 0;
54 } 56 }
55 57
56 for (i = 0; i < NUM_CONTROLS; i++) { 58 for (i = 0; i < num_counters; i++) {
57 if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) 59 if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
58 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; 60 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
59 else 61 else
@@ -67,8 +69,22 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
67 unsigned int low, high; 69 unsigned int low, high;
68 int i; 70 int i;
69 71
72 if (!reset_value) {
73 reset_value = kmalloc(sizeof(unsigned) * num_counters,
74 GFP_ATOMIC);
75 if (!reset_value)
76 return;
77 }
78
79 if (cpu_has_arch_perfmon) {
80 union cpuid10_eax eax;
81 eax.full = cpuid_eax(0xa);
82 if (counter_width < eax.split.bit_width)
83 counter_width = eax.split.bit_width;
84 }
85
70 /* clear all counters */ 86 /* clear all counters */
71 for (i = 0 ; i < NUM_CONTROLS; ++i) { 87 for (i = 0 ; i < num_counters; ++i) {
72 if (unlikely(!CTRL_IS_RESERVED(msrs, i))) 88 if (unlikely(!CTRL_IS_RESERVED(msrs, i)))
73 continue; 89 continue;
74 CTRL_READ(low, high, msrs, i); 90 CTRL_READ(low, high, msrs, i);
@@ -77,18 +93,18 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs)
77 } 93 }
78 94
79 /* avoid a false detection of ctr overflows in NMI handler */ 95 /* avoid a false detection of ctr overflows in NMI handler */
80 for (i = 0; i < NUM_COUNTERS; ++i) { 96 for (i = 0; i < num_counters; ++i) {
81 if (unlikely(!CTR_IS_RESERVED(msrs, i))) 97 if (unlikely(!CTR_IS_RESERVED(msrs, i)))
82 continue; 98 continue;
83 CTR_32BIT_WRITE(1, msrs, i); 99 wrmsrl(msrs->counters[i].addr, -1LL);
84 } 100 }
85 101
86 /* enable active counters */ 102 /* enable active counters */
87 for (i = 0; i < NUM_COUNTERS; ++i) { 103 for (i = 0; i < num_counters; ++i) {
88 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { 104 if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) {
89 reset_value[i] = counter_config[i].count; 105 reset_value[i] = counter_config[i].count;
90 106
91 CTR_32BIT_WRITE(counter_config[i].count, msrs, i); 107 wrmsrl(msrs->counters[i].addr, -reset_value[i]);
92 108
93 CTRL_READ(low, high, msrs, i); 109 CTRL_READ(low, high, msrs, i);
94 CTRL_CLEAR(low); 110 CTRL_CLEAR(low);
@@ -111,13 +127,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,
111 unsigned int low, high; 127 unsigned int low, high;
112 int i; 128 int i;
113 129
114 for (i = 0 ; i < NUM_COUNTERS; ++i) { 130 for (i = 0 ; i < num_counters; ++i) {
115 if (!reset_value[i]) 131 if (!reset_value[i])
116 continue; 132 continue;
117 CTR_READ(low, high, msrs, i); 133 CTR_READ(low, high, msrs, i);
118 if (CTR_OVERFLOWED(low)) { 134 if (CTR_OVERFLOWED(low)) {
119 oprofile_add_sample(regs, i); 135 oprofile_add_sample(regs, i);
120 CTR_32BIT_WRITE(reset_value[i], msrs, i); 136 wrmsrl(msrs->counters[i].addr, -reset_value[i]);
121 } 137 }
122 } 138 }
123 139
@@ -141,7 +157,7 @@ static void ppro_start(struct op_msrs const * const msrs)
141 unsigned int low, high; 157 unsigned int low, high;
142 int i; 158 int i;
143 159
144 for (i = 0; i < NUM_COUNTERS; ++i) { 160 for (i = 0; i < num_counters; ++i) {
145 if (reset_value[i]) { 161 if (reset_value[i]) {
146 CTRL_READ(low, high, msrs, i); 162 CTRL_READ(low, high, msrs, i);
147 CTRL_SET_ACTIVE(low); 163 CTRL_SET_ACTIVE(low);
@@ -156,7 +172,7 @@ static void ppro_stop(struct op_msrs const * const msrs)
156 unsigned int low, high; 172 unsigned int low, high;
157 int i; 173 int i;
158 174
159 for (i = 0; i < NUM_COUNTERS; ++i) { 175 for (i = 0; i < num_counters; ++i) {
160 if (!reset_value[i]) 176 if (!reset_value[i])
161 continue; 177 continue;
162 CTRL_READ(low, high, msrs, i); 178 CTRL_READ(low, high, msrs, i);
@@ -169,24 +185,70 @@ static void ppro_shutdown(struct op_msrs const * const msrs)
169{ 185{
170 int i; 186 int i;
171 187
172 for (i = 0 ; i < NUM_COUNTERS ; ++i) { 188 for (i = 0 ; i < num_counters ; ++i) {
173 if (CTR_IS_RESERVED(msrs, i)) 189 if (CTR_IS_RESERVED(msrs, i))
174 release_perfctr_nmi(MSR_P6_PERFCTR0 + i); 190 release_perfctr_nmi(MSR_P6_PERFCTR0 + i);
175 } 191 }
176 for (i = 0 ; i < NUM_CONTROLS ; ++i) { 192 for (i = 0 ; i < num_counters ; ++i) {
177 if (CTRL_IS_RESERVED(msrs, i)) 193 if (CTRL_IS_RESERVED(msrs, i))
178 release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); 194 release_evntsel_nmi(MSR_P6_EVNTSEL0 + i);
179 } 195 }
196 if (reset_value) {
197 kfree(reset_value);
198 reset_value = NULL;
199 }
180} 200}
181 201
182 202
183struct op_x86_model_spec const op_ppro_spec = { 203struct op_x86_model_spec op_ppro_spec = {
184 .num_counters = NUM_COUNTERS, 204 .num_counters = 2, /* can be overriden */
185 .num_controls = NUM_CONTROLS, 205 .num_controls = 2, /* dito */
186 .fill_in_addresses = &ppro_fill_in_addresses, 206 .fill_in_addresses = &ppro_fill_in_addresses,
187 .setup_ctrs = &ppro_setup_ctrs, 207 .setup_ctrs = &ppro_setup_ctrs,
188 .check_ctrs = &ppro_check_ctrs, 208 .check_ctrs = &ppro_check_ctrs,
189 .start = &ppro_start, 209 .start = &ppro_start,
190 .stop = &ppro_stop, 210 .stop = &ppro_stop,
191 .shutdown = &ppro_shutdown 211 .shutdown = &ppro_shutdown
212};
213
214/*
215 * Architectural performance monitoring.
216 *
217 * Newer Intel CPUs (Core1+) have support for architectural
218 * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details.
219 * The advantage of this is that it can be done without knowing about
220 * the specific CPU.
221 */
222
223void arch_perfmon_setup_counters(void)
224{
225 union cpuid10_eax eax;
226
227 eax.full = cpuid_eax(0xa);
228
229 /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */
230 if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 &&
231 current_cpu_data.x86_model == 15) {
232 eax.split.version_id = 2;
233 eax.split.num_counters = 2;
234 eax.split.bit_width = 40;
235 }
236
237 num_counters = eax.split.num_counters;
238
239 op_arch_perfmon_spec.num_counters = num_counters;
240 op_arch_perfmon_spec.num_controls = num_counters;
241 op_ppro_spec.num_counters = num_counters;
242 op_ppro_spec.num_controls = num_counters;
243}
244
245struct op_x86_model_spec op_arch_perfmon_spec = {
246 /* num_counters/num_controls filled in at runtime */
247 .fill_in_addresses = &ppro_fill_in_addresses,
248 /* user space does the cpuid check for available events */
249 .setup_ctrs = &ppro_setup_ctrs,
250 .check_ctrs = &ppro_check_ctrs,
251 .start = &ppro_start,
252 .stop = &ppro_stop,
253 .shutdown = &ppro_shutdown
192}; 254};
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 45b605fa71d0..825e79064d64 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -22,8 +22,8 @@ struct op_msr {
22}; 22};
23 23
24struct op_msrs { 24struct op_msrs {
25 struct op_msr * counters; 25 struct op_msr *counters;
26 struct op_msr * controls; 26 struct op_msr *controls;
27}; 27};
28 28
29struct pt_regs; 29struct pt_regs;
@@ -32,8 +32,10 @@ struct pt_regs;
32 * various x86 CPU models' perfctr support. 32 * various x86 CPU models' perfctr support.
33 */ 33 */
34struct op_x86_model_spec { 34struct op_x86_model_spec {
35 unsigned int const num_counters; 35 int (*init)(struct oprofile_operations *ops);
36 unsigned int const num_controls; 36 void (*exit)(void);
37 unsigned int num_counters;
38 unsigned int num_controls;
37 void (*fill_in_addresses)(struct op_msrs * const msrs); 39 void (*fill_in_addresses)(struct op_msrs * const msrs);
38 void (*setup_ctrs)(struct op_msrs const * const msrs); 40 void (*setup_ctrs)(struct op_msrs const * const msrs);
39 int (*check_ctrs)(struct pt_regs * const regs, 41 int (*check_ctrs)(struct pt_regs * const regs,
@@ -43,9 +45,12 @@ struct op_x86_model_spec {
43 void (*shutdown)(struct op_msrs const * const msrs); 45 void (*shutdown)(struct op_msrs const * const msrs);
44}; 46};
45 47
46extern struct op_x86_model_spec const op_ppro_spec; 48extern struct op_x86_model_spec op_ppro_spec;
47extern struct op_x86_model_spec const op_p4_spec; 49extern struct op_x86_model_spec const op_p4_spec;
48extern struct op_x86_model_spec const op_p4_ht2_spec; 50extern struct op_x86_model_spec const op_p4_ht2_spec;
49extern struct op_x86_model_spec const op_athlon_spec; 51extern struct op_x86_model_spec const op_amd_spec;
52extern struct op_x86_model_spec op_arch_perfmon_spec;
53
54extern void arch_perfmon_setup_counters(void);
50 55
51#endif /* OP_X86_MODEL_H */ 56#endif /* OP_X86_MODEL_H */
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index e515e8db842a..d49202e740ea 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -5,13 +5,13 @@ obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o
5obj-$(CONFIG_PCI_DIRECT) += direct.o 5obj-$(CONFIG_PCI_DIRECT) += direct.o
6obj-$(CONFIG_PCI_OLPC) += olpc.o 6obj-$(CONFIG_PCI_OLPC) += olpc.o
7 7
8pci-y := fixup.o 8obj-y += fixup.o
9pci-$(CONFIG_ACPI) += acpi.o 9obj-$(CONFIG_ACPI) += acpi.o
10pci-y += legacy.o irq.o 10obj-y += legacy.o irq.o
11 11
12pci-$(CONFIG_X86_VISWS) += visws.o 12obj-$(CONFIG_X86_VISWS) += visws.o
13 13
14pci-$(CONFIG_X86_NUMAQ) += numa.o 14obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
15 15
16obj-y += $(pci-y) common.o early.o 16obj-y += common.o early.o
17obj-y += amd_bus.o 17obj-y += amd_bus.o
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 19af06927fbc..1d88d2b39771 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -250,10 +250,5 @@ int __init pci_acpi_init(void)
250 acpi_pci_irq_enable(dev); 250 acpi_pci_irq_enable(dev);
251 } 251 }
252 252
253#ifdef CONFIG_X86_IO_APIC
254 if (acpi_ioapic)
255 print_IO_APIC();
256#endif
257
258 return 0; 253 return 0;
259} 254}
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index dbf532369711..22e057665e55 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -1,6 +1,7 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/pci.h> 2#include <linux/pci.h>
3#include <linux/topology.h> 3#include <linux/topology.h>
4#include <linux/cpu.h>
4#include "pci.h" 5#include "pci.h"
5 6
6#ifdef CONFIG_X86_64 7#ifdef CONFIG_X86_64
@@ -555,15 +556,17 @@ static int __init early_fill_mp_bus_info(void)
555 return 0; 556 return 0;
556} 557}
557 558
558postcore_initcall(early_fill_mp_bus_info); 559#else /* !CONFIG_X86_64 */
559 560
560#endif 561static int __init early_fill_mp_bus_info(void) { return 0; }
562
563#endif /* !CONFIG_X86_64 */
561 564
562/* common 32/64 bit code */ 565/* common 32/64 bit code */
563 566
564#define ENABLE_CF8_EXT_CFG (1ULL << 46) 567#define ENABLE_CF8_EXT_CFG (1ULL << 46)
565 568
566static void enable_pci_io_ecs_per_cpu(void *unused) 569static void enable_pci_io_ecs(void *unused)
567{ 570{
568 u64 reg; 571 u64 reg;
569 rdmsrl(MSR_AMD64_NB_CFG, reg); 572 rdmsrl(MSR_AMD64_NB_CFG, reg);
@@ -573,14 +576,51 @@ static void enable_pci_io_ecs_per_cpu(void *unused)
573 } 576 }
574} 577}
575 578
576static int __init enable_pci_io_ecs(void) 579static int __cpuinit amd_cpu_notify(struct notifier_block *self,
580 unsigned long action, void *hcpu)
577{ 581{
582 int cpu = (long)hcpu;
583 switch (action) {
584 case CPU_ONLINE:
585 case CPU_ONLINE_FROZEN:
586 smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0);
587 break;
588 default:
589 break;
590 }
591 return NOTIFY_OK;
592}
593
594static struct notifier_block __cpuinitdata amd_cpu_notifier = {
595 .notifier_call = amd_cpu_notify,
596};
597
598static int __init pci_io_ecs_init(void)
599{
600 int cpu;
601
578 /* assume all cpus from fam10h have IO ECS */ 602 /* assume all cpus from fam10h have IO ECS */
579 if (boot_cpu_data.x86 < 0x10) 603 if (boot_cpu_data.x86 < 0x10)
580 return 0; 604 return 0;
581 on_each_cpu(enable_pci_io_ecs_per_cpu, NULL, 1); 605
606 register_cpu_notifier(&amd_cpu_notifier);
607 for_each_online_cpu(cpu)
608 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
609 (void *)(long)cpu);
582 pci_probe |= PCI_HAS_IO_ECS; 610 pci_probe |= PCI_HAS_IO_ECS;
611
612 return 0;
613}
614
615static int __init amd_postcore_init(void)
616{
617 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
618 return 0;
619
620 early_fill_mp_bus_info();
621 pci_io_ecs_init();
622
583 return 0; 623 return 0;
584} 624}
585 625
586postcore_initcall(enable_pci_io_ecs); 626postcore_initcall(amd_postcore_init);
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index 858dbe3399f9..86631ccbc25a 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -7,15 +7,13 @@
7/* Direct PCI access. This is used for PCI accesses in early boot before 7/* Direct PCI access. This is used for PCI accesses in early boot before
8 the PCI subsystem works. */ 8 the PCI subsystem works. */
9 9
10#define PDprintk(x...)
11
12u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset) 10u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
13{ 11{
14 u32 v; 12 u32 v;
15 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 13 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
16 v = inl(0xcfc); 14 v = inl(0xcfc);
17 if (v != 0xffffffff) 15 if (v != 0xffffffff)
18 PDprintk("%x reading 4 from %x: %x\n", slot, offset, v); 16 pr_debug("%x reading 4 from %x: %x\n", slot, offset, v);
19 return v; 17 return v;
20} 18}
21 19
@@ -24,7 +22,7 @@ u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
24 u8 v; 22 u8 v;
25 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 23 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
26 v = inb(0xcfc + (offset&3)); 24 v = inb(0xcfc + (offset&3));
27 PDprintk("%x reading 1 from %x: %x\n", slot, offset, v); 25 pr_debug("%x reading 1 from %x: %x\n", slot, offset, v);
28 return v; 26 return v;
29} 27}
30 28
@@ -33,28 +31,28 @@ u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
33 u16 v; 31 u16 v;
34 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 32 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
35 v = inw(0xcfc + (offset&2)); 33 v = inw(0xcfc + (offset&2));
36 PDprintk("%x reading 2 from %x: %x\n", slot, offset, v); 34 pr_debug("%x reading 2 from %x: %x\n", slot, offset, v);
37 return v; 35 return v;
38} 36}
39 37
40void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, 38void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
41 u32 val) 39 u32 val)
42{ 40{
43 PDprintk("%x writing to %x: %x\n", slot, offset, val); 41 pr_debug("%x writing to %x: %x\n", slot, offset, val);
44 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 42 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
45 outl(val, 0xcfc); 43 outl(val, 0xcfc);
46} 44}
47 45
48void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) 46void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
49{ 47{
50 PDprintk("%x writing to %x: %x\n", slot, offset, val); 48 pr_debug("%x writing to %x: %x\n", slot, offset, val);
51 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 49 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
52 outb(val, 0xcfc + (offset&3)); 50 outb(val, 0xcfc + (offset&3));
53} 51}
54 52
55void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val) 53void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val)
56{ 54{
57 PDprintk("%x writing to %x: %x\n", slot, offset, val); 55 pr_debug("%x writing to %x: %x\n", slot, offset, val);
58 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 56 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
59 outw(val, 0xcfc + (offset&2)); 57 outw(val, 0xcfc + (offset&2));
60} 58}
@@ -71,7 +69,7 @@ void early_dump_pci_device(u8 bus, u8 slot, u8 func)
71 int j; 69 int j;
72 u32 val; 70 u32 val;
73 71
74 printk("PCI: %02x:%02x:%02x", bus, slot, func); 72 printk(KERN_INFO "PCI: %02x:%02x:%02x", bus, slot, func);
75 73
76 for (i = 0; i < 256; i += 4) { 74 for (i = 0; i < 256; i += 4) {
77 if (!(i & 0x0f)) 75 if (!(i & 0x0f))
diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c
index ff3a6a336342..3c27a809393b 100644
--- a/arch/x86/pci/fixup.c
+++ b/arch/x86/pci/fixup.c
@@ -23,7 +23,8 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
23 pci_read_config_byte(d, reg++, &busno); 23 pci_read_config_byte(d, reg++, &busno);
24 pci_read_config_byte(d, reg++, &suba); 24 pci_read_config_byte(d, reg++, &suba);
25 pci_read_config_byte(d, reg++, &subb); 25 pci_read_config_byte(d, reg++, &subb);
26 DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); 26 dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno,
27 suba, subb);
27 if (busno) 28 if (busno)
28 pci_scan_bus_with_sysdata(busno); /* Bus A */ 29 pci_scan_bus_with_sysdata(busno); /* Bus A */
29 if (suba < subb) 30 if (suba < subb)
@@ -510,3 +511,31 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, fam10h_pci_cfg_space_size);
510DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size); 511DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size);
511DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size); 512DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size);
512DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size); 513DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size);
514
515/*
516 * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from
517 * confusing the PCI engine:
518 */
519static void sb600_disable_hpet_bar(struct pci_dev *dev)
520{
521 u8 val;
522
523 /*
524 * The SB600 and SB700 both share the same device
525 * ID, but the PM register 0x55 does something different
526 * for the SB700, so make sure we are dealing with the
527 * SB600 before touching the bit:
528 */
529
530 pci_read_config_byte(dev, 0x08, &val);
531
532 if (val < 0x2F) {
533 outb(0x55, 0xCD6);
534 val = inb(0xCD7);
535
536 /* Set bit 7 in PM register 0x55 */
537 outb(0x55, 0xCD6);
538 outb(val | 0x80, 0xCD7);
539 }
540}
541DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar);
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 2aafb67dc5f1..844df0cbbd3e 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -33,6 +33,7 @@
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34 34
35#include <asm/pat.h> 35#include <asm/pat.h>
36#include <asm/e820.h>
36 37
37#include "pci.h" 38#include "pci.h"
38 39
@@ -128,10 +129,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
128 pr = pci_find_parent_resource(dev, r); 129 pr = pci_find_parent_resource(dev, r);
129 if (!r->start || !pr || 130 if (!r->start || !pr ||
130 request_resource(pr, r) < 0) { 131 request_resource(pr, r) < 0) {
131 printk(KERN_ERR "PCI: Cannot allocate " 132 dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx);
132 "resource region %d "
133 "of bridge %s\n",
134 idx, pci_name(dev));
135 /* 133 /*
136 * Something is wrong with the region. 134 * Something is wrong with the region.
137 * Invalidate the resource to prevent 135 * Invalidate the resource to prevent
@@ -166,15 +164,13 @@ static void __init pcibios_allocate_resources(int pass)
166 else 164 else
167 disabled = !(command & PCI_COMMAND_MEMORY); 165 disabled = !(command & PCI_COMMAND_MEMORY);
168 if (pass == disabled) { 166 if (pass == disabled) {
169 DBG("PCI: Resource %08lx-%08lx " 167 dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n",
170 "(f=%lx, d=%d, p=%d)\n", 168 (unsigned long long) r->start,
171 r->start, r->end, r->flags, disabled, pass); 169 (unsigned long long) r->end,
170 r->flags, disabled, pass);
172 pr = pci_find_parent_resource(dev, r); 171 pr = pci_find_parent_resource(dev, r);
173 if (!pr || request_resource(pr, r) < 0) { 172 if (!pr || request_resource(pr, r) < 0) {
174 printk(KERN_ERR "PCI: Cannot allocate " 173 dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx);
175 "resource region %d "
176 "of device %s\n",
177 idx, pci_name(dev));
178 /* We'll assign a new address later */ 174 /* We'll assign a new address later */
179 r->end -= r->start; 175 r->end -= r->start;
180 r->start = 0; 176 r->start = 0;
@@ -187,8 +183,7 @@ static void __init pcibios_allocate_resources(int pass)
187 /* Turn the ROM off, leave the resource region, 183 /* Turn the ROM off, leave the resource region,
188 * but keep it unregistered. */ 184 * but keep it unregistered. */
189 u32 reg; 185 u32 reg;
190 DBG("PCI: Switching off ROM of %s\n", 186 dev_dbg(&dev->dev, "disabling ROM\n");
191 pci_name(dev));
192 r->flags &= ~IORESOURCE_ROM_ENABLE; 187 r->flags &= ~IORESOURCE_ROM_ENABLE;
193 pci_read_config_dword(dev, 188 pci_read_config_dword(dev,
194 dev->rom_base_reg, &reg); 189 dev->rom_base_reg, &reg);
@@ -233,6 +228,8 @@ void __init pcibios_resource_survey(void)
233 pcibios_allocate_bus_resources(&pci_root_buses); 228 pcibios_allocate_bus_resources(&pci_root_buses);
234 pcibios_allocate_resources(0); 229 pcibios_allocate_resources(0);
235 pcibios_allocate_resources(1); 230 pcibios_allocate_resources(1);
231
232 e820_reserve_resources_late();
236} 233}
237 234
238/** 235/**
@@ -257,8 +254,7 @@ void pcibios_set_master(struct pci_dev *dev)
257 lat = pcibios_max_latency; 254 lat = pcibios_max_latency;
258 else 255 else
259 return; 256 return;
260 printk(KERN_DEBUG "PCI: Setting latency timer of device %s to %d\n", 257 dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat);
261 pci_name(dev), lat);
262 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); 258 pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat);
263} 259}
264 260
@@ -280,6 +276,7 @@ static void pci_track_mmap_page_range(struct vm_area_struct *vma)
280static struct vm_operations_struct pci_mmap_ops = { 276static struct vm_operations_struct pci_mmap_ops = {
281 .open = pci_track_mmap_page_range, 277 .open = pci_track_mmap_page_range,
282 .close = pci_unmap_page_range, 278 .close = pci_unmap_page_range,
279 .access = generic_access_phys,
283}; 280};
284 281
285int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, 282int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 6a06a2eb0597..bf69dbe08bff 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -436,7 +436,7 @@ static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq)
436{ 436{
437 WARN_ON_ONCE(pirq >= 9); 437 WARN_ON_ONCE(pirq >= 9);
438 if (pirq > 8) { 438 if (pirq > 8) {
439 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); 439 dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
440 return 0; 440 return 0;
441 } 441 }
442 return read_config_nybble(router, 0x74, pirq-1); 442 return read_config_nybble(router, 0x74, pirq-1);
@@ -446,7 +446,7 @@ static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq,
446{ 446{
447 WARN_ON_ONCE(pirq >= 9); 447 WARN_ON_ONCE(pirq >= 9);
448 if (pirq > 8) { 448 if (pirq > 8) {
449 printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); 449 dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq);
450 return 0; 450 return 0;
451 } 451 }
452 write_config_nybble(router, 0x74, pirq-1, irq); 452 write_config_nybble(router, 0x74, pirq-1, irq);
@@ -492,15 +492,17 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq
492 irq = 0; 492 irq = 0;
493 if (pirq <= 4) 493 if (pirq <= 4)
494 irq = read_config_nybble(router, 0x56, pirq - 1); 494 irq = read_config_nybble(router, 0x56, pirq - 1);
495 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n", 495 dev_info(&dev->dev,
496 dev->vendor, dev->device, pirq, irq); 496 "AMD756: dev [%04x:%04x], router PIRQ %d get IRQ %d\n",
497 dev->vendor, dev->device, pirq, irq);
497 return irq; 498 return irq;
498} 499}
499 500
500static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) 501static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq)
501{ 502{
502 printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", 503 dev_info(&dev->dev,
503 dev->vendor, dev->device, pirq, irq); 504 "AMD756: dev [%04x:%04x], router PIRQ %d set IRQ %d\n",
505 dev->vendor, dev->device, pirq, irq);
504 if (pirq <= 4) 506 if (pirq <= 4)
505 write_config_nybble(router, 0x56, pirq - 1, irq); 507 write_config_nybble(router, 0x56, pirq - 1, irq);
506 return 1; 508 return 1;
@@ -593,6 +595,15 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
593 r->set = pirq_piix_set; 595 r->set = pirq_piix_set;
594 return 1; 596 return 1;
595 } 597 }
598
599 if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) &&
600 (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) {
601 r->name = "PIIX/ICH";
602 r->get = pirq_piix_get;
603 r->set = pirq_piix_set;
604 return 1;
605 }
606
596 return 0; 607 return 0;
597} 608}
598 609
@@ -730,7 +741,6 @@ static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router,
730 switch (device) { 741 switch (device) {
731 case PCI_DEVICE_ID_AL_M1533: 742 case PCI_DEVICE_ID_AL_M1533:
732 case PCI_DEVICE_ID_AL_M1563: 743 case PCI_DEVICE_ID_AL_M1563:
733 printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n");
734 r->name = "ALI"; 744 r->name = "ALI";
735 r->get = pirq_ali_get; 745 r->get = pirq_ali_get;
736 r->set = pirq_ali_set; 746 r->set = pirq_ali_set;
@@ -820,7 +830,7 @@ static void __init pirq_find_router(struct irq_router *r)
820 r->get = NULL; 830 r->get = NULL;
821 r->set = NULL; 831 r->set = NULL;
822 832
823 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", 833 DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n",
824 rt->rtr_vendor, rt->rtr_device); 834 rt->rtr_vendor, rt->rtr_device);
825 835
826 pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn); 836 pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn);
@@ -840,11 +850,9 @@ static void __init pirq_find_router(struct irq_router *r)
840 h->probe(r, pirq_router_dev, pirq_router_dev->device)) 850 h->probe(r, pirq_router_dev, pirq_router_dev->device))
841 break; 851 break;
842 } 852 }
843 printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n", 853 dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n",
844 pirq_router.name, 854 pirq_router.name,
845 pirq_router_dev->vendor, 855 pirq_router_dev->vendor, pirq_router_dev->device);
846 pirq_router_dev->device,
847 pci_name(pirq_router_dev));
848 856
849 /* The device remains referenced for the kernel lifetime */ 857 /* The device remains referenced for the kernel lifetime */
850} 858}
@@ -877,7 +885,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
877 /* Find IRQ pin */ 885 /* Find IRQ pin */
878 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); 886 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
879 if (!pin) { 887 if (!pin) {
880 DBG(KERN_DEBUG " -> no interrupt pin\n"); 888 dev_dbg(&dev->dev, "no interrupt pin\n");
881 return 0; 889 return 0;
882 } 890 }
883 pin = pin - 1; 891 pin = pin - 1;
@@ -887,20 +895,20 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
887 if (!pirq_table) 895 if (!pirq_table)
888 return 0; 896 return 0;
889 897
890 DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin);
891 info = pirq_get_info(dev); 898 info = pirq_get_info(dev);
892 if (!info) { 899 if (!info) {
893 DBG(" -> not found in routing table\n" KERN_DEBUG); 900 dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n",
901 'A' + pin);
894 return 0; 902 return 0;
895 } 903 }
896 pirq = info->irq[pin].link; 904 pirq = info->irq[pin].link;
897 mask = info->irq[pin].bitmap; 905 mask = info->irq[pin].bitmap;
898 if (!pirq) { 906 if (!pirq) {
899 DBG(" -> not routed\n" KERN_DEBUG); 907 dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin);
900 return 0; 908 return 0;
901 } 909 }
902 DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, 910 dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x",
903 pirq_table->exclusive_irqs); 911 'A' + pin, pirq, mask, pirq_table->exclusive_irqs);
904 mask &= pcibios_irq_mask; 912 mask &= pcibios_irq_mask;
905 913
906 /* Work around broken HP Pavilion Notebooks which assign USB to 914 /* Work around broken HP Pavilion Notebooks which assign USB to
@@ -930,10 +938,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
930 if (pci_probe & PCI_USE_PIRQ_MASK) 938 if (pci_probe & PCI_USE_PIRQ_MASK)
931 newirq = 0; 939 newirq = 0;
932 else 940 else
933 printk("\n" KERN_WARNING 941 dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask "
934 "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n" 942 "%#x; try pci=usepirqmask\n", newirq, mask);
935 KERN_DEBUG, newirq,
936 pci_name(dev));
937 } 943 }
938 if (!newirq && assign) { 944 if (!newirq && assign) {
939 for (i = 0; i < 16; i++) { 945 for (i = 0; i < 16; i++) {
@@ -944,39 +950,35 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
944 newirq = i; 950 newirq = i;
945 } 951 }
946 } 952 }
947 DBG(" -> newirq=%d", newirq); 953 dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq);
948 954
949 /* Check if it is hardcoded */ 955 /* Check if it is hardcoded */
950 if ((pirq & 0xf0) == 0xf0) { 956 if ((pirq & 0xf0) == 0xf0) {
951 irq = pirq & 0xf; 957 irq = pirq & 0xf;
952 DBG(" -> hardcoded IRQ %d\n", irq); 958 msg = "hardcoded";
953 msg = "Hardcoded";
954 } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ 959 } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \
955 ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) { 960 ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) {
956 DBG(" -> got IRQ %d\n", irq); 961 msg = "found";
957 msg = "Found";
958 eisa_set_level_irq(irq); 962 eisa_set_level_irq(irq);
959 } else if (newirq && r->set && 963 } else if (newirq && r->set &&
960 (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { 964 (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) {
961 DBG(" -> assigning IRQ %d", newirq);
962 if (r->set(pirq_router_dev, dev, pirq, newirq)) { 965 if (r->set(pirq_router_dev, dev, pirq, newirq)) {
963 eisa_set_level_irq(newirq); 966 eisa_set_level_irq(newirq);
964 DBG(" ... OK\n"); 967 msg = "assigned";
965 msg = "Assigned";
966 irq = newirq; 968 irq = newirq;
967 } 969 }
968 } 970 }
969 971
970 if (!irq) { 972 if (!irq) {
971 DBG(" ... failed\n");
972 if (newirq && mask == (1 << newirq)) { 973 if (newirq && mask == (1 << newirq)) {
973 msg = "Guessed"; 974 msg = "guessed";
974 irq = newirq; 975 irq = newirq;
975 } else 976 } else {
977 dev_dbg(&dev->dev, "can't route interrupt\n");
976 return 0; 978 return 0;
979 }
977 } 980 }
978 printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, 981 dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq);
979 pci_name(dev));
980 982
981 /* Update IRQ for all devices with the same pirq value */ 983 /* Update IRQ for all devices with the same pirq value */
982 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { 984 while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) {
@@ -996,17 +998,17 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
996 (!(pci_probe & PCI_USE_PIRQ_MASK) || \ 998 (!(pci_probe & PCI_USE_PIRQ_MASK) || \
997 ((1 << dev2->irq) & mask))) { 999 ((1 << dev2->irq) & mask))) {
998#ifndef CONFIG_PCI_MSI 1000#ifndef CONFIG_PCI_MSI
999 printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n", 1001 dev_info(&dev2->dev, "IRQ routing conflict: "
1000 pci_name(dev2), dev2->irq, irq); 1002 "have IRQ %d, want IRQ %d\n",
1003 dev2->irq, irq);
1001#endif 1004#endif
1002 continue; 1005 continue;
1003 } 1006 }
1004 dev2->irq = irq; 1007 dev2->irq = irq;
1005 pirq_penalty[irq]++; 1008 pirq_penalty[irq]++;
1006 if (dev != dev2) 1009 if (dev != dev2)
1007 printk(KERN_INFO 1010 dev_info(&dev->dev, "sharing IRQ %d with %s\n",
1008 "PCI: Sharing IRQ %d with %s\n", 1011 irq, pci_name(dev2));
1009 irq, pci_name(dev2));
1010 } 1012 }
1011 } 1013 }
1012 return 1; 1014 return 1;
@@ -1025,8 +1027,7 @@ static void __init pcibios_fixup_irqs(void)
1025 * already in use. 1027 * already in use.
1026 */ 1028 */
1027 if (dev->irq >= 16) { 1029 if (dev->irq >= 16) {
1028 DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", 1030 dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq);
1029 pci_name(dev), dev->irq);
1030 dev->irq = 0; 1031 dev->irq = 0;
1031 } 1032 }
1032 /* 1033 /*
@@ -1049,35 +1050,44 @@ static void __init pcibios_fixup_irqs(void)
1049 if (io_apic_assign_pci_irqs) { 1050 if (io_apic_assign_pci_irqs) {
1050 int irq; 1051 int irq;
1051 1052
1052 if (pin) { 1053 if (!pin)
1053 /* 1054 continue;
1054 * interrupt pins are numbered starting 1055
1055 * from 1 1056 /*
1056 */ 1057 * interrupt pins are numbered starting from 1
1057 pin--; 1058 */
1058 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, 1059 pin--;
1059 PCI_SLOT(dev->devfn), pin); 1060 irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
1060 /* 1061 PCI_SLOT(dev->devfn), pin);
1061 * Busses behind bridges are typically not listed in the MP-table. 1062 /*
1062 * In this case we have to look up the IRQ based on the parent bus, 1063 * Busses behind bridges are typically not listed in the
1063 * parent slot, and pin number. The SMP code detects such bridged 1064 * MP-table. In this case we have to look up the IRQ
1064 * busses itself so we should get into this branch reliably. 1065 * based on the parent bus, parent slot, and pin number.
1065 */ 1066 * The SMP code detects such bridged busses itself so we
1066 if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ 1067 * should get into this branch reliably.
1067 struct pci_dev *bridge = dev->bus->self; 1068 */
1068 1069 if (irq < 0 && dev->bus->parent) {
1069 pin = (pin + PCI_SLOT(dev->devfn)) % 4; 1070 /* go back to the bridge */
1070 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1071 struct pci_dev *bridge = dev->bus->self;
1071 PCI_SLOT(bridge->devfn), pin); 1072 int bus;
1072 if (irq >= 0) 1073
1073 printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", 1074 pin = (pin + PCI_SLOT(dev->devfn)) % 4;
1074 pci_name(bridge), 'A' + pin, irq); 1075 bus = bridge->bus->number;
1075 } 1076 irq = IO_APIC_get_PCI_irq_vector(bus,
1076 if (irq >= 0) { 1077 PCI_SLOT(bridge->devfn), pin);
1077 printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", 1078 if (irq >= 0)
1078 pci_name(dev), 'A' + pin, irq); 1079 dev_warn(&dev->dev,
1079 dev->irq = irq; 1080 "using bridge %s INT %c to "
1080 } 1081 "get IRQ %d\n",
1082 pci_name(bridge),
1083 'A' + pin, irq);
1084 }
1085 if (irq >= 0) {
1086 dev_info(&dev->dev,
1087 "PCI->APIC IRQ transform: INT %c "
1088 "-> IRQ %d\n",
1089 'A' + pin, irq);
1090 dev->irq = irq;
1081 } 1091 }
1082 } 1092 }
1083#endif 1093#endif
@@ -1231,25 +1241,24 @@ static int pirq_enable_irq(struct pci_dev *dev)
1231 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, 1241 irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
1232 PCI_SLOT(bridge->devfn), pin); 1242 PCI_SLOT(bridge->devfn), pin);
1233 if (irq >= 0) 1243 if (irq >= 0)
1234 printk(KERN_WARNING 1244 dev_warn(&dev->dev, "using bridge %s "
1235 "PCI: using PPB %s[%c] to get irq %d\n", 1245 "INT %c to get IRQ %d\n",
1236 pci_name(bridge), 1246 pci_name(bridge), 'A' + pin,
1237 'A' + pin, irq); 1247 irq);
1238 dev = bridge; 1248 dev = bridge;
1239 } 1249 }
1240 dev = temp_dev; 1250 dev = temp_dev;
1241 if (irq >= 0) { 1251 if (irq >= 0) {
1242 printk(KERN_INFO 1252 dev_info(&dev->dev, "PCI->APIC IRQ transform: "
1243 "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", 1253 "INT %c -> IRQ %d\n", 'A' + pin, irq);
1244 pci_name(dev), 'A' + pin, irq);
1245 dev->irq = irq; 1254 dev->irq = irq;
1246 return 0; 1255 return 0;
1247 } else 1256 } else
1248 msg = " Probably buggy MP table."; 1257 msg = "; probably buggy MP table";
1249 } else if (pci_probe & PCI_BIOS_IRQ_SCAN) 1258 } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
1250 msg = ""; 1259 msg = "";
1251 else 1260 else
1252 msg = " Please try using pci=biosirq."; 1261 msg = "; please try using pci=biosirq";
1253 1262
1254 /* 1263 /*
1255 * With IDE legacy devices the IRQ lookup failure is not 1264 * With IDE legacy devices the IRQ lookup failure is not
@@ -1259,9 +1268,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
1259 !(dev->class & 0x5)) 1268 !(dev->class & 0x5))
1260 return 0; 1269 return 0;
1261 1270
1262 printk(KERN_WARNING 1271 dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n",
1263 "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", 1272 'A' + pin, msg);
1264 'A' + pin, pci_name(dev), msg);
1265 } 1273 }
1266 return 0; 1274 return 0;
1267} 1275}
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 132876cc6fca..b722dd481b39 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -14,7 +14,7 @@ static void __devinit pcibios_fixup_peer_bridges(void)
14 int n, devfn; 14 int n, devfn;
15 long node; 15 long node;
16 16
17 if (pcibios_last_bus <= 0 || pcibios_last_bus >= 0xff) 17 if (pcibios_last_bus <= 0 || pcibios_last_bus > 0xff)
18 return; 18 return;
19 DBG("PCI: Peer bridge fixup\n"); 19 DBG("PCI: Peer bridge fixup\n");
20 20
@@ -57,14 +57,17 @@ static int __init pci_legacy_init(void)
57 57
58int __init pci_subsys_init(void) 58int __init pci_subsys_init(void)
59{ 59{
60#ifdef CONFIG_X86_NUMAQ
61 pci_numaq_init();
62#endif
60#ifdef CONFIG_ACPI 63#ifdef CONFIG_ACPI
61 pci_acpi_init(); 64 pci_acpi_init();
62#endif 65#endif
66#ifdef CONFIG_X86_VISWS
67 pci_visws_init();
68#endif
63 pci_legacy_init(); 69 pci_legacy_init();
64 pcibios_irq_init(); 70 pcibios_irq_init();
65#ifdef CONFIG_X86_NUMAQ
66 pci_numa_init();
67#endif
68 pcibios_init(); 71 pcibios_init();
69 72
70 return 0; 73 return 0;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 23faaa890ffc..654a2234f8f3 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -209,7 +209,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
209 return name != NULL; 209 return name != NULL;
210} 210}
211 211
212static void __init pci_mmcfg_insert_resources(unsigned long resource_flags) 212static void __init pci_mmcfg_insert_resources(void)
213{ 213{
214#define PCI_MMCFG_RESOURCE_NAME_LEN 19 214#define PCI_MMCFG_RESOURCE_NAME_LEN 19
215 int i; 215 int i;
@@ -233,7 +233,7 @@ static void __init pci_mmcfg_insert_resources(unsigned long resource_flags)
233 cfg->pci_segment); 233 cfg->pci_segment);
234 res->start = cfg->address; 234 res->start = cfg->address;
235 res->end = res->start + (num_buses << 20) - 1; 235 res->end = res->start + (num_buses << 20) - 1;
236 res->flags = IORESOURCE_MEM | resource_flags; 236 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
237 insert_resource(&iomem_resource, res); 237 insert_resource(&iomem_resource, res);
238 names += PCI_MMCFG_RESOURCE_NAME_LEN; 238 names += PCI_MMCFG_RESOURCE_NAME_LEN;
239 } 239 }
@@ -293,7 +293,7 @@ static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl,
293 return AE_OK; 293 return AE_OK;
294} 294}
295 295
296static int __init is_acpi_reserved(unsigned long start, unsigned long end) 296static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
297{ 297{
298 struct resource mcfg_res; 298 struct resource mcfg_res;
299 299
@@ -310,6 +310,41 @@ static int __init is_acpi_reserved(unsigned long start, unsigned long end)
310 return mcfg_res.flags; 310 return mcfg_res.flags;
311} 311}
312 312
313typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type);
314
315static int __init is_mmconf_reserved(check_reserved_t is_reserved,
316 u64 addr, u64 size, int i,
317 typeof(pci_mmcfg_config[0]) *cfg, int with_e820)
318{
319 u64 old_size = size;
320 int valid = 0;
321
322 while (!is_reserved(addr, addr + size - 1, E820_RESERVED)) {
323 size >>= 1;
324 if (size < (16UL<<20))
325 break;
326 }
327
328 if (size >= (16UL<<20) || size == old_size) {
329 printk(KERN_NOTICE
330 "PCI: MCFG area at %Lx reserved in %s\n",
331 addr, with_e820?"E820":"ACPI motherboard resources");
332 valid = 1;
333
334 if (old_size != size) {
335 /* update end_bus_number */
336 cfg->end_bus_number = cfg->start_bus_number + ((size>>20) - 1);
337 printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx "
338 "segment %hu buses %u - %u\n",
339 i, (unsigned long)cfg->address, cfg->pci_segment,
340 (unsigned int)cfg->start_bus_number,
341 (unsigned int)cfg->end_bus_number);
342 }
343 }
344
345 return valid;
346}
347
313static void __init pci_mmcfg_reject_broken(int early) 348static void __init pci_mmcfg_reject_broken(int early)
314{ 349{
315 typeof(pci_mmcfg_config[0]) *cfg; 350 typeof(pci_mmcfg_config[0]) *cfg;
@@ -324,21 +359,22 @@ static void __init pci_mmcfg_reject_broken(int early)
324 359
325 for (i = 0; i < pci_mmcfg_config_num; i++) { 360 for (i = 0; i < pci_mmcfg_config_num; i++) {
326 int valid = 0; 361 int valid = 0;
327 u32 size = (cfg->end_bus_number + 1) << 20; 362 u64 addr, size;
363
328 cfg = &pci_mmcfg_config[i]; 364 cfg = &pci_mmcfg_config[i];
365 addr = cfg->start_bus_number;
366 addr <<= 20;
367 addr += cfg->address;
368 size = cfg->end_bus_number + 1 - cfg->start_bus_number;
369 size <<= 20;
329 printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " 370 printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx "
330 "segment %hu buses %u - %u\n", 371 "segment %hu buses %u - %u\n",
331 i, (unsigned long)cfg->address, cfg->pci_segment, 372 i, (unsigned long)cfg->address, cfg->pci_segment,
332 (unsigned int)cfg->start_bus_number, 373 (unsigned int)cfg->start_bus_number,
333 (unsigned int)cfg->end_bus_number); 374 (unsigned int)cfg->end_bus_number);
334 375
335 if (!early && 376 if (!early)
336 is_acpi_reserved(cfg->address, cfg->address + size - 1)) { 377 valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0);
337 printk(KERN_NOTICE "PCI: MCFG area at %Lx reserved "
338 "in ACPI motherboard resources\n",
339 cfg->address);
340 valid = 1;
341 }
342 378
343 if (valid) 379 if (valid)
344 continue; 380 continue;
@@ -347,16 +383,11 @@ static void __init pci_mmcfg_reject_broken(int early)
347 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" 383 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not"
348 " reserved in ACPI motherboard resources\n", 384 " reserved in ACPI motherboard resources\n",
349 cfg->address); 385 cfg->address);
386
350 /* Don't try to do this check unless configuration 387 /* Don't try to do this check unless configuration
351 type 1 is available. how about type 2 ?*/ 388 type 1 is available. how about type 2 ?*/
352 if (raw_pci_ops && e820_all_mapped(cfg->address, 389 if (raw_pci_ops)
353 cfg->address + size - 1, 390 valid = is_mmconf_reserved(e820_all_mapped, addr, size, i, cfg, 1);
354 E820_RESERVED)) {
355 printk(KERN_NOTICE
356 "PCI: MCFG area at %Lx reserved in E820\n",
357 cfg->address);
358 valid = 1;
359 }
360 391
361 if (!valid) 392 if (!valid)
362 goto reject; 393 goto reject;
@@ -365,7 +396,7 @@ static void __init pci_mmcfg_reject_broken(int early)
365 return; 396 return;
366 397
367reject: 398reject:
368 printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); 399 printk(KERN_INFO "PCI: Not using MMCONFIG.\n");
369 pci_mmcfg_arch_free(); 400 pci_mmcfg_arch_free();
370 kfree(pci_mmcfg_config); 401 kfree(pci_mmcfg_config);
371 pci_mmcfg_config = NULL; 402 pci_mmcfg_config = NULL;
@@ -403,11 +434,9 @@ static void __init __pci_mmcfg_init(int early)
403 (pci_mmcfg_config[0].address == 0)) 434 (pci_mmcfg_config[0].address == 0))
404 return; 435 return;
405 436
406 if (pci_mmcfg_arch_init()) { 437 if (pci_mmcfg_arch_init())
407 if (known_bridge)
408 pci_mmcfg_insert_resources(IORESOURCE_BUSY);
409 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 438 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
410 } else { 439 else {
411 /* 440 /*
412 * Signal not to attempt to insert mmcfg resources because 441 * Signal not to attempt to insert mmcfg resources because
413 * the architecture mmcfg setup could not initialize. 442 * the architecture mmcfg setup could not initialize.
@@ -444,7 +473,7 @@ static int __init pci_mmcfg_late_insert_resources(void)
444 * marked so it won't cause request errors when __request_region is 473 * marked so it won't cause request errors when __request_region is
445 * called. 474 * called.
446 */ 475 */
447 pci_mmcfg_insert_resources(0); 476 pci_mmcfg_insert_resources();
448 477
449 return 0; 478 return 0;
450} 479}
diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numaq_32.c
index 8b5ca1966731..1177845d3186 100644
--- a/arch/x86/pci/numa.c
+++ b/arch/x86/pci/numaq_32.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * numa.c - Low-level PCI access for NUMA-Q machines 2 * numaq_32.c - Low-level PCI access for NUMA-Q machines
3 */ 3 */
4 4
5#include <linux/pci.h> 5#include <linux/pci.h>
@@ -131,13 +131,14 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
131 u8 busno, suba, subb; 131 u8 busno, suba, subb;
132 int quad = BUS2QUAD(d->bus->number); 132 int quad = BUS2QUAD(d->bus->number);
133 133
134 printk("PCI: Searching for i450NX host bridges on %s\n", pci_name(d)); 134 dev_info(&d->dev, "searching for i450NX host bridges\n");
135 reg = 0xd0; 135 reg = 0xd0;
136 for(pxb=0; pxb<2; pxb++) { 136 for(pxb=0; pxb<2; pxb++) {
137 pci_read_config_byte(d, reg++, &busno); 137 pci_read_config_byte(d, reg++, &busno);
138 pci_read_config_byte(d, reg++, &suba); 138 pci_read_config_byte(d, reg++, &suba);
139 pci_read_config_byte(d, reg++, &subb); 139 pci_read_config_byte(d, reg++, &subb);
140 DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); 140 dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n",
141 pxb, busno, suba, subb);
141 if (busno) { 142 if (busno) {
142 /* Bus A */ 143 /* Bus A */
143 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno)); 144 pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno));
@@ -151,7 +152,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d)
151} 152}
152DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); 153DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx);
153 154
154int __init pci_numa_init(void) 155int __init pci_numaq_init(void)
155{ 156{
156 int quad; 157 int quad;
157 158
diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h
index 3e25deb821ac..15b9cf6be729 100644
--- a/arch/x86/pci/pci.h
+++ b/arch/x86/pci/pci.h
@@ -108,7 +108,8 @@ extern void __init dmi_check_skip_isa_align(void);
108/* some common used subsys_initcalls */ 108/* some common used subsys_initcalls */
109extern int __init pci_acpi_init(void); 109extern int __init pci_acpi_init(void);
110extern int __init pcibios_irq_init(void); 110extern int __init pcibios_irq_init(void);
111extern int __init pci_numa_init(void); 111extern int __init pci_visws_init(void);
112extern int __init pci_numaq_init(void);
112extern int __init pcibios_init(void); 113extern int __init pcibios_init(void);
113 114
114/* pci-mmconfig.c */ 115/* pci-mmconfig.c */
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index 1a7bed492bb1..42f4cb19faca 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -86,8 +86,14 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
86 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); 86 pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq);
87} 87}
88 88
89static int __init pci_visws_init(void) 89int __init pci_visws_init(void)
90{ 90{
91 if (!is_visws_box())
92 return -1;
93
94 pcibios_enable_irq = &pci_visws_enable_irq;
95 pcibios_disable_irq = &pci_visws_disable_irq;
96
91 /* The VISWS supports configuration access type 1 only */ 97 /* The VISWS supports configuration access type 1 only */
92 pci_probe = (pci_probe | PCI_PROBE_CONF1) & 98 pci_probe = (pci_probe | PCI_PROBE_CONF1) &
93 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2); 99 ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2);
@@ -105,18 +111,3 @@ static int __init pci_visws_init(void)
105 pcibios_resource_survey(); 111 pcibios_resource_survey();
106 return 0; 112 return 0;
107} 113}
108
109static __init int pci_subsys_init(void)
110{
111 if (!is_visws_box())
112 return -1;
113
114 pcibios_enable_irq = &pci_visws_enable_irq;
115 pcibios_disable_irq = &pci_visws_disable_irq;
116
117 pci_visws_init();
118 pcibios_init();
119
120 return 0;
121}
122subsys_initcall(pci_subsys_init);
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index 7dc5d5cf50a2..274d06082f48 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -11,6 +11,7 @@
11#include <linux/suspend.h> 11#include <linux/suspend.h>
12#include <asm/mtrr.h> 12#include <asm/mtrr.h>
13#include <asm/mce.h> 13#include <asm/mce.h>
14#include <asm/xcr.h>
14 15
15static struct saved_context saved_context; 16static struct saved_context saved_context;
16 17
@@ -45,7 +46,7 @@ static void __save_processor_state(struct saved_context *ctxt)
45 ctxt->cr0 = read_cr0(); 46 ctxt->cr0 = read_cr0();
46 ctxt->cr2 = read_cr2(); 47 ctxt->cr2 = read_cr2();
47 ctxt->cr3 = read_cr3(); 48 ctxt->cr3 = read_cr3();
48 ctxt->cr4 = read_cr4(); 49 ctxt->cr4 = read_cr4_safe();
49} 50}
50 51
51/* Needed by apm.c */ 52/* Needed by apm.c */
@@ -98,7 +99,9 @@ static void __restore_processor_state(struct saved_context *ctxt)
98 /* 99 /*
99 * control registers 100 * control registers
100 */ 101 */
101 write_cr4(ctxt->cr4); 102 /* cr4 was introduced in the Pentium CPU */
103 if (ctxt->cr4)
104 write_cr4(ctxt->cr4);
102 write_cr3(ctxt->cr3); 105 write_cr3(ctxt->cr3);
103 write_cr2(ctxt->cr2); 106 write_cr2(ctxt->cr2);
104 write_cr0(ctxt->cr0); 107 write_cr0(ctxt->cr0);
@@ -124,6 +127,12 @@ static void __restore_processor_state(struct saved_context *ctxt)
124 if (boot_cpu_has(X86_FEATURE_SEP)) 127 if (boot_cpu_has(X86_FEATURE_SEP))
125 enable_sep_cpu(); 128 enable_sep_cpu();
126 129
130 /*
131 * restore XCR0 for xsave capable cpu's.
132 */
133 if (cpu_has_xsave)
134 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
135
127 fix_processor_context(); 136 fix_processor_context();
128 do_fpu_end(); 137 do_fpu_end();
129 mtrr_ap_init(); 138 mtrr_ap_init();
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c
index 66bdfb591fd8..e3b6cf70d62c 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu_64.c
@@ -14,6 +14,7 @@
14#include <asm/page.h> 14#include <asm/page.h>
15#include <asm/pgtable.h> 15#include <asm/pgtable.h>
16#include <asm/mtrr.h> 16#include <asm/mtrr.h>
17#include <asm/xcr.h>
17 18
18static void fix_processor_context(void); 19static void fix_processor_context(void);
19 20
@@ -122,6 +123,12 @@ static void __restore_processor_state(struct saved_context *ctxt)
122 wrmsrl(MSR_GS_BASE, ctxt->gs_base); 123 wrmsrl(MSR_GS_BASE, ctxt->gs_base);
123 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); 124 wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base);
124 125
126 /*
127 * restore XCR0 for xsave capable cpu's.
128 */
129 if (cpu_has_xsave)
130 xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
131
125 fix_processor_context(); 132 fix_processor_context();
126 133
127 do_fpu_end(); 134 do_fpu_end();
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index b95aa6cfe3cb..d1e9b53f9d33 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -1,5 +1,3 @@
1.text
2
3/* 1/*
4 * This may not use any stack, nor any variable that is not "NoSave": 2 * This may not use any stack, nor any variable that is not "NoSave":
5 * 3 *
@@ -12,25 +10,26 @@
12#include <asm/segment.h> 10#include <asm/segment.h>
13#include <asm/page.h> 11#include <asm/page.h>
14#include <asm/asm-offsets.h> 12#include <asm/asm-offsets.h>
13#include <asm/processor-flags.h>
15 14
16 .text 15.text
17 16
18ENTRY(swsusp_arch_suspend) 17ENTRY(swsusp_arch_suspend)
19
20 movl %esp, saved_context_esp 18 movl %esp, saved_context_esp
21 movl %ebx, saved_context_ebx 19 movl %ebx, saved_context_ebx
22 movl %ebp, saved_context_ebp 20 movl %ebp, saved_context_ebp
23 movl %esi, saved_context_esi 21 movl %esi, saved_context_esi
24 movl %edi, saved_context_edi 22 movl %edi, saved_context_edi
25 pushfl ; popl saved_context_eflags 23 pushfl
24 popl saved_context_eflags
26 25
27 call swsusp_save 26 call swsusp_save
28 ret 27 ret
29 28
30ENTRY(restore_image) 29ENTRY(restore_image)
31 movl resume_pg_dir, %ecx 30 movl resume_pg_dir, %eax
32 subl $__PAGE_OFFSET, %ecx 31 subl $__PAGE_OFFSET, %eax
33 movl %ecx, %cr3 32 movl %eax, %cr3
34 33
35 movl restore_pblist, %edx 34 movl restore_pblist, %edx
36 .p2align 4,,7 35 .p2align 4,,7
@@ -52,17 +51,21 @@ copy_loop:
52 51
53done: 52done:
54 /* go back to the original page tables */ 53 /* go back to the original page tables */
55 movl $swapper_pg_dir, %ecx 54 movl $swapper_pg_dir, %eax
56 subl $__PAGE_OFFSET, %ecx 55 subl $__PAGE_OFFSET, %eax
57 movl %ecx, %cr3 56 movl %eax, %cr3
58 /* Flush TLB, including "global" things (vmalloc) */ 57 /* Flush TLB, including "global" things (vmalloc) */
59 movl mmu_cr4_features, %eax 58 movl mmu_cr4_features, %ecx
60 movl %eax, %edx 59 jecxz 1f # cr4 Pentium and higher, skip if zero
61 andl $~(1<<7), %edx; # PGE 60 movl %ecx, %edx
61 andl $~(X86_CR4_PGE), %edx
62 movl %edx, %cr4; # turn off PGE 62 movl %edx, %cr4; # turn off PGE
63 movl %cr3, %ecx; # flush TLB 631:
64 movl %ecx, %cr3 64 movl %cr3, %eax; # flush TLB
65 movl %eax, %cr4; # turn PGE back on 65 movl %eax, %cr3
66 jecxz 1f # cr4 Pentium and higher, skip if zero
67 movl %ecx, %cr4; # turn PGE back on
681:
66 69
67 movl saved_context_esp, %esp 70 movl saved_context_esp, %esp
68 movl saved_context_ebp, %ebp 71 movl saved_context_ebp, %ebp
@@ -70,7 +73,8 @@ done:
70 movl saved_context_esi, %esi 73 movl saved_context_esi, %esi
71 movl saved_context_edi, %edi 74 movl saved_context_edi, %edi
72 75
73 pushl saved_context_eflags ; popfl 76 pushl saved_context_eflags
77 popfl
74 78
75 xorl %eax, %eax 79 xorl %eax, %eax
76 80
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b7ad9f89d21f..4d6ef0a336d6 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE
62# Build multiple 32-bit vDSO images to choose from at boot time. 62# Build multiple 32-bit vDSO images to choose from at boot time.
63# 63#
64obj-$(VDSO32-y) += vdso32-syms.lds 64obj-$(VDSO32-y) += vdso32-syms.lds
65vdso32.so-$(CONFIG_X86_32) += int80 65vdso32.so-$(VDSO32-y) += int80
66vdso32.so-$(CONFIG_COMPAT) += syscall 66vdso32.so-$(CONFIG_COMPAT) += syscall
67vdso32.so-$(VDSO32-y) += sysenter 67vdso32.so-$(VDSO32-y) += sysenter
68 68
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 0bce5429a515..513f330c5832 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr)
193 } 193 }
194} 194}
195 195
196/*
197 * These symbols are defined by vdso32.S to mark the bounds
198 * of the ELF DSO images included therein.
199 */
200extern const char vdso32_default_start, vdso32_default_end;
201extern const char vdso32_sysenter_start, vdso32_sysenter_end;
202static struct page *vdso32_pages[1]; 196static struct page *vdso32_pages[1];
203 197
204#ifdef CONFIG_X86_64 198#ifdef CONFIG_X86_64
205 199
206#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) 200#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32))
201#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32))
207 202
208/* May not be __init: called during resume */ 203/* May not be __init: called during resume */
209void syscall32_cpu_init(void) 204void syscall32_cpu_init(void)
@@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map)
226#else /* CONFIG_X86_32 */ 221#else /* CONFIG_X86_32 */
227 222
228#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) 223#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP))
224#define vdso32_syscall() (0)
229 225
230void enable_sep_cpu(void) 226void enable_sep_cpu(void)
231{ 227{
@@ -296,12 +292,15 @@ int __init sysenter_setup(void)
296 gate_vma_init(); 292 gate_vma_init();
297#endif 293#endif
298 294
299 if (!vdso32_sysenter()) { 295 if (vdso32_syscall()) {
300 vsyscall = &vdso32_default_start; 296 vsyscall = &vdso32_syscall_start;
301 vsyscall_len = &vdso32_default_end - &vdso32_default_start; 297 vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start;
302 } else { 298 } else if (vdso32_sysenter()){
303 vsyscall = &vdso32_sysenter_start; 299 vsyscall = &vdso32_sysenter_start;
304 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; 300 vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start;
301 } else {
302 vsyscall = &vdso32_int80_start;
303 vsyscall_len = &vdso32_int80_end - &vdso32_int80_start;
305 } 304 }
306 305
307 memcpy(syscall_page, vsyscall, vsyscall_len); 306 memcpy(syscall_page, vsyscall, vsyscall_len);
diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S
index 1e36f72cab86..2ce5f82c333b 100644
--- a/arch/x86/vdso/vdso32.S
+++ b/arch/x86/vdso/vdso32.S
@@ -2,14 +2,17 @@
2 2
3__INITDATA 3__INITDATA
4 4
5 .globl vdso32_default_start, vdso32_default_end 5 .globl vdso32_int80_start, vdso32_int80_end
6vdso32_default_start: 6vdso32_int80_start:
7#ifdef CONFIG_X86_32
8 .incbin "arch/x86/vdso/vdso32-int80.so" 7 .incbin "arch/x86/vdso/vdso32-int80.so"
9#else 8vdso32_int80_end:
9
10 .globl vdso32_syscall_start, vdso32_syscall_end
11vdso32_syscall_start:
12#ifdef CONFIG_COMPAT
10 .incbin "arch/x86/vdso/vdso32-syscall.so" 13 .incbin "arch/x86/vdso/vdso32-syscall.so"
11#endif 14#endif
12vdso32_default_end: 15vdso32_syscall_end:
13 16
14 .globl vdso32_sysenter_start, vdso32_sysenter_end 17 .globl vdso32_sysenter_start, vdso32_sysenter_end
15vdso32_sysenter_start: 18vdso32_sysenter_start:
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 19a6cfaf5db9..257ba4a10abf 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -21,7 +21,8 @@ unsigned int __read_mostly vdso_enabled = 1;
21extern char vdso_start[], vdso_end[]; 21extern char vdso_start[], vdso_end[];
22extern unsigned short vdso_sync_cpuid; 22extern unsigned short vdso_sync_cpuid;
23 23
24struct page **vdso_pages; 24static struct page **vdso_pages;
25static unsigned vdso_size;
25 26
26static inline void *var_ref(void *p, char *name) 27static inline void *var_ref(void *p, char *name)
27{ 28{
@@ -38,6 +39,7 @@ static int __init init_vdso_vars(void)
38 int i; 39 int i;
39 char *vbase; 40 char *vbase;
40 41
42 vdso_size = npages << PAGE_SHIFT;
41 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); 43 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
42 if (!vdso_pages) 44 if (!vdso_pages)
43 goto oom; 45 goto oom;
@@ -101,20 +103,19 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack)
101 struct mm_struct *mm = current->mm; 103 struct mm_struct *mm = current->mm;
102 unsigned long addr; 104 unsigned long addr;
103 int ret; 105 int ret;
104 unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE);
105 106
106 if (!vdso_enabled) 107 if (!vdso_enabled)
107 return 0; 108 return 0;
108 109
109 down_write(&mm->mmap_sem); 110 down_write(&mm->mmap_sem);
110 addr = vdso_addr(mm->start_stack, len); 111 addr = vdso_addr(mm->start_stack, vdso_size);
111 addr = get_unmapped_area(NULL, addr, len, 0, 0); 112 addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0);
112 if (IS_ERR_VALUE(addr)) { 113 if (IS_ERR_VALUE(addr)) {
113 ret = addr; 114 ret = addr;
114 goto up_fail; 115 goto up_fail;
115 } 116 }
116 117
117 ret = install_special_mapping(mm, addr, len, 118 ret = install_special_mapping(mm, addr, vdso_size,
118 VM_READ|VM_EXEC| 119 VM_READ|VM_EXEC|
119 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| 120 VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|
120 VM_ALWAYSDUMP, 121 VM_ALWAYSDUMP,
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index c2cc99580871..87b9ab166423 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -6,8 +6,8 @@ config XEN
6 bool "Xen guest support" 6 bool "Xen guest support"
7 select PARAVIRT 7 select PARAVIRT
8 select PARAVIRT_CLOCK 8 select PARAVIRT_CLOCK
9 depends on X86_32 9 depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER))
10 depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) 10 depends on X86_CMPXCHG && X86_TSC
11 help 11 help
12 This is the Linux Xen port. Enabling this will allow the 12 This is the Linux Xen port. Enabling this will allow the
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
@@ -15,10 +15,24 @@ config XEN
15 15
16config XEN_MAX_DOMAIN_MEMORY 16config XEN_MAX_DOMAIN_MEMORY
17 int "Maximum allowed size of a domain in gigabytes" 17 int "Maximum allowed size of a domain in gigabytes"
18 default 8 18 default 8 if X86_32
19 default 32 if X86_64
19 depends on XEN 20 depends on XEN
20 help 21 help
21 The pseudo-physical to machine address array is sized 22 The pseudo-physical to machine address array is sized
22 according to the maximum possible memory size of a Xen 23 according to the maximum possible memory size of a Xen
23 domain. This array uses 1 page per gigabyte, so there's no 24 domain. This array uses 1 page per gigabyte, so there's no
24 need to be too stingy here. \ No newline at end of file 25 need to be too stingy here.
26
27config XEN_SAVE_RESTORE
28 bool
29 depends on XEN && PM
30 default y
31
32config XEN_DEBUG_FS
33 bool "Enable Xen debug and tuning parameters in debugfs"
34 depends on XEN && DEBUG_FS
35 default n
36 help
37 Enable statistics output and various tuning options in debugfs.
38 Enabling this option may incur a significant performance overhead.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 2ba2d1649131..313947940a1a 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -1,4 +1,12 @@
1obj-y := enlighten.o setup.o multicalls.o mmu.o \ 1ifdef CONFIG_FTRACE
2 time.o xen-asm.o grant-table.o suspend.o 2# Do not profile debug and lowlevel utilities
3CFLAGS_REMOVE_spinlock.o = -pg
4CFLAGS_REMOVE_time.o = -pg
5CFLAGS_REMOVE_irq.o = -pg
6endif
3 7
4obj-$(CONFIG_SMP) += smp.o 8obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
9 time.o xen-asm_$(BITS).o grant-table.o suspend.o
10
11obj-$(CONFIG_SMP) += smp.o spinlock.o
12obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
new file mode 100644
index 000000000000..b53225d2cac3
--- /dev/null
+++ b/arch/x86/xen/debugfs.c
@@ -0,0 +1,123 @@
1#include <linux/init.h>
2#include <linux/debugfs.h>
3#include <linux/module.h>
4
5#include "debugfs.h"
6
7static struct dentry *d_xen_debug;
8
9struct dentry * __init xen_init_debugfs(void)
10{
11 if (!d_xen_debug) {
12 d_xen_debug = debugfs_create_dir("xen", NULL);
13
14 if (!d_xen_debug)
15 pr_warning("Could not create 'xen' debugfs directory\n");
16 }
17
18 return d_xen_debug;
19}
20
21struct array_data
22{
23 void *array;
24 unsigned elements;
25};
26
27static int u32_array_open(struct inode *inode, struct file *file)
28{
29 file->private_data = NULL;
30 return nonseekable_open(inode, file);
31}
32
33static size_t format_array(char *buf, size_t bufsize, const char *fmt,
34 u32 *array, unsigned array_size)
35{
36 size_t ret = 0;
37 unsigned i;
38
39 for(i = 0; i < array_size; i++) {
40 size_t len;
41
42 len = snprintf(buf, bufsize, fmt, array[i]);
43 len++; /* ' ' or '\n' */
44 ret += len;
45
46 if (buf) {
47 buf += len;
48 bufsize -= len;
49 buf[-1] = (i == array_size-1) ? '\n' : ' ';
50 }
51 }
52
53 ret++; /* \0 */
54 if (buf)
55 *buf = '\0';
56
57 return ret;
58}
59
60static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size)
61{
62 size_t len = format_array(NULL, 0, fmt, array, array_size);
63 char *ret;
64
65 ret = kmalloc(len, GFP_KERNEL);
66 if (ret == NULL)
67 return NULL;
68
69 format_array(ret, len, fmt, array, array_size);
70 return ret;
71}
72
73static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len,
74 loff_t *ppos)
75{
76 struct inode *inode = file->f_path.dentry->d_inode;
77 struct array_data *data = inode->i_private;
78 size_t size;
79
80 if (*ppos == 0) {
81 if (file->private_data) {
82 kfree(file->private_data);
83 file->private_data = NULL;
84 }
85
86 file->private_data = format_array_alloc("%u", data->array, data->elements);
87 }
88
89 size = 0;
90 if (file->private_data)
91 size = strlen(file->private_data);
92
93 return simple_read_from_buffer(buf, len, ppos, file->private_data, size);
94}
95
96static int xen_array_release(struct inode *inode, struct file *file)
97{
98 kfree(file->private_data);
99
100 return 0;
101}
102
103static struct file_operations u32_array_fops = {
104 .owner = THIS_MODULE,
105 .open = u32_array_open,
106 .release= xen_array_release,
107 .read = u32_array_read,
108};
109
110struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
111 struct dentry *parent,
112 u32 *array, unsigned elements)
113{
114 struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL);
115
116 if (data == NULL)
117 return NULL;
118
119 data->array = array;
120 data->elements = elements;
121
122 return debugfs_create_file(name, mode, parent, data, &u32_array_fops);
123}
diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h
new file mode 100644
index 000000000000..e28132084832
--- /dev/null
+++ b/arch/x86/xen/debugfs.h
@@ -0,0 +1,10 @@
1#ifndef _XEN_DEBUGFS_H
2#define _XEN_DEBUGFS_H
3
4struct dentry * __init xen_init_debugfs(void);
5
6struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
7 struct dentry *parent,
8 u32 *array, unsigned elements);
9
10#endif /* _XEN_DEBUGFS_H */
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index bb508456ef52..b61534c7a4c4 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,22 +30,23 @@
30#include <xen/interface/xen.h> 30#include <xen/interface/xen.h>
31#include <xen/interface/physdev.h> 31#include <xen/interface/physdev.h>
32#include <xen/interface/vcpu.h> 32#include <xen/interface/vcpu.h>
33#include <xen/interface/sched.h>
34#include <xen/features.h> 33#include <xen/features.h>
35#include <xen/page.h> 34#include <xen/page.h>
35#include <xen/hvc-console.h>
36 36
37#include <asm/paravirt.h> 37#include <asm/paravirt.h>
38#include <asm/apic.h>
38#include <asm/page.h> 39#include <asm/page.h>
39#include <asm/xen/hypercall.h> 40#include <asm/xen/hypercall.h>
40#include <asm/xen/hypervisor.h> 41#include <asm/xen/hypervisor.h>
41#include <asm/fixmap.h> 42#include <asm/fixmap.h>
42#include <asm/processor.h> 43#include <asm/processor.h>
44#include <asm/msr-index.h>
43#include <asm/setup.h> 45#include <asm/setup.h>
44#include <asm/desc.h> 46#include <asm/desc.h>
45#include <asm/pgtable.h> 47#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 48#include <asm/tlbflush.h>
47#include <asm/reboot.h> 49#include <asm/reboot.h>
48#include <asm/pgalloc.h>
49 50
50#include "xen-ops.h" 51#include "xen-ops.h"
51#include "mmu.h" 52#include "mmu.h"
@@ -56,6 +57,21 @@ EXPORT_SYMBOL_GPL(hypercall_page);
56DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); 57DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu);
57DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); 58DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
58 59
60enum xen_domain_type xen_domain_type = XEN_NATIVE;
61EXPORT_SYMBOL_GPL(xen_domain_type);
62
63/*
64 * Identity map, in addition to plain kernel map. This needs to be
65 * large enough to allocate page table pages to allocate the rest.
66 * Each page can map 2MB.
67 */
68static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss;
69
70#ifdef CONFIG_X86_64
71/* l3 pud for userspace vsyscall mapping */
72static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
73#endif /* CONFIG_X86_64 */
74
59/* 75/*
60 * Note about cr3 (pagetable base) values: 76 * Note about cr3 (pagetable base) values:
61 * 77 *
@@ -97,7 +113,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
97 * 113 *
98 * 0: not available, 1: available 114 * 0: not available, 1: available
99 */ 115 */
100static int have_vcpu_info_placement = 1; 116static int have_vcpu_info_placement =
117#ifdef CONFIG_X86_32
118 1
119#else
120 0
121#endif
122 ;
123
101 124
102static void xen_vcpu_setup(int cpu) 125static void xen_vcpu_setup(int cpu)
103{ 126{
@@ -167,10 +190,14 @@ void xen_vcpu_restore(void)
167 190
168static void __init xen_banner(void) 191static void __init xen_banner(void)
169{ 192{
193 unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL);
194 struct xen_extraversion extra;
195 HYPERVISOR_xen_version(XENVER_extraversion, &extra);
196
170 printk(KERN_INFO "Booting paravirtualized kernel on %s\n", 197 printk(KERN_INFO "Booting paravirtualized kernel on %s\n",
171 pv_info.name); 198 pv_info.name);
172 printk(KERN_INFO "Hypervisor signature: %s%s\n", 199 printk(KERN_INFO "Xen version: %d.%d%s%s\n",
173 xen_start_info->magic, 200 version >> 16, version & 0xffff, extra.extraversion,
174 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); 201 xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
175} 202}
176 203
@@ -209,103 +236,68 @@ static unsigned long xen_get_debugreg(int reg)
209 return HYPERVISOR_get_debugreg(reg); 236 return HYPERVISOR_get_debugreg(reg);
210} 237}
211 238
212static unsigned long xen_save_fl(void) 239static void xen_leave_lazy(void)
213{ 240{
214 struct vcpu_info *vcpu; 241 paravirt_leave_lazy(paravirt_get_lazy_mode());
215 unsigned long flags; 242 xen_mc_flush();
216
217 vcpu = x86_read_percpu(xen_vcpu);
218
219 /* flag has opposite sense of mask */
220 flags = !vcpu->evtchn_upcall_mask;
221
222 /* convert to IF type flag
223 -0 -> 0x00000000
224 -1 -> 0xffffffff
225 */
226 return (-flags) & X86_EFLAGS_IF;
227} 243}
228 244
229static void xen_restore_fl(unsigned long flags) 245static unsigned long xen_store_tr(void)
230{ 246{
231 struct vcpu_info *vcpu; 247 return 0;
232
233 /* convert from IF type flag */
234 flags = !(flags & X86_EFLAGS_IF);
235
236 /* There's a one instruction preempt window here. We need to
237 make sure we're don't switch CPUs between getting the vcpu
238 pointer and updating the mask. */
239 preempt_disable();
240 vcpu = x86_read_percpu(xen_vcpu);
241 vcpu->evtchn_upcall_mask = flags;
242 preempt_enable_no_resched();
243
244 /* Doesn't matter if we get preempted here, because any
245 pending event will get dealt with anyway. */
246
247 if (flags == 0) {
248 preempt_check_resched();
249 barrier(); /* unmask then check (avoid races) */
250 if (unlikely(vcpu->evtchn_upcall_pending))
251 force_evtchn_callback();
252 }
253} 248}
254 249
255static void xen_irq_disable(void) 250/*
251 * Set the page permissions for a particular virtual address. If the
252 * address is a vmalloc mapping (or other non-linear mapping), then
253 * find the linear mapping of the page and also set its protections to
254 * match.
255 */
256static void set_aliased_prot(void *v, pgprot_t prot)
256{ 257{
257 /* There's a one instruction preempt window here. We need to 258 int level;
258 make sure we're don't switch CPUs between getting the vcpu 259 pte_t *ptep;
259 pointer and updating the mask. */ 260 pte_t pte;
260 preempt_disable(); 261 unsigned long pfn;
261 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; 262 struct page *page;
262 preempt_enable_no_resched();
263}
264 263
265static void xen_irq_enable(void) 264 ptep = lookup_address((unsigned long)v, &level);
266{ 265 BUG_ON(ptep == NULL);
267 struct vcpu_info *vcpu;
268 266
269 /* We don't need to worry about being preempted here, since 267 pfn = pte_pfn(*ptep);
270 either a) interrupts are disabled, so no preemption, or b) 268 page = pfn_to_page(pfn);
271 the caller is confused and is trying to re-enable interrupts
272 on an indeterminate processor. */
273 269
274 vcpu = x86_read_percpu(xen_vcpu); 270 pte = pfn_pte(pfn, prot);
275 vcpu->evtchn_upcall_mask = 0;
276 271
277 /* Doesn't matter if we get preempted here, because any 272 if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0))
278 pending event will get dealt with anyway. */ 273 BUG();
279 274
280 barrier(); /* unmask then check (avoid races) */ 275 if (!PageHighMem(page)) {
281 if (unlikely(vcpu->evtchn_upcall_pending)) 276 void *av = __va(PFN_PHYS(pfn));
282 force_evtchn_callback();
283}
284 277
285static void xen_safe_halt(void) 278 if (av != v)
286{ 279 if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0))
287 /* Blocking includes an implicit local_irq_enable(). */ 280 BUG();
288 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) 281 } else
289 BUG(); 282 kmap_flush_unused();
290} 283}
291 284
292static void xen_halt(void) 285static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries)
293{ 286{
294 if (irqs_disabled()) 287 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
295 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); 288 int i;
296 else
297 xen_safe_halt();
298}
299 289
300static void xen_leave_lazy(void) 290 for(i = 0; i < entries; i += entries_per_page)
301{ 291 set_aliased_prot(ldt + i, PAGE_KERNEL_RO);
302 paravirt_leave_lazy(paravirt_get_lazy_mode());
303 xen_mc_flush();
304} 292}
305 293
306static unsigned long xen_store_tr(void) 294static void xen_free_ldt(struct desc_struct *ldt, unsigned entries)
307{ 295{
308 return 0; 296 const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE;
297 int i;
298
299 for(i = 0; i < entries; i += entries_per_page)
300 set_aliased_prot(ldt + i, PAGE_KERNEL);
309} 301}
310 302
311static void xen_set_ldt(const void *addr, unsigned entries) 303static void xen_set_ldt(const void *addr, unsigned entries)
@@ -363,14 +355,6 @@ static void load_TLS_descriptor(struct thread_struct *t,
363 355
364static void xen_load_tls(struct thread_struct *t, unsigned int cpu) 356static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
365{ 357{
366 xen_mc_batch();
367
368 load_TLS_descriptor(t, cpu, 0);
369 load_TLS_descriptor(t, cpu, 1);
370 load_TLS_descriptor(t, cpu, 2);
371
372 xen_mc_issue(PARAVIRT_LAZY_CPU);
373
374 /* 358 /*
375 * XXX sleazy hack: If we're being called in a lazy-cpu zone, 359 * XXX sleazy hack: If we're being called in a lazy-cpu zone,
376 * it means we're in a context switch, and %gs has just been 360 * it means we're in a context switch, and %gs has just been
@@ -379,16 +363,44 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu)
379 * Either way, it has been saved, and the new value will get 363 * Either way, it has been saved, and the new value will get
380 * loaded properly. This will go away as soon as Xen has been 364 * loaded properly. This will go away as soon as Xen has been
381 * modified to not save/restore %gs for normal hypercalls. 365 * modified to not save/restore %gs for normal hypercalls.
366 *
367 * On x86_64, this hack is not used for %gs, because gs points
368 * to KERNEL_GS_BASE (and uses it for PDA references), so we
369 * must not zero %gs on x86_64
370 *
371 * For x86_64, we need to zero %fs, otherwise we may get an
372 * exception between the new %fs descriptor being loaded and
373 * %fs being effectively cleared at __switch_to().
382 */ 374 */
383 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) 375 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
376#ifdef CONFIG_X86_32
384 loadsegment(gs, 0); 377 loadsegment(gs, 0);
378#else
379 loadsegment(fs, 0);
380#endif
381 }
382
383 xen_mc_batch();
384
385 load_TLS_descriptor(t, cpu, 0);
386 load_TLS_descriptor(t, cpu, 1);
387 load_TLS_descriptor(t, cpu, 2);
388
389 xen_mc_issue(PARAVIRT_LAZY_CPU);
385} 390}
386 391
392#ifdef CONFIG_X86_64
393static void xen_load_gs_index(unsigned int idx)
394{
395 if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx))
396 BUG();
397}
398#endif
399
387static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, 400static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
388 const void *ptr) 401 const void *ptr)
389{ 402{
390 unsigned long lp = (unsigned long)&dt[entrynum]; 403 xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]);
391 xmaddr_t mach_lp = virt_to_machine(lp);
392 u64 entry = *(u64 *)ptr; 404 u64 entry = *(u64 *)ptr;
393 405
394 preempt_disable(); 406 preempt_disable();
@@ -400,23 +412,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
400 preempt_enable(); 412 preempt_enable();
401} 413}
402 414
403static int cvt_gate_to_trap(int vector, u32 low, u32 high, 415static int cvt_gate_to_trap(int vector, const gate_desc *val,
404 struct trap_info *info) 416 struct trap_info *info)
405{ 417{
406 u8 type, dpl; 418 if (val->type != 0xf && val->type != 0xe)
407
408 type = (high >> 8) & 0x1f;
409 dpl = (high >> 13) & 3;
410
411 if (type != 0xf && type != 0xe)
412 return 0; 419 return 0;
413 420
414 info->vector = vector; 421 info->vector = vector;
415 info->address = (high & 0xffff0000) | (low & 0x0000ffff); 422 info->address = gate_offset(*val);
416 info->cs = low >> 16; 423 info->cs = gate_segment(*val);
417 info->flags = dpl; 424 info->flags = val->dpl;
418 /* interrupt gates clear IF */ 425 /* interrupt gates clear IF */
419 if (type == 0xe) 426 if (val->type == 0xe)
420 info->flags |= 4; 427 info->flags |= 4;
421 428
422 return 1; 429 return 1;
@@ -443,11 +450,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
443 450
444 if (p >= start && (p + 8) <= end) { 451 if (p >= start && (p + 8) <= end) {
445 struct trap_info info[2]; 452 struct trap_info info[2];
446 u32 *desc = (u32 *)g;
447 453
448 info[1].address = 0; 454 info[1].address = 0;
449 455
450 if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) 456 if (cvt_gate_to_trap(entrynum, g, &info[0]))
451 if (HYPERVISOR_set_trap_table(info)) 457 if (HYPERVISOR_set_trap_table(info))
452 BUG(); 458 BUG();
453 } 459 }
@@ -460,13 +466,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc,
460{ 466{
461 unsigned in, out, count; 467 unsigned in, out, count;
462 468
463 count = (desc->size+1) / 8; 469 count = (desc->size+1) / sizeof(gate_desc);
464 BUG_ON(count > 256); 470 BUG_ON(count > 256);
465 471
466 for (in = out = 0; in < count; in++) { 472 for (in = out = 0; in < count; in++) {
467 const u32 *entry = (u32 *)(desc->address + in * 8); 473 gate_desc *entry = (gate_desc*)(desc->address) + in;
468 474
469 if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) 475 if (cvt_gate_to_trap(in, entry, &traps[out]))
470 out++; 476 out++;
471 } 477 }
472 traps[out].address = 0; 478 traps[out].address = 0;
@@ -527,7 +533,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
527} 533}
528 534
529static void xen_load_sp0(struct tss_struct *tss, 535static void xen_load_sp0(struct tss_struct *tss,
530 struct thread_struct *thread) 536 struct thread_struct *thread)
531{ 537{
532 struct multicall_space mcs = xen_mc_entry(0); 538 struct multicall_space mcs = xen_mc_entry(0);
533 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); 539 MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
@@ -548,16 +554,47 @@ static void xen_io_delay(void)
548} 554}
549 555
550#ifdef CONFIG_X86_LOCAL_APIC 556#ifdef CONFIG_X86_LOCAL_APIC
551static u32 xen_apic_read(unsigned long reg) 557static u32 xen_apic_read(u32 reg)
552{ 558{
553 return 0; 559 return 0;
554} 560}
555 561
556static void xen_apic_write(unsigned long reg, u32 val) 562static void xen_apic_write(u32 reg, u32 val)
557{ 563{
558 /* Warn to see if there's any stray references */ 564 /* Warn to see if there's any stray references */
559 WARN_ON(1); 565 WARN_ON(1);
560} 566}
567
568static u64 xen_apic_icr_read(void)
569{
570 return 0;
571}
572
573static void xen_apic_icr_write(u32 low, u32 id)
574{
575 /* Warn to see if there's any stray references */
576 WARN_ON(1);
577}
578
579static void xen_apic_wait_icr_idle(void)
580{
581 return;
582}
583
584static u32 xen_safe_apic_wait_icr_idle(void)
585{
586 return 0;
587}
588
589static struct apic_ops xen_basic_apic_ops = {
590 .read = xen_apic_read,
591 .write = xen_apic_write,
592 .icr_read = xen_apic_icr_read,
593 .icr_write = xen_apic_icr_write,
594 .wait_icr_idle = xen_apic_wait_icr_idle,
595 .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle,
596};
597
561#endif 598#endif
562 599
563static void xen_flush_tlb(void) 600static void xen_flush_tlb(void)
@@ -695,36 +732,105 @@ static void set_current_cr3(void *v)
695 x86_write_percpu(xen_current_cr3, (unsigned long)v); 732 x86_write_percpu(xen_current_cr3, (unsigned long)v);
696} 733}
697 734
698static void xen_write_cr3(unsigned long cr3) 735static void __xen_write_cr3(bool kernel, unsigned long cr3)
699{ 736{
700 struct mmuext_op *op; 737 struct mmuext_op *op;
701 struct multicall_space mcs; 738 struct multicall_space mcs;
702 unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); 739 unsigned long mfn;
703 740
704 BUG_ON(preemptible()); 741 if (cr3)
742 mfn = pfn_to_mfn(PFN_DOWN(cr3));
743 else
744 mfn = 0;
705 745
706 mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ 746 WARN_ON(mfn == 0 && kernel);
707 747
708 /* Update while interrupts are disabled, so its atomic with 748 mcs = __xen_mc_entry(sizeof(*op));
709 respect to ipis */
710 x86_write_percpu(xen_cr3, cr3);
711 749
712 op = mcs.args; 750 op = mcs.args;
713 op->cmd = MMUEXT_NEW_BASEPTR; 751 op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
714 op->arg1.mfn = mfn; 752 op->arg1.mfn = mfn;
715 753
716 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 754 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
717 755
718 /* Update xen_update_cr3 once the batch has actually 756 if (kernel) {
719 been submitted. */ 757 x86_write_percpu(xen_cr3, cr3);
720 xen_mc_callback(set_current_cr3, (void *)cr3); 758
759 /* Update xen_current_cr3 once the batch has actually
760 been submitted. */
761 xen_mc_callback(set_current_cr3, (void *)cr3);
762 }
763}
764
765static void xen_write_cr3(unsigned long cr3)
766{
767 BUG_ON(preemptible());
768
769 xen_mc_batch(); /* disables interrupts */
770
771 /* Update while interrupts are disabled, so its atomic with
772 respect to ipis */
773 x86_write_percpu(xen_cr3, cr3);
774
775 __xen_write_cr3(true, cr3);
776
777#ifdef CONFIG_X86_64
778 {
779 pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
780 if (user_pgd)
781 __xen_write_cr3(false, __pa(user_pgd));
782 else
783 __xen_write_cr3(false, 0);
784 }
785#endif
721 786
722 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ 787 xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */
723} 788}
724 789
790static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
791{
792 int ret;
793
794 ret = 0;
795
796 switch(msr) {
797#ifdef CONFIG_X86_64
798 unsigned which;
799 u64 base;
800
801 case MSR_FS_BASE: which = SEGBASE_FS; goto set;
802 case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set;
803 case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set;
804
805 set:
806 base = ((u64)high << 32) | low;
807 if (HYPERVISOR_set_segment_base(which, base) != 0)
808 ret = -EFAULT;
809 break;
810#endif
811
812 case MSR_STAR:
813 case MSR_CSTAR:
814 case MSR_LSTAR:
815 case MSR_SYSCALL_MASK:
816 case MSR_IA32_SYSENTER_CS:
817 case MSR_IA32_SYSENTER_ESP:
818 case MSR_IA32_SYSENTER_EIP:
819 /* Fast syscall setup is all done in hypercalls, so
820 these are all ignored. Stub them out here to stop
821 Xen console noise. */
822 break;
823
824 default:
825 ret = native_write_msr_safe(msr, low, high);
826 }
827
828 return ret;
829}
830
725/* Early in boot, while setting up the initial pagetable, assume 831/* Early in boot, while setting up the initial pagetable, assume
726 everything is pinned. */ 832 everything is pinned. */
727static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) 833static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
728{ 834{
729#ifdef CONFIG_FLATMEM 835#ifdef CONFIG_FLATMEM
730 BUG_ON(mem_map); /* should only be used early */ 836 BUG_ON(mem_map); /* should only be used early */
@@ -734,7 +840,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn)
734 840
735/* Early release_pte assumes that all pts are pinned, since there's 841/* Early release_pte assumes that all pts are pinned, since there's
736 only init_mm and anything attached to that is pinned. */ 842 only init_mm and anything attached to that is pinned. */
737static void xen_release_pte_init(u32 pfn) 843static void xen_release_pte_init(unsigned long pfn)
738{ 844{
739 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 845 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
740} 846}
@@ -750,7 +856,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
750 856
751/* This needs to make sure the new pte page is pinned iff its being 857/* This needs to make sure the new pte page is pinned iff its being
752 attached to a pinned pagetable. */ 858 attached to a pinned pagetable. */
753static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) 859static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level)
754{ 860{
755 struct page *page = pfn_to_page(pfn); 861 struct page *page = pfn_to_page(pfn);
756 862
@@ -758,34 +864,77 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level)
758 SetPagePinned(page); 864 SetPagePinned(page);
759 865
760 if (!PageHighMem(page)) { 866 if (!PageHighMem(page)) {
761 make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); 867 make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn)));
762 if (level == PT_PTE) 868 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
763 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); 869 pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
764 } else 870 } else
765 /* make sure there are no stray mappings of 871 /* make sure there are no stray mappings of
766 this page */ 872 this page */
767 kmap_flush_unused(); 873 kmap_flush_unused();
874 vm_unmap_aliases();
768 } 875 }
769} 876}
770 877
771static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) 878static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn)
772{ 879{
773 xen_alloc_ptpage(mm, pfn, PT_PTE); 880 xen_alloc_ptpage(mm, pfn, PT_PTE);
774} 881}
775 882
776static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) 883static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
777{ 884{
778 xen_alloc_ptpage(mm, pfn, PT_PMD); 885 xen_alloc_ptpage(mm, pfn, PT_PMD);
779} 886}
780 887
888static int xen_pgd_alloc(struct mm_struct *mm)
889{
890 pgd_t *pgd = mm->pgd;
891 int ret = 0;
892
893 BUG_ON(PagePinned(virt_to_page(pgd)));
894
895#ifdef CONFIG_X86_64
896 {
897 struct page *page = virt_to_page(pgd);
898 pgd_t *user_pgd;
899
900 BUG_ON(page->private != 0);
901
902 ret = -ENOMEM;
903
904 user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
905 page->private = (unsigned long)user_pgd;
906
907 if (user_pgd != NULL) {
908 user_pgd[pgd_index(VSYSCALL_START)] =
909 __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE);
910 ret = 0;
911 }
912
913 BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
914 }
915#endif
916
917 return ret;
918}
919
920static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
921{
922#ifdef CONFIG_X86_64
923 pgd_t *user_pgd = xen_get_user_pgd(pgd);
924
925 if (user_pgd)
926 free_page((unsigned long)user_pgd);
927#endif
928}
929
781/* This should never happen until we're OK to use struct page */ 930/* This should never happen until we're OK to use struct page */
782static void xen_release_ptpage(u32 pfn, unsigned level) 931static void xen_release_ptpage(unsigned long pfn, unsigned level)
783{ 932{
784 struct page *page = pfn_to_page(pfn); 933 struct page *page = pfn_to_page(pfn);
785 934
786 if (PagePinned(page)) { 935 if (PagePinned(page)) {
787 if (!PageHighMem(page)) { 936 if (!PageHighMem(page)) {
788 if (level == PT_PTE) 937 if (level == PT_PTE && USE_SPLIT_PTLOCKS)
789 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 938 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
790 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 939 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
791 } 940 }
@@ -793,16 +942,28 @@ static void xen_release_ptpage(u32 pfn, unsigned level)
793 } 942 }
794} 943}
795 944
796static void xen_release_pte(u32 pfn) 945static void xen_release_pte(unsigned long pfn)
797{ 946{
798 xen_release_ptpage(pfn, PT_PTE); 947 xen_release_ptpage(pfn, PT_PTE);
799} 948}
800 949
801static void xen_release_pmd(u32 pfn) 950static void xen_release_pmd(unsigned long pfn)
802{ 951{
803 xen_release_ptpage(pfn, PT_PMD); 952 xen_release_ptpage(pfn, PT_PMD);
804} 953}
805 954
955#if PAGETABLE_LEVELS == 4
956static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
957{
958 xen_alloc_ptpage(mm, pfn, PT_PUD);
959}
960
961static void xen_release_pud(unsigned long pfn)
962{
963 xen_release_ptpage(pfn, PT_PUD);
964}
965#endif
966
806#ifdef CONFIG_HIGHPTE 967#ifdef CONFIG_HIGHPTE
807static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) 968static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
808{ 969{
@@ -820,6 +981,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
820} 981}
821#endif 982#endif
822 983
984#ifdef CONFIG_X86_32
823static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 985static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
824{ 986{
825 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 987 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
@@ -838,71 +1000,20 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte)
838 1000
839 xen_set_pte(ptep, pte); 1001 xen_set_pte(ptep, pte);
840} 1002}
1003#endif
841 1004
842static __init void xen_pagetable_setup_start(pgd_t *base) 1005static __init void xen_pagetable_setup_start(pgd_t *base)
843{ 1006{
844 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base;
845 int i;
846
847 /* special set_pte for pagetable initialization */
848 pv_mmu_ops.set_pte = xen_set_pte_init;
849
850 init_mm.pgd = base;
851 /*
852 * copy top-level of Xen-supplied pagetable into place. This
853 * is a stand-in while we copy the pmd pages.
854 */
855 memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t));
856
857 /*
858 * For PAE, need to allocate new pmds, rather than
859 * share Xen's, since Xen doesn't like pmd's being
860 * shared between address spaces.
861 */
862 for (i = 0; i < PTRS_PER_PGD; i++) {
863 if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) {
864 pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE);
865
866 memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]),
867 PAGE_SIZE);
868
869 make_lowmem_page_readonly(pmd);
870
871 set_pgd(&base[i], __pgd(1 + __pa(pmd)));
872 } else
873 pgd_clear(&base[i]);
874 }
875
876 /* make sure zero_page is mapped RO so we can use it in pagetables */
877 make_lowmem_page_readonly(empty_zero_page);
878 make_lowmem_page_readonly(base);
879 /*
880 * Switch to new pagetable. This is done before
881 * pagetable_init has done anything so that the new pages
882 * added to the table can be prepared properly for Xen.
883 */
884 xen_write_cr3(__pa(base));
885
886 /* Unpin initial Xen pagetable */
887 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
888 PFN_DOWN(__pa(xen_start_info->pt_base)));
889} 1007}
890 1008
891void xen_setup_shared_info(void) 1009void xen_setup_shared_info(void)
892{ 1010{
893 if (!xen_feature(XENFEAT_auto_translated_physmap)) { 1011 if (!xen_feature(XENFEAT_auto_translated_physmap)) {
894 unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); 1012 set_fixmap(FIX_PARAVIRT_BOOTMAP,
895 1013 xen_start_info->shared_info);
896 /* 1014
897 * Create a mapping for the shared info page. 1015 HYPERVISOR_shared_info =
898 * Should be set_fixmap(), but shared_info is a machine 1016 (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
899 * address with no corresponding pseudo-phys address.
900 */
901 set_pte_mfn(addr,
902 PFN_DOWN(xen_start_info->shared_info),
903 PAGE_KERNEL);
904
905 HYPERVISOR_shared_info = (struct shared_info *)addr;
906 } else 1017 } else
907 HYPERVISOR_shared_info = 1018 HYPERVISOR_shared_info =
908 (struct shared_info *)__va(xen_start_info->shared_info); 1019 (struct shared_info *)__va(xen_start_info->shared_info);
@@ -917,26 +1028,32 @@ void xen_setup_shared_info(void)
917 1028
918static __init void xen_pagetable_setup_done(pgd_t *base) 1029static __init void xen_pagetable_setup_done(pgd_t *base)
919{ 1030{
920 /* This will work as long as patching hasn't happened yet
921 (which it hasn't) */
922 pv_mmu_ops.alloc_pte = xen_alloc_pte;
923 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
924 pv_mmu_ops.release_pte = xen_release_pte;
925 pv_mmu_ops.release_pmd = xen_release_pmd;
926 pv_mmu_ops.set_pte = xen_set_pte;
927
928 xen_setup_shared_info(); 1031 xen_setup_shared_info();
929
930 /* Actually pin the pagetable down, but we can't set PG_pinned
931 yet because the page structures don't exist yet. */
932 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base)));
933} 1032}
934 1033
935static __init void xen_post_allocator_init(void) 1034static __init void xen_post_allocator_init(void)
936{ 1035{
1036 pv_mmu_ops.set_pte = xen_set_pte;
937 pv_mmu_ops.set_pmd = xen_set_pmd; 1037 pv_mmu_ops.set_pmd = xen_set_pmd;
938 pv_mmu_ops.set_pud = xen_set_pud; 1038 pv_mmu_ops.set_pud = xen_set_pud;
1039#if PAGETABLE_LEVELS == 4
1040 pv_mmu_ops.set_pgd = xen_set_pgd;
1041#endif
939 1042
1043 /* This will work as long as patching hasn't happened yet
1044 (which it hasn't) */
1045 pv_mmu_ops.alloc_pte = xen_alloc_pte;
1046 pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
1047 pv_mmu_ops.release_pte = xen_release_pte;
1048 pv_mmu_ops.release_pmd = xen_release_pmd;
1049#if PAGETABLE_LEVELS == 4
1050 pv_mmu_ops.alloc_pud = xen_alloc_pud;
1051 pv_mmu_ops.release_pud = xen_release_pud;
1052#endif
1053
1054#ifdef CONFIG_X86_64
1055 SetPagePinned(virt_to_page(level3_user_vsyscall));
1056#endif
940 xen_mark_init_mm_pinned(); 1057 xen_mark_init_mm_pinned();
941} 1058}
942 1059
@@ -1025,8 +1142,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1025#ifdef CONFIG_X86_F00F_BUG 1142#ifdef CONFIG_X86_F00F_BUG
1026 case FIX_F00F_IDT: 1143 case FIX_F00F_IDT:
1027#endif 1144#endif
1145#ifdef CONFIG_X86_32
1028 case FIX_WP_TEST: 1146 case FIX_WP_TEST:
1029 case FIX_VDSO: 1147 case FIX_VDSO:
1148# ifdef CONFIG_HIGHMEM
1149 case FIX_KMAP_BEGIN ... FIX_KMAP_END:
1150# endif
1151#else
1152 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1153#endif
1030#ifdef CONFIG_X86_LOCAL_APIC 1154#ifdef CONFIG_X86_LOCAL_APIC
1031 case FIX_APIC_BASE: /* maps dummy local APIC */ 1155 case FIX_APIC_BASE: /* maps dummy local APIC */
1032#endif 1156#endif
@@ -1039,6 +1163,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
1039 } 1163 }
1040 1164
1041 __native_set_fixmap(idx, pte); 1165 __native_set_fixmap(idx, pte);
1166
1167#ifdef CONFIG_X86_64
1168 /* Replicate changes to map the vsyscall page into the user
1169 pagetable vsyscall mapping. */
1170 if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) {
1171 unsigned long vaddr = __fix_to_virt(idx);
1172 set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte);
1173 }
1174#endif
1042} 1175}
1043 1176
1044static const struct pv_info xen_info __initdata = { 1177static const struct pv_info xen_info __initdata = {
@@ -1084,18 +1217,28 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1084 .wbinvd = native_wbinvd, 1217 .wbinvd = native_wbinvd,
1085 1218
1086 .read_msr = native_read_msr_safe, 1219 .read_msr = native_read_msr_safe,
1087 .write_msr = native_write_msr_safe, 1220 .write_msr = xen_write_msr_safe,
1088 .read_tsc = native_read_tsc, 1221 .read_tsc = native_read_tsc,
1089 .read_pmc = native_read_pmc, 1222 .read_pmc = native_read_pmc,
1090 1223
1091 .iret = xen_iret, 1224 .iret = xen_iret,
1092 .irq_enable_sysexit = xen_sysexit, 1225 .irq_enable_sysexit = xen_sysexit,
1226#ifdef CONFIG_X86_64
1227 .usergs_sysret32 = xen_sysret32,
1228 .usergs_sysret64 = xen_sysret64,
1229#endif
1093 1230
1094 .load_tr_desc = paravirt_nop, 1231 .load_tr_desc = paravirt_nop,
1095 .set_ldt = xen_set_ldt, 1232 .set_ldt = xen_set_ldt,
1096 .load_gdt = xen_load_gdt, 1233 .load_gdt = xen_load_gdt,
1097 .load_idt = xen_load_idt, 1234 .load_idt = xen_load_idt,
1098 .load_tls = xen_load_tls, 1235 .load_tls = xen_load_tls,
1236#ifdef CONFIG_X86_64
1237 .load_gs_index = xen_load_gs_index,
1238#endif
1239
1240 .alloc_ldt = xen_alloc_ldt,
1241 .free_ldt = xen_free_ldt,
1099 1242
1100 .store_gdt = native_store_gdt, 1243 .store_gdt = native_store_gdt,
1101 .store_idt = native_store_idt, 1244 .store_idt = native_store_idt,
@@ -1109,30 +1252,17 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1109 .set_iopl_mask = xen_set_iopl_mask, 1252 .set_iopl_mask = xen_set_iopl_mask,
1110 .io_delay = xen_io_delay, 1253 .io_delay = xen_io_delay,
1111 1254
1255 /* Xen takes care of %gs when switching to usermode for us */
1256 .swapgs = paravirt_nop,
1257
1112 .lazy_mode = { 1258 .lazy_mode = {
1113 .enter = paravirt_enter_lazy_cpu, 1259 .enter = paravirt_enter_lazy_cpu,
1114 .leave = xen_leave_lazy, 1260 .leave = xen_leave_lazy,
1115 }, 1261 },
1116}; 1262};
1117 1263
1118static const struct pv_irq_ops xen_irq_ops __initdata = {
1119 .init_IRQ = xen_init_IRQ,
1120 .save_fl = xen_save_fl,
1121 .restore_fl = xen_restore_fl,
1122 .irq_disable = xen_irq_disable,
1123 .irq_enable = xen_irq_enable,
1124 .safe_halt = xen_safe_halt,
1125 .halt = xen_halt,
1126#ifdef CONFIG_X86_64
1127 .adjust_exception_frame = paravirt_nop,
1128#endif
1129};
1130
1131static const struct pv_apic_ops xen_apic_ops __initdata = { 1264static const struct pv_apic_ops xen_apic_ops __initdata = {
1132#ifdef CONFIG_X86_LOCAL_APIC 1265#ifdef CONFIG_X86_LOCAL_APIC
1133 .apic_write = xen_apic_write,
1134 .apic_write_atomic = xen_apic_write,
1135 .apic_read = xen_apic_read,
1136 .setup_boot_clock = paravirt_nop, 1266 .setup_boot_clock = paravirt_nop,
1137 .setup_secondary_clock = paravirt_nop, 1267 .setup_secondary_clock = paravirt_nop,
1138 .startup_ipi_hook = paravirt_nop, 1268 .startup_ipi_hook = paravirt_nop,
@@ -1157,8 +1287,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1157 .pte_update = paravirt_nop, 1287 .pte_update = paravirt_nop,
1158 .pte_update_defer = paravirt_nop, 1288 .pte_update_defer = paravirt_nop,
1159 1289
1160 .pgd_alloc = __paravirt_pgd_alloc, 1290 .pgd_alloc = xen_pgd_alloc,
1161 .pgd_free = paravirt_nop, 1291 .pgd_free = xen_pgd_free,
1162 1292
1163 .alloc_pte = xen_alloc_pte_init, 1293 .alloc_pte = xen_alloc_pte_init,
1164 .release_pte = xen_release_pte_init, 1294 .release_pte = xen_release_pte_init,
@@ -1170,7 +1300,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1170 .kmap_atomic_pte = xen_kmap_atomic_pte, 1300 .kmap_atomic_pte = xen_kmap_atomic_pte,
1171#endif 1301#endif
1172 1302
1173 .set_pte = NULL, /* see xen_pagetable_setup_* */ 1303#ifdef CONFIG_X86_64
1304 .set_pte = xen_set_pte,
1305#else
1306 .set_pte = xen_set_pte_init,
1307#endif
1174 .set_pte_at = xen_set_pte_at, 1308 .set_pte_at = xen_set_pte_at,
1175 .set_pmd = xen_set_pmd_hyper, 1309 .set_pmd = xen_set_pmd_hyper,
1176 1310
@@ -1178,21 +1312,32 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1178 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 1312 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
1179 1313
1180 .pte_val = xen_pte_val, 1314 .pte_val = xen_pte_val,
1181 .pte_flags = native_pte_val, 1315 .pte_flags = native_pte_flags,
1182 .pgd_val = xen_pgd_val, 1316 .pgd_val = xen_pgd_val,
1183 1317
1184 .make_pte = xen_make_pte, 1318 .make_pte = xen_make_pte,
1185 .make_pgd = xen_make_pgd, 1319 .make_pgd = xen_make_pgd,
1186 1320
1321#ifdef CONFIG_X86_PAE
1187 .set_pte_atomic = xen_set_pte_atomic, 1322 .set_pte_atomic = xen_set_pte_atomic,
1188 .set_pte_present = xen_set_pte_at, 1323 .set_pte_present = xen_set_pte_at,
1189 .set_pud = xen_set_pud_hyper,
1190 .pte_clear = xen_pte_clear, 1324 .pte_clear = xen_pte_clear,
1191 .pmd_clear = xen_pmd_clear, 1325 .pmd_clear = xen_pmd_clear,
1326#endif /* CONFIG_X86_PAE */
1327 .set_pud = xen_set_pud_hyper,
1192 1328
1193 .make_pmd = xen_make_pmd, 1329 .make_pmd = xen_make_pmd,
1194 .pmd_val = xen_pmd_val, 1330 .pmd_val = xen_pmd_val,
1195 1331
1332#if PAGETABLE_LEVELS == 4
1333 .pud_val = xen_pud_val,
1334 .make_pud = xen_make_pud,
1335 .set_pgd = xen_set_pgd_hyper,
1336
1337 .alloc_pud = xen_alloc_pte_init,
1338 .release_pud = xen_release_pte_init,
1339#endif /* PAGETABLE_LEVELS == 4 */
1340
1196 .activate_mm = xen_activate_mm, 1341 .activate_mm = xen_activate_mm,
1197 .dup_mmap = xen_dup_mmap, 1342 .dup_mmap = xen_dup_mmap,
1198 .exit_mmap = xen_exit_mmap, 1343 .exit_mmap = xen_exit_mmap,
@@ -1205,21 +1350,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1205 .set_fixmap = xen_set_fixmap, 1350 .set_fixmap = xen_set_fixmap,
1206}; 1351};
1207 1352
1208#ifdef CONFIG_SMP
1209static const struct smp_ops xen_smp_ops __initdata = {
1210 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
1211 .smp_prepare_cpus = xen_smp_prepare_cpus,
1212 .cpu_up = xen_cpu_up,
1213 .smp_cpus_done = xen_smp_cpus_done,
1214
1215 .smp_send_stop = xen_smp_send_stop,
1216 .smp_send_reschedule = xen_smp_send_reschedule,
1217
1218 .send_call_func_ipi = xen_smp_send_call_function_ipi,
1219 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
1220};
1221#endif /* CONFIG_SMP */
1222
1223static void xen_reboot(int reason) 1353static void xen_reboot(int reason)
1224{ 1354{
1225 struct sched_shutdown r = { .reason = reason }; 1355 struct sched_shutdown r = { .reason = reason };
@@ -1264,15 +1394,219 @@ static const struct machine_ops __initdata xen_machine_ops = {
1264 1394
1265static void __init xen_reserve_top(void) 1395static void __init xen_reserve_top(void)
1266{ 1396{
1397#ifdef CONFIG_X86_32
1267 unsigned long top = HYPERVISOR_VIRT_START; 1398 unsigned long top = HYPERVISOR_VIRT_START;
1268 struct xen_platform_parameters pp; 1399 struct xen_platform_parameters pp;
1269 1400
1270 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) 1401 if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0)
1271 top = pp.virt_start; 1402 top = pp.virt_start;
1272 1403
1273 reserve_top_address(-top + 2 * PAGE_SIZE); 1404 reserve_top_address(-top);
1405#endif /* CONFIG_X86_32 */
1406}
1407
1408/*
1409 * Like __va(), but returns address in the kernel mapping (which is
1410 * all we have until the physical memory mapping has been set up.
1411 */
1412static void *__ka(phys_addr_t paddr)
1413{
1414#ifdef CONFIG_X86_64
1415 return (void *)(paddr + __START_KERNEL_map);
1416#else
1417 return __va(paddr);
1418#endif
1419}
1420
1421/* Convert a machine address to physical address */
1422static unsigned long m2p(phys_addr_t maddr)
1423{
1424 phys_addr_t paddr;
1425
1426 maddr &= PTE_PFN_MASK;
1427 paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1428
1429 return paddr;
1274} 1430}
1275 1431
1432/* Convert a machine address to kernel virtual */
1433static void *m2v(phys_addr_t maddr)
1434{
1435 return __ka(m2p(maddr));
1436}
1437
1438static void set_page_prot(void *addr, pgprot_t prot)
1439{
1440 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1441 pte_t pte = pfn_pte(pfn, prot);
1442
1443 if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0))
1444 BUG();
1445}
1446
1447static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1448{
1449 unsigned pmdidx, pteidx;
1450 unsigned ident_pte;
1451 unsigned long pfn;
1452
1453 ident_pte = 0;
1454 pfn = 0;
1455 for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
1456 pte_t *pte_page;
1457
1458 /* Reuse or allocate a page of ptes */
1459 if (pmd_present(pmd[pmdidx]))
1460 pte_page = m2v(pmd[pmdidx].pmd);
1461 else {
1462 /* Check for free pte pages */
1463 if (ident_pte == ARRAY_SIZE(level1_ident_pgt))
1464 break;
1465
1466 pte_page = &level1_ident_pgt[ident_pte];
1467 ident_pte += PTRS_PER_PTE;
1468
1469 pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE);
1470 }
1471
1472 /* Install mappings */
1473 for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1474 pte_t pte;
1475
1476 if (pfn > max_pfn_mapped)
1477 max_pfn_mapped = pfn;
1478
1479 if (!pte_none(pte_page[pteidx]))
1480 continue;
1481
1482 pte = pfn_pte(pfn, PAGE_KERNEL_EXEC);
1483 pte_page[pteidx] = pte;
1484 }
1485 }
1486
1487 for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE)
1488 set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO);
1489
1490 set_page_prot(pmd, PAGE_KERNEL_RO);
1491}
1492
1493#ifdef CONFIG_X86_64
1494static void convert_pfn_mfn(void *v)
1495{
1496 pte_t *pte = v;
1497 int i;
1498
1499 /* All levels are converted the same way, so just treat them
1500 as ptes. */
1501 for(i = 0; i < PTRS_PER_PTE; i++)
1502 pte[i] = xen_make_pte(pte[i].pte);
1503}
1504
1505/*
1506 * Set up the inital kernel pagetable.
1507 *
1508 * We can construct this by grafting the Xen provided pagetable into
1509 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
1510 * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This
1511 * means that only the kernel has a physical mapping to start with -
1512 * but that's enough to get __va working. We need to fill in the rest
1513 * of the physical mapping once some sort of allocator has been set
1514 * up.
1515 */
1516static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1517{
1518 pud_t *l3;
1519 pmd_t *l2;
1520
1521 /* Zap identity mapping */
1522 init_level4_pgt[0] = __pgd(0);
1523
1524 /* Pre-constructed entries are in pfn, so convert to mfn */
1525 convert_pfn_mfn(init_level4_pgt);
1526 convert_pfn_mfn(level3_ident_pgt);
1527 convert_pfn_mfn(level3_kernel_pgt);
1528
1529 l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd);
1530 l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud);
1531
1532 memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1533 memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1534
1535 l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd);
1536 l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud);
1537 memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD);
1538
1539 /* Set up identity map */
1540 xen_map_identity_early(level2_ident_pgt, max_pfn);
1541
1542 /* Make pagetable pieces RO */
1543 set_page_prot(init_level4_pgt, PAGE_KERNEL_RO);
1544 set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO);
1545 set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO);
1546 set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO);
1547 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1548 set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO);
1549
1550 /* Pin down new L4 */
1551 pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1552 PFN_DOWN(__pa_symbol(init_level4_pgt)));
1553
1554 /* Unpin Xen-provided one */
1555 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1556
1557 /* Switch over */
1558 pgd = init_level4_pgt;
1559
1560 /*
1561 * At this stage there can be no user pgd, and no page
1562 * structure to attach it to, so make sure we just set kernel
1563 * pgd.
1564 */
1565 xen_mc_batch();
1566 __xen_write_cr3(true, __pa(pgd));
1567 xen_mc_issue(PARAVIRT_LAZY_CPU);
1568
1569 reserve_early(__pa(xen_start_info->pt_base),
1570 __pa(xen_start_info->pt_base +
1571 xen_start_info->nr_pt_frames * PAGE_SIZE),
1572 "XEN PAGETABLES");
1573
1574 return pgd;
1575}
1576#else /* !CONFIG_X86_64 */
1577static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss;
1578
1579static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn)
1580{
1581 pmd_t *kernel_pmd;
1582
1583 init_pg_tables_start = __pa(pgd);
1584 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE;
1585 max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024);
1586
1587 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1588 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1589
1590 xen_map_identity_early(level2_kernel_pgt, max_pfn);
1591
1592 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1593 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY],
1594 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT));
1595
1596 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO);
1597 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1598 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1599
1600 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1601
1602 xen_write_cr3(__pa(swapper_pg_dir));
1603
1604 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
1605
1606 return swapper_pg_dir;
1607}
1608#endif /* CONFIG_X86_64 */
1609
1276/* First C function to be called on Xen boot */ 1610/* First C function to be called on Xen boot */
1277asmlinkage void __init xen_start_kernel(void) 1611asmlinkage void __init xen_start_kernel(void)
1278{ 1612{
@@ -1281,6 +1615,8 @@ asmlinkage void __init xen_start_kernel(void)
1281 if (!xen_start_info) 1615 if (!xen_start_info)
1282 return; 1616 return;
1283 1617
1618 xen_domain_type = XEN_PV_DOMAIN;
1619
1284 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); 1620 BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0);
1285 1621
1286 xen_setup_features(); 1622 xen_setup_features();
@@ -1290,10 +1626,18 @@ asmlinkage void __init xen_start_kernel(void)
1290 pv_init_ops = xen_init_ops; 1626 pv_init_ops = xen_init_ops;
1291 pv_time_ops = xen_time_ops; 1627 pv_time_ops = xen_time_ops;
1292 pv_cpu_ops = xen_cpu_ops; 1628 pv_cpu_ops = xen_cpu_ops;
1293 pv_irq_ops = xen_irq_ops;
1294 pv_apic_ops = xen_apic_ops; 1629 pv_apic_ops = xen_apic_ops;
1295 pv_mmu_ops = xen_mmu_ops; 1630 pv_mmu_ops = xen_mmu_ops;
1296 1631
1632 xen_init_irq_ops();
1633
1634#ifdef CONFIG_X86_LOCAL_APIC
1635 /*
1636 * set up the basic apic ops.
1637 */
1638 apic_ops = &xen_basic_apic_ops;
1639#endif
1640
1297 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { 1641 if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) {
1298 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; 1642 pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start;
1299 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; 1643 pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit;
@@ -1301,60 +1645,69 @@ asmlinkage void __init xen_start_kernel(void)
1301 1645
1302 machine_ops = xen_machine_ops; 1646 machine_ops = xen_machine_ops;
1303 1647
1304#ifdef CONFIG_SMP 1648#ifdef CONFIG_X86_64
1305 smp_ops = xen_smp_ops; 1649 /* Disable until direct per-cpu data access. */
1650 have_vcpu_info_placement = 0;
1651 x86_64_init_pda();
1306#endif 1652#endif
1307 1653
1654 xen_smp_init();
1655
1308 /* Get mfn list */ 1656 /* Get mfn list */
1309 if (!xen_feature(XENFEAT_auto_translated_physmap)) 1657 if (!xen_feature(XENFEAT_auto_translated_physmap))
1310 xen_build_dynamic_phys_to_machine(); 1658 xen_build_dynamic_phys_to_machine();
1311 1659
1312 pgd = (pgd_t *)xen_start_info->pt_base; 1660 pgd = (pgd_t *)xen_start_info->pt_base;
1313 1661
1314 init_pg_tables_start = __pa(pgd); 1662 /* Prevent unwanted bits from being set in PTEs. */
1315 init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; 1663 __supported_pte_mask &= ~_PAGE_GLOBAL;
1316 max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; 1664 if (!xen_initial_domain())
1317 1665 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1318 init_mm.pgd = pgd; /* use the Xen pagetables to start */
1319
1320 /* keep using Xen gdt for now; no urgent need to change it */
1321
1322 x86_write_percpu(xen_cr3, __pa(pgd));
1323 x86_write_percpu(xen_current_cr3, __pa(pgd));
1324 1666
1325 /* Don't do the full vcpu_info placement stuff until we have a 1667 /* Don't do the full vcpu_info placement stuff until we have a
1326 possible map and a non-dummy shared_info. */ 1668 possible map and a non-dummy shared_info. */
1327 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1669 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1328 1670
1671 xen_raw_console_write("mapping kernel into physical memory\n");
1672 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1673
1674 init_mm.pgd = pgd;
1675
1676 /* keep using Xen gdt for now; no urgent need to change it */
1677
1329 pv_info.kernel_rpl = 1; 1678 pv_info.kernel_rpl = 1;
1330 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1679 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1331 pv_info.kernel_rpl = 0; 1680 pv_info.kernel_rpl = 0;
1332 1681
1333 /* Prevent unwanted bits from being set in PTEs. */
1334 __supported_pte_mask &= ~_PAGE_GLOBAL;
1335 if (!is_initial_xendomain())
1336 __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
1337
1338 /* set the limit of our address space */ 1682 /* set the limit of our address space */
1339 xen_reserve_top(); 1683 xen_reserve_top();
1340 1684
1685#ifdef CONFIG_X86_32
1341 /* set up basic CPUID stuff */ 1686 /* set up basic CPUID stuff */
1342 cpu_detect(&new_cpu_data); 1687 cpu_detect(&new_cpu_data);
1343 new_cpu_data.hard_math = 1; 1688 new_cpu_data.hard_math = 1;
1344 new_cpu_data.x86_capability[0] = cpuid_edx(1); 1689 new_cpu_data.x86_capability[0] = cpuid_edx(1);
1690#endif
1345 1691
1346 /* Poke various useful things into boot_params */ 1692 /* Poke various useful things into boot_params */
1347 boot_params.hdr.type_of_loader = (9 << 4) | 0; 1693 boot_params.hdr.type_of_loader = (9 << 4) | 0;
1348 boot_params.hdr.ramdisk_image = xen_start_info->mod_start 1694 boot_params.hdr.ramdisk_image = xen_start_info->mod_start
1349 ? __pa(xen_start_info->mod_start) : 0; 1695 ? __pa(xen_start_info->mod_start) : 0;
1350 boot_params.hdr.ramdisk_size = xen_start_info->mod_len; 1696 boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
1697 boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
1351 1698
1352 if (!is_initial_xendomain()) { 1699 if (!xen_initial_domain()) {
1353 add_preferred_console("xenboot", 0, NULL); 1700 add_preferred_console("xenboot", 0, NULL);
1354 add_preferred_console("tty", 0, NULL); 1701 add_preferred_console("tty", 0, NULL);
1355 add_preferred_console("hvc", 0, NULL); 1702 add_preferred_console("hvc", 0, NULL);
1356 } 1703 }
1357 1704
1705 xen_raw_console_write("about to get started...\n");
1706
1358 /* Start the world */ 1707 /* Start the world */
1708#ifdef CONFIG_X86_32
1359 i386_start_kernel(); 1709 i386_start_kernel();
1710#else
1711 x86_64_start_reservations((char *)__pa_symbol(&boot_params));
1712#endif
1360} 1713}
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
new file mode 100644
index 000000000000..bb042608c602
--- /dev/null
+++ b/arch/x86/xen/irq.c
@@ -0,0 +1,141 @@
1#include <linux/hardirq.h>
2
3#include <xen/interface/xen.h>
4#include <xen/interface/sched.h>
5#include <xen/interface/vcpu.h>
6
7#include <asm/xen/hypercall.h>
8#include <asm/xen/hypervisor.h>
9
10#include "xen-ops.h"
11
12/*
13 * Force a proper event-channel callback from Xen after clearing the
14 * callback mask. We do this in a very simple manner, by making a call
15 * down into Xen. The pending flag will be checked by Xen on return.
16 */
17void xen_force_evtchn_callback(void)
18{
19 (void)HYPERVISOR_xen_version(0, NULL);
20}
21
22static void __init __xen_init_IRQ(void)
23{
24 int i;
25
26 /* Create identity vector->irq map */
27 for(i = 0; i < NR_VECTORS; i++) {
28 int cpu;
29
30 for_each_possible_cpu(cpu)
31 per_cpu(vector_irq, cpu)[i] = i;
32 }
33
34 xen_init_IRQ();
35}
36
37static unsigned long xen_save_fl(void)
38{
39 struct vcpu_info *vcpu;
40 unsigned long flags;
41
42 vcpu = x86_read_percpu(xen_vcpu);
43
44 /* flag has opposite sense of mask */
45 flags = !vcpu->evtchn_upcall_mask;
46
47 /* convert to IF type flag
48 -0 -> 0x00000000
49 -1 -> 0xffffffff
50 */
51 return (-flags) & X86_EFLAGS_IF;
52}
53
54static void xen_restore_fl(unsigned long flags)
55{
56 struct vcpu_info *vcpu;
57
58 /* convert from IF type flag */
59 flags = !(flags & X86_EFLAGS_IF);
60
61 /* There's a one instruction preempt window here. We need to
62 make sure we're don't switch CPUs between getting the vcpu
63 pointer and updating the mask. */
64 preempt_disable();
65 vcpu = x86_read_percpu(xen_vcpu);
66 vcpu->evtchn_upcall_mask = flags;
67 preempt_enable_no_resched();
68
69 /* Doesn't matter if we get preempted here, because any
70 pending event will get dealt with anyway. */
71
72 if (flags == 0) {
73 preempt_check_resched();
74 barrier(); /* unmask then check (avoid races) */
75 if (unlikely(vcpu->evtchn_upcall_pending))
76 xen_force_evtchn_callback();
77 }
78}
79
80static void xen_irq_disable(void)
81{
82 /* There's a one instruction preempt window here. We need to
83 make sure we're don't switch CPUs between getting the vcpu
84 pointer and updating the mask. */
85 preempt_disable();
86 x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
87 preempt_enable_no_resched();
88}
89
90static void xen_irq_enable(void)
91{
92 struct vcpu_info *vcpu;
93
94 /* We don't need to worry about being preempted here, since
95 either a) interrupts are disabled, so no preemption, or b)
96 the caller is confused and is trying to re-enable interrupts
97 on an indeterminate processor. */
98
99 vcpu = x86_read_percpu(xen_vcpu);
100 vcpu->evtchn_upcall_mask = 0;
101
102 /* Doesn't matter if we get preempted here, because any
103 pending event will get dealt with anyway. */
104
105 barrier(); /* unmask then check (avoid races) */
106 if (unlikely(vcpu->evtchn_upcall_pending))
107 xen_force_evtchn_callback();
108}
109
110static void xen_safe_halt(void)
111{
112 /* Blocking includes an implicit local_irq_enable(). */
113 if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0)
114 BUG();
115}
116
117static void xen_halt(void)
118{
119 if (irqs_disabled())
120 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
121 else
122 xen_safe_halt();
123}
124
125static const struct pv_irq_ops xen_irq_ops __initdata = {
126 .init_IRQ = __xen_init_IRQ,
127 .save_fl = xen_save_fl,
128 .restore_fl = xen_restore_fl,
129 .irq_disable = xen_irq_disable,
130 .irq_enable = xen_irq_enable,
131 .safe_halt = xen_safe_halt,
132 .halt = xen_halt,
133#ifdef CONFIG_X86_64
134 .adjust_exception_frame = xen_adjust_exception_frame,
135#endif
136};
137
138void __init xen_init_irq_ops()
139{
140 pv_irq_ops = xen_irq_ops;
141}
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ff0aa74afaa1..d4d52f5a1cf7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -40,12 +40,15 @@
40 */ 40 */
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/highmem.h> 42#include <linux/highmem.h>
43#include <linux/debugfs.h>
43#include <linux/bug.h> 44#include <linux/bug.h>
44 45
45#include <asm/pgtable.h> 46#include <asm/pgtable.h>
46#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/fixmap.h>
47#include <asm/mmu_context.h> 49#include <asm/mmu_context.h>
48#include <asm/paravirt.h> 50#include <asm/paravirt.h>
51#include <asm/linkage.h>
49 52
50#include <asm/xen/hypercall.h> 53#include <asm/xen/hypercall.h>
51#include <asm/xen/hypervisor.h> 54#include <asm/xen/hypervisor.h>
@@ -55,27 +58,85 @@
55 58
56#include "multicalls.h" 59#include "multicalls.h"
57#include "mmu.h" 60#include "mmu.h"
61#include "debugfs.h"
62
63#define MMU_UPDATE_HISTO 30
64
65#ifdef CONFIG_XEN_DEBUG_FS
66
67static struct {
68 u32 pgd_update;
69 u32 pgd_update_pinned;
70 u32 pgd_update_batched;
71
72 u32 pud_update;
73 u32 pud_update_pinned;
74 u32 pud_update_batched;
75
76 u32 pmd_update;
77 u32 pmd_update_pinned;
78 u32 pmd_update_batched;
79
80 u32 pte_update;
81 u32 pte_update_pinned;
82 u32 pte_update_batched;
83
84 u32 mmu_update;
85 u32 mmu_update_extended;
86 u32 mmu_update_histo[MMU_UPDATE_HISTO];
87
88 u32 prot_commit;
89 u32 prot_commit_batched;
90
91 u32 set_pte_at;
92 u32 set_pte_at_batched;
93 u32 set_pte_at_pinned;
94 u32 set_pte_at_current;
95 u32 set_pte_at_kernel;
96} mmu_stats;
97
98static u8 zero_stats;
99
100static inline void check_zero(void)
101{
102 if (unlikely(zero_stats)) {
103 memset(&mmu_stats, 0, sizeof(mmu_stats));
104 zero_stats = 0;
105 }
106}
107
108#define ADD_STATS(elem, val) \
109 do { check_zero(); mmu_stats.elem += (val); } while(0)
110
111#else /* !CONFIG_XEN_DEBUG_FS */
112
113#define ADD_STATS(elem, val) do { (void)(val); } while(0)
114
115#endif /* CONFIG_XEN_DEBUG_FS */
116
117/*
118 * Just beyond the highest usermode address. STACK_TOP_MAX has a
119 * redzone above it, so round it up to a PGD boundary.
120 */
121#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
122
58 123
59#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) 124#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
60#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) 125#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
61 126
62/* Placeholder for holes in the address space */ 127/* Placeholder for holes in the address space */
63static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] 128static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
64 __attribute__((section(".data.page_aligned"))) =
65 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; 129 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
66 130
67 /* Array of pointers to pages containing p2m entries */ 131 /* Array of pointers to pages containing p2m entries */
68static unsigned long *p2m_top[TOP_ENTRIES] 132static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
69 __attribute__((section(".data.page_aligned"))) =
70 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; 133 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
71 134
72/* Arrays of p2m arrays expressed in mfns used for save/restore */ 135/* Arrays of p2m arrays expressed in mfns used for save/restore */
73static unsigned long p2m_top_mfn[TOP_ENTRIES] 136static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
74 __attribute__((section(".bss.page_aligned")));
75 137
76static unsigned long p2m_top_mfn_list[ 138static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
77 PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)] 139 __page_aligned_bss;
78 __attribute__((section(".bss.page_aligned")));
79 140
80static inline unsigned p2m_top_index(unsigned long pfn) 141static inline unsigned p2m_top_index(unsigned long pfn)
81{ 142{
@@ -181,15 +242,16 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
181 p2m_top[topidx][idx] = mfn; 242 p2m_top[topidx][idx] = mfn;
182} 243}
183 244
184xmaddr_t arbitrary_virt_to_machine(unsigned long address) 245xmaddr_t arbitrary_virt_to_machine(void *vaddr)
185{ 246{
247 unsigned long address = (unsigned long)vaddr;
186 unsigned int level; 248 unsigned int level;
187 pte_t *pte = lookup_address(address, &level); 249 pte_t *pte = lookup_address(address, &level);
188 unsigned offset = address & ~PAGE_MASK; 250 unsigned offset = address & ~PAGE_MASK;
189 251
190 BUG_ON(pte == NULL); 252 BUG_ON(pte == NULL);
191 253
192 return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); 254 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
193} 255}
194 256
195void make_lowmem_page_readonly(void *vaddr) 257void make_lowmem_page_readonly(void *vaddr)
@@ -223,25 +285,35 @@ void make_lowmem_page_readwrite(void *vaddr)
223} 285}
224 286
225 287
226static bool page_pinned(void *ptr) 288static bool xen_page_pinned(void *ptr)
227{ 289{
228 struct page *page = virt_to_page(ptr); 290 struct page *page = virt_to_page(ptr);
229 291
230 return PagePinned(page); 292 return PagePinned(page);
231} 293}
232 294
233static void extend_mmu_update(const struct mmu_update *update) 295static void xen_extend_mmu_update(const struct mmu_update *update)
234{ 296{
235 struct multicall_space mcs; 297 struct multicall_space mcs;
236 struct mmu_update *u; 298 struct mmu_update *u;
237 299
238 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 300 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
239 301
240 if (mcs.mc != NULL) 302 if (mcs.mc != NULL) {
303 ADD_STATS(mmu_update_extended, 1);
304 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
305
241 mcs.mc->args[1]++; 306 mcs.mc->args[1]++;
242 else { 307
308 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
309 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
310 else
311 ADD_STATS(mmu_update_histo[0], 1);
312 } else {
313 ADD_STATS(mmu_update, 1);
243 mcs = __xen_mc_entry(sizeof(*u)); 314 mcs = __xen_mc_entry(sizeof(*u));
244 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 315 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
316 ADD_STATS(mmu_update_histo[1], 1);
245 } 317 }
246 318
247 u = mcs.args; 319 u = mcs.args;
@@ -256,9 +328,12 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
256 328
257 xen_mc_batch(); 329 xen_mc_batch();
258 330
259 u.ptr = virt_to_machine(ptr).maddr; 331 /* ptr may be ioremapped for 64-bit pagetable setup */
332 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
260 u.val = pmd_val_ma(val); 333 u.val = pmd_val_ma(val);
261 extend_mmu_update(&u); 334 xen_extend_mmu_update(&u);
335
336 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
262 337
263 xen_mc_issue(PARAVIRT_LAZY_MMU); 338 xen_mc_issue(PARAVIRT_LAZY_MMU);
264 339
@@ -267,13 +342,17 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
267 342
268void xen_set_pmd(pmd_t *ptr, pmd_t val) 343void xen_set_pmd(pmd_t *ptr, pmd_t val)
269{ 344{
345 ADD_STATS(pmd_update, 1);
346
270 /* If page is not pinned, we can just update the entry 347 /* If page is not pinned, we can just update the entry
271 directly */ 348 directly */
272 if (!page_pinned(ptr)) { 349 if (!xen_page_pinned(ptr)) {
273 *ptr = val; 350 *ptr = val;
274 return; 351 return;
275 } 352 }
276 353
354 ADD_STATS(pmd_update_pinned, 1);
355
277 xen_set_pmd_hyper(ptr, val); 356 xen_set_pmd_hyper(ptr, val);
278} 357}
279 358
@@ -283,35 +362,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
283 */ 362 */
284void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) 363void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
285{ 364{
286 pgd_t *pgd; 365 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
287 pud_t *pud;
288 pmd_t *pmd;
289 pte_t *pte;
290
291 pgd = swapper_pg_dir + pgd_index(vaddr);
292 if (pgd_none(*pgd)) {
293 BUG();
294 return;
295 }
296 pud = pud_offset(pgd, vaddr);
297 if (pud_none(*pud)) {
298 BUG();
299 return;
300 }
301 pmd = pmd_offset(pud, vaddr);
302 if (pmd_none(*pmd)) {
303 BUG();
304 return;
305 }
306 pte = pte_offset_kernel(pmd, vaddr);
307 /* <mfn,flags> stored as-is, to permit clearing entries */
308 xen_set_pte(pte, mfn_pte(mfn, flags));
309
310 /*
311 * It's enough to flush this one mapping.
312 * (PGE mappings get flushed as well)
313 */
314 __flush_tlb_one(vaddr);
315} 366}
316 367
317void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 368void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
@@ -321,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
321 if (mm == &init_mm) 372 if (mm == &init_mm)
322 preempt_disable(); 373 preempt_disable();
323 374
375 ADD_STATS(set_pte_at, 1);
376// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
377 ADD_STATS(set_pte_at_current, mm == current->mm);
378 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
379
324 if (mm == current->mm || mm == &init_mm) { 380 if (mm == current->mm || mm == &init_mm) {
325 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 381 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
326 struct multicall_space mcs; 382 struct multicall_space mcs;
327 mcs = xen_mc_entry(0); 383 mcs = xen_mc_entry(0);
328 384
329 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 385 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0);
386 ADD_STATS(set_pte_at_batched, 1);
330 xen_mc_issue(PARAVIRT_LAZY_MMU); 387 xen_mc_issue(PARAVIRT_LAZY_MMU);
331 goto out; 388 goto out;
332 } else 389 } else
@@ -355,7 +412,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
355 412
356 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 413 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
357 u.val = pte_val_ma(pte); 414 u.val = pte_val_ma(pte);
358 extend_mmu_update(&u); 415 xen_extend_mmu_update(&u);
416
417 ADD_STATS(prot_commit, 1);
418 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
359 419
360 xen_mc_issue(PARAVIRT_LAZY_MMU); 420 xen_mc_issue(PARAVIRT_LAZY_MMU);
361} 421}
@@ -364,8 +424,8 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
364static pteval_t pte_mfn_to_pfn(pteval_t val) 424static pteval_t pte_mfn_to_pfn(pteval_t val)
365{ 425{
366 if (val & _PAGE_PRESENT) { 426 if (val & _PAGE_PRESENT) {
367 unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; 427 unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
368 pteval_t flags = val & ~PTE_MASK; 428 pteval_t flags = val & PTE_FLAGS_MASK;
369 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; 429 val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags;
370 } 430 }
371 431
@@ -375,8 +435,8 @@ static pteval_t pte_mfn_to_pfn(pteval_t val)
375static pteval_t pte_pfn_to_mfn(pteval_t val) 435static pteval_t pte_pfn_to_mfn(pteval_t val)
376{ 436{
377 if (val & _PAGE_PRESENT) { 437 if (val & _PAGE_PRESENT) {
378 unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; 438 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
379 pteval_t flags = val & ~PTE_MASK; 439 pteval_t flags = val & PTE_FLAGS_MASK;
380 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; 440 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags;
381 } 441 }
382 442
@@ -418,9 +478,12 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
418 478
419 xen_mc_batch(); 479 xen_mc_batch();
420 480
421 u.ptr = virt_to_machine(ptr).maddr; 481 /* ptr may be ioremapped for 64-bit pagetable setup */
482 u.ptr = arbitrary_virt_to_machine(ptr).maddr;
422 u.val = pud_val_ma(val); 483 u.val = pud_val_ma(val);
423 extend_mmu_update(&u); 484 xen_extend_mmu_update(&u);
485
486 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
424 487
425 xen_mc_issue(PARAVIRT_LAZY_MMU); 488 xen_mc_issue(PARAVIRT_LAZY_MMU);
426 489
@@ -429,26 +492,39 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
429 492
430void xen_set_pud(pud_t *ptr, pud_t val) 493void xen_set_pud(pud_t *ptr, pud_t val)
431{ 494{
495 ADD_STATS(pud_update, 1);
496
432 /* If page is not pinned, we can just update the entry 497 /* If page is not pinned, we can just update the entry
433 directly */ 498 directly */
434 if (!page_pinned(ptr)) { 499 if (!xen_page_pinned(ptr)) {
435 *ptr = val; 500 *ptr = val;
436 return; 501 return;
437 } 502 }
438 503
504 ADD_STATS(pud_update_pinned, 1);
505
439 xen_set_pud_hyper(ptr, val); 506 xen_set_pud_hyper(ptr, val);
440} 507}
441 508
442void xen_set_pte(pte_t *ptep, pte_t pte) 509void xen_set_pte(pte_t *ptep, pte_t pte)
443{ 510{
511 ADD_STATS(pte_update, 1);
512// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
513 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
514
515#ifdef CONFIG_X86_PAE
444 ptep->pte_high = pte.pte_high; 516 ptep->pte_high = pte.pte_high;
445 smp_wmb(); 517 smp_wmb();
446 ptep->pte_low = pte.pte_low; 518 ptep->pte_low = pte.pte_low;
519#else
520 *ptep = pte;
521#endif
447} 522}
448 523
524#ifdef CONFIG_X86_PAE
449void xen_set_pte_atomic(pte_t *ptep, pte_t pte) 525void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
450{ 526{
451 set_64bit((u64 *)ptep, pte_val_ma(pte)); 527 set_64bit((u64 *)ptep, native_pte_val(pte));
452} 528}
453 529
454void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 530void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
@@ -462,6 +538,7 @@ void xen_pmd_clear(pmd_t *pmdp)
462{ 538{
463 set_pmd(pmdp, __pmd(0)); 539 set_pmd(pmdp, __pmd(0));
464} 540}
541#endif /* CONFIG_X86_PAE */
465 542
466pmd_t xen_make_pmd(pmdval_t pmd) 543pmd_t xen_make_pmd(pmdval_t pmd)
467{ 544{
@@ -469,95 +546,218 @@ pmd_t xen_make_pmd(pmdval_t pmd)
469 return native_make_pmd(pmd); 546 return native_make_pmd(pmd);
470} 547}
471 548
549#if PAGETABLE_LEVELS == 4
550pudval_t xen_pud_val(pud_t pud)
551{
552 return pte_mfn_to_pfn(pud.pud);
553}
554
555pud_t xen_make_pud(pudval_t pud)
556{
557 pud = pte_pfn_to_mfn(pud);
558
559 return native_make_pud(pud);
560}
561
562pgd_t *xen_get_user_pgd(pgd_t *pgd)
563{
564 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
565 unsigned offset = pgd - pgd_page;
566 pgd_t *user_ptr = NULL;
567
568 if (offset < pgd_index(USER_LIMIT)) {
569 struct page *page = virt_to_page(pgd_page);
570 user_ptr = (pgd_t *)page->private;
571 if (user_ptr)
572 user_ptr += offset;
573 }
574
575 return user_ptr;
576}
577
578static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
579{
580 struct mmu_update u;
581
582 u.ptr = virt_to_machine(ptr).maddr;
583 u.val = pgd_val_ma(val);
584 xen_extend_mmu_update(&u);
585}
586
472/* 587/*
473 (Yet another) pagetable walker. This one is intended for pinning a 588 * Raw hypercall-based set_pgd, intended for in early boot before
474 pagetable. This means that it walks a pagetable and calls the 589 * there's a page structure. This implies:
475 callback function on each page it finds making up the page table, 590 * 1. The only existing pagetable is the kernel's
476 at every level. It walks the entire pagetable, but it only bothers 591 * 2. It is always pinned
477 pinning pte pages which are below pte_limit. In the normal case 592 * 3. It has no user pagetable attached to it
478 this will be TASK_SIZE, but at boot we need to pin up to 593 */
479 FIXADDR_TOP. But the important bit is that we don't pin beyond 594void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
480 there, because then we start getting into Xen's ptes. 595{
481*/ 596 preempt_disable();
482static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), 597
483 unsigned long limit) 598 xen_mc_batch();
484{ 599
485 pgd_t *pgd = pgd_base; 600 __xen_set_pgd_hyper(ptr, val);
601
602 xen_mc_issue(PARAVIRT_LAZY_MMU);
603
604 preempt_enable();
605}
606
607void xen_set_pgd(pgd_t *ptr, pgd_t val)
608{
609 pgd_t *user_ptr = xen_get_user_pgd(ptr);
610
611 ADD_STATS(pgd_update, 1);
612
613 /* If page is not pinned, we can just update the entry
614 directly */
615 if (!xen_page_pinned(ptr)) {
616 *ptr = val;
617 if (user_ptr) {
618 WARN_ON(xen_page_pinned(user_ptr));
619 *user_ptr = val;
620 }
621 return;
622 }
623
624 ADD_STATS(pgd_update_pinned, 1);
625 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
626
627 /* If it's pinned, then we can at least batch the kernel and
628 user updates together. */
629 xen_mc_batch();
630
631 __xen_set_pgd_hyper(ptr, val);
632 if (user_ptr)
633 __xen_set_pgd_hyper(user_ptr, val);
634
635 xen_mc_issue(PARAVIRT_LAZY_MMU);
636}
637#endif /* PAGETABLE_LEVELS == 4 */
638
639/*
640 * (Yet another) pagetable walker. This one is intended for pinning a
641 * pagetable. This means that it walks a pagetable and calls the
642 * callback function on each page it finds making up the page table,
643 * at every level. It walks the entire pagetable, but it only bothers
644 * pinning pte pages which are below limit. In the normal case this
645 * will be STACK_TOP_MAX, but at boot we need to pin up to
646 * FIXADDR_TOP.
647 *
648 * For 32-bit the important bit is that we don't pin beyond there,
649 * because then we start getting into Xen's ptes.
650 *
651 * For 64-bit, we must skip the Xen hole in the middle of the address
652 * space, just after the big x86-64 virtual hole.
653 */
654static int xen_pgd_walk(struct mm_struct *mm,
655 int (*func)(struct mm_struct *mm, struct page *,
656 enum pt_level),
657 unsigned long limit)
658{
659 pgd_t *pgd = mm->pgd;
486 int flush = 0; 660 int flush = 0;
487 unsigned long addr = 0; 661 unsigned hole_low, hole_high;
488 unsigned long pgd_next; 662 unsigned pgdidx_limit, pudidx_limit, pmdidx_limit;
663 unsigned pgdidx, pudidx, pmdidx;
489 664
490 BUG_ON(limit > FIXADDR_TOP); 665 /* The limit is the last byte to be touched */
666 limit--;
667 BUG_ON(limit >= FIXADDR_TOP);
491 668
492 if (xen_feature(XENFEAT_auto_translated_physmap)) 669 if (xen_feature(XENFEAT_auto_translated_physmap))
493 return 0; 670 return 0;
494 671
495 for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { 672 /*
673 * 64-bit has a great big hole in the middle of the address
674 * space, which contains the Xen mappings. On 32-bit these
675 * will end up making a zero-sized hole and so is a no-op.
676 */
677 hole_low = pgd_index(USER_LIMIT);
678 hole_high = pgd_index(PAGE_OFFSET);
679
680 pgdidx_limit = pgd_index(limit);
681#if PTRS_PER_PUD > 1
682 pudidx_limit = pud_index(limit);
683#else
684 pudidx_limit = 0;
685#endif
686#if PTRS_PER_PMD > 1
687 pmdidx_limit = pmd_index(limit);
688#else
689 pmdidx_limit = 0;
690#endif
691
692 for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) {
496 pud_t *pud; 693 pud_t *pud;
497 unsigned long pud_limit, pud_next;
498 694
499 pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); 695 if (pgdidx >= hole_low && pgdidx < hole_high)
696 continue;
500 697
501 if (!pgd_val(*pgd)) 698 if (!pgd_val(pgd[pgdidx]))
502 continue; 699 continue;
503 700
504 pud = pud_offset(pgd, 0); 701 pud = pud_offset(&pgd[pgdidx], 0);
505 702
506 if (PTRS_PER_PUD > 1) /* not folded */ 703 if (PTRS_PER_PUD > 1) /* not folded */
507 flush |= (*func)(virt_to_page(pud), PT_PUD); 704 flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
508 705
509 for (; addr != pud_limit; pud++, addr = pud_next) { 706 for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) {
510 pmd_t *pmd; 707 pmd_t *pmd;
511 unsigned long pmd_limit;
512
513 pud_next = pud_addr_end(addr, pud_limit);
514 708
515 if (pud_next < limit) 709 if (pgdidx == pgdidx_limit &&
516 pmd_limit = pud_next; 710 pudidx > pudidx_limit)
517 else 711 goto out;
518 pmd_limit = limit;
519 712
520 if (pud_none(*pud)) 713 if (pud_none(pud[pudidx]))
521 continue; 714 continue;
522 715
523 pmd = pmd_offset(pud, 0); 716 pmd = pmd_offset(&pud[pudidx], 0);
524 717
525 if (PTRS_PER_PMD > 1) /* not folded */ 718 if (PTRS_PER_PMD > 1) /* not folded */
526 flush |= (*func)(virt_to_page(pmd), PT_PMD); 719 flush |= (*func)(mm, virt_to_page(pmd), PT_PMD);
720
721 for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) {
722 struct page *pte;
527 723
528 for (; addr != pmd_limit; pmd++) { 724 if (pgdidx == pgdidx_limit &&
529 addr += (PAGE_SIZE * PTRS_PER_PTE); 725 pudidx == pudidx_limit &&
530 if ((pmd_limit-1) < (addr-1)) { 726 pmdidx > pmdidx_limit)
531 addr = pmd_limit; 727 goto out;
532 break;
533 }
534 728
535 if (pmd_none(*pmd)) 729 if (pmd_none(pmd[pmdidx]))
536 continue; 730 continue;
537 731
538 flush |= (*func)(pmd_page(*pmd), PT_PTE); 732 pte = pmd_page(pmd[pmdidx]);
733 flush |= (*func)(mm, pte, PT_PTE);
539 } 734 }
540 } 735 }
541 } 736 }
542 737
543 flush |= (*func)(virt_to_page(pgd_base), PT_PGD); 738out:
739 /* Do the top level last, so that the callbacks can use it as
740 a cue to do final things like tlb flushes. */
741 flush |= (*func)(mm, virt_to_page(pgd), PT_PGD);
544 742
545 return flush; 743 return flush;
546} 744}
547 745
548static spinlock_t *lock_pte(struct page *page) 746/* If we're using split pte locks, then take the page's lock and
747 return a pointer to it. Otherwise return NULL. */
748static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm)
549{ 749{
550 spinlock_t *ptl = NULL; 750 spinlock_t *ptl = NULL;
551 751
552#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS 752#if USE_SPLIT_PTLOCKS
553 ptl = __pte_lockptr(page); 753 ptl = __pte_lockptr(page);
554 spin_lock(ptl); 754 spin_lock_nest_lock(ptl, &mm->page_table_lock);
555#endif 755#endif
556 756
557 return ptl; 757 return ptl;
558} 758}
559 759
560static void do_unlock(void *v) 760static void xen_pte_unlock(void *v)
561{ 761{
562 spinlock_t *ptl = v; 762 spinlock_t *ptl = v;
563 spin_unlock(ptl); 763 spin_unlock(ptl);
@@ -575,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn)
575 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); 775 MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
576} 776}
577 777
578static int pin_page(struct page *page, enum pt_level level) 778static int xen_pin_page(struct mm_struct *mm, struct page *page,
779 enum pt_level level)
579{ 780{
580 unsigned pgfl = TestSetPagePinned(page); 781 unsigned pgfl = TestSetPagePinned(page);
581 int flush; 782 int flush;
@@ -594,21 +795,40 @@ static int pin_page(struct page *page, enum pt_level level)
594 795
595 flush = 0; 796 flush = 0;
596 797
798 /*
799 * We need to hold the pagetable lock between the time
800 * we make the pagetable RO and when we actually pin
801 * it. If we don't, then other users may come in and
802 * attempt to update the pagetable by writing it,
803 * which will fail because the memory is RO but not
804 * pinned, so Xen won't do the trap'n'emulate.
805 *
806 * If we're using split pte locks, we can't hold the
807 * entire pagetable's worth of locks during the
808 * traverse, because we may wrap the preempt count (8
809 * bits). The solution is to mark RO and pin each PTE
810 * page while holding the lock. This means the number
811 * of locks we end up holding is never more than a
812 * batch size (~32 entries, at present).
813 *
814 * If we're not using split pte locks, we needn't pin
815 * the PTE pages independently, because we're
816 * protected by the overall pagetable lock.
817 */
597 ptl = NULL; 818 ptl = NULL;
598 if (level == PT_PTE) 819 if (level == PT_PTE)
599 ptl = lock_pte(page); 820 ptl = xen_pte_lock(page, mm);
600 821
601 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, 822 MULTI_update_va_mapping(mcs.mc, (unsigned long)pt,
602 pfn_pte(pfn, PAGE_KERNEL_RO), 823 pfn_pte(pfn, PAGE_KERNEL_RO),
603 level == PT_PGD ? UVMF_TLB_FLUSH : 0); 824 level == PT_PGD ? UVMF_TLB_FLUSH : 0);
604 825
605 if (level == PT_PTE) 826 if (ptl) {
606 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); 827 xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
607 828
608 if (ptl) {
609 /* Queue a deferred unlock for when this batch 829 /* Queue a deferred unlock for when this batch
610 is completed. */ 830 is completed. */
611 xen_mc_callback(do_unlock, ptl); 831 xen_mc_callback(xen_pte_unlock, ptl);
612 } 832 }
613 } 833 }
614 834
@@ -618,26 +838,54 @@ static int pin_page(struct page *page, enum pt_level level)
618/* This is called just after a mm has been created, but it has not 838/* This is called just after a mm has been created, but it has not
619 been used yet. We need to make sure that its pagetable is all 839 been used yet. We need to make sure that its pagetable is all
620 read-only, and can be pinned. */ 840 read-only, and can be pinned. */
621void xen_pgd_pin(pgd_t *pgd) 841static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
622{ 842{
623 xen_mc_batch(); 843 xen_mc_batch();
624 844
625 if (pgd_walk(pgd, pin_page, TASK_SIZE)) { 845 if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) {
626 /* re-enable interrupts for kmap_flush_unused */ 846 /* re-enable interrupts for kmap_flush_unused */
627 xen_mc_issue(0); 847 xen_mc_issue(0);
628 kmap_flush_unused(); 848 kmap_flush_unused();
849 vm_unmap_aliases();
629 xen_mc_batch(); 850 xen_mc_batch();
630 } 851 }
631 852
853#ifdef CONFIG_X86_64
854 {
855 pgd_t *user_pgd = xen_get_user_pgd(pgd);
856
857 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
858
859 if (user_pgd) {
860 xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD);
861 xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd)));
862 }
863 }
864#else /* CONFIG_X86_32 */
865#ifdef CONFIG_X86_PAE
866 /* Need to make sure unshared kernel PMD is pinnable */
867 xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
868 PT_PMD);
869#endif
632 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); 870 xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd)));
871#endif /* CONFIG_X86_64 */
633 xen_mc_issue(0); 872 xen_mc_issue(0);
634} 873}
635 874
875static void xen_pgd_pin(struct mm_struct *mm)
876{
877 __xen_pgd_pin(mm, mm->pgd);
878}
879
636/* 880/*
637 * On save, we need to pin all pagetables to make sure they get their 881 * On save, we need to pin all pagetables to make sure they get their
638 * mfns turned into pfns. Search the list for any unpinned pgds and pin 882 * mfns turned into pfns. Search the list for any unpinned pgds and pin
639 * them (unpinned pgds are not currently in use, probably because the 883 * them (unpinned pgds are not currently in use, probably because the
640 * process is under construction or destruction). 884 * process is under construction or destruction).
885 *
886 * Expected to be called in stop_machine() ("equivalent to taking
887 * every spinlock in the system"), so the locking doesn't really
888 * matter all that much.
641 */ 889 */
642void xen_mm_pin_all(void) 890void xen_mm_pin_all(void)
643{ 891{
@@ -648,7 +896,7 @@ void xen_mm_pin_all(void)
648 896
649 list_for_each_entry(page, &pgd_list, lru) { 897 list_for_each_entry(page, &pgd_list, lru) {
650 if (!PagePinned(page)) { 898 if (!PagePinned(page)) {
651 xen_pgd_pin((pgd_t *)page_address(page)); 899 __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page));
652 SetPageSavePinned(page); 900 SetPageSavePinned(page);
653 } 901 }
654 } 902 }
@@ -656,10 +904,13 @@ void xen_mm_pin_all(void)
656 spin_unlock_irqrestore(&pgd_lock, flags); 904 spin_unlock_irqrestore(&pgd_lock, flags);
657} 905}
658 906
659/* The init_mm pagetable is really pinned as soon as its created, but 907/*
660 that's before we have page structures to store the bits. So do all 908 * The init_mm pagetable is really pinned as soon as its created, but
661 the book-keeping now. */ 909 * that's before we have page structures to store the bits. So do all
662static __init int mark_pinned(struct page *page, enum pt_level level) 910 * the book-keeping now.
911 */
912static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page,
913 enum pt_level level)
663{ 914{
664 SetPagePinned(page); 915 SetPagePinned(page);
665 return 0; 916 return 0;
@@ -667,10 +918,11 @@ static __init int mark_pinned(struct page *page, enum pt_level level)
667 918
668void __init xen_mark_init_mm_pinned(void) 919void __init xen_mark_init_mm_pinned(void)
669{ 920{
670 pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); 921 xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP);
671} 922}
672 923
673static int unpin_page(struct page *page, enum pt_level level) 924static int xen_unpin_page(struct mm_struct *mm, struct page *page,
925 enum pt_level level)
674{ 926{
675 unsigned pgfl = TestClearPagePinned(page); 927 unsigned pgfl = TestClearPagePinned(page);
676 928
@@ -680,10 +932,18 @@ static int unpin_page(struct page *page, enum pt_level level)
680 spinlock_t *ptl = NULL; 932 spinlock_t *ptl = NULL;
681 struct multicall_space mcs; 933 struct multicall_space mcs;
682 934
935 /*
936 * Do the converse to pin_page. If we're using split
937 * pte locks, we must be holding the lock for while
938 * the pte page is unpinned but still RO to prevent
939 * concurrent updates from seeing it in this
940 * partially-pinned state.
941 */
683 if (level == PT_PTE) { 942 if (level == PT_PTE) {
684 ptl = lock_pte(page); 943 ptl = xen_pte_lock(page, mm);
685 944
686 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); 945 if (ptl)
946 xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
687 } 947 }
688 948
689 mcs = __xen_mc_entry(0); 949 mcs = __xen_mc_entry(0);
@@ -694,7 +954,7 @@ static int unpin_page(struct page *page, enum pt_level level)
694 954
695 if (ptl) { 955 if (ptl) {
696 /* unlock when batch completed */ 956 /* unlock when batch completed */
697 xen_mc_callback(do_unlock, ptl); 957 xen_mc_callback(xen_pte_unlock, ptl);
698 } 958 }
699 } 959 }
700 960
@@ -702,17 +962,39 @@ static int unpin_page(struct page *page, enum pt_level level)
702} 962}
703 963
704/* Release a pagetables pages back as normal RW */ 964/* Release a pagetables pages back as normal RW */
705static void xen_pgd_unpin(pgd_t *pgd) 965static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd)
706{ 966{
707 xen_mc_batch(); 967 xen_mc_batch();
708 968
709 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 969 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
710 970
711 pgd_walk(pgd, unpin_page, TASK_SIZE); 971#ifdef CONFIG_X86_64
972 {
973 pgd_t *user_pgd = xen_get_user_pgd(pgd);
974
975 if (user_pgd) {
976 xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd)));
977 xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD);
978 }
979 }
980#endif
981
982#ifdef CONFIG_X86_PAE
983 /* Need to make sure unshared kernel PMD is unpinned */
984 xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])),
985 PT_PMD);
986#endif
987
988 xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT);
712 989
713 xen_mc_issue(0); 990 xen_mc_issue(0);
714} 991}
715 992
993static void xen_pgd_unpin(struct mm_struct *mm)
994{
995 __xen_pgd_unpin(mm, mm->pgd);
996}
997
716/* 998/*
717 * On resume, undo any pinning done at save, so that the rest of the 999 * On resume, undo any pinning done at save, so that the rest of the
718 * kernel doesn't see any unexpected pinned pagetables. 1000 * kernel doesn't see any unexpected pinned pagetables.
@@ -727,8 +1009,7 @@ void xen_mm_unpin_all(void)
727 list_for_each_entry(page, &pgd_list, lru) { 1009 list_for_each_entry(page, &pgd_list, lru) {
728 if (PageSavePinned(page)) { 1010 if (PageSavePinned(page)) {
729 BUG_ON(!PagePinned(page)); 1011 BUG_ON(!PagePinned(page));
730 printk("unpinning pinned %p\n", page_address(page)); 1012 __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page));
731 xen_pgd_unpin((pgd_t *)page_address(page));
732 ClearPageSavePinned(page); 1013 ClearPageSavePinned(page);
733 } 1014 }
734 } 1015 }
@@ -739,14 +1020,14 @@ void xen_mm_unpin_all(void)
739void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1020void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
740{ 1021{
741 spin_lock(&next->page_table_lock); 1022 spin_lock(&next->page_table_lock);
742 xen_pgd_pin(next->pgd); 1023 xen_pgd_pin(next);
743 spin_unlock(&next->page_table_lock); 1024 spin_unlock(&next->page_table_lock);
744} 1025}
745 1026
746void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 1027void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
747{ 1028{
748 spin_lock(&mm->page_table_lock); 1029 spin_lock(&mm->page_table_lock);
749 xen_pgd_pin(mm->pgd); 1030 xen_pgd_pin(mm);
750 spin_unlock(&mm->page_table_lock); 1031 spin_unlock(&mm->page_table_lock);
751} 1032}
752 1033
@@ -757,8 +1038,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
757static void drop_other_mm_ref(void *info) 1038static void drop_other_mm_ref(void *info)
758{ 1039{
759 struct mm_struct *mm = info; 1040 struct mm_struct *mm = info;
1041 struct mm_struct *active_mm;
760 1042
761 if (__get_cpu_var(cpu_tlbstate).active_mm == mm) 1043#ifdef CONFIG_X86_64
1044 active_mm = read_pda(active_mm);
1045#else
1046 active_mm = __get_cpu_var(cpu_tlbstate).active_mm;
1047#endif
1048
1049 if (active_mm == mm)
762 leave_mm(smp_processor_id()); 1050 leave_mm(smp_processor_id());
763 1051
764 /* If this cpu still has a stale cr3 reference, then make sure 1052 /* If this cpu still has a stale cr3 reference, then make sure
@@ -769,7 +1057,7 @@ static void drop_other_mm_ref(void *info)
769 } 1057 }
770} 1058}
771 1059
772static void drop_mm_ref(struct mm_struct *mm) 1060static void xen_drop_mm_ref(struct mm_struct *mm)
773{ 1061{
774 cpumask_t mask; 1062 cpumask_t mask;
775 unsigned cpu; 1063 unsigned cpu;
@@ -799,7 +1087,7 @@ static void drop_mm_ref(struct mm_struct *mm)
799 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); 1087 smp_call_function_mask(mask, drop_other_mm_ref, mm, 1);
800} 1088}
801#else 1089#else
802static void drop_mm_ref(struct mm_struct *mm) 1090static void xen_drop_mm_ref(struct mm_struct *mm)
803{ 1091{
804 if (current->active_mm == mm) 1092 if (current->active_mm == mm)
805 load_cr3(swapper_pg_dir); 1093 load_cr3(swapper_pg_dir);
@@ -823,14 +1111,77 @@ static void drop_mm_ref(struct mm_struct *mm)
823void xen_exit_mmap(struct mm_struct *mm) 1111void xen_exit_mmap(struct mm_struct *mm)
824{ 1112{
825 get_cpu(); /* make sure we don't move around */ 1113 get_cpu(); /* make sure we don't move around */
826 drop_mm_ref(mm); 1114 xen_drop_mm_ref(mm);
827 put_cpu(); 1115 put_cpu();
828 1116
829 spin_lock(&mm->page_table_lock); 1117 spin_lock(&mm->page_table_lock);
830 1118
831 /* pgd may not be pinned in the error exit path of execve */ 1119 /* pgd may not be pinned in the error exit path of execve */
832 if (page_pinned(mm->pgd)) 1120 if (xen_page_pinned(mm->pgd))
833 xen_pgd_unpin(mm->pgd); 1121 xen_pgd_unpin(mm);
834 1122
835 spin_unlock(&mm->page_table_lock); 1123 spin_unlock(&mm->page_table_lock);
836} 1124}
1125
1126#ifdef CONFIG_XEN_DEBUG_FS
1127
1128static struct dentry *d_mmu_debug;
1129
1130static int __init xen_mmu_debugfs(void)
1131{
1132 struct dentry *d_xen = xen_init_debugfs();
1133
1134 if (d_xen == NULL)
1135 return -ENOMEM;
1136
1137 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
1138
1139 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
1140
1141 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
1142 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
1143 &mmu_stats.pgd_update_pinned);
1144 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
1145 &mmu_stats.pgd_update_pinned);
1146
1147 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
1148 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
1149 &mmu_stats.pud_update_pinned);
1150 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
1151 &mmu_stats.pud_update_pinned);
1152
1153 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
1154 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
1155 &mmu_stats.pmd_update_pinned);
1156 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
1157 &mmu_stats.pmd_update_pinned);
1158
1159 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
1160// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
1161// &mmu_stats.pte_update_pinned);
1162 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
1163 &mmu_stats.pte_update_pinned);
1164
1165 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
1166 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
1167 &mmu_stats.mmu_update_extended);
1168 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
1169 mmu_stats.mmu_update_histo, 20);
1170
1171 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
1172 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
1173 &mmu_stats.set_pte_at_batched);
1174 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
1175 &mmu_stats.set_pte_at_current);
1176 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
1177 &mmu_stats.set_pte_at_kernel);
1178
1179 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
1180 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
1181 &mmu_stats.prot_commit_batched);
1182
1183 return 0;
1184}
1185fs_initcall(xen_mmu_debugfs);
1186
1187#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 297bf9f5b8bc..98d71659da5a 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -10,18 +10,6 @@ enum pt_level {
10 PT_PTE 10 PT_PTE
11}; 11};
12 12
13/*
14 * Page-directory addresses above 4GB do not fit into architectural %cr3.
15 * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
16 * must use the following accessor macros to pack/unpack valid MFNs.
17 *
18 * Note that Xen is using the fact that the pagetable base is always
19 * page-aligned, and putting the 12 MSB of the address into the 12 LSB
20 * of cr3.
21 */
22#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
23#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
24
25 13
26void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 14void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
27 15
@@ -30,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
30void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); 18void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
31void xen_exit_mmap(struct mm_struct *mm); 19void xen_exit_mmap(struct mm_struct *mm);
32 20
33void xen_pgd_pin(pgd_t *pgd);
34//void xen_pgd_unpin(pgd_t *pgd);
35
36pteval_t xen_pte_val(pte_t); 21pteval_t xen_pte_val(pte_t);
37pmdval_t xen_pmd_val(pmd_t); 22pmdval_t xen_pmd_val(pmd_t);
38pgdval_t xen_pgd_val(pgd_t); 23pgdval_t xen_pgd_val(pgd_t);
@@ -44,13 +29,26 @@ pgd_t xen_make_pgd(pgdval_t);
44void xen_set_pte(pte_t *ptep, pte_t pteval); 29void xen_set_pte(pte_t *ptep, pte_t pteval);
45void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 30void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
46 pte_t *ptep, pte_t pteval); 31 pte_t *ptep, pte_t pteval);
32
33#ifdef CONFIG_X86_PAE
47void xen_set_pte_atomic(pte_t *ptep, pte_t pte); 34void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
35void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
36void xen_pmd_clear(pmd_t *pmdp);
37#endif /* CONFIG_X86_PAE */
38
48void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); 39void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
49void xen_set_pud(pud_t *ptr, pud_t val); 40void xen_set_pud(pud_t *ptr, pud_t val);
50void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); 41void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
51void xen_set_pud_hyper(pud_t *ptr, pud_t val); 42void xen_set_pud_hyper(pud_t *ptr, pud_t val);
52void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 43
53void xen_pmd_clear(pmd_t *pmdp); 44#if PAGETABLE_LEVELS == 4
45pudval_t xen_pud_val(pud_t pud);
46pud_t xen_make_pud(pudval_t pudval);
47void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
48void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
49#endif
50
51pgd_t *xen_get_user_pgd(pgd_t *pgd);
54 52
55pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 53pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
56void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 54void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 3c63c4da7ed1..8ea8a0d0b0de 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -21,16 +21,20 @@
21 */ 21 */
22#include <linux/percpu.h> 22#include <linux/percpu.h>
23#include <linux/hardirq.h> 23#include <linux/hardirq.h>
24#include <linux/debugfs.h>
24 25
25#include <asm/xen/hypercall.h> 26#include <asm/xen/hypercall.h>
26 27
27#include "multicalls.h" 28#include "multicalls.h"
29#include "debugfs.h"
30
31#define MC_BATCH 32
28 32
29#define MC_DEBUG 1 33#define MC_DEBUG 1
30 34
31#define MC_BATCH 32
32#define MC_ARGS (MC_BATCH * 16) 35#define MC_ARGS (MC_BATCH * 16)
33 36
37
34struct mc_buffer { 38struct mc_buffer {
35 struct multicall_entry entries[MC_BATCH]; 39 struct multicall_entry entries[MC_BATCH];
36#if MC_DEBUG 40#if MC_DEBUG
@@ -47,6 +51,76 @@ struct mc_buffer {
47static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); 51static DEFINE_PER_CPU(struct mc_buffer, mc_buffer);
48DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); 52DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags);
49 53
54/* flush reasons 0- slots, 1- args, 2- callbacks */
55enum flush_reasons
56{
57 FL_SLOTS,
58 FL_ARGS,
59 FL_CALLBACKS,
60
61 FL_N_REASONS
62};
63
64#ifdef CONFIG_XEN_DEBUG_FS
65#define NHYPERCALLS 40 /* not really */
66
67static struct {
68 unsigned histo[MC_BATCH+1];
69
70 unsigned issued;
71 unsigned arg_total;
72 unsigned hypercalls;
73 unsigned histo_hypercalls[NHYPERCALLS];
74
75 unsigned flush[FL_N_REASONS];
76} mc_stats;
77
78static u8 zero_stats;
79
80static inline void check_zero(void)
81{
82 if (unlikely(zero_stats)) {
83 memset(&mc_stats, 0, sizeof(mc_stats));
84 zero_stats = 0;
85 }
86}
87
88static void mc_add_stats(const struct mc_buffer *mc)
89{
90 int i;
91
92 check_zero();
93
94 mc_stats.issued++;
95 mc_stats.hypercalls += mc->mcidx;
96 mc_stats.arg_total += mc->argidx;
97
98 mc_stats.histo[mc->mcidx]++;
99 for(i = 0; i < mc->mcidx; i++) {
100 unsigned op = mc->entries[i].op;
101 if (op < NHYPERCALLS)
102 mc_stats.histo_hypercalls[op]++;
103 }
104}
105
106static void mc_stats_flush(enum flush_reasons idx)
107{
108 check_zero();
109
110 mc_stats.flush[idx]++;
111}
112
113#else /* !CONFIG_XEN_DEBUG_FS */
114
115static inline void mc_add_stats(const struct mc_buffer *mc)
116{
117}
118
119static inline void mc_stats_flush(enum flush_reasons idx)
120{
121}
122#endif /* CONFIG_XEN_DEBUG_FS */
123
50void xen_mc_flush(void) 124void xen_mc_flush(void)
51{ 125{
52 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 126 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
@@ -60,6 +134,8 @@ void xen_mc_flush(void)
60 something in the middle */ 134 something in the middle */
61 local_irq_save(flags); 135 local_irq_save(flags);
62 136
137 mc_add_stats(b);
138
63 if (b->mcidx) { 139 if (b->mcidx) {
64#if MC_DEBUG 140#if MC_DEBUG
65 memcpy(b->debug, b->entries, 141 memcpy(b->debug, b->entries,
@@ -76,6 +152,7 @@ void xen_mc_flush(void)
76 if (ret) { 152 if (ret) {
77 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", 153 printk(KERN_ERR "%d multicall(s) failed: cpu %d\n",
78 ret, smp_processor_id()); 154 ret, smp_processor_id());
155 dump_stack();
79 for (i = 0; i < b->mcidx; i++) { 156 for (i = 0; i < b->mcidx; i++) {
80 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", 157 printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n",
81 i+1, b->mcidx, 158 i+1, b->mcidx,
@@ -114,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args)
114 191
115 if (b->mcidx == MC_BATCH || 192 if (b->mcidx == MC_BATCH ||
116 (argidx + args) > MC_ARGS) { 193 (argidx + args) > MC_ARGS) {
194 mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
117 xen_mc_flush(); 195 xen_mc_flush();
118 argidx = roundup(b->argidx, sizeof(u64)); 196 argidx = roundup(b->argidx, sizeof(u64));
119 } 197 }
@@ -157,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data)
157 struct mc_buffer *b = &__get_cpu_var(mc_buffer); 235 struct mc_buffer *b = &__get_cpu_var(mc_buffer);
158 struct callback *cb; 236 struct callback *cb;
159 237
160 if (b->cbidx == MC_BATCH) 238 if (b->cbidx == MC_BATCH) {
239 mc_stats_flush(FL_CALLBACKS);
161 xen_mc_flush(); 240 xen_mc_flush();
241 }
162 242
163 cb = &b->callbacks[b->cbidx++]; 243 cb = &b->callbacks[b->cbidx++];
164 cb->fn = fn; 244 cb->fn = fn;
165 cb->data = data; 245 cb->data = data;
166} 246}
247
248#ifdef CONFIG_XEN_DEBUG_FS
249
250static struct dentry *d_mc_debug;
251
252static int __init xen_mc_debugfs(void)
253{
254 struct dentry *d_xen = xen_init_debugfs();
255
256 if (d_xen == NULL)
257 return -ENOMEM;
258
259 d_mc_debug = debugfs_create_dir("multicalls", d_xen);
260
261 debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats);
262
263 debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued);
264 debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls);
265 debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total);
266
267 xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug,
268 mc_stats.histo, MC_BATCH);
269 xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug,
270 mc_stats.histo_hypercalls, NHYPERCALLS);
271 xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug,
272 mc_stats.flush, FL_N_REASONS);
273
274 return 0;
275}
276fs_initcall(xen_mc_debugfs);
277
278#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index e0a39595bde3..d67901083888 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -42,7 +42,7 @@ char * __init xen_memory_setup(void)
42 42
43 e820.nr_map = 0; 43 e820.nr_map = 0;
44 44
45 e820_add_region(0, PFN_PHYS(max_pfn), E820_RAM); 45 e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM);
46 46
47 /* 47 /*
48 * Even though this is normal, usable memory under Xen, reserve 48 * Even though this is normal, usable memory under Xen, reserve
@@ -83,30 +83,72 @@ static void xen_idle(void)
83 83
84/* 84/*
85 * Set the bit indicating "nosegneg" library variants should be used. 85 * Set the bit indicating "nosegneg" library variants should be used.
86 * We only need to bother in pure 32-bit mode; compat 32-bit processes
87 * can have un-truncated segments, so wrapping around is allowed.
86 */ 88 */
87static void __init fiddle_vdso(void) 89static void __init fiddle_vdso(void)
88{ 90{
89 extern const char vdso32_default_start; 91#ifdef CONFIG_X86_32
90 u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); 92 u32 *mask;
93 mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK);
91 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; 94 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
95 mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK);
96 *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT;
97#endif
92} 98}
93 99
94void xen_enable_sysenter(void) 100static __cpuinit int register_callback(unsigned type, const void *func)
95{ 101{
96 int cpu = smp_processor_id(); 102 struct callback_register callback = {
97 extern void xen_sysenter_target(void); 103 .type = type,
98 /* Mask events on entry, even though they get enabled immediately */ 104 .address = XEN_CALLBACK(__KERNEL_CS, func),
99 static struct callback_register sysenter = {
100 .type = CALLBACKTYPE_sysenter,
101 .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target },
102 .flags = CALLBACKF_mask_events, 105 .flags = CALLBACKF_mask_events,
103 }; 106 };
104 107
105 if (!boot_cpu_has(X86_FEATURE_SEP) || 108 return HYPERVISOR_callback_op(CALLBACKOP_register, &callback);
106 HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { 109}
107 clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); 110
108 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); 111void __cpuinit xen_enable_sysenter(void)
112{
113 extern void xen_sysenter_target(void);
114 int ret;
115 unsigned sysenter_feature;
116
117#ifdef CONFIG_X86_32
118 sysenter_feature = X86_FEATURE_SEP;
119#else
120 sysenter_feature = X86_FEATURE_SYSENTER32;
121#endif
122
123 if (!boot_cpu_has(sysenter_feature))
124 return;
125
126 ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target);
127 if(ret != 0)
128 setup_clear_cpu_cap(sysenter_feature);
129}
130
131void __cpuinit xen_enable_syscall(void)
132{
133#ifdef CONFIG_X86_64
134 int ret;
135 extern void xen_syscall_target(void);
136 extern void xen_syscall32_target(void);
137
138 ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target);
139 if (ret != 0) {
140 printk(KERN_ERR "Failed to set syscall callback: %d\n", ret);
141 /* Pretty fatal; 64-bit userspace has no other
142 mechanism for syscalls. */
109 } 143 }
144
145 if (boot_cpu_has(X86_FEATURE_SYSCALL32)) {
146 ret = register_callback(CALLBACKTYPE_syscall32,
147 xen_syscall32_target);
148 if (ret != 0)
149 setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
150 }
151#endif /* CONFIG_X86_64 */
110} 152}
111 153
112void __init xen_arch_setup(void) 154void __init xen_arch_setup(void)
@@ -120,10 +162,12 @@ void __init xen_arch_setup(void)
120 if (!xen_feature(XENFEAT_auto_translated_physmap)) 162 if (!xen_feature(XENFEAT_auto_translated_physmap))
121 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); 163 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3);
122 164
123 HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, 165 if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) ||
124 __KERNEL_CS, (unsigned long)xen_failsafe_callback); 166 register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback))
167 BUG();
125 168
126 xen_enable_sysenter(); 169 xen_enable_sysenter();
170 xen_enable_syscall();
127 171
128 set_iopl.iopl = 1; 172 set_iopl.iopl = 1;
129 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); 173 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
@@ -143,11 +187,6 @@ void __init xen_arch_setup(void)
143 187
144 pm_idle = xen_idle; 188 pm_idle = xen_idle;
145 189
146#ifdef CONFIG_SMP
147 /* fill cpus_possible with all available cpus */
148 xen_fill_possible_map();
149#endif
150
151 paravirt_disable_iospace(); 190 paravirt_disable_iospace();
152 191
153 fiddle_vdso(); 192 fiddle_vdso();
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 233156f39b7f..d77da613b1d2 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -11,8 +11,6 @@
11 * useful topology information for the kernel to make use of. As a 11 * useful topology information for the kernel to make use of. As a
12 * result, all CPUs are treated as if they're single-core and 12 * result, all CPUs are treated as if they're single-core and
13 * single-threaded. 13 * single-threaded.
14 *
15 * This does not handle HOTPLUG_CPU yet.
16 */ 14 */
17#include <linux/sched.h> 15#include <linux/sched.h>
18#include <linux/err.h> 16#include <linux/err.h>
@@ -61,22 +59,37 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
61 return IRQ_HANDLED; 59 return IRQ_HANDLED;
62} 60}
63 61
64static __cpuinit void cpu_bringup_and_idle(void) 62static __cpuinit void cpu_bringup(void)
65{ 63{
66 int cpu = smp_processor_id(); 64 int cpu = smp_processor_id();
67 65
68 cpu_init(); 66 cpu_init();
67 touch_softlockup_watchdog();
68 preempt_disable();
69
69 xen_enable_sysenter(); 70 xen_enable_sysenter();
71 xen_enable_syscall();
70 72
71 preempt_disable(); 73 cpu = smp_processor_id();
72 per_cpu(cpu_state, cpu) = CPU_ONLINE; 74 smp_store_cpu_info(cpu);
75 cpu_data(cpu).x86_max_cores = 1;
76 set_cpu_sibling_map(cpu);
73 77
74 xen_setup_cpu_clockevents(); 78 xen_setup_cpu_clockevents();
75 79
80 cpu_set(cpu, cpu_online_map);
81 x86_write_percpu(cpu_state, CPU_ONLINE);
82 wmb();
83
76 /* We can take interrupts now: we're officially "up". */ 84 /* We can take interrupts now: we're officially "up". */
77 local_irq_enable(); 85 local_irq_enable();
78 86
79 wmb(); /* make sure everything is out */ 87 wmb(); /* make sure everything is out */
88}
89
90static __cpuinit void cpu_bringup_and_idle(void)
91{
92 cpu_bringup();
80 cpu_idle(); 93 cpu_idle();
81} 94}
82 95
@@ -141,56 +154,39 @@ static int xen_smp_intr_init(unsigned int cpu)
141 return rc; 154 return rc;
142} 155}
143 156
144void __init xen_fill_possible_map(void) 157static void __init xen_fill_possible_map(void)
145{ 158{
146 int i, rc; 159 int i, rc;
147 160
148 for (i = 0; i < NR_CPUS; i++) { 161 for (i = 0; i < NR_CPUS; i++) {
149 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 162 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
150 if (rc >= 0) 163 if (rc >= 0) {
164 num_processors++;
151 cpu_set(i, cpu_possible_map); 165 cpu_set(i, cpu_possible_map);
166 }
152 } 167 }
153} 168}
154 169
155void __init xen_smp_prepare_boot_cpu(void) 170static void __init xen_smp_prepare_boot_cpu(void)
156{ 171{
157 int cpu;
158
159 BUG_ON(smp_processor_id() != 0); 172 BUG_ON(smp_processor_id() != 0);
160 native_smp_prepare_boot_cpu(); 173 native_smp_prepare_boot_cpu();
161 174
162 /* We've switched to the "real" per-cpu gdt, so make sure the 175 /* We've switched to the "real" per-cpu gdt, so make sure the
163 old memory can be recycled */ 176 old memory can be recycled */
164 make_lowmem_page_readwrite(&per_cpu__gdt_page); 177 make_lowmem_page_readwrite(&per_cpu_var(gdt_page));
165
166 for_each_possible_cpu(cpu) {
167 cpus_clear(per_cpu(cpu_sibling_map, cpu));
168 /*
169 * cpu_core_map lives in a per cpu area that is cleared
170 * when the per cpu array is allocated.
171 *
172 * cpus_clear(per_cpu(cpu_core_map, cpu));
173 */
174 }
175 178
176 xen_setup_vcpu_info_placement(); 179 xen_setup_vcpu_info_placement();
177} 180}
178 181
179void __init xen_smp_prepare_cpus(unsigned int max_cpus) 182static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
180{ 183{
181 unsigned cpu; 184 unsigned cpu;
182 185
183 for_each_possible_cpu(cpu) { 186 xen_init_lock_cpu(0);
184 cpus_clear(per_cpu(cpu_sibling_map, cpu));
185 /*
186 * cpu_core_ map will be zeroed when the per
187 * cpu area is allocated.
188 *
189 * cpus_clear(per_cpu(cpu_core_map, cpu));
190 */
191 }
192 187
193 smp_store_cpu_info(0); 188 smp_store_cpu_info(0);
189 cpu_data(0).x86_max_cores = 1;
194 set_cpu_sibling_map(0); 190 set_cpu_sibling_map(0);
195 191
196 if (xen_smp_intr_init(0)) 192 if (xen_smp_intr_init(0))
@@ -217,15 +213,13 @@ void __init xen_smp_prepare_cpus(unsigned int max_cpus)
217 213
218 cpu_set(cpu, cpu_present_map); 214 cpu_set(cpu, cpu_present_map);
219 } 215 }
220
221 //init_xenbus_allowed_cpumask();
222} 216}
223 217
224static __cpuinit int 218static __cpuinit int
225cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 219cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
226{ 220{
227 struct vcpu_guest_context *ctxt; 221 struct vcpu_guest_context *ctxt;
228 struct gdt_page *gdt = &per_cpu(gdt_page, cpu); 222 struct desc_struct *gdt;
229 223
230 if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) 224 if (cpu_test_and_set(cpu, xen_cpu_initialized_map))
231 return 0; 225 return 0;
@@ -234,12 +228,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
234 if (ctxt == NULL) 228 if (ctxt == NULL)
235 return -ENOMEM; 229 return -ENOMEM;
236 230
231 gdt = get_cpu_gdt_table(cpu);
232
237 ctxt->flags = VGCF_IN_KERNEL; 233 ctxt->flags = VGCF_IN_KERNEL;
238 ctxt->user_regs.ds = __USER_DS; 234 ctxt->user_regs.ds = __USER_DS;
239 ctxt->user_regs.es = __USER_DS; 235 ctxt->user_regs.es = __USER_DS;
240 ctxt->user_regs.fs = __KERNEL_PERCPU;
241 ctxt->user_regs.gs = 0;
242 ctxt->user_regs.ss = __KERNEL_DS; 236 ctxt->user_regs.ss = __KERNEL_DS;
237#ifdef CONFIG_X86_32
238 ctxt->user_regs.fs = __KERNEL_PERCPU;
239#endif
243 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; 240 ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
244 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ 241 ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
245 242
@@ -249,11 +246,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
249 246
250 ctxt->ldt_ents = 0; 247 ctxt->ldt_ents = 0;
251 248
252 BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); 249 BUG_ON((unsigned long)gdt & ~PAGE_MASK);
253 make_lowmem_page_readonly(gdt->gdt); 250 make_lowmem_page_readonly(gdt);
254 251
255 ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); 252 ctxt->gdt_frames[0] = virt_to_mfn(gdt);
256 ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); 253 ctxt->gdt_ents = GDT_ENTRIES;
257 254
258 ctxt->user_regs.cs = __KERNEL_CS; 255 ctxt->user_regs.cs = __KERNEL_CS;
259 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); 256 ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
@@ -261,9 +258,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
261 ctxt->kernel_ss = __KERNEL_DS; 258 ctxt->kernel_ss = __KERNEL_DS;
262 ctxt->kernel_sp = idle->thread.sp0; 259 ctxt->kernel_sp = idle->thread.sp0;
263 260
261#ifdef CONFIG_X86_32
264 ctxt->event_callback_cs = __KERNEL_CS; 262 ctxt->event_callback_cs = __KERNEL_CS;
265 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
266 ctxt->failsafe_callback_cs = __KERNEL_CS; 263 ctxt->failsafe_callback_cs = __KERNEL_CS;
264#endif
265 ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback;
267 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; 266 ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback;
268 267
269 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); 268 per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
@@ -276,21 +275,33 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
276 return 0; 275 return 0;
277} 276}
278 277
279int __cpuinit xen_cpu_up(unsigned int cpu) 278static int __cpuinit xen_cpu_up(unsigned int cpu)
280{ 279{
281 struct task_struct *idle = idle_task(cpu); 280 struct task_struct *idle = idle_task(cpu);
282 int rc; 281 int rc;
283 282
284#if 0 283#ifdef CONFIG_X86_64
285 rc = cpu_up_check(cpu); 284 /* Allocate node local memory for AP pdas */
286 if (rc) 285 WARN_ON(cpu == 0);
287 return rc; 286 if (cpu > 0) {
287 rc = get_local_pda(cpu);
288 if (rc)
289 return rc;
290 }
288#endif 291#endif
289 292
293#ifdef CONFIG_X86_32
290 init_gdt(cpu); 294 init_gdt(cpu);
291 per_cpu(current_task, cpu) = idle; 295 per_cpu(current_task, cpu) = idle;
292 irq_ctx_init(cpu); 296 irq_ctx_init(cpu);
297#else
298 cpu_pda(cpu)->pcurrent = idle;
299 clear_tsk_thread_flag(idle, TIF_FORK);
300#endif
293 xen_setup_timer(cpu); 301 xen_setup_timer(cpu);
302 xen_init_lock_cpu(cpu);
303
304 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
294 305
295 /* make sure interrupts start blocked */ 306 /* make sure interrupts start blocked */
296 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; 307 per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -306,23 +317,75 @@ int __cpuinit xen_cpu_up(unsigned int cpu)
306 if (rc) 317 if (rc)
307 return rc; 318 return rc;
308 319
309 smp_store_cpu_info(cpu);
310 set_cpu_sibling_map(cpu);
311 /* This must be done before setting cpu_online_map */
312 wmb();
313
314 cpu_set(cpu, cpu_online_map);
315
316 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); 320 rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
317 BUG_ON(rc); 321 BUG_ON(rc);
318 322
323 while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
324 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
325 barrier();
326 }
327
328 return 0;
329}
330
331static void xen_smp_cpus_done(unsigned int max_cpus)
332{
333}
334
335#ifdef CONFIG_HOTPLUG_CPU
336static int xen_cpu_disable(void)
337{
338 unsigned int cpu = smp_processor_id();
339 if (cpu == 0)
340 return -EBUSY;
341
342 cpu_disable_common();
343
344 load_cr3(swapper_pg_dir);
319 return 0; 345 return 0;
320} 346}
321 347
322void xen_smp_cpus_done(unsigned int max_cpus) 348static void xen_cpu_die(unsigned int cpu)
349{
350 while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) {
351 current->state = TASK_UNINTERRUPTIBLE;
352 schedule_timeout(HZ/10);
353 }
354 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL);
355 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL);
356 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL);
357 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL);
358 xen_uninit_lock_cpu(cpu);
359 xen_teardown_timer(cpu);
360
361 if (num_online_cpus() == 1)
362 alternatives_smp_switch(0);
363}
364
365static void xen_play_dead(void)
366{
367 play_dead_common();
368 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
369 cpu_bringup();
370}
371
372#else /* !CONFIG_HOTPLUG_CPU */
373static int xen_cpu_disable(void)
374{
375 return -ENOSYS;
376}
377
378static void xen_cpu_die(unsigned int cpu)
323{ 379{
380 BUG();
324} 381}
325 382
383static void xen_play_dead(void)
384{
385 BUG();
386}
387
388#endif
326static void stop_self(void *v) 389static void stop_self(void *v)
327{ 390{
328 int cpu = smp_processor_id(); 391 int cpu = smp_processor_id();
@@ -335,12 +398,12 @@ static void stop_self(void *v)
335 BUG(); 398 BUG();
336} 399}
337 400
338void xen_smp_send_stop(void) 401static void xen_smp_send_stop(void)
339{ 402{
340 smp_call_function(stop_self, NULL, 0); 403 smp_call_function(stop_self, NULL, 0);
341} 404}
342 405
343void xen_smp_send_reschedule(int cpu) 406static void xen_smp_send_reschedule(int cpu)
344{ 407{
345 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); 408 xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR);
346} 409}
@@ -351,18 +414,18 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector)
351 414
352 cpus_and(mask, mask, cpu_online_map); 415 cpus_and(mask, mask, cpu_online_map);
353 416
354 for_each_cpu_mask(cpu, mask) 417 for_each_cpu_mask_nr(cpu, mask)
355 xen_send_IPI_one(cpu, vector); 418 xen_send_IPI_one(cpu, vector);
356} 419}
357 420
358void xen_smp_send_call_function_ipi(cpumask_t mask) 421static void xen_smp_send_call_function_ipi(cpumask_t mask)
359{ 422{
360 int cpu; 423 int cpu;
361 424
362 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR); 425 xen_send_IPI_mask(mask, XEN_CALL_FUNCTION_VECTOR);
363 426
364 /* Make sure other vcpus get a chance to run if they need to. */ 427 /* Make sure other vcpus get a chance to run if they need to. */
365 for_each_cpu_mask(cpu, mask) { 428 for_each_cpu_mask_nr(cpu, mask) {
366 if (xen_vcpu_stolen(cpu)) { 429 if (xen_vcpu_stolen(cpu)) {
367 HYPERVISOR_sched_op(SCHEDOP_yield, 0); 430 HYPERVISOR_sched_op(SCHEDOP_yield, 0);
368 break; 431 break;
@@ -370,7 +433,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask)
370 } 433 }
371} 434}
372 435
373void xen_smp_send_call_function_single_ipi(int cpu) 436static void xen_smp_send_call_function_single_ipi(int cpu)
374{ 437{
375 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); 438 xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR);
376} 439}
@@ -379,7 +442,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id)
379{ 442{
380 irq_enter(); 443 irq_enter();
381 generic_smp_call_function_interrupt(); 444 generic_smp_call_function_interrupt();
445#ifdef CONFIG_X86_32
382 __get_cpu_var(irq_stat).irq_call_count++; 446 __get_cpu_var(irq_stat).irq_call_count++;
447#else
448 add_pda(irq_call_count, 1);
449#endif
383 irq_exit(); 450 irq_exit();
384 451
385 return IRQ_HANDLED; 452 return IRQ_HANDLED;
@@ -389,8 +456,36 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
389{ 456{
390 irq_enter(); 457 irq_enter();
391 generic_smp_call_function_single_interrupt(); 458 generic_smp_call_function_single_interrupt();
459#ifdef CONFIG_X86_32
392 __get_cpu_var(irq_stat).irq_call_count++; 460 __get_cpu_var(irq_stat).irq_call_count++;
461#else
462 add_pda(irq_call_count, 1);
463#endif
393 irq_exit(); 464 irq_exit();
394 465
395 return IRQ_HANDLED; 466 return IRQ_HANDLED;
396} 467}
468
469static const struct smp_ops xen_smp_ops __initdata = {
470 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
471 .smp_prepare_cpus = xen_smp_prepare_cpus,
472 .smp_cpus_done = xen_smp_cpus_done,
473
474 .cpu_up = xen_cpu_up,
475 .cpu_die = xen_cpu_die,
476 .cpu_disable = xen_cpu_disable,
477 .play_dead = xen_play_dead,
478
479 .smp_send_stop = xen_smp_send_stop,
480 .smp_send_reschedule = xen_smp_send_reschedule,
481
482 .send_call_func_ipi = xen_smp_send_call_function_ipi,
483 .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi,
484};
485
486void __init xen_smp_init(void)
487{
488 smp_ops = xen_smp_ops;
489 xen_fill_possible_map();
490 xen_init_spinlocks();
491}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
new file mode 100644
index 000000000000..5601506f2dd9
--- /dev/null
+++ b/arch/x86/xen/spinlock.c
@@ -0,0 +1,428 @@
1/*
2 * Split spinlock implementation out into its own file, so it can be
3 * compiled in a FTRACE-compatible way.
4 */
5#include <linux/kernel_stat.h>
6#include <linux/spinlock.h>
7#include <linux/debugfs.h>
8#include <linux/log2.h>
9
10#include <asm/paravirt.h>
11
12#include <xen/interface/xen.h>
13#include <xen/events.h>
14
15#include "xen-ops.h"
16#include "debugfs.h"
17
18#ifdef CONFIG_XEN_DEBUG_FS
19static struct xen_spinlock_stats
20{
21 u64 taken;
22 u32 taken_slow;
23 u32 taken_slow_nested;
24 u32 taken_slow_pickup;
25 u32 taken_slow_spurious;
26 u32 taken_slow_irqenable;
27
28 u64 released;
29 u32 released_slow;
30 u32 released_slow_kicked;
31
32#define HISTO_BUCKETS 30
33 u32 histo_spin_total[HISTO_BUCKETS+1];
34 u32 histo_spin_spinning[HISTO_BUCKETS+1];
35 u32 histo_spin_blocked[HISTO_BUCKETS+1];
36
37 u64 time_total;
38 u64 time_spinning;
39 u64 time_blocked;
40} spinlock_stats;
41
42static u8 zero_stats;
43
44static unsigned lock_timeout = 1 << 10;
45#define TIMEOUT lock_timeout
46
47static inline void check_zero(void)
48{
49 if (unlikely(zero_stats)) {
50 memset(&spinlock_stats, 0, sizeof(spinlock_stats));
51 zero_stats = 0;
52 }
53}
54
55#define ADD_STATS(elem, val) \
56 do { check_zero(); spinlock_stats.elem += (val); } while(0)
57
58static inline u64 spin_time_start(void)
59{
60 return xen_clocksource_read();
61}
62
63static void __spin_time_accum(u64 delta, u32 *array)
64{
65 unsigned index = ilog2(delta);
66
67 check_zero();
68
69 if (index < HISTO_BUCKETS)
70 array[index]++;
71 else
72 array[HISTO_BUCKETS]++;
73}
74
75static inline void spin_time_accum_spinning(u64 start)
76{
77 u32 delta = xen_clocksource_read() - start;
78
79 __spin_time_accum(delta, spinlock_stats.histo_spin_spinning);
80 spinlock_stats.time_spinning += delta;
81}
82
83static inline void spin_time_accum_total(u64 start)
84{
85 u32 delta = xen_clocksource_read() - start;
86
87 __spin_time_accum(delta, spinlock_stats.histo_spin_total);
88 spinlock_stats.time_total += delta;
89}
90
91static inline void spin_time_accum_blocked(u64 start)
92{
93 u32 delta = xen_clocksource_read() - start;
94
95 __spin_time_accum(delta, spinlock_stats.histo_spin_blocked);
96 spinlock_stats.time_blocked += delta;
97}
98#else /* !CONFIG_XEN_DEBUG_FS */
99#define TIMEOUT (1 << 10)
100#define ADD_STATS(elem, val) do { (void)(val); } while(0)
101
102static inline u64 spin_time_start(void)
103{
104 return 0;
105}
106
107static inline void spin_time_accum_total(u64 start)
108{
109}
110static inline void spin_time_accum_spinning(u64 start)
111{
112}
113static inline void spin_time_accum_blocked(u64 start)
114{
115}
116#endif /* CONFIG_XEN_DEBUG_FS */
117
118struct xen_spinlock {
119 unsigned char lock; /* 0 -> free; 1 -> locked */
120 unsigned short spinners; /* count of waiting cpus */
121};
122
123static int xen_spin_is_locked(struct raw_spinlock *lock)
124{
125 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
126
127 return xl->lock != 0;
128}
129
130static int xen_spin_is_contended(struct raw_spinlock *lock)
131{
132 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
133
134 /* Not strictly true; this is only the count of contended
135 lock-takers entering the slow path. */
136 return xl->spinners != 0;
137}
138
139static int xen_spin_trylock(struct raw_spinlock *lock)
140{
141 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
142 u8 old = 1;
143
144 asm("xchgb %b0,%1"
145 : "+q" (old), "+m" (xl->lock) : : "memory");
146
147 return old == 0;
148}
149
150static DEFINE_PER_CPU(int, lock_kicker_irq) = -1;
151static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners);
152
153/*
154 * Mark a cpu as interested in a lock. Returns the CPU's previous
155 * lock of interest, in case we got preempted by an interrupt.
156 */
157static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
158{
159 struct xen_spinlock *prev;
160
161 prev = __get_cpu_var(lock_spinners);
162 __get_cpu_var(lock_spinners) = xl;
163
164 wmb(); /* set lock of interest before count */
165
166 asm(LOCK_PREFIX " incw %0"
167 : "+m" (xl->spinners) : : "memory");
168
169 return prev;
170}
171
172/*
173 * Mark a cpu as no longer interested in a lock. Restores previous
174 * lock of interest (NULL for none).
175 */
176static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev)
177{
178 asm(LOCK_PREFIX " decw %0"
179 : "+m" (xl->spinners) : : "memory");
180 wmb(); /* decrement count before restoring lock */
181 __get_cpu_var(lock_spinners) = prev;
182}
183
184static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable)
185{
186 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
187 struct xen_spinlock *prev;
188 int irq = __get_cpu_var(lock_kicker_irq);
189 int ret;
190 unsigned long flags;
191 u64 start;
192
193 /* If kicker interrupts not initialized yet, just spin */
194 if (irq == -1)
195 return 0;
196
197 start = spin_time_start();
198
199 /* announce we're spinning */
200 prev = spinning_lock(xl);
201
202 flags = __raw_local_save_flags();
203 if (irq_enable) {
204 ADD_STATS(taken_slow_irqenable, 1);
205 raw_local_irq_enable();
206 }
207
208 ADD_STATS(taken_slow, 1);
209 ADD_STATS(taken_slow_nested, prev != NULL);
210
211 do {
212 /* clear pending */
213 xen_clear_irq_pending(irq);
214
215 /* check again make sure it didn't become free while
216 we weren't looking */
217 ret = xen_spin_trylock(lock);
218 if (ret) {
219 ADD_STATS(taken_slow_pickup, 1);
220
221 /*
222 * If we interrupted another spinlock while it
223 * was blocking, make sure it doesn't block
224 * without rechecking the lock.
225 */
226 if (prev != NULL)
227 xen_set_irq_pending(irq);
228 goto out;
229 }
230
231 /*
232 * Block until irq becomes pending. If we're
233 * interrupted at this point (after the trylock but
234 * before entering the block), then the nested lock
235 * handler guarantees that the irq will be left
236 * pending if there's any chance the lock became free;
237 * xen_poll_irq() returns immediately if the irq is
238 * pending.
239 */
240 xen_poll_irq(irq);
241 ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq));
242 } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */
243
244 kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq));
245
246out:
247 raw_local_irq_restore(flags);
248 unspinning_lock(xl, prev);
249 spin_time_accum_blocked(start);
250
251 return ret;
252}
253
254static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
255{
256 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
257 unsigned timeout;
258 u8 oldval;
259 u64 start_spin;
260
261 ADD_STATS(taken, 1);
262
263 start_spin = spin_time_start();
264
265 do {
266 u64 start_spin_fast = spin_time_start();
267
268 timeout = TIMEOUT;
269
270 asm("1: xchgb %1,%0\n"
271 " testb %1,%1\n"
272 " jz 3f\n"
273 "2: rep;nop\n"
274 " cmpb $0,%0\n"
275 " je 1b\n"
276 " dec %2\n"
277 " jnz 2b\n"
278 "3:\n"
279 : "+m" (xl->lock), "=q" (oldval), "+r" (timeout)
280 : "1" (1)
281 : "memory");
282
283 spin_time_accum_spinning(start_spin_fast);
284
285 } while (unlikely(oldval != 0 &&
286 (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable))));
287
288 spin_time_accum_total(start_spin);
289}
290
291static void xen_spin_lock(struct raw_spinlock *lock)
292{
293 __xen_spin_lock(lock, false);
294}
295
296static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags)
297{
298 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
299}
300
301static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
302{
303 int cpu;
304
305 ADD_STATS(released_slow, 1);
306
307 for_each_online_cpu(cpu) {
308 /* XXX should mix up next cpu selection */
309 if (per_cpu(lock_spinners, cpu) == xl) {
310 ADD_STATS(released_slow_kicked, 1);
311 xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR);
312 break;
313 }
314 }
315}
316
317static void xen_spin_unlock(struct raw_spinlock *lock)
318{
319 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
320
321 ADD_STATS(released, 1);
322
323 smp_wmb(); /* make sure no writes get moved after unlock */
324 xl->lock = 0; /* release lock */
325
326 /* make sure unlock happens before kick */
327 barrier();
328
329 if (unlikely(xl->spinners))
330 xen_spin_unlock_slow(xl);
331}
332
333static irqreturn_t dummy_handler(int irq, void *dev_id)
334{
335 BUG();
336 return IRQ_HANDLED;
337}
338
339void __cpuinit xen_init_lock_cpu(int cpu)
340{
341 int irq;
342 const char *name;
343
344 name = kasprintf(GFP_KERNEL, "spinlock%d", cpu);
345 irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR,
346 cpu,
347 dummy_handler,
348 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING,
349 name,
350 NULL);
351
352 if (irq >= 0) {
353 disable_irq(irq); /* make sure it's never delivered */
354 per_cpu(lock_kicker_irq, cpu) = irq;
355 }
356
357 printk("cpu %d spinlock event irq %d\n", cpu, irq);
358}
359
360void xen_uninit_lock_cpu(int cpu)
361{
362 unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL);
363}
364
365void __init xen_init_spinlocks(void)
366{
367 pv_lock_ops.spin_is_locked = xen_spin_is_locked;
368 pv_lock_ops.spin_is_contended = xen_spin_is_contended;
369 pv_lock_ops.spin_lock = xen_spin_lock;
370 pv_lock_ops.spin_lock_flags = xen_spin_lock_flags;
371 pv_lock_ops.spin_trylock = xen_spin_trylock;
372 pv_lock_ops.spin_unlock = xen_spin_unlock;
373}
374
375#ifdef CONFIG_XEN_DEBUG_FS
376
377static struct dentry *d_spin_debug;
378
379static int __init xen_spinlock_debugfs(void)
380{
381 struct dentry *d_xen = xen_init_debugfs();
382
383 if (d_xen == NULL)
384 return -ENOMEM;
385
386 d_spin_debug = debugfs_create_dir("spinlocks", d_xen);
387
388 debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats);
389
390 debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout);
391
392 debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken);
393 debugfs_create_u32("taken_slow", 0444, d_spin_debug,
394 &spinlock_stats.taken_slow);
395 debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug,
396 &spinlock_stats.taken_slow_nested);
397 debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug,
398 &spinlock_stats.taken_slow_pickup);
399 debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug,
400 &spinlock_stats.taken_slow_spurious);
401 debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug,
402 &spinlock_stats.taken_slow_irqenable);
403
404 debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released);
405 debugfs_create_u32("released_slow", 0444, d_spin_debug,
406 &spinlock_stats.released_slow);
407 debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug,
408 &spinlock_stats.released_slow_kicked);
409
410 debugfs_create_u64("time_spinning", 0444, d_spin_debug,
411 &spinlock_stats.time_spinning);
412 debugfs_create_u64("time_blocked", 0444, d_spin_debug,
413 &spinlock_stats.time_blocked);
414 debugfs_create_u64("time_total", 0444, d_spin_debug,
415 &spinlock_stats.time_total);
416
417 xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug,
418 spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1);
419 xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug,
420 spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1);
421 xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug,
422 spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1);
423
424 return 0;
425}
426fs_initcall(xen_spinlock_debugfs);
427
428#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 251669a932d4..2a234db5949b 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled)
38 xen_cpu_initialized_map = cpu_online_map; 38 xen_cpu_initialized_map = cpu_online_map;
39#endif 39#endif
40 xen_vcpu_restore(); 40 xen_vcpu_restore();
41 xen_timer_resume();
42 } 41 }
43 42
44} 43}
45 44
45void xen_arch_resume(void)
46{
47 /* nothing */
48}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 685b77470fc3..c9f7cda48ed7 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -30,8 +30,6 @@
30#define TIMER_SLOP 100000 30#define TIMER_SLOP 100000
31#define NS_PER_TICK (1000000000LL / HZ) 31#define NS_PER_TICK (1000000000LL / HZ)
32 32
33static cycle_t xen_clocksource_read(void);
34
35/* runstate info updated by Xen */ 33/* runstate info updated by Xen */
36static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 34static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate);
37 35
@@ -200,20 +198,13 @@ unsigned long long xen_sched_clock(void)
200/* Get the TSC speed from Xen */ 198/* Get the TSC speed from Xen */
201unsigned long xen_tsc_khz(void) 199unsigned long xen_tsc_khz(void)
202{ 200{
203 u64 xen_khz = 1000000ULL << 32; 201 struct pvclock_vcpu_time_info *info =
204 const struct pvclock_vcpu_time_info *info =
205 &HYPERVISOR_shared_info->vcpu_info[0].time; 202 &HYPERVISOR_shared_info->vcpu_info[0].time;
206 203
207 do_div(xen_khz, info->tsc_to_system_mul); 204 return pvclock_tsc_khz(info);
208 if (info->tsc_shift < 0)
209 xen_khz <<= -info->tsc_shift;
210 else
211 xen_khz >>= info->tsc_shift;
212
213 return xen_khz;
214} 205}
215 206
216static cycle_t xen_clocksource_read(void) 207cycle_t xen_clocksource_read(void)
217{ 208{
218 struct pvclock_vcpu_time_info *src; 209 struct pvclock_vcpu_time_info *src;
219 cycle_t ret; 210 cycle_t ret;
@@ -452,6 +443,14 @@ void xen_setup_timer(int cpu)
452 setup_runstate_info(cpu); 443 setup_runstate_info(cpu);
453} 444}
454 445
446void xen_teardown_timer(int cpu)
447{
448 struct clock_event_device *evt;
449 BUG_ON(cpu == 0);
450 evt = &per_cpu(xen_clock_events, cpu);
451 unbind_from_irqhandler(evt->irq, NULL);
452}
453
455void xen_setup_cpu_clockevents(void) 454void xen_setup_cpu_clockevents(void)
456{ 455{
457 BUG_ON(preemptible()); 456 BUG_ON(preemptible());
diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm_32.S
index 2497a30f41de..42786f59d9c0 100644
--- a/arch/x86/xen/xen-asm.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -298,7 +298,7 @@ check_events:
298 push %eax 298 push %eax
299 push %ecx 299 push %ecx
300 push %edx 300 push %edx
301 call force_evtchn_callback 301 call xen_force_evtchn_callback
302 pop %edx 302 pop %edx
303 pop %ecx 303 pop %ecx
304 pop %eax 304 pop %eax
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
new file mode 100644
index 000000000000..05794c566e87
--- /dev/null
+++ b/arch/x86/xen/xen-asm_64.S
@@ -0,0 +1,285 @@
1/*
2 Asm versions of Xen pv-ops, suitable for either direct use or inlining.
3 The inline versions are the same as the direct-use versions, with the
4 pre- and post-amble chopped off.
5
6 This code is encoded for size rather than absolute efficiency,
7 with a view to being able to inline as much as possible.
8
9 We only bother with direct forms (ie, vcpu in pda) of the operations
10 here; the indirect forms are better handled in C, since they're
11 generally too large to inline anyway.
12 */
13
14#include <linux/linkage.h>
15
16#include <asm/asm-offsets.h>
17#include <asm/processor-flags.h>
18#include <asm/errno.h>
19#include <asm/segment.h>
20
21#include <xen/interface/xen.h>
22
23#define RELOC(x, v) .globl x##_reloc; x##_reloc=v
24#define ENDPATCH(x) .globl x##_end; x##_end=.
25
26/* Pseudo-flag used for virtual NMI, which we don't implement yet */
27#define XEN_EFLAGS_NMI 0x80000000
28
29#if 1
30/*
31 x86-64 does not yet support direct access to percpu variables
32 via a segment override, so we just need to make sure this code
33 never gets used
34 */
35#define BUG ud2a
36#define PER_CPU_VAR(var, off) 0xdeadbeef
37#endif
38
39/*
40 Enable events. This clears the event mask and tests the pending
41 event status with one and operation. If there are pending
42 events, then enter the hypervisor to get them handled.
43 */
44ENTRY(xen_irq_enable_direct)
45 BUG
46
47 /* Unmask events */
48 movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
49
50 /* Preempt here doesn't matter because that will deal with
51 any pending interrupts. The pending check may end up being
52 run on the wrong CPU, but that doesn't hurt. */
53
54 /* Test for pending */
55 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
56 jz 1f
57
582: call check_events
591:
60ENDPATCH(xen_irq_enable_direct)
61 ret
62 ENDPROC(xen_irq_enable_direct)
63 RELOC(xen_irq_enable_direct, 2b+1)
64
65/*
66 Disabling events is simply a matter of making the event mask
67 non-zero.
68 */
69ENTRY(xen_irq_disable_direct)
70 BUG
71
72 movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
73ENDPATCH(xen_irq_disable_direct)
74 ret
75 ENDPROC(xen_irq_disable_direct)
76 RELOC(xen_irq_disable_direct, 0)
77
78/*
79 (xen_)save_fl is used to get the current interrupt enable status.
80 Callers expect the status to be in X86_EFLAGS_IF, and other bits
81 may be set in the return value. We take advantage of this by
82 making sure that X86_EFLAGS_IF has the right value (and other bits
83 in that byte are 0), but other bits in the return value are
84 undefined. We need to toggle the state of the bit, because
85 Xen and x86 use opposite senses (mask vs enable).
86 */
87ENTRY(xen_save_fl_direct)
88 BUG
89
90 testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
91 setz %ah
92 addb %ah,%ah
93ENDPATCH(xen_save_fl_direct)
94 ret
95 ENDPROC(xen_save_fl_direct)
96 RELOC(xen_save_fl_direct, 0)
97
98/*
99 In principle the caller should be passing us a value return
100 from xen_save_fl_direct, but for robustness sake we test only
101 the X86_EFLAGS_IF flag rather than the whole byte. After
102 setting the interrupt mask state, it checks for unmasked
103 pending events and enters the hypervisor to get them delivered
104 if so.
105 */
106ENTRY(xen_restore_fl_direct)
107 BUG
108
109 testb $X86_EFLAGS_IF>>8, %ah
110 setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask)
111 /* Preempt here doesn't matter because that will deal with
112 any pending interrupts. The pending check may end up being
113 run on the wrong CPU, but that doesn't hurt. */
114
115 /* check for unmasked and pending */
116 cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending)
117 jz 1f
1182: call check_events
1191:
120ENDPATCH(xen_restore_fl_direct)
121 ret
122 ENDPROC(xen_restore_fl_direct)
123 RELOC(xen_restore_fl_direct, 2b+1)
124
125
126/*
127 Force an event check by making a hypercall,
128 but preserve regs before making the call.
129 */
130check_events:
131 push %rax
132 push %rcx
133 push %rdx
134 push %rsi
135 push %rdi
136 push %r8
137 push %r9
138 push %r10
139 push %r11
140 call xen_force_evtchn_callback
141 pop %r11
142 pop %r10
143 pop %r9
144 pop %r8
145 pop %rdi
146 pop %rsi
147 pop %rdx
148 pop %rcx
149 pop %rax
150 ret
151
152ENTRY(xen_adjust_exception_frame)
153 mov 8+0(%rsp),%rcx
154 mov 8+8(%rsp),%r11
155 ret $16
156
157hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32
158/*
159 Xen64 iret frame:
160
161 ss
162 rsp
163 rflags
164 cs
165 rip <-- standard iret frame
166
167 flags
168
169 rcx }
170 r11 }<-- pushed by hypercall page
171rsp -> rax }
172 */
173ENTRY(xen_iret)
174 pushq $0
1751: jmp hypercall_iret
176ENDPATCH(xen_iret)
177RELOC(xen_iret, 1b+1)
178
179/*
180 sysexit is not used for 64-bit processes, so it's
181 only ever used to return to 32-bit compat userspace.
182 */
183ENTRY(xen_sysexit)
184 pushq $__USER32_DS
185 pushq %rcx
186 pushq $X86_EFLAGS_IF
187 pushq $__USER32_CS
188 pushq %rdx
189
190 pushq $0
1911: jmp hypercall_iret
192ENDPATCH(xen_sysexit)
193RELOC(xen_sysexit, 1b+1)
194
195ENTRY(xen_sysret64)
196 /* We're already on the usermode stack at this point, but still
197 with the kernel gs, so we can easily switch back */
198 movq %rsp, %gs:pda_oldrsp
199 movq %gs:pda_kernelstack,%rsp
200
201 pushq $__USER_DS
202 pushq %gs:pda_oldrsp
203 pushq %r11
204 pushq $__USER_CS
205 pushq %rcx
206
207 pushq $VGCF_in_syscall
2081: jmp hypercall_iret
209ENDPATCH(xen_sysret64)
210RELOC(xen_sysret64, 1b+1)
211
212ENTRY(xen_sysret32)
213 /* We're already on the usermode stack at this point, but still
214 with the kernel gs, so we can easily switch back */
215 movq %rsp, %gs:pda_oldrsp
216 movq %gs:pda_kernelstack, %rsp
217
218 pushq $__USER32_DS
219 pushq %gs:pda_oldrsp
220 pushq %r11
221 pushq $__USER32_CS
222 pushq %rcx
223
224 pushq $VGCF_in_syscall
2251: jmp hypercall_iret
226ENDPATCH(xen_sysret32)
227RELOC(xen_sysret32, 1b+1)
228
229/*
230 Xen handles syscall callbacks much like ordinary exceptions,
231 which means we have:
232 - kernel gs
233 - kernel rsp
234 - an iret-like stack frame on the stack (including rcx and r11):
235 ss
236 rsp
237 rflags
238 cs
239 rip
240 r11
241 rsp-> rcx
242
243 In all the entrypoints, we undo all that to make it look
244 like a CPU-generated syscall/sysenter and jump to the normal
245 entrypoint.
246 */
247
248.macro undo_xen_syscall
249 mov 0*8(%rsp),%rcx
250 mov 1*8(%rsp),%r11
251 mov 5*8(%rsp),%rsp
252.endm
253
254/* Normal 64-bit system call target */
255ENTRY(xen_syscall_target)
256 undo_xen_syscall
257 jmp system_call_after_swapgs
258ENDPROC(xen_syscall_target)
259
260#ifdef CONFIG_IA32_EMULATION
261
262/* 32-bit compat syscall target */
263ENTRY(xen_syscall32_target)
264 undo_xen_syscall
265 jmp ia32_cstar_target
266ENDPROC(xen_syscall32_target)
267
268/* 32-bit compat sysenter target */
269ENTRY(xen_sysenter_target)
270 undo_xen_syscall
271 jmp ia32_sysenter_target
272ENDPROC(xen_sysenter_target)
273
274#else /* !CONFIG_IA32_EMULATION */
275
276ENTRY(xen_syscall32_target)
277ENTRY(xen_sysenter_target)
278 lea 16(%rsp), %rsp /* strip %rcx,%r11 */
279 mov $-ENOSYS, %rax
280 pushq $VGCF_in_syscall
281 jmp hypercall_iret
282ENDPROC(xen_syscall32_target)
283ENDPROC(xen_sysenter_target)
284
285#endif /* CONFIG_IA32_EMULATION */
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 7c0cf6320a0a..63d49a523ed3 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -5,15 +5,24 @@
5 5
6#include <linux/elfnote.h> 6#include <linux/elfnote.h>
7#include <linux/init.h> 7#include <linux/init.h>
8
8#include <asm/boot.h> 9#include <asm/boot.h>
10#include <asm/asm.h>
11#include <asm/page.h>
12
9#include <xen/interface/elfnote.h> 13#include <xen/interface/elfnote.h>
10#include <asm/xen/interface.h> 14#include <asm/xen/interface.h>
11 15
12 __INIT 16 __INIT
13ENTRY(startup_xen) 17ENTRY(startup_xen)
14 movl %esi,xen_start_info
15 cld 18 cld
16 movl $(init_thread_union+THREAD_SIZE),%esp 19#ifdef CONFIG_X86_32
20 mov %esi,xen_start_info
21 mov $init_thread_union+THREAD_SIZE,%esp
22#else
23 mov %rsi,xen_start_info
24 mov $init_thread_union+THREAD_SIZE,%rsp
25#endif
17 jmp xen_start_kernel 26 jmp xen_start_kernel
18 27
19 __FINIT 28 __FINIT
@@ -21,21 +30,26 @@ ENTRY(startup_xen)
21.pushsection .text 30.pushsection .text
22 .align PAGE_SIZE_asm 31 .align PAGE_SIZE_asm
23ENTRY(hypercall_page) 32ENTRY(hypercall_page)
24 .skip 0x1000 33 .skip PAGE_SIZE_asm
25.popsection 34.popsection
26 35
27 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
28 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") 37 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6")
29 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") 38 ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0")
30 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) 39#ifdef CONFIG_X86_32
31 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) 40 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET)
32 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) 41#else
42 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map)
43#endif
44 ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen)
45 ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page)
33 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") 46 ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb")
34 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") 47 ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes")
35 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") 48 ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic")
36 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, 49 ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
37 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) 50 .quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
38 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) 51 ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
39 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START) 52 ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START)
53 ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0)
40 54
41#endif /*CONFIG_XEN */ 55#endif /*CONFIG_XEN */
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 6f4b1045c1c2..d7422dc2a55c 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -2,6 +2,7 @@
2#define XEN_OPS_H 2#define XEN_OPS_H
3 3
4#include <linux/init.h> 4#include <linux/init.h>
5#include <linux/clocksource.h>
5#include <linux/irqreturn.h> 6#include <linux/irqreturn.h>
6#include <xen/xen-ops.h> 7#include <xen/xen-ops.h>
7 8
@@ -26,18 +27,21 @@ char * __init xen_memory_setup(void);
26void __init xen_arch_setup(void); 27void __init xen_arch_setup(void);
27void __init xen_init_IRQ(void); 28void __init xen_init_IRQ(void);
28void xen_enable_sysenter(void); 29void xen_enable_sysenter(void);
30void xen_enable_syscall(void);
29void xen_vcpu_restore(void); 31void xen_vcpu_restore(void);
30 32
31void __init xen_build_dynamic_phys_to_machine(void); 33void __init xen_build_dynamic_phys_to_machine(void);
32 34
35void xen_init_irq_ops(void);
33void xen_setup_timer(int cpu); 36void xen_setup_timer(int cpu);
37void xen_teardown_timer(int cpu);
38cycle_t xen_clocksource_read(void);
34void xen_setup_cpu_clockevents(void); 39void xen_setup_cpu_clockevents(void);
35unsigned long xen_tsc_khz(void); 40unsigned long xen_tsc_khz(void);
36void __init xen_time_init(void); 41void __init xen_time_init(void);
37unsigned long xen_get_wallclock(void); 42unsigned long xen_get_wallclock(void);
38int xen_set_wallclock(unsigned long time); 43int xen_set_wallclock(unsigned long time);
39unsigned long long xen_sched_clock(void); 44unsigned long long xen_sched_clock(void);
40void xen_timer_resume(void);
41 45
42irqreturn_t xen_debug_interrupt(int irq, void *dev_id); 46irqreturn_t xen_debug_interrupt(int irq, void *dev_id);
43 47
@@ -45,20 +49,19 @@ bool xen_vcpu_stolen(int vcpu);
45 49
46void xen_mark_init_mm_pinned(void); 50void xen_mark_init_mm_pinned(void);
47 51
48void __init xen_fill_possible_map(void);
49
50void __init xen_setup_vcpu_info_placement(void); 52void __init xen_setup_vcpu_info_placement(void);
51void xen_smp_prepare_boot_cpu(void);
52void xen_smp_prepare_cpus(unsigned int max_cpus);
53int xen_cpu_up(unsigned int cpu);
54void xen_smp_cpus_done(unsigned int max_cpus);
55 53
56void xen_smp_send_stop(void); 54#ifdef CONFIG_SMP
57void xen_smp_send_reschedule(int cpu); 55void xen_smp_init(void);
58void xen_smp_send_call_function_ipi(cpumask_t mask); 56
59void xen_smp_send_call_function_single_ipi(int cpu); 57void __init xen_init_spinlocks(void);
58__cpuinit void xen_init_lock_cpu(int cpu);
59void xen_uninit_lock_cpu(int cpu);
60 60
61extern cpumask_t xen_cpu_initialized_map; 61extern cpumask_t xen_cpu_initialized_map;
62#else
63static inline void xen_smp_init(void) {}
64#endif
62 65
63 66
64/* Declare an asm function, along with symbols needed to make it 67/* Declare an asm function, along with symbols needed to make it
@@ -73,7 +76,11 @@ DECL_ASM(void, xen_irq_disable_direct, void);
73DECL_ASM(unsigned long, xen_save_fl_direct, void); 76DECL_ASM(unsigned long, xen_save_fl_direct, void);
74DECL_ASM(void, xen_restore_fl_direct, unsigned long); 77DECL_ASM(void, xen_restore_fl_direct, unsigned long);
75 78
79/* These are not functions, and cannot be called normally */
76void xen_iret(void); 80void xen_iret(void);
77void xen_sysexit(void); 81void xen_sysexit(void);
82void xen_sysret32(void);
83void xen_sysret64(void);
84void xen_adjust_exception_frame(void);
78 85
79#endif /* XEN_OPS_H */ 86#endif /* XEN_OPS_H */